From a0ddc0e3a8e53874cd5152bbfca8fd60553f50ff Mon Sep 17 00:00:00 2001
From: codesausage <grchang123@163.com>
Date: Tue, 14 Jul 2020 09:56:39 +0800
Subject: [PATCH 01/68]  add directory in model_zoo

update directory and delete some

add hub directory
---
 hub/docs/.gitkeep                              | 0
 hub/images/.gitkeep                            | 0
 hub/scripts/.gitkeep                           | 0
 model_zoo/community/README.md                  | 0
 model_zoo/official/README.md                   | 0
 model_zoo/official/audio/.gitkeep              | 0
 model_zoo/official/cv/googlenet_quant/.gitkeep | 0
 model_zoo/official/lite/.gitkeep               | 0
 model_zoo/official/nlp/.gitkeep                | 0
 model_zoo/official/recommend/.gitkeep          | 0
 model_zoo/official/utils/.gitkeep              | 0
 model_zoo/research/.gitkeep                    | 0
 12 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 hub/docs/.gitkeep
 create mode 100644 hub/images/.gitkeep
 create mode 100644 hub/scripts/.gitkeep
 create mode 100644 model_zoo/community/README.md
 create mode 100644 model_zoo/official/README.md
 create mode 100644 model_zoo/official/audio/.gitkeep
 create mode 100644 model_zoo/official/cv/googlenet_quant/.gitkeep
 create mode 100644 model_zoo/official/lite/.gitkeep
 create mode 100644 model_zoo/official/nlp/.gitkeep
 create mode 100644 model_zoo/official/recommend/.gitkeep
 create mode 100644 model_zoo/official/utils/.gitkeep
 create mode 100644 model_zoo/research/.gitkeep

diff --git a/hub/docs/.gitkeep b/hub/docs/.gitkeep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/hub/images/.gitkeep b/hub/images/.gitkeep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/hub/scripts/.gitkeep b/hub/scripts/.gitkeep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/model_zoo/community/README.md b/model_zoo/community/README.md
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/model_zoo/official/README.md b/model_zoo/official/README.md
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/model_zoo/official/audio/.gitkeep b/model_zoo/official/audio/.gitkeep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/model_zoo/official/cv/googlenet_quant/.gitkeep b/model_zoo/official/cv/googlenet_quant/.gitkeep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/model_zoo/official/lite/.gitkeep b/model_zoo/official/lite/.gitkeep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/model_zoo/official/nlp/.gitkeep b/model_zoo/official/nlp/.gitkeep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/model_zoo/official/recommend/.gitkeep b/model_zoo/official/recommend/.gitkeep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/model_zoo/official/utils/.gitkeep b/model_zoo/official/utils/.gitkeep
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/model_zoo/research/.gitkeep b/model_zoo/research/.gitkeep
new file mode 100644
index 00000000000..e69de29bb2d

From c772b9e291e9ff48ec750020db05f475caac0e66 Mon Sep 17 00:00:00 2001
From: liuxiao93 <liuxiao93@huawei.com>
Date: Sat, 11 Jul 2020 15:47:03 +0800
Subject: [PATCH 02/68] Add TBE ops Tan/TruncateDiv/TruncateMod for VM.

---
 mindspore/ops/_grad/grad_math_ops.py       | 44 +++++++++++
 mindspore/ops/_op_impl/tbe/__init__.py     |  3 +
 mindspore/ops/_op_impl/tbe/tan.py          | 38 ++++++++++
 mindspore/ops/_op_impl/tbe/truncate_div.py | 41 ++++++++++
 mindspore/ops/_op_impl/tbe/truncate_mod.py | 41 ++++++++++
 mindspore/ops/operations/__init__.py       |  7 +-
 mindspore/ops/operations/math_ops.py       | 87 ++++++++++++++++++++++
 tests/ut/python/ops/test_ops.py            | 12 +++
 8 files changed, 271 insertions(+), 2 deletions(-)
 create mode 100644 mindspore/ops/_op_impl/tbe/tan.py
 create mode 100644 mindspore/ops/_op_impl/tbe/truncate_div.py
 create mode 100644 mindspore/ops/_op_impl/tbe/truncate_mod.py

diff --git a/mindspore/ops/_grad/grad_math_ops.py b/mindspore/ops/_grad/grad_math_ops.py
index 975e9188174..ae0b06745e3 100755
--- a/mindspore/ops/_grad/grad_math_ops.py
+++ b/mindspore/ops/_grad/grad_math_ops.py
@@ -306,6 +306,34 @@ def get_bprop_floormod(self):
     return bprop
 
 
+@bprop_getters.register(P.TruncateDiv)
+def get_bprop_truncate_div(self):
+    """Grad definition for `TruncateDiv` operation."""
+    div_op = P.TruncateDiv()
+    neg = P.Neg()
+    mul_op = P.Mul()
+
+    def bprop(x, y, out, dout):
+        bc_x = div_op(dout, y)
+        bc_y = neg(mul_op(bc_x, out))
+        return binop_grad_common(x, y, bc_x, bc_y)
+
+    return bprop
+
+
+@bprop_getters.register(P.TruncateMod)
+def get_bprop_truncate_mod(self):
+    """Grad definition for `TruncateMod` operation."""
+    div_op = P.TruncateDiv()
+
+    def bprop(x, y, out, dout):
+        bc_x = dout
+        bc_y = -dout * div_op(x, y)
+        return binop_grad_common(x, y, bc_x, bc_y)
+
+    return bprop
+
+
 @bprop_getters.register(P.Mod)
 def get_bprop_mod(self):
     """Grad definition for `Mod` operation."""
@@ -1027,6 +1055,22 @@ def get_bprop_atan(self):
     return bprop
 
 
+@bprop_getters.register(P.Tan)
+def get_bprop_tan(self):
+    """Grad definition for `Tan` operation."""
+    reciprocal = P.Reciprocal()
+    square = P.Square()
+    cos = P.Cos()
+
+    def bprop(x, out, dout):
+        cosx = cos(x)
+        secx2 = square(reciprocal(cosx))
+        dx = secx2 * dout
+        return (dx,)
+
+    return bprop
+
+
 @bprop_getters.register(P.BesselI1e)
 def get_bprop_bessel_i1e(self):
     """Generate bprop for BesselI1e"""
diff --git a/mindspore/ops/_op_impl/tbe/__init__.py b/mindspore/ops/_op_impl/tbe/__init__.py
index f768cfe58cf..55c6e595beb 100644
--- a/mindspore/ops/_op_impl/tbe/__init__.py
+++ b/mindspore/ops/_op_impl/tbe/__init__.py
@@ -132,6 +132,8 @@ from .sparse_apply_ftrl_d import _sparse_apply_ftrl_d
 from .sparse_apply_proximal_adagrad import _sparse_apply_proximal_adagrad
 from .apply_proximal_adagrad import _apply_proximal_adagrad
 from .transpose_d import _transpose_d_tbe
+from .truncate_div import _truncate_div_tbe
+from .truncate_mod import _truncate_mod_tbe
 from .unsorted_segment_sum import _unsorted_segment_sum_tbe
 from .unsorted_segment_prod import _unsorted_segment_prod_tbe
 from .logsoftmax_grad import _logsoftmax_grad_tbe
@@ -222,6 +224,7 @@ from .binary_cross_entropy import _binary_cross_entropy_tbe
 from .binary_cross_entropy_grad import _binary_cross_entropy_grad_tbe
 from .sin import _sin_tbe
 from .cos import _cos_tbe
+from .tan import _tan_tbe
 from .cum_sum import _cum_sum_tbe
 from .apply_rms_prop import _apply_rms_prop_tbe
 from .cumprod import _cumprop_tbe
diff --git a/mindspore/ops/_op_impl/tbe/tan.py b/mindspore/ops/_op_impl/tbe/tan.py
new file mode 100644
index 00000000000..2287e4bc07a
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/tan.py
@@ -0,0 +1,38 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""Tan op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+tan_op_info = TBERegOp("Tan") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("tan.so") \
+    .compute_cost(10) \
+    .kernel_name("tan") \
+    .partial_flag(True) \
+    .op_pattern("formatAgnostic") \
+    .input(0, "x", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_None, DataType.F16_None) \
+    .dtype_format(DataType.F32_None, DataType.F32_None) \
+    .dtype_format(DataType.I32_None, DataType.I32_None) \
+    .get_op_info()
+
+
+@op_info_register(tan_op_info)
+def _tan_tbe():
+    """Tan TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/truncate_div.py b/mindspore/ops/_op_impl/tbe/truncate_div.py
new file mode 100644
index 00000000000..583d96b7f31
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/truncate_div.py
@@ -0,0 +1,41 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""TruncateDiv op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+truncate_div_op_info = TBERegOp("TruncateDiv") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("truncate_div.so") \
+    .compute_cost(10) \
+    .kernel_name("truncate_div") \
+    .partial_flag(True) \
+    .op_pattern("broadcast") \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_None, DataType.F16_None, DataType.F16_None) \
+    .dtype_format(DataType.F32_None, DataType.F32_None, DataType.F32_None) \
+    .dtype_format(DataType.I32_None, DataType.I32_None, DataType.I32_None) \
+    .dtype_format(DataType.I8_None, DataType.I8_None, DataType.I8_None) \
+    .dtype_format(DataType.U8_None, DataType.U8_None, DataType.U8_None) \
+    .get_op_info()
+
+
+@op_info_register(truncate_div_op_info)
+def _truncate_div_tbe():
+    """TruncateDiv TBE register"""
+    return
diff --git a/mindspore/ops/_op_impl/tbe/truncate_mod.py b/mindspore/ops/_op_impl/tbe/truncate_mod.py
new file mode 100644
index 00000000000..b8cfa991e2e
--- /dev/null
+++ b/mindspore/ops/_op_impl/tbe/truncate_mod.py
@@ -0,0 +1,41 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""TruncateMod op"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+truncate_mod_op_info = TBERegOp("TruncateMod") \
+    .fusion_type("ELEMWISE") \
+    .async_flag(False) \
+    .binfile_name("truncate_mod.so") \
+    .compute_cost(10) \
+    .kernel_name("truncate_mod") \
+    .partial_flag(True) \
+    .op_pattern("broadcast") \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_None, DataType.F16_None, DataType.F16_None) \
+    .dtype_format(DataType.F32_None, DataType.F32_None, DataType.F32_None) \
+    .dtype_format(DataType.I32_None, DataType.I32_None, DataType.I32_None) \
+    .dtype_format(DataType.I8_None, DataType.I8_None, DataType.I8_None) \
+    .dtype_format(DataType.U8_None, DataType.U8_None, DataType.U8_None) \
+    .get_op_info()
+
+
+@op_info_register(truncate_mod_op_info)
+def _truncate_mod_tbe():
+    """TruncateMod TBE register"""
+    return
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index 423ef89f928..a992c0edd52 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -52,8 +52,8 @@ from .math_ops import (Abs, ACos, Asin, Asinh, AddN, AccumulateNV2, AssignAdd, A
                        NPUAllocFloatStatus, NPUClearFloatStatus,
                        NPUGetFloatStatus, Pow, RealDiv, IsNan, IsInf, IsFinite, FloatStatus,
                        Reciprocal, CumSum, HistogramFixedWidth,
-                       Sin, Sqrt, Rsqrt, BesselI0e, BesselI1e,
-                       Square, Sub, TensorAdd, Sign, Round, SquareSumAll, Atan, Atanh, Cosh, Sinh, Eps)
+                       Sin, Sqrt, Rsqrt, BesselI0e, BesselI1e, TruncateDiv, TruncateMod,
+                       Square, Sub, TensorAdd, Sign, Round, SquareSumAll, Atan, Atanh, Cosh, Sinh, Eps, Tan)
 
 from .random_ops import (RandomChoiceWithMask, Normal)
 from .nn_ops import (LSTM, SGD, Adam, SparseApplyAdam, SparseApplyLazyAdam, ApplyMomentum, BatchNorm,
@@ -267,6 +267,8 @@ __all__ = [
     'SigmoidCrossEntropyWithLogits',
     'FloorDiv',
     'FloorMod',
+    'TruncateDiv',
+    'TruncateMod',
     'Ceil',
     'Acosh',
     'Asinh',
@@ -323,6 +325,7 @@ __all__ = [
     "BesselI1e",
     "Atan",
     "Atanh",
+    "Tan",
     "BasicLSTMCell",
     "BroadcastTo",
     "DataFormatDimMap",
diff --git a/mindspore/ops/operations/math_ops.py b/mindspore/ops/operations/math_ops.py
index b4a684d2f77..10dc7cbeec1 100644
--- a/mindspore/ops/operations/math_ops.py
+++ b/mindspore/ops/operations/math_ops.py
@@ -1744,6 +1744,65 @@ class FloorDiv(_MathBinaryOp):
     """
 
 
+class TruncateDiv(_MathBinaryOp):
+    """
+    Divide the first input tensor by the second input tensor element-wise for integer types, negative numbers will
+    round fractional quantities towards zero.
+
+    The inputs must be two tensors or one tensor and one scalar.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
+
+    Inputs:
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
+
+    Outputs:
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
+
+    Examples:
+        >>> input_x = Tensor(np.array([2, 4, -1]), mindspore.int32)
+        >>> input_y = Tensor(np.array([3, 3, 3]), mindspore.int32)
+        >>> truncate_div = P.TruncateDiv()
+        >>> truncate_div(input_x, input_y)
+        [0, 1, 0]
+    """
+
+
+class TruncateMod(_MathBinaryOp):
+    """
+    Returns element-wise remainder of division.
+
+    The inputs must be two tensors or one tensor and one scalar.
+    When the inputs are two tensors,
+    both dtypes cannot be bool, and the shapes of them could be broadcast.
+    When the inputs are one tensor and one scalar,
+    the scalar only could be a constant.
+
+    Inputs:
+        - **input_x** (Union[Tensor, Number, bool]) - The first input is a number or
+          a bool or a tensor whose data type is number or bool.
+        - **input_y** (Union[Tensor, Number, bool]) - The second input is a number or
+          a bool when the first input is a tensor or a tensor whose data type is number or bool.
+
+    Outputs:
+        Tensor, the shape is same as the shape after broadcasting,
+        and the data type is the one with high precision or high digits among the two inputs.
+
+    Examples:
+        >>> input_x = Tensor(np.array([2, 4, -1]), mindspore.int32)
+        >>> input_y = Tensor(np.array([3, 3, 3]), mindspore.int32)
+        >>> truncate_mod = P.TruncateMod()
+        >>> truncate_mod(input_x, input_y)
+        [2, 1, -1]
+    """
+
+
 class Mod(_MathBinaryOp):
     """
     Computes the remainder of dividing the first input tensor by the second input tensor element-wise.
@@ -2870,6 +2929,34 @@ class Round(PrimitiveWithInfer):
         return x_type
 
 
+class Tan(PrimitiveWithInfer):
+    """
+    Computes tan of `input_x` element-wise.
+
+    Inputs:
+        - **input_x** (Tensor) - The shape of tensor is :math:`(x_1, x_2, ..., x_R)`.
+
+    Outputs:
+        Tensor, has the same shape as `input_x`.
+
+    Examples:
+        >>> tan = P.Tan()
+        >>> input_x = Tensor(np.array([-1.0, 0.0, 1.0]), mindspore.float32)
+        >>> output = tan(input_x)
+    """
+
+    @prim_attr_register
+    def __init__(self):
+        """init Tan"""
+
+    def infer_shape(self, x_shape):
+        return x_shape
+
+    def infer_dtype(self, x_type):
+        validator.check_tensor_type_same({'x': x_type}, mstype.number_type, self.name)
+        return x_type
+
+
 class Atan(PrimitiveWithInfer):
     """
     Computes the trignometric inverse tangent of x element-wise.
diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py
index 31d89f0e42c..7bc6ee83eee 100755
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@@ -768,6 +768,10 @@ test_case_math_ops = [
         'block': P.Asinh(),
         'desc_inputs': [[3, 4, 5]],
         'desc_bprop': [[3, 4, 5]]}),
+    ('Tan', {
+        'block': P.Tan(),
+        'desc_inputs': [[2, 3]],
+        'desc_bprop': [[2, 3]]}),
     ('Reciprocal', {
         'block': P.Reciprocal(),
         'desc_inputs': [[2, 3, 3, 5]],
@@ -850,6 +854,14 @@ test_case_math_ops = [
         'block': P.FloorMod(),
         'desc_inputs': [[3, 4, 5], [2, 3, 4, 5]],
         'desc_bprop': [[2, 3, 4, 5]]}),
+    ('TruncateDiv', {
+        'block': P.TruncateDiv(),
+        'desc_inputs': [[3, 4, 5], [2, 3, 4, 5]],
+        'desc_bprop': [[2, 3, 4, 5]]}),
+    ('TruncateMod', {
+        'block': P.TruncateMod(),
+        'desc_inputs': [[3, 4, 5], [2, 3, 4, 5]],
+        'desc_bprop': [[2, 3, 4, 5]]}),
     ('identity', {
         'block': ops.functional.identity,
         'desc_inputs': [[2, 2]],

From 3f03480efe345992f8bacfa33f16310fed9da1e9 Mon Sep 17 00:00:00 2001
From: caifubi <caifubi1@huawei.com>
Date: Wed, 15 Jul 2020 16:14:03 +0800
Subject: [PATCH 03/68] fix dump device address cannot access

---
 mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc | 6 +++---
 mindspore/ccsrc/runtime/device/device_address.h           | 1 +
 mindspore/core/ir/device_sync.h                           | 1 +
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
index ab2c6b27486..ca9a74022ac 100644
--- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
@@ -107,9 +107,9 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf
   MS_EXCEPTION_IF_NULL(currnet_epoch_tensor->device_address());
   MS_EXCEPTION_IF_NULL(steps_per_epoch_tensor->device_address());
 
-  void *current_step = current_step_tensor->device_address()->ptr_;
-  void *current_epoch = currnet_epoch_tensor->device_address()->ptr_;
-  void *steps_per_epoch = steps_per_epoch_tensor->device_address()->ptr_;
+  void *current_step = current_step_tensor->device_address()->GetMutablePtr();
+  void *current_epoch = currnet_epoch_tensor->device_address()->GetMutablePtr();
+  void *steps_per_epoch = steps_per_epoch_tensor->device_address()->GetMutablePtr();
 
   if (current_epoch != nullptr && current_step != nullptr && steps_per_epoch != nullptr) {
     dump_info->set_step_id_addr(reinterpret_cast<uint64_t>(current_epoch));
diff --git a/mindspore/ccsrc/runtime/device/device_address.h b/mindspore/ccsrc/runtime/device/device_address.h
index 32f5fcced9e..fb3cf63488d 100644
--- a/mindspore/ccsrc/runtime/device/device_address.h
+++ b/mindspore/ccsrc/runtime/device/device_address.h
@@ -65,6 +65,7 @@ class DeviceAddress : public mindspore::DeviceSync {
   virtual void set_status(DeviceAddressStatus status) {}
   virtual DeviceAddressStatus status() const { return DeviceAddressStatus::kInDevice; }
   virtual DeviceAddressType DeviceType() const { return DeviceAddressType::kUnknown; }
+  void *GetMutablePtr() const override { return ptr_; }
 
  protected:
   const void *ptr() const { return ptr_; }
diff --git a/mindspore/core/ir/device_sync.h b/mindspore/core/ir/device_sync.h
index a6bbe92233a..708b6b0e565 100644
--- a/mindspore/core/ir/device_sync.h
+++ b/mindspore/core/ir/device_sync.h
@@ -32,6 +32,7 @@ class DeviceSync {
   virtual bool SyncDeviceToHost(const std::vector<int> &shape, size_t size, TypeId type, void *host_ptr) const = 0;
   virtual bool SyncHostToDevice(const std::vector<int> &shape, size_t size, TypeId type,
                                 const void *host_ptr) const = 0;
+  virtual void *GetMutablePtr() const = 0;
 };
 using DeviceSyncPtr = std::shared_ptr<DeviceSync>;
 }  // namespace mindspore

From 072b09b3fd1e8ed1422fd6408eed66cc5a646235 Mon Sep 17 00:00:00 2001
From: VectorSL <shiliang10@huawei.com>
Date: Wed, 15 Jul 2020 16:22:17 +0800
Subject: [PATCH 04/68] gpu add fusion: 1 replace bn cast 2 replace addn by
 tensoradd

---
 .../ccsrc/backend/optimizer/gpu/adam_fusion.h |  6 +-
 .../optimizer/gpu/adam_weight_decay_fusion.h  |  6 +-
 .../optimizer/gpu/replace_addn_fusion.cc      | 65 +++++++++++++
 .../optimizer/gpu/replace_addn_fusion.h       | 40 ++++++++
 .../optimizer/gpu/replace_bn_cast_fusion.cc   | 92 +++++++++++++++++++
 .../optimizer/gpu/replace_bn_cast_fusion.h    | 58 ++++++++++++
 6 files changed, 261 insertions(+), 6 deletions(-)
 create mode 100644 mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.cc
 create mode 100644 mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.h
 create mode 100644 mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.cc
 create mode 100644 mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.h

diff --git a/mindspore/ccsrc/backend/optimizer/gpu/adam_fusion.h b/mindspore/ccsrc/backend/optimizer/gpu/adam_fusion.h
index f87defc04ca..1fa339c3f3e 100644
--- a/mindspore/ccsrc/backend/optimizer/gpu/adam_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/adam_fusion.h
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
-#define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
 
 #include <memory>
 #include "backend/optimizer/common/optimizer.h"
@@ -53,4 +53,4 @@ class AdamFusion : public PatternProcessPass {
 };
 }  // namespace opt
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/adam_weight_decay_fusion.h b/mindspore/ccsrc/backend/optimizer/gpu/adam_weight_decay_fusion.h
index 53477ec898c..015ce632061 100644
--- a/mindspore/ccsrc/backend/optimizer/gpu/adam_weight_decay_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/adam_weight_decay_fusion.h
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
-#define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
 
 #include <memory>
 #include "backend/optimizer/common/optimizer.h"
@@ -55,4 +55,4 @@ class AdamWeightDecayFusion : public PatternProcessPass {
 };
 }  // namespace opt
 }  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.cc b/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.cc
new file mode 100644
index 00000000000..575a01cc24b
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.cc
@@ -0,0 +1,65 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/gpu/replace_addn_fusion.h"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "backend/session/anf_runtime_algorithm.h"
+#include "ir/primitive.h"
+#include "utils/utils.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+const BaseRef ReplaceAddNFusion::DefinePattern() const {
+  VectorRef addn = VectorRef({prim::kPrimAddN, A, B});
+  return addn;
+}
+
+const AnfNodePtr ReplaceAddNFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
+                                            const EquivPtr &equiv) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(equiv);
+
+  auto A = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
+  auto B = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 1);
+  MS_EXCEPTION_IF_NULL(A);
+  MS_EXCEPTION_IF_NULL(B);
+  int num_input = AnfAlgo::GetNodeAttr<int>(node, "n");
+
+  if (num_input == 2) {
+    auto prim = std::make_shared<Primitive>(prim::kPrimTensorAdd->name());
+    MS_EXCEPTION_IF_NULL(prim);
+    std::vector<AnfNodePtr> inputs = {NewValueNode(prim), A, B};
+    auto add_new = graph->NewCNode(inputs);
+    std::vector<TypeId> outputs_type;
+    std::vector<std::vector<size_t>> outputs_shape;
+    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(A, 0));
+    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(A, 0));
+    AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, add_new.get());
+    auto manager = graph->manager();
+    MS_EXCEPTION_IF_NULL(manager);
+    manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(add_new));
+    return add_new;
+  } else {
+    return nullptr;
+  }
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.h b/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.h
new file mode 100644
index 00000000000..d83da2b0676
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
+
+#include <memory>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ReplaceAddNFusion : public PatternProcessPass {
+ public:
+  explicit ReplaceAddNFusion(bool multigraph = true) : PatternProcessPass("replace_addn", multigraph) {
+    A = std::make_shared<Var>();
+    B = std::make_shared<Var>();
+  }
+  ~ReplaceAddNFusion() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  VarPtr A;
+  VarPtr B;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.cc b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.cc
new file mode 100644
index 00000000000..8e90f044fad
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.cc
@@ -0,0 +1,92 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "backend/session/anf_runtime_algorithm.h"
+#include "ir/primitive.h"
+#include "utils/utils.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+const BaseRef ReplaceBNCastFusion::DefinePattern() const {
+  VectorRef in_cast = VectorRef({prim::kPrimCast, x_});
+  VectorRef fbn2 = VectorRef({prim::kPrimFusedBatchNorm, in_cast, scale_, bias_, mean_, var_});
+  VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2, index_});
+  VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
+  return out_cast;
+}
+
+const AnfNodePtr ReplaceBNCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
+                                              const EquivPtr &equiv) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(equiv);
+
+  auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
+  auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
+  MS_EXCEPTION_IF_NULL(index_node);
+  auto value_node = index_node->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(value_node);
+  int item_idx = GetValue<int>(value_node->value());
+
+  auto fbn2 = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
+  auto x_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 0);
+  auto x_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(x_after), 0);
+  if (item_idx != 0) {
+    return nullptr;
+  }
+  auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 1);
+  auto bias = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 2);
+  auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 3);
+  auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 4);
+
+  MS_EXCEPTION_IF_NULL(fbn2);
+  MS_EXCEPTION_IF_NULL(x_after);
+  MS_EXCEPTION_IF_NULL(x_before);
+  MS_EXCEPTION_IF_NULL(scale);
+  MS_EXCEPTION_IF_NULL(bias);
+  MS_EXCEPTION_IF_NULL(mean);
+  MS_EXCEPTION_IF_NULL(var);
+
+  auto manager = graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  manager->Replace(utils::cast<CNodePtr>(x_after), utils::cast<CNodePtr>(x_before));
+  manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
+
+  std::vector<TypeId> outputs_type;
+  std::vector<std::vector<size_t>> outputs_shape;
+  auto output_num = AnfAlgo::GetOutputTensorNum(fbn2);
+  for (size_t i = 0; i < output_num; i++) {
+    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2, i));
+    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2, i));
+  }
+  outputs_type[0] = kNumberTypeFloat16;
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2.get());
+
+  outputs_type.clear();
+  outputs_shape.clear();
+  outputs_type.push_back(kNumberTypeFloat16);
+  outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
+  return tuple;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.h b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.h
new file mode 100644
index 00000000000..6b1e2ad7b12
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.h
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
+
+#include <memory>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ReplaceBNCastFusion : public PatternProcessPass {
+ public:
+  explicit ReplaceBNCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_cast", multigraph) {
+    x_ = std::make_shared<Var>();
+    scale_ = std::make_shared<Var>();
+    bias_ = std::make_shared<Var>();
+    mean_ = std::make_shared<Var>();
+    var_ = std::make_shared<Var>();
+    y_ = std::make_shared<Var>();
+    running_mean_ = std::make_shared<Var>();
+    running_var_ = std::make_shared<Var>();
+    save_mean_ = std::make_shared<Var>();
+    save_var_ = std::make_shared<Var>();
+    index_ = std::make_shared<Var>();
+  }
+  ~ReplaceBNCastFusion() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  VarPtr x_;
+  VarPtr scale_;
+  VarPtr bias_;
+  VarPtr mean_;
+  VarPtr var_;
+  VarPtr y_;
+  VarPtr running_mean_;
+  VarPtr running_var_;
+  VarPtr save_mean_;
+  VarPtr save_var_;
+  VarPtr index_;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_

From 4cf7faeae6fbbe782f1fb8feb56c3802fa2fe364 Mon Sep 17 00:00:00 2001
From: VectorSL <shiliang10@huawei.com>
Date: Wed, 15 Jul 2020 16:26:24 +0800
Subject: [PATCH 05/68] gpu add fusion: replace batchnorm grad cast

---
 .../gpu/replace_bn_grad_cast2_fusion.cc       | 88 ++++++++++++++++++
 .../gpu/replace_bn_grad_cast2_fusion.h        | 54 +++++++++++
 .../gpu/replace_bn_grad_cast_fusion.cc        | 91 +++++++++++++++++++
 .../gpu/replace_bn_grad_cast_fusion.h         | 54 +++++++++++
 4 files changed, 287 insertions(+)
 create mode 100644 mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.cc
 create mode 100644 mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h
 create mode 100644 mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc
 create mode 100644 mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.h

diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.cc b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.cc
new file mode 100644
index 00000000000..a5b2a92bb74
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.cc
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "backend/session/anf_runtime_algorithm.h"
+#include "ir/primitive.h"
+#include "utils/utils.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+const BaseRef ReplaceBNGradCast2Fusion::DefinePattern() const {
+  VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_, x_, scale_, mean_, var_});
+  VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
+  VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
+  return out_cast;
+}
+
+const AnfNodePtr ReplaceBNGradCast2Fusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
+                                                   const EquivPtr &equiv) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(equiv);
+  auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
+  auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
+  MS_EXCEPTION_IF_NULL(index_node);
+  auto value_node = index_node->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(value_node);
+  int item_idx = GetValue<int>(value_node->value());
+  if (item_idx != 0) {
+    return nullptr;
+  }
+  auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
+
+  auto dy_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
+  auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
+
+  auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
+  auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
+  auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);
+
+  MS_EXCEPTION_IF_NULL(fbn2g);
+  MS_EXCEPTION_IF_NULL(dy_);
+  MS_EXCEPTION_IF_NULL(scale);
+  MS_EXCEPTION_IF_NULL(x_);
+  MS_EXCEPTION_IF_NULL(mean);
+  MS_EXCEPTION_IF_NULL(var);
+
+  auto manager = graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
+  std::vector<TypeId> outputs_type;
+  std::vector<std::vector<size_t>> outputs_shape;
+  auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
+  for (size_t i = 0; i < output_num; i++) {
+    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
+    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
+  }
+  outputs_type[0] = AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0);
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
+
+  outputs_type.clear();
+  outputs_shape.clear();
+  outputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0));
+  outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
+
+  return tuple;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h
new file mode 100644
index 00000000000..fcb56be7123
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
+
+#include <memory>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ReplaceBNGradCast2Fusion : public PatternProcessPass {
+ public:
+  explicit ReplaceBNGradCast2Fusion(bool multigraph = true) : PatternProcessPass("replace_grad_cast2", multigraph) {
+    dy_ = std::make_shared<Var>();
+    x_ = std::make_shared<Var>();
+    scale_ = std::make_shared<Var>();
+    mean_ = std::make_shared<Var>();
+    var_ = std::make_shared<Var>();
+    dx_ = std::make_shared<Var>();
+    bn_scale_ = std::make_shared<Var>();
+    bn_bias_ = std::make_shared<Var>();
+    index_ = std::make_shared<Var>();
+  }
+  ~ReplaceBNGradCast2Fusion() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  VarPtr dy_;
+  VarPtr x_;
+  VarPtr scale_;
+  VarPtr mean_;
+  VarPtr var_;
+  VarPtr dx_;
+  VarPtr bn_scale_;
+  VarPtr bn_bias_;
+  VarPtr index_;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc
new file mode 100644
index 00000000000..9dba16bf860
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc
@@ -0,0 +1,91 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "backend/session/anf_runtime_algorithm.h"
+#include "ir/primitive.h"
+#include "utils/utils.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+const BaseRef ReplaceBNGradCastFusion::DefinePattern() const {
+  VectorRef dy_cast = VectorRef({prim::kPrimCast, dy_});
+  VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_cast, x_, scale_, mean_, var_});
+  VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
+  VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
+  return out_cast;
+}
+
+const AnfNodePtr ReplaceBNGradCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
+                                                  const EquivPtr &equiv) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(equiv);
+
+  auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
+  auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
+  MS_EXCEPTION_IF_NULL(index_node);
+  auto value_node = index_node->cast<ValueNodePtr>();
+  MS_EXCEPTION_IF_NULL(value_node);
+  int item_idx = GetValue<int>(value_node->value());
+  if (item_idx != 0) {
+    return nullptr;
+  }
+  auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
+
+  auto dy_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
+  auto dy_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(dy_after), 0);
+  auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
+
+  auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
+  auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
+  auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);
+
+  MS_EXCEPTION_IF_NULL(fbn2g);
+  MS_EXCEPTION_IF_NULL(dy_after);
+  MS_EXCEPTION_IF_NULL(dy_before);
+  MS_EXCEPTION_IF_NULL(scale);
+  MS_EXCEPTION_IF_NULL(x_);
+  MS_EXCEPTION_IF_NULL(mean);
+  MS_EXCEPTION_IF_NULL(var);
+
+  auto manager = graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  manager->Replace(utils::cast<CNodePtr>(dy_after), utils::cast<CNodePtr>(dy_before));
+  manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
+  std::vector<TypeId> outputs_type;
+  std::vector<std::vector<size_t>> outputs_shape;
+  auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
+  for (size_t i = 0; i < output_num; i++) {
+    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
+    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
+  }
+  outputs_type[0] = kNumberTypeFloat16;
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
+  outputs_type.clear();
+  outputs_shape.clear();
+  outputs_type.push_back(kNumberTypeFloat16);
+  outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
+  return tuple;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.h b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.h
new file mode 100644
index 00000000000..b937aa25bf6
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
+
+#include <memory>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ReplaceBNGradCastFusion : public PatternProcessPass {
+ public:
+  explicit ReplaceBNGradCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_grad_cast", multigraph) {
+    dy_ = std::make_shared<Var>();
+    x_ = std::make_shared<Var>();
+    scale_ = std::make_shared<Var>();
+    mean_ = std::make_shared<Var>();
+    var_ = std::make_shared<Var>();
+    dx_ = std::make_shared<Var>();
+    bn_scale_ = std::make_shared<Var>();
+    bn_bias_ = std::make_shared<Var>();
+    index_ = std::make_shared<Var>();
+  }
+  ~ReplaceBNGradCastFusion() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  VarPtr dy_;
+  VarPtr x_;
+  VarPtr scale_;
+  VarPtr mean_;
+  VarPtr var_;
+  VarPtr dx_;
+  VarPtr bn_scale_;
+  VarPtr bn_bias_;
+  VarPtr index_;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_

From dfb958de1e575f6fc81ec69b62fdcd421d40c995 Mon Sep 17 00:00:00 2001
From: wilfChen <chenweifeng720@huawei.com>
Date: Wed, 15 Jul 2020 16:44:31 +0800
Subject: [PATCH 06/68] Gpu support BroadcastTo kernel

---
 .../gpu/arrays/broadcast_to_gpu_kernel.cc     | 26 ++++++
 .../gpu/arrays/broadcast_to_gpu_kernel.h      | 83 +++++++++++++++++++
 .../gpu/cuda_impl/broadcast_impl.cu           | 40 +++++++--
 .../gpu/cuda_impl/broadcast_impl.cuh          |  4 +
 tests/st/ops/gpu/test_broadcast_to_ops.py     | 40 +++++++++
 5 files changed, 187 insertions(+), 6 deletions(-)
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.cc
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h
 create mode 100644 tests/st/ops/gpu/test_broadcast_to_ops.py

diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.cc
new file mode 100644
index 00000000000..96e82bc5f3d
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.cc
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                      BroadcastToGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      BroadcastToGpuKernel, half)
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h
new file mode 100644
index 00000000000..459471ed763
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class BroadcastToGpuKernel : public GpuKernel {
+ public:
+  BroadcastToGpuKernel() {}
+  ~BroadcastToGpuKernel() = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *input_addr = GetDeviceAddress<T>(inputs, 0);
+    T *output_addr = GetDeviceAddress<T>(outputs, 0);
+
+    BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], output_shape_[0], output_shape_[1],
+                output_shape_[2], output_shape_[3], input_addr, output_addr,
+                reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+  bool Init(const CNodePtr &kernel_node) override {
+    auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    if (input_shapes.size() > 4 || output_shapes.size() > 4) {
+      MS_LOG(EXCEPTION) << "BroadcastTo operation not support dim greater than 4";
+    }
+
+    for (int i = input_shapes.size() - 1; i >= 0; i--) {
+      input_shape_[i] = input_shapes[i];
+    }
+
+    for (int j = output_shapes.size() - 1; j >= 0; j--) {
+      output_shape_[j] = output_shapes[j];
+    }
+
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * sizeof(T));
+    output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] * sizeof(T));
+  }
+
+ private:
+  int input_shape_[4] = {1, 1, 1, 1};
+  int output_shape_[4] = {1, 1, 1, 1};
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
index a72daa42346..f5c88e7ebfc 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
@@ -116,16 +116,16 @@ __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const
                                                       output);
     case BROADCAST_TYPE_REALDIV:
       return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+                                                        output);
     case BROADCAST_TYPE_MUL:
       return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+                                                    output);
     case BROADCAST_TYPE_SUB:
       return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+                                                    output);
     case BROADCAST_TYPE_ADD:
       return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
-                                                      output);
+                                                    output);
   }
 }
 
@@ -176,6 +176,28 @@ void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, cons
   NoBroadcastKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, op, input0, input1, output);
 }
 
+template <typename T>
+__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0,
+                                  const int o1, const int o2, const int o3, const T *input_addr, T *output_addr) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < o0 * o1 * o2 * o3; pos += blockDim.x * gridDim.x) {
+    int i = pos / (o1 * o2 * o3) % o0;
+    int j = pos / (o2 * o3) % o1;
+    int k = pos / o3 % o2;
+    int l = pos % o3;
+
+    int input_idx = Index(i, i0) * i1 * i2 * i3 + Index(j, i1) * i2 * i3 + Index(k, i2) * i3 + Index(l, i3);
+    output_addr[pos] = input_addr[input_idx];
+  }
+}
+
+template <typename T>
+void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
+                 const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) {
+  int nums = o0 * o1 * o2 * o3;
+  BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr,
+                                                                  output_addr);
+}
+
 template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
                         const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
                         enum BroadcastOpType op, const float *input0, const float *input1, bool *output,
@@ -204,5 +226,11 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
                           bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
                           half *output, cudaStream_t stream);
-template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
-                          int *output, cudaStream_t stream);
+template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
+                          cudaStream_t stream);
+
+template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
+                          const int &o2, const int &o3, const float *input_addr, float *output_addr,
+                          cudaStream_t stream);
+template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
+                          const int &o2, const int &o3, const half *input_addr, half *output_addr, cudaStream_t stream);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
index dfc4c75c932..62a3baad0e6 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
@@ -41,4 +41,8 @@ template <typename T, typename S>
 void NoBroadcast(const int &size, enum BroadcastOpType op, const T *input0, const T *input1, S *output,
                  cudaStream_t stream);
 
+template <typename T>
+void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
+                 const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream);
+
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
diff --git a/tests/st/ops/gpu/test_broadcast_to_ops.py b/tests/st/ops/gpu/test_broadcast_to_ops.py
new file mode 100644
index 00000000000..828e72c4d00
--- /dev/null
+++ b/tests/st/ops/gpu/test_broadcast_to_ops.py
@@ -0,0 +1,40 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+from mindspore.common.tensor import Tensor
+from mindspore.ops import operations as P
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_broadcast():
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+
+    x_np = np.random.rand(3, 1, 5, 1).astype(np.float32)
+    shape = (3, 4, 5, 6)
+
+    output = P.BroadcastTo(shape)(Tensor(x_np))
+    expect = np.broadcast_to(x_np, shape)
+    assert np.allclose(output.asnumpy(), expect)
+
+    x1_np = np.random.rand(3, 1, 5, 1).astype(np.float16)
+    output = P.BroadcastTo(shape)(Tensor(x1_np))
+    expect = np.broadcast_to(x1_np, shape)
+    assert np.allclose(output.asnumpy(), expect)

From 380a57f3c1b86df4d7a9249c55c16dba9bc043f4 Mon Sep 17 00:00:00 2001
From: liuxiao93 <liuxiao93@huawei.com>
Date: Wed, 15 Jul 2020 14:58:50 +0800
Subject: [PATCH 07/68] Adapt ApplyCenteredRmsProp.

---
 .../kernel_compiler/tbe/tbe_adapter.cc        |  1 +
 mindspore/ccsrc/transform/graph_ir/convert.cc |  2 +-
 .../ccsrc/transform/graph_ir/op_declare.cc    | 13 +++----
 .../ccsrc/transform/graph_ir/op_declare.h     |  4 +--
 .../_op_impl/tbe/apply_centered_rms_prop.py   | 35 ++++++++++++-------
 mindspore/ops/operations/nn_ops.py            |  5 +++
 6 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.cc
index 449a9f45564..e24663fb10a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.cc
@@ -81,6 +81,7 @@ static std::map<string, string> tbe_func_adapter_map = {
   {"sparse_apply_proximal_adagrad", "sparse_apply_proximal_adagrad_d"},
   {"apply_add_sign", "apply_add_sign_d"},
   {"apply_power_sign", "apply_power_sign_d"},
+  {"apply_centered_rms_prop", "apply_centered_rms_prop_d"},
   {"transpose", "transpose_d"},
   {"fill", "fill_d"},
   {"unsorted_segment_sum", "unsorted_segment_sum_d"},
diff --git a/mindspore/ccsrc/transform/graph_ir/convert.cc b/mindspore/ccsrc/transform/graph_ir/convert.cc
index 7419dd2cc98..56028bbdd90 100644
--- a/mindspore/ccsrc/transform/graph_ir/convert.cc
+++ b/mindspore/ccsrc/transform/graph_ir/convert.cc
@@ -409,7 +409,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {string(kNameBatchToSpace), ADPT_DESC(BatchToSpaceD)},
     {string(kNameAtan2), ADPT_DESC(Atan2)},
     {string(kNameApplyRMSProp), ADPT_DESC(ApplyRMSPropD)},
-    {string(kNameApplyCenteredRMSProp), ADPT_DESC(ApplyCenteredRMSProp)},
+    {string(kNameApplyCenteredRMSProp), ADPT_DESC(ApplyCenteredRMSPropD)},
     {string(kNameL2Loss), ADPT_DESC(L2Loss)},
     {string(kNameCTCLoss), ADPT_DESC(CTCLoss)},
     {string(kNameRange), ADPT_DESC(RangeD)},
diff --git a/mindspore/ccsrc/transform/graph_ir/op_declare.cc b/mindspore/ccsrc/transform/graph_ir/op_declare.cc
index e3751e0c925..939e5feba18 100644
--- a/mindspore/ccsrc/transform/graph_ir/op_declare.cc
+++ b/mindspore/ccsrc/transform/graph_ir/op_declare.cc
@@ -1284,12 +1284,13 @@ INPUT_ATTR_MAP(ApplyRMSPropD) = {{6, ATTR_DESC(rho, AnyTraits<float>())},
 ATTR_MAP(ApplyRMSPropD) = {{"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
 OUTPUT_MAP(ApplyRMSPropD) = {{0, OUTPUT_DESC(var)}};
 
-// ApplyCenteredRMSProp
-INPUT_MAP(ApplyCenteredRMSProp) = {{1, INPUT_DESC(var)}, {2, INPUT_DESC(mg)},       {3, INPUT_DESC(ms)},
-                                   {4, INPUT_DESC(mom)}, {5, INPUT_DESC(grad)},     {6, INPUT_DESC(lr)},
-                                   {7, INPUT_DESC(rho)}, {8, INPUT_DESC(momentum)}, {9, INPUT_DESC(epsilon)}};
-ATTR_MAP(ApplyCenteredRMSProp) = {{"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
-OUTPUT_MAP(ApplyCenteredRMSProp) = {{0, OUTPUT_DESC(var)}};
+// ApplyCenteredRMSPropD
+INPUT_MAP(ApplyCenteredRMSPropD) = {{1, INPUT_DESC(var)}, {2, INPUT_DESC(mg)},       {3, INPUT_DESC(ms)},
+                                    {4, INPUT_DESC(mom)}, {5, INPUT_DESC(grad)},     {6, INPUT_DESC(lr)},
+                                    {7, INPUT_DESC(rho)}, {8, INPUT_DESC(momentum)}, {9, INPUT_DESC(epsilon)}};
+ATTR_MAP(ApplyCenteredRMSPropD) = {{"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
+OUTPUT_MAP(ApplyCenteredRMSPropD) = {
+  {0, OUTPUT_DESC(var)}, {1, OUTPUT_DESC(mg)}, {2, OUTPUT_DESC(ms)}, {3, OUTPUT_DESC(mom)}};
 
 // L2Loss
 INPUT_MAP(L2Loss) = {{1, INPUT_DESC(x)}};
diff --git a/mindspore/ccsrc/transform/graph_ir/op_declare.h b/mindspore/ccsrc/transform/graph_ir/op_declare.h
index e493ea0e528..2774ac1ff83 100755
--- a/mindspore/ccsrc/transform/graph_ir/op_declare.h
+++ b/mindspore/ccsrc/transform/graph_ir/op_declare.h
@@ -486,8 +486,8 @@ DECLARE_OP_USE_OUTPUT(Atan2)
 DECLARE_OP_ADAPTER(ApplyRMSPropD)
 DECLARE_OP_USE_INPUT_ATTR(ApplyRMSPropD)
 DECLARE_OP_USE_OUTPUT(ApplyRMSPropD)
-DECLARE_OP_ADAPTER(ApplyCenteredRMSProp)
-DECLARE_OP_USE_OUTPUT(ApplyCenteredRMSProp)
+DECLARE_OP_ADAPTER(ApplyCenteredRMSPropD)
+DECLARE_OP_USE_OUTPUT(ApplyCenteredRMSPropD)
 DECLARE_OP_ADAPTER(L2Loss)
 DECLARE_OP_USE_OUTPUT(L2Loss)
 DECLARE_OP_ADAPTER(CTCLoss)
diff --git a/mindspore/ops/_op_impl/tbe/apply_centered_rms_prop.py b/mindspore/ops/_op_impl/tbe/apply_centered_rms_prop.py
index 8499614324c..f372583a85d 100644
--- a/mindspore/ops/_op_impl/tbe/apply_centered_rms_prop.py
+++ b/mindspore/ops/_op_impl/tbe/apply_centered_rms_prop.py
@@ -13,15 +13,15 @@
 # limitations under the License.
 # ============================================================================
 
-"""ApplyCenteredRMSProp op"""
+"""ApplyCenteredRMSPropD op"""
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
 apply_centered_rms_prop_op_info = TBERegOp("ApplyCenteredRMSProp") \
     .fusion_type("OPAQUE") \
     .async_flag(False) \
-    .binfile_name("apply_centered_rms_prop.so") \
+    .binfile_name("apply_centered_rms_prop_d.so") \
     .compute_cost(10) \
-    .kernel_name("apply_centered_rms_prop") \
+    .kernel_name("apply_centered_rms_prop_d") \
     .partial_flag(True) \
     .input(0, "var", False, "required", "all") \
     .input(1, "mg", False, "required", "all") \
@@ -33,34 +33,45 @@ apply_centered_rms_prop_op_info = TBERegOp("ApplyCenteredRMSProp") \
     .input(7, "epsilon", False, "required", "all") \
     .input(8, "grad", False, "required", "all") \
     .output(0, "var", False, "required", "all") \
+    .output(1, "mg", False, "required", "all") \
+    .output(2, "ms", False, "required", "all") \
+    .output(3, "mom", False, "required", "all") \
     .dtype_format(DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD,
                   DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_5HD, DataType.F16_5HD) \
+                  DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD, DataType.F16_5HD,
+                  DataType.F16_5HD) \
     .dtype_format(DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ,
                   DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_FracZ, DataType.F16_FracZ) \
+                  DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ, DataType.F16_FracZ,
+                  DataType.F16_FracZ) \
     .dtype_format(DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0,
                   DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0) \
+                  DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0, DataType.F16_C1HWNCoC0,
+                  DataType.F16_C1HWNCoC0) \
     .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
                   DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
-                  DataType.F16_Default, DataType.F16_Default) \
+                  DataType.F16_Default, DataType.F16_Default, DataType.F16_Default, DataType.F16_Default,
+                  DataType.F16_Default) \
     .dtype_format(DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
                   DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_5HD, DataType.F32_5HD) \
+                  DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD, DataType.F32_5HD,
+                  DataType.F32_5HD) \
     .dtype_format(DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ,
                   DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_FracZ, DataType.F32_FracZ) \
+                  DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ, DataType.F32_FracZ,
+                  DataType.F32_FracZ) \
     .dtype_format(DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0,
                   DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0) \
+                  DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0, DataType.F32_C1HWNCoC0,
+                  DataType.F32_C1HWNCoC0) \
     .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
                   DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
-                  DataType.F32_Default, DataType.F32_Default) \
+                  DataType.F32_Default, DataType.F32_Default, DataType.F32_Default, DataType.F32_Default,
+                  DataType.F32_Default) \
     .get_op_info()
 
 
 @op_info_register(apply_centered_rms_prop_op_info)
 def _apply_centered_rms_prop_tbe():
-    """ApplyCenteredRMSProp TBE register"""
+    """ApplyCenteredRMSPropD TBE register"""
     return
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index e97c4c91c8c..d2b47357d93 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -1962,6 +1962,7 @@ class ApplyCenteredRMSProp(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self, use_locking=False):
         self.use_locking = validator.check_value_type("use_locking", use_locking, [bool], self.name)
+        self.is_ascend = context.get_context("device_target") == "Ascend"
 
     def infer_shape(self, var_shape, mean_gradient_shape, mean_square_shape, moment_shape, grad_shape,
                     learning_rate_shape, decay_shape, momentum_shape, epsilon_shape):
@@ -1969,6 +1970,8 @@ class ApplyCenteredRMSProp(PrimitiveWithInfer):
         validator.check("var_shape", var_shape, "mean_square_shape", mean_square_shape, Rel.EQ, self.name)
         validator.check("var_shape", var_shape, "moment_shape", moment_shape, Rel.EQ, self.name)
         validator.check("var_shape", var_shape, "grad_shape", grad_shape, Rel.EQ, self.name)
+        if self.is_ascend:
+            return var_shape, mean_gradient_shape, mean_square_shape, moment_shape
         return var_shape
 
     def infer_dtype(self, var_dtype, mean_gradient_dtype, mean_square_dtype, moment_dtype, grad_dtype,
@@ -1982,6 +1985,8 @@ class ApplyCenteredRMSProp(PrimitiveWithInfer):
         validator.check_type_same(args_rho, valid_types, self.name)
         args_lr = {"learning_rate": learning_rate_dtype, "rho": rho_dtype}
         validator.check_scalar_or_tensor_type_same(args_lr, valid_types, self.name, allow_mix=True)
+        if self.is_ascend:
+            return var_dtype, mean_gradient_dtype, mean_square_dtype, moment_dtype
         return var_dtype
 
 

From c10e07734cd6d92038402310298e19ad3d88f5e5 Mon Sep 17 00:00:00 2001
From: wilfChen <chenweifeng720@huawei.com>
Date: Wed, 15 Jul 2020 19:51:30 +0800
Subject: [PATCH 08/68] gpu support TopK kernel

---
 mindspore/ccsrc/CMakeLists.txt                |   2 +-
 .../gpu/arrays/topk_gpu_kernel.cc             |  29 ++++
 .../gpu/arrays/topk_gpu_kernel.h              | 110 ++++++++++++
 .../gpu/cuda_impl/topk_impl.cu                | 162 ++++++++++++++++++
 .../gpu/cuda_impl/topk_impl.cuh               |  32 ++++
 .../ccsrc/runtime/device/gpu/cuda_common.h    |   4 +
 tests/st/ops/gpu/test_topk_op.py              |  82 +++++++++
 7 files changed, 420 insertions(+), 1 deletion(-)
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.cc
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cu
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh
 create mode 100644 tests/st/ops/gpu/test_topk_op.py

diff --git a/mindspore/ccsrc/CMakeLists.txt b/mindspore/ccsrc/CMakeLists.txt
index 53300acda4a..472783c5012 100644
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -44,7 +44,7 @@ if(ENABLE_GPU)
             "backend/kernel_compiler/akg/akg_kernel_attrs_process.cc"
             )
 
-    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53)
+    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
     list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/blocking_queue.cc" "runtime/device/gpu/gpu_buffer_mgr.cc")
     list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/mpi/mpi_initializer.cc"
                                   "runtime/device/gpu/distribution/collective_wrapper.cc"
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.cc
new file mode 100644
index 00000000000..59503128e90
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.cc
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_TWO(TopK,
+                      KernelAttr()
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeInt32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeInt32),
+                      TopKGpuKernel, float, int)
+}
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
new file mode 100644
index 00000000000..8b16552c5a8
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
@@ -0,0 +1,110 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_
+
+#include <vector>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T, typename S>
+class TopKGpuKernel : public GpuKernel {
+ public:
+  TopKGpuKernel() : sorted_(false), outer_size_(1), inner_size_(1), k_(1), use_share_mem_(true), ceil_power2_(0) {}
+  ~TopKGpuKernel() override = default;
+
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspaces,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *input_addr = GetDeviceAddress<T>(inputs, 0);
+    S *k = GetDeviceAddress<S>(inputs, 1);
+    T *output_addr = GetDeviceAddress<T>(outputs, 0);
+    S *indices = GetDeviceAddress<S>(outputs, 1);
+    T *data_buff = nullptr;
+    S *index_buff = nullptr;
+    if (use_share_mem_ == false) {
+      data_buff = GetDeviceAddress<T>(workspaces, 0);
+      index_buff = GetDeviceAddress<S>(workspaces, 1);
+    }
+
+    TopK(outer_size_, inner_size_, input_addr, k, output_addr, indices, data_buff, index_buff,
+         reinterpret_cast<cudaStream_t>(stream_ptr));
+
+    if (sorted_ == false) {
+      std::cout << "================BitonicSortByKey" << std::endl;
+      BitonicSortByKey(outer_size_, k_, output_addr, indices, data_buff, index_buff,
+                       reinterpret_cast<cudaStream_t>(stream_ptr));
+    }
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    for (size_t i = 0; i < input_shapes.size() - 1; i++) {
+      outer_size_ *= input_shapes[i];
+    }
+    inner_size_ = input_shapes[input_shapes.size() - 1];
+    k_ = output_shapes[output_shapes.size() - 1];
+
+    sorted_ = GetAttr<bool>(kernel_node, "sorted");
+
+    ceil_power2_ = RoundUpPower2(inner_size_);
+    size_t buffer_size = ceil_power2_ * (sizeof(T) + sizeof(S));
+    if (buffer_size > SHARED_MEM_PER_BLOCK) {
+      use_share_mem_ = false;
+      MS_LOG(WARNING) << "CUDA share memory not enough, sort with RAM";
+    }
+
+    InitSizeLists();
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {
+    input_size_list_.push_back(outer_size_ * inner_size_ * sizeof(T));
+    input_size_list_.push_back(sizeof(S));
+    output_size_list_.push_back(outer_size_ * k_ * sizeof(T));
+    output_size_list_.push_back(outer_size_ * k_ * sizeof(S));
+    if (use_share_mem_ == false) {
+      workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(T));
+      workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(S));
+    }
+  }
+
+ private:
+  bool sorted_;
+  int outer_size_;
+  int inner_size_;
+  int k_;
+  bool use_share_mem_;
+  int ceil_power2_;
+
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // TopKpuKernel
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cu
new file mode 100644
index 00000000000..6e5ac52903b
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cu
@@ -0,0 +1,162 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
+#include <limits>
+#include <algorithm>
+
+int RoundUpPower2(int v) {
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v++;
+  return v;
+}
+
+template <typename T>
+__inline__ __device__ void Swap(T *lhs, T *rhs) {
+  T tmp = lhs[0];
+  lhs[0] = rhs[0];
+  rhs[0] = tmp;
+}
+
+template <typename T, typename S>
+__global__ void TopkKernel(const int outer, const int inner, const int ceil_power2, const T *input, const S *k,
+                           T *output, S *indices, T *data_buff, S *index_buff) {
+  // default: sort with share memory
+  extern __shared__ T share_mem[];
+  T *data_arr = share_mem;
+  S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2);
+  // sort with RAM
+  if (data_buff != nullptr && index_buff != nullptr) {
+    data_arr = data_buff + blockIdx.x * ceil_power2;
+    index_arr = index_buff + blockIdx.x * ceil_power2;
+  }
+
+  for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
+    data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max();
+    index_arr[i] = i;
+  }
+  __syncthreads();
+
+  for (size_t i = 2; i <= ceil_power2; i <<= 1) {
+    for (size_t j = (i >> 1); j > 0; j >>= 1) {
+      for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
+        size_t tid_comp = tid ^ j;
+        if (tid_comp > tid) {
+          if ((tid & i) == 0) {
+            if (data_arr[tid] > data_arr[tid_comp]) {
+              Swap(&data_arr[tid], &data_arr[tid_comp]);
+              Swap(&index_arr[tid], &index_arr[tid_comp]);
+            }
+          } else {
+            if (data_arr[tid] < data_arr[tid_comp]) {
+              Swap(&data_arr[tid], &data_arr[tid_comp]);
+              Swap(&index_arr[tid], &index_arr[tid_comp]);
+            }
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+
+  for (size_t tid = threadIdx.x; tid < k[0]; tid += blockDim.x) {
+    output[blockIdx.x * k[0] + tid] = data_arr[inner - tid - 1];
+    indices[blockIdx.x * k[0] + tid] = index_arr[inner - tid - 1];
+  }
+}
+
+template <typename T, typename S>
+void TopK(const int &outer, const int &inner, const T *input, const S *k, T *output, S *indices, T *data_buff,
+          S *index_buff, cudaStream_t stream) {
+  int ceil_power2 = RoundUpPower2(inner);
+  int share_mem = (data_buff == nullptr) ? ceil_power2 * (sizeof(T) + sizeof(S)) : 0;
+  int thread = std::min(ceil_power2, GET_THREADS);
+  TopkKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, k, output, indices, data_buff,
+                                                   index_buff);
+}
+
+template <typename T, typename S>
+__global__ void BitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2, T *input,
+                                       S *indices, T *data_buff, S *index_buff) {
+  // default: sort with share memory
+  extern __shared__ T share_mem[];
+  T *data_arr = share_mem;
+  S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2);
+  // sort with RAM
+  if (data_buff != nullptr && index_buff != nullptr) {
+    data_arr = data_buff + blockIdx.x * ceil_power2;
+    index_arr = index_buff + blockIdx.x * ceil_power2;
+  }
+
+  for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
+    data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max();
+    index_arr[i] = (i < inner) ? indices[blockIdx.x * inner + i] : std::numeric_limits<S>::max();;
+  }
+  __syncthreads();
+
+  for (size_t i = 2; i <= ceil_power2; i <<= 1) {
+    for (size_t j = (i >> 1); j > 0; j >>= 1) {
+      for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
+        size_t tid_comp = tid ^ j;
+        if (tid_comp > tid) {
+          if ((tid & i) == 0) {
+            if (index_arr[tid] > index_arr[tid_comp]) {
+              Swap(&data_arr[tid], &data_arr[tid_comp]);
+              Swap(&index_arr[tid], &index_arr[tid_comp]);
+            }
+          } else {
+            if (index_arr[tid] < index_arr[tid_comp]) {
+              Swap(&data_arr[tid], &data_arr[tid_comp]);
+              Swap(&index_arr[tid], &index_arr[tid_comp]);
+            }
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+
+  for (size_t tid = threadIdx.x; tid < inner; tid += blockDim.x) {
+    input[blockIdx.x * inner + tid] = data_arr[tid];
+    indices[blockIdx.x * inner + tid] = index_arr[tid];
+  }
+}
+
+template <typename T, typename S>
+void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff,
+                      cudaStream_t stream) {
+  int ceil_power2 = RoundUpPower2(inner);
+  size_t share_mem = ceil_power2 * (sizeof(T) + sizeof(S));
+  if (share_mem > SHARED_MEM_PER_BLOCK) {
+    share_mem = 0;
+  } else {
+    data_buff = nullptr;
+    index_buff = nullptr;
+  }
+  int thread = std::min(ceil_power2, GET_THREADS);
+  BitonicSortByKeyKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, indices, data_buff,
+                                                               index_buff);
+}
+
+template void TopK(const int &outer, const int &inner, const float *input_addr, const int *k, float *output,
+                   int *indices, float *data_buff, int *index_buff, cudaStream_t stream);
+template void BitonicSortByKey(const int &outer, const int &inner, float *input, int *indices, float *data_buff,
+                               int *index_buff, cudaStream_t stream);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh
new file mode 100644
index 00000000000..014044296a4
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
+
+#include <cuda_runtime.h>
+#include "runtime/device/gpu/cuda_common.h"
+
+template <typename T, typename S>
+void TopK(const int &outer, const int &inner, const T *input_addr, const S *k, T *output, S *indices, T *data_buff,
+          S *index_buff, cudaStream_t stream);
+
+template <typename T, typename S>
+void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff,
+                      cudaStream_t stream);
+int RoundUpPower2(int v);
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
diff --git a/mindspore/ccsrc/runtime/device/gpu/cuda_common.h b/mindspore/ccsrc/runtime/device/gpu/cuda_common.h
index 2689fdbacab..ffe237ab6bd 100644
--- a/mindspore/ccsrc/runtime/device/gpu/cuda_common.h
+++ b/mindspore/ccsrc/runtime/device/gpu/cuda_common.h
@@ -30,6 +30,7 @@ class CudaCommon {
   inline int blocks_num(const int total_threads) const {
     return std::min(((total_threads - 1) / threads_per_block_) + 1, max_blocks_);
   }
+  size_t share_memory_size() const { return max_share_memory_; }
 
   static CudaCommon &GetInstance() {
     static CudaCommon instance;
@@ -44,6 +45,7 @@ class CudaCommon {
     threads_per_block_ = prop.maxThreadsPerBlock;
     max_blocks_ = prop.multiProcessorCount;
     major_sm_ = prop.major;
+    max_share_memory_ = prop.sharedMemPerBlock;
   }
   ~CudaCommon() = default;
   CudaCommon(const CudaCommon &) = delete;
@@ -52,10 +54,12 @@ class CudaCommon {
   int max_blocks_;
   int threads_per_block_;
   int major_sm_;
+  size_t max_share_memory_;
 };
 #define GET_BLOCKS(total_threads) mindspore::device::gpu::CudaCommon::GetInstance().blocks_num(total_threads)
 #define GET_THREADS mindspore::device::gpu::CudaCommon::GetInstance().threads_num()
 #define GET_MAJOR_SM mindspore::device::gpu::CudaCommon::GetInstance().major_sm()
+#define SHARED_MEM_PER_BLOCK mindspore::device::gpu::CudaCommon::GetInstance().share_memory_size()
 #define MINIUM_SM 6
 #define RECOMMEND_SM 7
 }  // namespace gpu
diff --git a/tests/st/ops/gpu/test_topk_op.py b/tests/st/ops/gpu/test_topk_op.py
new file mode 100644
index 00000000000..83cd8e6403e
--- /dev/null
+++ b/tests/st/ops/gpu/test_topk_op.py
@@ -0,0 +1,82 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_topk():
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+
+    x_np = np.random.rand(3, 4).astype(np.float32)
+    k = 4
+    ms_output = P.TopK(True)(Tensor(x_np), k)
+    np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
+    assert np.allclose(ms_output[0].asnumpy(), np_output)
+
+    x_np = np.random.rand(3, 4).astype(np.float32)
+    k = 4
+    ms_output = P.TopK(False)(Tensor(x_np), k)
+    assert np.allclose(ms_output[0].asnumpy(), x_np)
+
+    x_np = np.random.rand(2, 3, 4).astype(np.float32)
+    k = 2
+    ms_output = P.TopK(True)(Tensor(x_np), k)
+    np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
+    assert np.allclose(ms_output[0].asnumpy(), np_output)
+
+    x_np = np.random.rand(512, 1024).astype(np.float32)
+    k = 512
+    ms_output = P.TopK(True)(Tensor(x_np), k)
+    np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
+    assert np.allclose(ms_output[0].asnumpy(), np_output)
+
+    # sorted elements num greater than max thread per block
+    x_np = np.random.rand(512, 2048).astype(np.float32)
+    k = 1
+    ms_output = P.TopK(True)(Tensor(x_np), k)
+    np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
+    assert np.allclose(ms_output[0].asnumpy(), np_output)
+
+    x_np = np.random.rand(512, 2048).astype(np.float32)
+    k = 2048
+    ms_output = P.TopK(True)(Tensor(x_np), k)
+    np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
+    assert np.allclose(ms_output[0].asnumpy(), np_output)
+
+    # sorted elements num greater than max share memory per block
+    x_np = np.random.rand(512, 40960).astype(np.float32)
+    k = 1
+    ms_output = P.TopK(True)(Tensor(x_np), k)
+    np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
+    assert np.allclose(ms_output[0].asnumpy(), np_output)
+
+    x_np = np.random.rand(512, 40960).astype(np.float32)
+    k = 40960
+    ms_output = P.TopK(True)(Tensor(x_np), k)
+    np_output = np.sort(x_np, axis=-1)[..., ::-1][..., 0:k]
+    assert np.allclose(ms_output[0].asnumpy(), np_output)
+
+    x_np = np.random.rand(512, 40960).astype(np.float32)
+    k = 40960
+    ms_output = P.TopK(False)(Tensor(x_np), k)
+    assert np.allclose(ms_output[0].asnumpy(), x_np)

From 4136892a3ec2e92f0bc2f744ba87fa29f61e8f6d Mon Sep 17 00:00:00 2001
From: YangLuo <luoyang42@huawei.com>
Date: Wed, 1 Jul 2020 14:34:57 +0800
Subject: [PATCH 09/68] add SlidingWindow Op

---
 .../minddata/dataset/api/python_bindings.cc   |   4 +
 .../minddata/dataset/kernels/tensor_op.h      |   1 +
 .../dataset/text/kernels/CMakeLists.txt       |   2 +
 .../dataset/text/kernels/data_utils.cc        |  66 +++++++++++
 .../dataset/text/kernels/data_utils.h         |  43 +++++++
 .../dataset/text/kernels/sliding_window_op.cc |  57 ++++++++++
 .../dataset/text/kernels/sliding_window_op.h  |  68 ++++++++++++
 mindspore/dataset/text/__init__.py            |   4 +-
 mindspore/dataset/text/transforms.py          |  30 ++++-
 mindspore/dataset/text/validators.py          |  13 ++-
 tests/ut/cpp/dataset/CMakeLists.txt           |   1 +
 .../ut/cpp/dataset/sliding_window_op_test.cc  |  69 ++++++++++++
 .../ut/python/dataset/test_sliding_window.py  | 105 ++++++++++++++++++
 13 files changed, 459 insertions(+), 4 deletions(-)
 create mode 100644 mindspore/ccsrc/minddata/dataset/text/kernels/data_utils.cc
 create mode 100644 mindspore/ccsrc/minddata/dataset/text/kernels/data_utils.h
 create mode 100644 mindspore/ccsrc/minddata/dataset/text/kernels/sliding_window_op.cc
 create mode 100644 mindspore/ccsrc/minddata/dataset/text/kernels/sliding_window_op.h
 create mode 100644 tests/ut/cpp/dataset/sliding_window_op_test.cc
 create mode 100644 tests/ut/python/dataset/test_sliding_window.py

diff --git a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
index 145291ec3be..36741637d13 100644
--- a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
@@ -77,6 +77,7 @@
 #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/lookup_op.h"
 #include "minddata/dataset/text/kernels/ngram_op.h"
+#include "minddata/dataset/text/kernels/sliding_window_op.h"
 #include "minddata/dataset/text/kernels/to_number_op.h"
 #include "minddata/dataset/text/kernels/unicode_char_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/wordpiece_tokenizer_op.h"
@@ -640,6 +641,9 @@ void bindTokenizerOps(py::module *m) {
       py::arg("max_bytes_per_token") = WordpieceTokenizerOp::kDefMaxBytesPerToken,
       py::arg("unknown_token") = std::string(WordpieceTokenizerOp::kDefUnknownToken),
       py::arg("with_offsets") = WordpieceTokenizerOp::kDefWithOffsets);
+  (void)py::class_<SlidingWindowOp, TensorOp, std::shared_ptr<SlidingWindowOp>>(
+    *m, "SlidingWindowOp", "TensorOp to apply sliding window to a 1-D Tensor.")
+    .def(py::init<uint32_t, int32_t>(), py::arg("width"), py::arg("axis"));
 }
 
 void bindDependIcuTokenizerOps(py::module *m) {
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
index 3bcba4b4630..d4f5abc4b69 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
@@ -120,6 +120,7 @@ constexpr char kCaseFoldOp[] = "CaseFoldOp";
 constexpr char kJiebaTokenizerOp[] = "JiebaTokenizerOp";
 constexpr char kLookupOp[] = "LookupOp";
 constexpr char kNgramOp[] = "NgramOp";
+constexpr char kSlidingWindowOp[] = "SlidingWindowOp";
 constexpr char kNormalizeUTF8Op[] = "NormalizeUTF8Op";
 constexpr char kRegexReplaceOp[] = "RegexReplaceOp";
 constexpr char kRegexTokenizerOp[] = "RegexTokenizerOp";
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt
index 449bb93d8b9..a932a2089eb 100644
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/CMakeLists.txt
@@ -12,10 +12,12 @@ if (NOT (CMAKE_SYSTEM_NAME MATCHES "Windows"))
                 whitespace_tokenizer_op.cc)
 endif()
 add_library(text-kernels OBJECT
+        data_utils.cc
         lookup_op.cc
         jieba_tokenizer_op.cc
         unicode_char_tokenizer_op.cc
         ngram_op.cc
+        sliding_window_op.cc
         wordpiece_tokenizer_op.cc
         truncate_sequence_pair_op.cc
         to_number_op.cc
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/data_utils.cc b/mindspore/ccsrc/minddata/dataset/text/kernels/data_utils.cc
new file mode 100644
index 00000000000..74b1d930775
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/data_utils.cc
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "minddata/dataset/text/kernels/data_utils.h"
+
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "minddata/dataset/core/pybind_support.h"
+#include "minddata/dataset/kernels/data/type_cast_op.h"
+#include "minddata/dataset/kernels/data/slice_op.h"
+#include "minddata/dataset/kernels/data/concatenate_op.h"
+
+namespace mindspore {
+namespace dataset {
+Status SlidingWindowHelper(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, TensorShape out_shape,
+                           uint32_t width, int32_t axis) {
+  // if the data row has fewer items than width, the corresponding result row will be empty
+  if (out_shape.Size() == 0) {
+    MS_LOG(WARNING) << "The data row has fewer items than width, the result will be empty.";
+    if (input->type().value() == DataType::DE_STRING) {
+      RETURN_IF_NOT_OK(Tensor::CreateTensor(output, std::vector<std::string>{}, TensorShape({0})));
+    } else {
+      RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, TensorShape({0}), input->type()));
+    }
+    return Status::OK();
+  }
+
+  axis = Tensor::HandleNeg(axis, input->shape().Size());
+  int32_t axis_end = input->shape()[axis];
+  std::shared_ptr<Tensor> tmp;
+  auto concatenate_op = std::make_unique<ConcatenateOp>(axis, nullptr, nullptr);
+
+  // Slice on specified axis and concatenate on new axis
+  for (int32_t i = 0; i + width <= axis_end; i++) {
+    auto slice_op = std::make_unique<SliceOp>(Slice(i, i + width, 1));
+    slice_op->Compute(input, &tmp);
+    if (i == 0) {
+      *output = tmp;
+    } else {
+      TensorRow in({*output, tmp});
+      TensorRow out_row;
+      concatenate_op->Compute(in, &out_row);
+      *output = out_row[0];
+    }
+  }
+  (*output)->Reshape(out_shape);
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/data_utils.h b/mindspore/ccsrc/minddata/dataset/text/kernels/data_utils.h
new file mode 100644
index 00000000000..2af69cd3d6f
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/data_utils.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_KERNELS_TEXT_DATA_UTILS_H_
+#define DATASET_KERNELS_TEXT_DATA_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "minddata/dataset/util/status.h"
+#include "minddata/dataset/core/constants.h"
+#include "minddata/dataset/core/data_type.h"
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/core/cv_tensor.h"
+#include "minddata/dataset/core/tensor_shape.h"
+#include "minddata/dataset/core/tensor_row.h"
+
+namespace mindspore {
+namespace dataset {
+/// \brief Helper method that perform sliding window on input tensor.
+/// \param[in] input - Input tensor.
+/// \param[in] out_shape - Output shape of output tensor.
+/// \param[in] width - The axis along which sliding window is computed.
+/// \param[in] axis - The width of the window.
+/// \param[out] output - Output tensor
+/// \return Status return code
+Status SlidingWindowHelper(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, TensorShape out_shape,
+                           uint32_t width, int32_t axis);
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_KERNELS_TEXT_DATA_UTILS_H_
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/sliding_window_op.cc b/mindspore/ccsrc/minddata/dataset/text/kernels/sliding_window_op.cc
new file mode 100644
index 00000000000..f857f1ab966
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/sliding_window_op.cc
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "minddata/dataset/text/kernels/sliding_window_op.h"
+
+namespace mindspore {
+namespace dataset {
+Status SlidingWindowOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  CHECK_FAIL_RETURN_UNEXPECTED(input->shape().Rank() == 1, "SlidingWindosOp supports 1D Tensors only for now.");
+  CHECK_FAIL_RETURN_UNEXPECTED(axis_ == 0 || axis_ == -1, "axis supports 0 or -1 only for now.");
+
+  std::vector<TensorShape> input_shape = {input->shape()};
+  std::vector<TensorShape> output_shape = {TensorShape({})};
+  RETURN_IF_NOT_OK(OutputShape(input_shape, output_shape));
+
+  RETURN_IF_NOT_OK(SlidingWindowHelper(input, output, output_shape[0], width_, axis_));
+  return Status::OK();
+}
+
+Status SlidingWindowOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) {
+  CHECK_FAIL_RETURN_UNEXPECTED(inputs.size() == NumInput(), "incorrect num of inputs\n");
+  int32_t axis = Tensor::HandleNeg(axis_, inputs[0].Size());
+  TensorShape input_shape = inputs[0];
+  std::vector<dsize_t> output_shape_initializer;
+
+  // if a data row has fewer items than width, the corresponding result row will be empty.
+  if (input_shape[axis] >= width_) {
+    for (int32_t idx = 0; idx < input_shape.Size(); ++idx) {
+      if (idx != axis) {
+        output_shape_initializer.push_back(input_shape[idx]);
+      } else {
+        output_shape_initializer.push_back(input_shape[idx] - (width_ - 1));
+        output_shape_initializer.push_back(width_);
+      }
+    }
+  }
+
+  outputs.pop_back();
+  outputs.emplace_back(TensorShape(output_shape_initializer));
+  CHECK_FAIL_RETURN_UNEXPECTED(outputs.size() == NumOutput(), "incorrect num of outputs\n");
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/sliding_window_op.h b/mindspore/ccsrc/minddata/dataset/text/kernels/sliding_window_op.h
new file mode 100644
index 00000000000..a9340d12bd0
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/sliding_window_op.h
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_KERNELS_TEXT_SLIDING_WINDOW_OP_H_
+#define DATASET_KERNELS_TEXT_SLIDING_WINDOW_OP_H_
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/text/kernels/data_utils.h"
+
+namespace mindspore {
+namespace dataset {
+
+class SlidingWindowOp : public TensorOp {
+ public:
+  /// \brief Constructor of SlidingWindowOp.
+  /// \param[in] width - The axis along which sliding window is computed.
+  /// \param[in] axis - The width of the window.
+  /// \return Status return code
+  explicit SlidingWindowOp(uint32_t width, int32_t axis = 0) : width_(width), axis_(axis) {}
+
+  /// \brief Destructor of SlidingWindowOp.
+  ~SlidingWindowOp() override = default;
+
+  /// \brief Perform sliding window to tensor.
+  /// \param[in] input - Input tensor of Op.
+  /// \param[out] output - output tensor of Op.
+  /// \return Status return code
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+  /// \brief Calculate tensor shape for output tensor.
+  /// \param[in] inputs - Input tensor shapes.
+  /// \param[out] outputs - Output tensor shapes.
+  /// \return Status return code
+  Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
+
+  /// \brief Print args for debugging.
+  /// \param[in] out - std::ostream &out.
+  void Print(std::ostream &out) const override { out << "SliceWindowOp"; }
+
+  /// \brief Print name of op.
+  std::string Name() const override { return kSlidingWindowOp; }
+
+ private:
+  uint32_t width_;  // The width of the window. Must be an integer and greater than zero.
+  int32_t axis_;    // The axis along which sliding window is computed, only support 0/-1 for now.
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_KERNELS_TEXT_SLIDING_WINDOW_OP_H_
diff --git a/mindspore/dataset/text/__init__.py b/mindspore/dataset/text/__init__.py
index 04eb90a0b6d..048f345cfab 100644
--- a/mindspore/dataset/text/__init__.py
+++ b/mindspore/dataset/text/__init__.py
@@ -19,13 +19,13 @@ utils provides some general methods for nlp text processing.
 """
 import platform
 from .transforms import Lookup, JiebaTokenizer, UnicodeCharTokenizer, Ngram, WordpieceTokenizer, TruncateSequencePair, \
-    ToNumber
+    ToNumber, SlidingWindow
 from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm
 
 __all__ = [
     "Lookup", "JiebaTokenizer", "UnicodeCharTokenizer", "Ngram",
     "to_str", "to_bytes", "Vocab", "WordpieceTokenizer", "TruncateSequencePair", "ToNumber",
-    "PythonTokenizer"
+    "PythonTokenizer", "SlidingWindow"
 ]
 
 if platform.system().lower() != 'windows':
diff --git a/mindspore/dataset/text/transforms.py b/mindspore/dataset/text/transforms.py
index 30fa2b8f429..7f60f05107b 100644
--- a/mindspore/dataset/text/transforms.py
+++ b/mindspore/dataset/text/transforms.py
@@ -54,7 +54,7 @@ from .utils import JiebaMode, NormalizeForm, to_str
 from .validators import check_lookup, check_jieba_add_dict, \
     check_jieba_add_word, check_jieba_init, check_with_offsets, check_unicode_script_tokenizer,\
     check_wordpiece_tokenizer, check_regex_tokenizer, check_basic_tokenizer, check_ngram, check_pair_truncate,\
-    check_to_number, check_bert_tokenizer, check_python_tokenizer
+    check_to_number, check_bert_tokenizer, check_python_tokenizer, check_slidingwindow
 from ..core.datatypes import mstype_to_detype
 
 
@@ -72,6 +72,34 @@ class Lookup(cde.LookupOp):
     def __init__(self, vocab, unknown_token=None):
         super().__init__(vocab, unknown_token)
 
+class SlidingWindow(cde.SlidingWindowOp):
+    """
+    TensorOp to construct a tensor from data (only 1-D for now), where each element in the dimension axis
+    is a slice of data starting at the corresponding position, with a specified width.
+
+    Args:
+        width (int): The width of the window. Must be an integer and greater than zero.
+        axis (int, optional): The axis along which sliding window is computed (default=0).
+
+    Examples:
+        >>> # Data before
+        >>> # |    col1     |
+        >>> # +-------------+
+        >>> # | [1,2,3,4,5] |
+        >>> # +-------------+
+        >>> data = data.map(operations=SlidingWindow(3, 0))
+        >>> # Data after
+        >>> # |     col1    |
+        >>> # +-------------+
+        >>> # |  [[1,2,3],  |
+        >>> # |   [2,3,4],  |
+        >>> # |   [3,4,5]]  |
+        >>> # +--------------+
+    """
+
+    @check_slidingwindow
+    def __init__(self, width, axis=0):
+        super().__init__(width=width, axis=axis)
 
 class Ngram(cde.NgramOp):
     """
diff --git a/mindspore/dataset/text/validators.py b/mindspore/dataset/text/validators.py
index b0327f5609c..71f48a1238a 100644
--- a/mindspore/dataset/text/validators.py
+++ b/mindspore/dataset/text/validators.py
@@ -23,7 +23,7 @@ import mindspore._c_dataengine as cde
 from mindspore._c_expression import typing
 
 from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_uint32, \
-    INT32_MAX, check_value, check_positive
+    INT32_MAX, check_value, check_positive, check_pos_int32
 
 
 def check_unique_list_of_words(words, arg_name):
@@ -328,6 +328,17 @@ def check_from_dataset(method):
 
     return new_method
 
+def check_slidingwindow(method):
+    """A wrapper that wrap a parameter checker to the original function(sliding window operation)."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        [width, axis], _ = parse_user_args(method, *args, **kwargs)
+        check_pos_int32(width, "width")
+        type_check(axis, (int,), "axis")
+        return method(self, *args, **kwargs)
+
+    return new_method
 
 def check_ngram(method):
     """A wrapper that wraps a parameter checker to the original function."""
diff --git a/tests/ut/cpp/dataset/CMakeLists.txt b/tests/ut/cpp/dataset/CMakeLists.txt
index 8bbf42a6404..084bd05ab41 100644
--- a/tests/ut/cpp/dataset/CMakeLists.txt
+++ b/tests/ut/cpp/dataset/CMakeLists.txt
@@ -92,6 +92,7 @@ SET(DE_UT_SRCS
         perf_data_test.cc
         c_api_test.cc
 		tensor_op_fusion_pass_test.cc
+        sliding_window_op_test.cc
         )
 
 add_executable(de_ut_tests ${DE_UT_SRCS})
diff --git a/tests/ut/cpp/dataset/sliding_window_op_test.cc b/tests/ut/cpp/dataset/sliding_window_op_test.cc
new file mode 100644
index 00000000000..7020229d9af
--- /dev/null
+++ b/tests/ut/cpp/dataset/sliding_window_op_test.cc
@@ -0,0 +1,69 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common/common.h"
+#include "minddata/dataset/text/kernels/sliding_window_op.h"
+#include "utils/log_adapter.h"
+
+using namespace mindspore::dataset;
+using mindspore::MsLogLevel::INFO;
+using mindspore::ExceptionType::NoExceptionType;
+using mindspore::LogStream;
+
+class MindDataTestSlidingWindowOp : public UT::Common {
+ protected:
+  MindDataTestSlidingWindowOp() {}
+};
+
+TEST_F(MindDataTestSlidingWindowOp, Compute) {
+  MS_LOG(INFO) << "Doing MindDataTestSlidingWindowOp->Compute.";
+  std::vector<std::string> strings = {"one", "two", "three", "four", "five", "six", "seven", "eight"};
+  TensorShape shape({static_cast<dsize_t>(strings.size())});
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>(strings, shape);
+  std::shared_ptr<Tensor> output;
+
+  std::unique_ptr<SlidingWindowOp> op(new SlidingWindowOp(3, 0));
+  Status s = op->Compute(input, &output);
+
+  std::vector<std::string> out = {"one", "two", "three", "two", "three", "four", "three", "four", "five",
+                                  "four", "five", "six", "five", "six", "seven", "six", "seven", "eight"};
+  std::shared_ptr<Tensor> expected = std::make_shared<Tensor>(out, TensorShape({6, 3}));
+
+  ASSERT_TRUE(output->shape() == expected->shape());
+  ASSERT_TRUE(output->type() == expected->type());
+  MS_LOG(DEBUG) << *output << std::endl;
+  MS_LOG(DEBUG) << *expected << std::endl;
+  ASSERT_TRUE(*output == *expected);
+
+  MS_LOG(INFO) << "MindDataTestSlidingWindowOp end.";
+}
+
+TEST_F(MindDataTestSlidingWindowOp, OutputShape) {
+  MS_LOG(INFO) << "Doing MindDataTestSlidingWindowOp->OutputShape.";
+  std::vector<std::string> strings = {"one", "two", "three", "four", "five", "six", "seven", "eight"};
+  TensorShape shape({static_cast<dsize_t>(strings.size())});
+  std::shared_ptr<Tensor> input = std::make_shared<Tensor>(strings, shape);
+  std::vector<TensorShape> input_shape = {input->shape()};
+  std::vector<TensorShape> output_shape = {TensorShape({})};
+
+  std::unique_ptr<SlidingWindowOp> op(new SlidingWindowOp(3, 0));
+  Status s = op->OutputShape(input_shape, output_shape);
+
+  MS_LOG(DEBUG) << "input_shape" << input_shape[0];
+  MS_LOG(DEBUG) << "output_shape" << output_shape[0];
+  ASSERT_TRUE(output_shape[0] == TensorShape({6, 3}));
+
+  MS_LOG(INFO) << "MindDataTestSlidingWindowOp end.";
+}
diff --git a/tests/ut/python/dataset/test_sliding_window.py b/tests/ut/python/dataset/test_sliding_window.py
new file mode 100644
index 00000000000..4fdd7a25c07
--- /dev/null
+++ b/tests/ut/python/dataset/test_sliding_window.py
@@ -0,0 +1,105 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing SlidingWindow in mindspore.dataset
+"""
+import numpy as np
+import mindspore.dataset as ds
+import mindspore.dataset.text as text
+
+def test_sliding_window_string():
+    """ test sliding_window with string type"""
+    inputs = [["大", "家", "早", "上", "好"]]
+    expect = np.array([['大', '家'], ['家', '早'], ['早', '上'], ['上', '好']])
+
+    dataset = ds.NumpySlicesDataset(inputs, column_names=["text"], shuffle=False)
+    dataset = dataset.map(input_columns=["text"], operations=text.SlidingWindow(2, 0))
+
+    result = []
+    for data in dataset.create_dict_iterator():
+        for i in range(data['text'].shape[0]):
+            result.append([])
+            for j in range(data['text'].shape[1]):
+                result[i].append(data['text'][i][j].decode('utf8'))
+        result = np.array(result)
+    np.testing.assert_array_equal(result, expect)
+
+def test_sliding_window_number():
+    inputs = [1]
+    expect = np.array([[1]])
+
+    def gen(nums):
+        yield (np.array(nums),)
+
+    dataset = ds.GeneratorDataset(gen(inputs), column_names=["number"])
+    dataset = dataset.map(input_columns=["number"], operations=text.SlidingWindow(1, -1))
+
+    for data in dataset.create_dict_iterator():
+        np.testing.assert_array_equal(data['number'], expect)
+
+def test_sliding_window_big_width():
+    inputs = [[1, 2, 3, 4, 5]]
+    expect = np.array([])
+
+    dataset = ds.NumpySlicesDataset(inputs, column_names=["number"], shuffle=False)
+    dataset = dataset.map(input_columns=["number"], operations=text.SlidingWindow(30, 0))
+
+    for data in dataset.create_dict_iterator():
+        np.testing.assert_array_equal(data['number'], expect)
+
+def test_sliding_window_exception():
+    try:
+        _ = text.SlidingWindow(0, 0)
+        assert False
+    except ValueError:
+        pass
+
+    try:
+        _ = text.SlidingWindow("1", 0)
+        assert False
+    except TypeError:
+        pass
+
+    try:
+        _ = text.SlidingWindow(1, "0")
+        assert False
+    except TypeError:
+        pass
+
+    try:
+        inputs = [[1, 2, 3, 4, 5]]
+        dataset = ds.NumpySlicesDataset(inputs, column_names=["text"], shuffle=False)
+        dataset = dataset.map(input_columns=["text"], operations=text.SlidingWindow(3, -100))
+        for _ in dataset.create_dict_iterator():
+            pass
+        assert False
+    except RuntimeError as e:
+        assert "axis supports 0 or -1 only for now." in str(e)
+
+    try:
+        inputs = ["aa", "bb", "cc"]
+        dataset = ds.NumpySlicesDataset(inputs, column_names=["text"], shuffle=False)
+        dataset = dataset.map(input_columns=["text"], operations=text.SlidingWindow(2, 0))
+        for _ in dataset.create_dict_iterator():
+            pass
+        assert False
+    except RuntimeError as e:
+        assert "SlidingWindosOp supports 1D Tensors only for now." in str(e)
+
+if __name__ == '__main__':
+    test_sliding_window_string()
+    test_sliding_window_number()
+    test_sliding_window_big_width()
+    test_sliding_window_exception()

From d233c541396f5b176eb20416d72997d6cc0038b6 Mon Sep 17 00:00:00 2001
From: Zirui Wu <zirui.wu@huawei.com>
Date: Tue, 7 Jul 2020 15:58:46 -0400
Subject: [PATCH 10/68] implemented cpp random choice, apply and compos

python part of random ops

added random select sub policy

validators added

comments added, remaining issues addressed

add more python test cases

fix ci

fix CI

fix order of include files

addr review cmts

addr review cmts

reorg file

fix compile err

address review cmts

address review cmts
---
 .../ccsrc/minddata/dataset/api/de_pipeline.cc |  2 -
 .../minddata/dataset/api/python_bindings.cc   | 83 +++++++++++++++-
 .../minddata/dataset/kernels/CMakeLists.txt   |  7 +-
 .../minddata/dataset/kernels/compose_op.cc    | 66 +++++++++++++
 .../minddata/dataset/kernels/compose_op.h     | 70 +++++++++++++
 .../dataset/kernels/image/CMakeLists.txt      |  1 +
 .../image/random_select_subpolicy_op.cc       | 96 ++++++++++++++++++
 .../image/random_select_subpolicy_op.h        | 79 +++++++++++++++
 .../dataset/kernels/random_apply_op.cc        | 68 +++++++++++++
 .../dataset/kernels/random_apply_op.h         | 79 +++++++++++++++
 .../dataset/kernels/random_choice_op.cc       | 97 +++++++++++++++++++
 .../dataset/kernels/random_choice_op.h        | 77 +++++++++++++++
 .../minddata/dataset/kernels/tensor_op.h      |  4 +
 mindspore/dataset/core/validator_helpers.py   |  8 ++
 mindspore/dataset/transforms/c_transforms.py  | 51 +++++++++-
 mindspore/dataset/transforms/validators.py    | 22 ++++-
 .../dataset/transforms/vision/c_transforms.py |  8 +-
 .../dataset/transforms/vision/validators.py   | 25 ++++-
 tests/ut/python/dataset/test_c_compose.py     | 50 ++++++++++
 .../ut/python/dataset/test_c_random_apply.py  | 48 +++++++++
 .../ut/python/dataset/test_c_random_choice.py | 48 +++++++++
 .../dataset/test_random_select_subpolicy.py   | 51 ++++++++++
 22 files changed, 1029 insertions(+), 11 deletions(-)
 create mode 100644 mindspore/ccsrc/minddata/dataset/kernels/compose_op.cc
 create mode 100644 mindspore/ccsrc/minddata/dataset/kernels/compose_op.h
 create mode 100644 mindspore/ccsrc/minddata/dataset/kernels/image/random_select_subpolicy_op.cc
 create mode 100644 mindspore/ccsrc/minddata/dataset/kernels/image/random_select_subpolicy_op.h
 create mode 100644 mindspore/ccsrc/minddata/dataset/kernels/random_apply_op.cc
 create mode 100644 mindspore/ccsrc/minddata/dataset/kernels/random_apply_op.h
 create mode 100644 mindspore/ccsrc/minddata/dataset/kernels/random_choice_op.cc
 create mode 100644 mindspore/ccsrc/minddata/dataset/kernels/random_choice_op.h
 create mode 100644 tests/ut/python/dataset/test_c_compose.py
 create mode 100644 tests/ut/python/dataset/test_c_random_apply.py
 create mode 100644 tests/ut/python/dataset/test_c_random_choice.py
 create mode 100644 tests/ut/python/dataset/test_random_select_subpolicy.py

diff --git a/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc b/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc
index 2a6166f868c..c780d8f645b 100644
--- a/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc
@@ -42,8 +42,6 @@
 #include "minddata/dataset/util/status.h"
 #include "minddata/mindrecord/include/shard_category.h"
 #include "minddata/mindrecord/include/shard_distributed_sample.h"
-#include "minddata/mindrecord/include/shard_sample.h"
-#include "minddata/mindrecord/include/shard_shuffle.h"
 #include "pybind11/stl.h"
 #include "utils/log_adapter.h"
 
diff --git a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
index 145291ec3be..173c1af2f22 100644
--- a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
@@ -16,6 +16,7 @@
 #include <exception>
 
 #include "minddata/dataset/api/de_pipeline.h"
+#include "minddata/dataset/engine/cache/cache_client.h"
 #include "minddata/dataset/engine/datasetops/source/cifar_op.h"
 #include "minddata/dataset/engine/datasetops/source/clue_op.h"
 #include "minddata/dataset/engine/datasetops/source/coco_op.h"
@@ -35,9 +36,9 @@
 #include "minddata/dataset/engine/datasetops/source/text_file_op.h"
 #include "minddata/dataset/engine/datasetops/source/tf_reader_op.h"
 #include "minddata/dataset/engine/datasetops/source/voc_op.h"
-#include "minddata/dataset/engine/cache/cache_client.h"
 #include "minddata/dataset/engine/gnn/graph.h"
 #include "minddata/dataset/engine/jagged_connector.h"
+#include "minddata/dataset/kernels/compose_op.h"
 #include "minddata/dataset/kernels/data/concatenate_op.h"
 #include "minddata/dataset/kernels/data/duplicate_op.h"
 #include "minddata/dataset/kernels/data/fill_op.h"
@@ -61,11 +62,12 @@
 #include "minddata/dataset/kernels/image/random_crop_decode_resize_op.h"
 #include "minddata/dataset/kernels/image/random_crop_op.h"
 #include "minddata/dataset/kernels/image/random_crop_with_bbox_op.h"
-#include "minddata/dataset/kernels/image/random_horizontal_flip_with_bbox_op.h"
 #include "minddata/dataset/kernels/image/random_horizontal_flip_op.h"
+#include "minddata/dataset/kernels/image/random_horizontal_flip_with_bbox_op.h"
 #include "minddata/dataset/kernels/image/random_resize_op.h"
 #include "minddata/dataset/kernels/image/random_resize_with_bbox_op.h"
 #include "minddata/dataset/kernels/image/random_rotation_op.h"
+#include "minddata/dataset/kernels/image/random_select_subpolicy_op.h"
 #include "minddata/dataset/kernels/image/random_vertical_flip_op.h"
 #include "minddata/dataset/kernels/image/random_vertical_flip_with_bbox_op.h"
 #include "minddata/dataset/kernels/image/rescale_op.h"
@@ -74,6 +76,9 @@
 #include "minddata/dataset/kernels/image/resize_with_bbox_op.h"
 #include "minddata/dataset/kernels/image/uniform_aug_op.h"
 #include "minddata/dataset/kernels/no_op.h"
+#include "minddata/dataset/kernels/py_func_op.h"
+#include "minddata/dataset/kernels/random_apply_op.h"
+#include "minddata/dataset/kernels/random_choice_op.h"
 #include "minddata/dataset/text/kernels/jieba_tokenizer_op.h"
 #include "minddata/dataset/text/kernels/lookup_op.h"
 #include "minddata/dataset/text/kernels/ngram_op.h"
@@ -88,6 +93,7 @@
 #include "minddata/mindrecord/include/shard_sample.h"
 #include "minddata/mindrecord/include/shard_sequential_sample.h"
 #include "mindspore/ccsrc/minddata/dataset/text/kernels/truncate_sequence_pair_op.h"
+
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 #include "pybind11/stl_bind.h"
@@ -113,6 +119,24 @@ namespace dataset {
     if (rc.IsError()) throw std::runtime_error(rc.ToString()); \
   } while (false)
 
+Status PyListToTensorOps(const py::list &py_ops, std::vector<std::shared_ptr<TensorOp>> *ops) {
+  RETURN_UNEXPECTED_IF_NULL(ops);
+  for (auto op : py_ops) {
+    if (py::isinstance<TensorOp>(op)) {
+      ops->emplace_back(op.cast<std::shared_ptr<TensorOp>>());
+    } else if (py::isinstance<py::function>(op)) {
+      ops->emplace_back(std::make_shared<PyFuncOp>(op.cast<py::function>()));
+    } else {
+      RETURN_STATUS_UNEXPECTED("element is neither a TensorOp nor a pyfunc.");
+    }
+  }
+  CHECK_FAIL_RETURN_UNEXPECTED(!ops->empty(), "TensorOp list is empty.");
+  for (auto const &op : *ops) {
+    RETURN_UNEXPECTED_IF_NULL(op);
+  }
+  return Status::OK();
+}
+
 void bindDEPipeline(py::module *m) {
   (void)py::class_<DEPipeline>(*m, "DEPipeline")
     .def(py::init<>())
@@ -623,7 +647,7 @@ void bindTokenizerOps(py::module *m) {
       WordIdType default_id = vocab->Lookup(word);
       if (default_id == Vocab::kNoTokenExists) {
         THROW_IF_ERROR(
-          Status(StatusCode::kUnexpectedError, "default unknown token:" + word + " doesn't exist in vocab."));
+          Status(StatusCode::kUnexpectedError, "default unknown token: " + word + " doesn't exist in vocab."));
       }
       return std::make_shared<LookupOp>(vocab, default_id);
     }));
@@ -868,6 +892,58 @@ void bindGraphData(py::module *m) {
     });
 }
 
+void bindRandomTransformTensorOps(py::module *m) {
+  (void)py::class_<ComposeOp, TensorOp, std::shared_ptr<ComposeOp>>(*m, "ComposeOp")
+    .def(py::init([](const py::list &ops) {
+      std::vector<std::shared_ptr<TensorOp>> t_ops;
+      THROW_IF_ERROR(PyListToTensorOps(ops, &t_ops));
+      return std::make_shared<ComposeOp>(t_ops);
+    }));
+  (void)py::class_<RandomChoiceOp, TensorOp, std::shared_ptr<RandomChoiceOp>>(*m, "RandomChoiceOp")
+    .def(py::init([](const py::list &ops) {
+      std::vector<std::shared_ptr<TensorOp>> t_ops;
+      THROW_IF_ERROR(PyListToTensorOps(ops, &t_ops));
+      return std::make_shared<RandomChoiceOp>(t_ops);
+    }));
+  (void)py::class_<RandomApplyOp, TensorOp, std::shared_ptr<RandomApplyOp>>(*m, "RandomApplyOp")
+    .def(py::init([](double prob, const py::list &ops) {
+      std::vector<std::shared_ptr<TensorOp>> t_ops;
+      THROW_IF_ERROR(PyListToTensorOps(ops, &t_ops));
+      if (prob < 0 || prob > 1) {
+        THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, "prob needs to be within [0,1]."));
+      }
+      return std::make_shared<RandomApplyOp>(prob, t_ops);
+    }));
+  (void)py::class_<RandomSelectSubpolicyOp, TensorOp, std::shared_ptr<RandomSelectSubpolicyOp>>(
+    *m, "RandomSelectSubpolicyOp")
+    .def(py::init([](const py::list &py_policy) {
+      std::vector<Subpolicy> cpp_policy;
+      for (auto &py_sub : py_policy) {
+        cpp_policy.push_back({});
+        for (auto handle : py_sub.cast<py::list>()) {
+          py::tuple tp = handle.cast<py::tuple>();
+          if (tp.is_none() || tp.size() != 2) {
+            THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, "Each tuple in subpolicy should be (op, prob)."));
+          }
+          std::shared_ptr<TensorOp> t_op;
+          if (py::isinstance<TensorOp>(tp[0])) {
+            t_op = (tp[0]).cast<std::shared_ptr<TensorOp>>();
+          } else if (py::isinstance<py::function>(tp[0])) {
+            t_op = std::make_shared<PyFuncOp>((tp[0]).cast<py::function>());
+          } else {
+            THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, "op is neither a tensorOp nor a pyfunc."));
+          }
+          double prob = (tp[1]).cast<py::float_>();
+          if (prob < 0 || prob > 1) {
+            THROW_IF_ERROR(Status(StatusCode::kUnexpectedError, "prob needs to be with [0,1]."));
+          }
+          cpp_policy.back().emplace_back(std::make_pair(t_op, prob));
+        }
+      }
+      return std::make_shared<RandomSelectSubpolicyOp>(cpp_policy);
+    }));
+}
+
 // This is where we externalize the C logic as python modules
 PYBIND11_MODULE(_c_dataengine, m) {
   m.doc() = "pybind11 for _c_dataengine";
@@ -949,6 +1025,7 @@ PYBIND11_MODULE(_c_dataengine, m) {
   bindVocabObjects(&m);
   bindGraphData(&m);
   bindDependIcuTokenizerOps(&m);
+  bindRandomTransformTensorOps(&m);
 }
 }  // namespace dataset
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/kernels/CMakeLists.txt
index 8a9096ff23d..0d072d1d854 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/kernels/CMakeLists.txt
@@ -4,11 +4,16 @@ file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc"
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
 if (ENABLE_PYTHON)
     add_library(kernels OBJECT
+        compose_op.cc
+        random_apply_op.cc
+        random_choice_op.cc
         py_func_op.cc
         tensor_op.cc)
     target_include_directories(kernels PRIVATE ${pybind11_INCLUDE_DIRS})
 else()
     add_library(kernels OBJECT
+        compose_op.cc
+        random_apply_op.cc
+        random_choice_op.cc
         tensor_op.cc)
 endif()
-
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/compose_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/compose_op.cc
new file mode 100644
index 00000000000..35128d3e886
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/kernels/compose_op.cc
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "minddata/dataset/kernels/compose_op.h"
+
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/py_func_op.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status ComposeOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) {
+  std::vector<TensorShape> in_shapes = inputs;
+  for (auto &op : ops_) {
+    RETURN_IF_NOT_OK(op->OutputShape(in_shapes, outputs));
+    in_shapes = std::move(outputs);  // outputs become empty after move
+  }
+  outputs = std::move(in_shapes);
+  return Status::OK();
+}
+Status ComposeOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
+  std::vector<DataType> in_types = inputs;
+  for (auto &op : ops_) {
+    RETURN_IF_NOT_OK(op->OutputType(in_types, outputs));
+    in_types = std::move(outputs);  // outputs become empty after move
+  }
+  outputs = std::move(in_types);
+  return Status::OK();
+}
+Status ComposeOp::Compute(const TensorRow &inputs, TensorRow *outputs) {
+  IO_CHECK_VECTOR(inputs, outputs);
+  TensorRow in_rows = inputs;
+  for (auto &op : ops_) {
+    RETURN_IF_NOT_OK(op->Compute(in_rows, outputs));
+    in_rows = std::move(*outputs);  // after move, *outputs become empty
+  }
+  (*outputs) = std::move(in_rows);
+  return Status::OK();
+}
+
+ComposeOp::ComposeOp(const std::vector<std::shared_ptr<TensorOp>> &ops) : ops_(ops) {
+  if (ops_.empty()) {
+    MS_LOG(ERROR) << "op_list is empty this might lead to Segmentation Fault.";
+  } else if (ops_.size() == 1) {
+    MS_LOG(WARNING) << "op_list has only 1 op. Compose is probably not needed.";
+  }
+}
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/compose_op.h b/mindspore/ccsrc/minddata/dataset/kernels/compose_op.h
new file mode 100644
index 00000000000..3f0d994a7ec
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/kernels/compose_op.h
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_KERNELS_COMPOSE_OP_
+#define DATASET_KERNELS_COMPOSE_OP_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+
+namespace mindspore {
+namespace dataset {
+class ComposeOp : public TensorOp {
+ public:
+  /// constructor
+  /// \param[in] ops list of TensorOps to compose into 1 TensorOp
+  explicit ComposeOp(const std::vector<std::shared_ptr<TensorOp>> &ops);
+
+  /// default destructor
+  ~ComposeOp() override = default;
+
+  /// return the number of inputs the first tensorOp in compose takes
+  /// \return number of input tensors
+  uint32_t NumInput() override { return ops_.front()->NumInput(); }
+
+  /// return the number of outputs the last tensorOp in compose produces
+  /// \return number of output tensors
+  uint32_t NumOutput() override { return ops_.back()->NumOutput(); }
+
+  /// \param[in] inputs
+  /// \param[out] outputs
+  /// \return  Status code
+  Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
+
+  /// \param[in] inputs
+  /// \param[out] outputs
+  /// \return Status code
+  Status OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) override;
+
+  /// \param[in] input
+  /// \param[out] output
+  /// \return Status code
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+
+  std::string Name() const override { return kComposeOp; }
+
+ private:
+  std::vector<std::shared_ptr<TensorOp>> ops_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_KERNELS_COMPOSE_OP_
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
index c0c575de9af..bfc27a920bd 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
@@ -19,6 +19,7 @@ add_library(kernels-image OBJECT
     bounding_box_augment_op.cc
     random_resize_op.cc
     random_rotation_op.cc
+    random_select_subpolicy_op.cc
     random_vertical_flip_op.cc
     random_vertical_flip_with_bbox_op.cc
     rescale_op.cc
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/random_select_subpolicy_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/random_select_subpolicy_op.cc
new file mode 100644
index 00000000000..3a789ab3444
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/random_select_subpolicy_op.cc
@@ -0,0 +1,96 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "minddata/dataset/kernels/image/random_select_subpolicy_op.h"
+
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+Status RandomSelectSubpolicyOp::Compute(const TensorRow &input, TensorRow *output) {
+  TensorRow in_row = input;
+  size_t rand_num = rand_int_(gen_);
+  CHECK_FAIL_RETURN_UNEXPECTED(rand_num < policy_.size(), "invalid rand_num:" + std::to_string(rand_num));
+  for (auto &sub : policy_[rand_num]) {
+    if (rand_double_(gen_) <= sub.second) {
+      RETURN_IF_NOT_OK(sub.first->Compute(in_row, output));
+      in_row = std::move(*output);
+    }
+  }
+  *output = std::move(in_row);
+  return Status::OK();
+}
+
+uint32_t RandomSelectSubpolicyOp::NumInput() {
+  uint32_t num_in = policy_.front().front().first->NumInput();
+  for (auto &sub : policy_) {
+    for (auto p : sub) {
+      if (num_in != p.first->NumInput()) {
+        MS_LOG(WARNING) << "Unable to determine numInput.";
+        return 0;
+      }
+    }
+  }
+  return num_in;
+}
+
+uint32_t RandomSelectSubpolicyOp::NumOutput() {
+  uint32_t num_out = policy_.front().front().first->NumOutput();
+  for (auto &sub : policy_) {
+    for (auto p : sub) {
+      if (num_out != p.first->NumOutput()) {
+        MS_LOG(WARNING) << "Unable to determine numInput.";
+        return 0;
+      }
+    }
+  }
+  return num_out;
+}
+
+Status RandomSelectSubpolicyOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) {
+  outputs.clear();
+  outputs.resize(NumOutput(), TensorShape::CreateUnknownRankShape());
+  return Status::OK();
+}
+
+Status RandomSelectSubpolicyOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
+  RETURN_IF_NOT_OK(policy_.front().front().first->OutputType(inputs, outputs));
+  for (auto &sub : policy_) {
+    for (auto p : sub) {
+      std::vector<DataType> tmp_types;
+      RETURN_IF_NOT_OK(p.first->OutputType(inputs, tmp_types));
+      if (outputs != tmp_types) {
+        outputs.clear();
+        outputs.resize(NumOutput(), DataType(DataType::DE_UNKNOWN));
+        return Status::OK();
+      }
+    }
+  }
+  return Status::OK();
+}
+RandomSelectSubpolicyOp::RandomSelectSubpolicyOp(const std::vector<Subpolicy> &policy)
+    : gen_(GetSeed()), policy_(policy), rand_int_(0, policy.size() - 1), rand_double_(0, 1) {
+  if (policy_.empty()) {
+    MS_LOG(ERROR) << "policy in RandomSelectSubpolicyOp is empty.";
+  }
+}
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/random_select_subpolicy_op.h b/mindspore/ccsrc/minddata/dataset/kernels/image/random_select_subpolicy_op.h
new file mode 100644
index 00000000000..55482a9818a
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/random_select_subpolicy_op.h
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_KERNELS_IMAGE_RANDOM_SELECT_SUBPOLICY_OP_
+#define DATASET_KERNELS_IMAGE_RANDOM_SELECT_SUBPOLICY_OP_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/util/random.h"
+
+namespace mindspore {
+namespace dataset {
+
+using Subpolicy = std::vector<std::pair<std::shared_ptr<TensorOp>, double>>;
+
+class RandomSelectSubpolicyOp : public TensorOp {
+ public:
+  /// constructor
+  /// \param[in] policy policy to choose subpolicy from
+  explicit RandomSelectSubpolicyOp(const std::vector<Subpolicy> &policy);
+
+  /// destructor
+  ~RandomSelectSubpolicyOp() override = default;
+
+  /// return number of input tensors
+  /// \return number of inputs if all ops in policy have the same NumInput, otherwise return 0
+  uint32_t NumInput() override;
+
+  /// return number of output tensors
+  /// \return number of outputs if all ops in policy have the same NumOutput, otherwise return 0
+  uint32_t NumOutput() override;
+
+  /// return unknown shapes
+  /// \param[in] inputs
+  /// \param[out] outputs
+  /// \return Status Code
+  Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
+
+  /// return output type if all ops in policy return the same type, otherwise return unknown type
+  /// \param[in] inputs
+  /// \param[out] outputs
+  /// \return Status Code
+  Status OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) override;
+
+  /// \param[in] input
+  /// \param[out] output
+  /// \return Status code
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+
+  std::string Name() const override { return kRandomSelectSubpolicyOp; }
+
+ private:
+  std::vector<Subpolicy> policy_;
+  std::mt19937 gen_;  // mersenne_twister_engine
+  std::uniform_int_distribution<size_t> rand_int_;
+  std::uniform_real_distribution<double> rand_double_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_KERNELS_IMAGE_RANDOM_SELECT_SUBPOLICY_OP_
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/random_apply_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/random_apply_op.cc
new file mode 100644
index 00000000000..783d5077ccb
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/kernels/random_apply_op.cc
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "minddata/dataset/kernels/random_apply_op.h"
+
+#include <memory>
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+uint32_t RandomApplyOp::NumOutput() {
+  if (compose_->NumOutput() != NumInput()) {
+    MS_LOG(WARNING) << "NumOutput!=NumInput (randomApply would randomly affect number of outputs).";
+    return 0;
+  }
+  return compose_->NumOutput();
+}
+
+Status RandomApplyOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) {
+  RETURN_IF_NOT_OK(compose_->OutputShape(inputs, outputs));
+  // randomApply either runs all ops or do nothing. If the two methods don't give the same result. return unknown shape.
+  if (inputs != outputs) {  // when RandomApply is not applied, input should be the same as output
+    outputs.clear();
+    outputs.resize(NumOutput(), TensorShape::CreateUnknownRankShape());
+  }
+  return Status::OK();
+}
+Status RandomApplyOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
+  RETURN_IF_NOT_OK(compose_->OutputType(inputs, outputs));
+  if (inputs != outputs) {  // when RandomApply is not applied, input should be the same as output
+    outputs.clear();
+    outputs.resize(NumOutput(), DataType(DataType::DE_UNKNOWN));
+  }
+  return Status::OK();
+}
+Status RandomApplyOp::Compute(const TensorRow &input, TensorRow *output) {
+  if (rand_double_(gen_) <= prob_) {
+    RETURN_IF_NOT_OK(compose_->Compute(input, output));
+  } else {
+    IO_CHECK_VECTOR(input, output);
+    *output = input;  // copy over the tensors
+  }
+  return Status::OK();
+}
+RandomApplyOp::RandomApplyOp(double prob, const std::vector<std::shared_ptr<TensorOp>> &ops)
+    : prob_(prob), gen_(GetSeed()), rand_double_(0, 1) {
+  compose_ = std::make_unique<ComposeOp>(ops);
+}
+
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/random_apply_op.h b/mindspore/ccsrc/minddata/dataset/kernels/random_apply_op.h
new file mode 100644
index 00000000000..117b95a22e8
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/kernels/random_apply_op.h
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_KERNELS_RANDOM_APPLY_OP_
+#define DATASET_KERNELS_RANDOM_APPLY_OP_
+
+#include <memory>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/compose_op.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/util/random.h"
+
+namespace mindspore {
+namespace dataset {
+class RandomApplyOp : public TensorOp {
+ public:
+  /// constructor
+  /// \param[in] prob probability whether the list of TensorOps will be applied
+  /// \param[in] ops the list of TensorOps to apply with prob likelihood
+  explicit RandomApplyOp(double prob, const std::vector<std::shared_ptr<TensorOp>> &ops);
+
+  /// default destructor
+  ~RandomApplyOp() = default;
+
+  /// return the number of inputs the first tensorOp in compose takes
+  /// \return number of input tensors
+  uint32_t NumInput() override { return compose_->NumInput(); }
+
+  /// return the number of outputs
+  /// \return number of output tensors
+  uint32_t NumOutput() override;
+
+  /// return output shape if randomApply won't affect the output shape, otherwise return unknown shape
+  /// \param[in] inputs
+  /// \param[out] outputs
+  /// \return  Status code
+  Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
+
+  /// return output type if randomApply won't affect the output type, otherwise return unknown type
+  /// \param[in] inputs
+  /// \param[out] outputs
+  /// \return Status code
+  Status OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) override;
+
+  /// \param[in] input
+  /// \param[out] output
+  /// \return Status code
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+
+  std::string Name() const override { return kRandomApplyOp; }
+
+ private:
+  double prob_;
+  std::shared_ptr<TensorOp> compose_;
+  std::mt19937 gen_;  // mersenne_twister_engine
+  std::uniform_real_distribution<double> rand_double_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_KERNELS_RANDOM_APPLY_OP_
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/random_choice_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/random_choice_op.cc
new file mode 100644
index 00000000000..fc81e85741f
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/kernels/random_choice_op.cc
@@ -0,0 +1,97 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "minddata/dataset/kernels/random_choice_op.h"
+
+#include <memory>
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+uint32_t RandomChoiceOp::NumInput() {
+  uint32_t num_input = ops_.front()->NumInput();
+  for (auto &op : ops_) {
+    uint32_t cur_num = op->NumInput();
+    if (num_input != cur_num && cur_num > 0) {
+      MS_LOG(WARNING) << "Unable to determine NumInput, ops in RandomChoice don't take the same number of input.";
+      return 0;
+    }
+  }
+  return num_input;
+}
+
+uint32_t RandomChoiceOp::NumOutput() {
+  uint32_t num_output = ops_.front()->NumOutput();
+  for (auto &op : ops_) {
+    uint32_t cur_num = op->NumOutput();
+    if (num_output != cur_num) {
+      MS_LOG(WARNING) << "Unable to determine NumInput, ops in RandomChoice don't have the same number of input.";
+      return 0;
+    }
+  }
+  return num_output;
+}
+
+Status RandomChoiceOp::OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) {
+  RETURN_IF_NOT_OK(ops_.front()->OutputShape(inputs, outputs));
+  for (auto &op : ops_) {
+    std::vector<TensorShape> out_shapes;
+    RETURN_IF_NOT_OK(op->OutputShape(inputs, out_shapes));
+    if (outputs != out_shapes) {
+      MS_LOG(WARNING) << "TensorOp in RandomChoice don't return the same tensorShape.";
+      outputs.clear();
+      outputs.resize(NumOutput(), TensorShape::CreateUnknownRankShape());
+      return Status::OK();
+    }
+  }
+  return Status::OK();
+}
+
+Status RandomChoiceOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
+  RETURN_IF_NOT_OK(ops_.front()->OutputType(inputs, outputs));
+  for (auto &op : ops_) {
+    std::vector<DataType> out_types;
+    RETURN_IF_NOT_OK(op->OutputType(inputs, out_types));
+    if (outputs != out_types) {
+      MS_LOG(WARNING) << "TensorOp in RandomChoice don't return the same tensorType.";
+      outputs.clear();
+      outputs.resize(NumOutput(), DataType(DataType::DE_UNKNOWN));
+      return Status::OK();
+    }
+  }
+  return Status::OK();
+}
+
+Status RandomChoiceOp::Compute(const TensorRow &input, TensorRow *output) {
+  size_t rand_num = rand_int_(gen_);
+  CHECK_FAIL_RETURN_UNEXPECTED(rand_num < ops_.size(), "invalid rand_num:" + std::to_string(rand_num));
+  RETURN_IF_NOT_OK(ops_[rand_num]->Compute(input, output));
+  return Status::OK();
+}
+RandomChoiceOp::RandomChoiceOp(const std::vector<std::shared_ptr<TensorOp>> &ops)
+    : ops_(ops), gen_(GetSeed()), rand_int_(0, ops.size() - 1) {
+  if (ops_.empty()) {
+    MS_LOG(ERROR) << "op_list in RandomChoiceOp is empty.";
+  } else if (ops_.size() == 1) {
+    MS_LOG(WARNING) << "op_list has only 1 op, this op would be picked every time.";
+  }
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/random_choice_op.h b/mindspore/ccsrc/minddata/dataset/kernels/random_choice_op.h
new file mode 100644
index 00000000000..f952046837e
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/kernels/random_choice_op.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_KERNELS_RANDOM_CHOICE_OP_
+#define DATASET_KERNELS_RANDOM_CHOICE_OP_
+
+#include <memory>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/kernels/compose_op.h"
+#include "minddata/dataset/util/random.h"
+
+namespace mindspore {
+namespace dataset {
+class RandomChoiceOp : public TensorOp {
+ public:
+  /// constructor
+  /// \param[in] ops list of TensorOps to randomly choose 1 from
+  explicit RandomChoiceOp(const std::vector<std::shared_ptr<TensorOp>> &ops);
+
+  /// default destructor
+  ~RandomChoiceOp() = default;
+
+  /// return the number of inputs. All op in ops_ should have the same number of inputs
+  /// \return number of input tensors
+  uint32_t NumInput() override;
+
+  /// return the number of outputs. All op in ops_ should have the same number of outputs
+  /// \return number of input tensors
+  uint32_t NumOutput() override;
+
+  /// return output shape if all ops in ops_ return the same shape, otherwise return unknown shape
+  /// \param[in] inputs
+  /// \param[out] outputs
+  /// \return  Status code
+  Status OutputShape(const std::vector<TensorShape> &inputs, std::vector<TensorShape> &outputs) override;
+
+  /// return output type if all ops in ops_ return the same type, otherwise return unknown type
+  /// \param[in] inputs
+  /// \param[out] outputs
+  /// \return Status code
+  Status OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) override;
+
+  /// \param[in] input
+  /// \param[out] output
+  /// \return Status code
+  Status Compute(const TensorRow &input, TensorRow *output) override;
+
+  std::string Name() const override { return kRandomChoiceOp; }
+
+ private:
+  std::vector<std::shared_ptr<TensorOp>> ops_;
+  std::mt19937 gen_;  // mersenne_twister_engine
+  std::uniform_int_distribution<size_t> rand_int_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_KERNELS_RANDOM_CHOICE_OP_
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
index 3bcba4b4630..00b4fa5efb3 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
@@ -129,6 +129,10 @@ constexpr char kUnicodeCharTokenizerOp[] = "UnicodeCharTokenizerOp";
 constexpr char kUnicodeScriptTokenizerOp[] = "UnicodeScriptTokenizerOp";
 constexpr char kWhitespaceTokenizerOp[] = "WhitespaceTokenizerOp";
 constexpr char kWordpieceTokenizerOp[] = "WordpieceTokenizerOp";
+constexpr char kRandomChoiceOp[] = "RandomChoiceOp";
+constexpr char kRandomApplyOp[] = "RandomApplyOp";
+constexpr char kComposeOp[] = "ComposeOp";
+constexpr char kRandomSelectSubpolicyOp[] = "RandomSelectSubpolicyOp";
 
 // data
 constexpr char kConcatenateOp[] = "kConcatenateOp";
diff --git a/mindspore/dataset/core/validator_helpers.py b/mindspore/dataset/core/validator_helpers.py
index 8806babd639..1ded33a9f98 100644
--- a/mindspore/dataset/core/validator_helpers.py
+++ b/mindspore/dataset/core/validator_helpers.py
@@ -19,6 +19,8 @@ import inspect
 from multiprocessing import cpu_count
 import os
 import numpy as np
+
+import mindspore._c_dataengine as cde
 from ..engine import samplers
 
 # POS_INT_MIN is used to limit values from starting from 0
@@ -358,3 +360,9 @@ def check_gnn_list_or_ndarray(param, param_name):
         if not param.dtype == np.int32:
             raise TypeError("Each member in {0} should be of type int32. Got {1}.".format(
                 param_name, param.dtype))
+
+
+def check_tensor_op(param, param_name):
+    """check whether param is a tensor op or a callable python function"""
+    if not isinstance(param, cde.TensorOp) and not callable(param):
+        raise TypeError("{0} is not a c_transform op (TensorOp) nor a callable pyfunc.".format(param_name))
diff --git a/mindspore/dataset/transforms/c_transforms.py b/mindspore/dataset/transforms/c_transforms.py
index 62496822e51..a2f9f5eee1e 100644
--- a/mindspore/dataset/transforms/c_transforms.py
+++ b/mindspore/dataset/transforms/c_transforms.py
@@ -22,7 +22,7 @@ import mindspore.common.dtype as mstype
 import mindspore._c_dataengine as cde
 
 from .validators import check_num_classes, check_de_type, check_fill_value, check_slice_op, check_mask_op, \
-    check_pad_end, check_concat_type
+    check_pad_end, check_concat_type, check_random_transform_ops
 from ..core.datatypes import mstype_to_detype
 
 
@@ -82,7 +82,7 @@ class Slice(cde.SliceOp):
             Maximum `n` number of arguments to slice a tensor of rank `n`.
             One object in slices can be one of:
             1.  :py:obj:`int`: Slice this index only. Negative index is supported.
-            2.  :py:obj:`list(int)`: Slice these indices ion the list only. Negative indices are supdeported.
+            2.  :py:obj:`list(int)`: Slice these indices ion the list only. Negative indices are supported.
             3.  :py:obj:`slice`: Slice the generated indices from the slice object. Similar to `start:stop:step`.
             4.  :py:obj:`None`: Slice the whole dimension. Similar to `:` in python indexing.
             5.  :py:obj:`Ellipses`: Slice all dimensions between the two slices. Similar to `...` in python indexing.
@@ -232,3 +232,50 @@ class Duplicate(cde.DuplicateOp):
         >>> # | [1,2,3] | [1,2,3] |
         >>> # +---------+---------+
     """
+
+
+class Compose(cde.ComposeOp):
+    """
+    Compose a list of transforms into a single transform.
+
+    Args:
+        transforms (list): List of transformations to be applied.
+    Example:
+        >>> compose = Compose([vision.Decode(), vision.RandomCrop()])
+        >>> dataset = ds.map(operations=compose)
+    """
+
+    @check_random_transform_ops
+    def __init__(self, op_list):
+        super().__init__(op_list)
+
+
+class RandomApply(cde.RandomApplyOp):
+    """
+    Randomly performs a series of transforms with a given probability.
+    Args:
+        transforms (list): List of transformations to be applied.
+        prob (float, optional): The probability to apply the transformation list (default=0.5)
+    Example:
+        >>> rand_apply = RandomApply([vision.RandomCrop()])
+        >>> dataset = ds.map(operations=rand_apply)
+    """
+
+    @check_random_transform_ops
+    def __init__(self, op_list, prob=0.5):
+        super().__init__(prob, op_list)
+
+
+class RandomChoice(cde.RandomChoiceOp):
+    """
+    Randomly selects one transform from a list of transforms to perform operation.
+    Args:
+        transforms (list): List of transformations to be chosen from to apply.
+    Example:
+        >>> rand_choice = RandomChoice([vision.CenterCrop(), vision.RandomCrop()])
+        >>> dataset = ds.map(operations=rand_choice)
+    """
+
+    @check_random_transform_ops
+    def __init__(self, op_list):
+        super().__init__(op_list)
diff --git a/mindspore/dataset/transforms/validators.py b/mindspore/dataset/transforms/validators.py
index 9fe0fa5f106..f44fd918eea 100644
--- a/mindspore/dataset/transforms/validators.py
+++ b/mindspore/dataset/transforms/validators.py
@@ -18,7 +18,8 @@ from functools import wraps
 import numpy as np
 
 from mindspore._c_expression import typing
-from ..core.validator_helpers import parse_user_args, type_check, check_pos_int64, check_value, check_positive
+from ..core.validator_helpers import parse_user_args, type_check, check_pos_int64, check_value, check_positive, \
+    check_tensor_op
 
 # POS_INT_MIN is used to limit values from starting from 0
 POS_INT_MIN = 1
@@ -180,3 +181,22 @@ def check_concat_type(method):
         return method(self, *args, **kwargs)
 
     return new_method
+
+
+def check_random_transform_ops(method):
+    """Wrapper method to check the parameters of RandomChoice, RandomApply and Compose."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        arg_list, _ = parse_user_args(method, *args, **kwargs)
+        type_check(arg_list[0], (list,), "op_list")
+        if not arg_list[0]:
+            raise ValueError("op_list can not be empty.")
+        for ind, op in enumerate(arg_list[0]):
+            check_tensor_op(op, "op_list[{0}]".format(ind))
+        if len(arg_list) == 2:  # random apply takes an additional arg
+            type_check(arg_list[1], (float, int), "prob")
+            check_value(arg_list[1], (0, 1), "prob")
+        return method(self, *args, **kwargs)
+
+    return new_method
diff --git a/mindspore/dataset/transforms/vision/c_transforms.py b/mindspore/dataset/transforms/vision/c_transforms.py
index 8e3b7c72141..2de575d14d6 100644
--- a/mindspore/dataset/transforms/vision/c_transforms.py
+++ b/mindspore/dataset/transforms/vision/c_transforms.py
@@ -47,7 +47,7 @@ from .utils import Inter, Border
 from .validators import check_prob, check_crop, check_resize_interpolation, check_random_resize_crop, \
     check_normalize_c, check_random_crop, check_random_color_adjust, check_random_rotation, check_range, \
     check_resize, check_rescale, check_pad, check_cutout, check_uniform_augment_cpp, check_bounding_box_augment_cpp, \
-    FLOAT_MAX_INTEGER
+    check_random_select_subpolicy_op, FLOAT_MAX_INTEGER
 
 DE_C_INTER_MODE = {Inter.NEAREST: cde.InterpolationMode.DE_INTER_NEAREST_NEIGHBOUR,
                    Inter.LINEAR: cde.InterpolationMode.DE_INTER_LINEAR,
@@ -712,3 +712,9 @@ class UniformAugment(cde.UniformAugOp):
         self.operations = operations
         self.num_ops = num_ops
         super().__init__(operations, num_ops)
+
+
+class RandomSelectSubpolicy(cde.RandomSelectSubpolicyOp):
+    @check_random_select_subpolicy_op
+    def __init__(self, policy):
+        super().__init__(policy)
diff --git a/mindspore/dataset/transforms/vision/validators.py b/mindspore/dataset/transforms/vision/validators.py
index 4cb66133592..0f2bc2ce2e3 100644
--- a/mindspore/dataset/transforms/vision/validators.py
+++ b/mindspore/dataset/transforms/vision/validators.py
@@ -21,7 +21,7 @@ from mindspore._c_dataengine import TensorOp
 
 from .utils import Inter, Border
 from ...core.validator_helpers import check_value, check_uint8, FLOAT_MAX_INTEGER, check_pos_float32, \
-    check_2tuple, check_range, check_positive, INT32_MAX, parse_user_args, type_check, type_check_list
+    check_2tuple, check_range, check_positive, INT32_MAX, parse_user_args, type_check, type_check_list, check_tensor_op
 
 
 def check_crop_size(size):
@@ -588,3 +588,26 @@ def check_compose_list(method):
         return method(self, *args, **kwargs)
 
     return new_method
+
+
+def check_random_select_subpolicy_op(method):
+    """Wrapper method to check the parameters of RandomSelectSubpolicyOp."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        [policy], _ = parse_user_args(method, *args, **kwargs)
+        type_check(policy, (list,), "policy")
+        if not policy:
+            raise ValueError("policy can not be empty.")
+        for sub_ind, sub in enumerate(policy):
+            type_check(sub, (list,), "policy[{0}]".format([sub_ind]))
+            if not sub:
+                raise ValueError("policy[{0}] can not be empty.".format(sub_ind))
+            for op_ind, tp in enumerate(sub):
+                check_2tuple(tp, "policy[{0}][{1}]".format(sub_ind, op_ind))
+                check_tensor_op(tp[0], "op of (op, prob) in policy[{0}][{1}]".format(sub_ind, op_ind))
+                check_value(tp[1], (0, 1), "prob of (op, prob) policy[{0}][{1}]".format(sub_ind, op_ind))
+
+        return method(self, *args, **kwargs)
+
+    return new_method
diff --git a/tests/ut/python/dataset/test_c_compose.py b/tests/ut/python/dataset/test_c_compose.py
new file mode 100644
index 00000000000..906d787f219
--- /dev/null
+++ b/tests/ut/python/dataset/test_c_compose.py
@@ -0,0 +1,50 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import mindspore.common.dtype as mstype
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.c_transforms as ops
+import mindspore.dataset.transforms.py_transforms as py_ops
+
+
+def test_compose():
+    ds.config.set_seed(0)
+
+    def test_config(arr, op_list):
+        try:
+            data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False)
+            data = data.map(input_columns=["col"], operations=ops.Compose(op_list))
+            res = []
+            for i in data.create_dict_iterator():
+                res.append(i["col"].tolist())
+            return res
+        except (TypeError, ValueError) as e:
+            return str(e)
+
+    # test simple compose with only 1 op, this would generate a warning
+    assert test_config([[1, 0], [3, 4]], [ops.Fill(2)]) == [[2, 2], [2, 2]]
+    # test 1 column -> 2columns -> 1 -> 2 -> 1
+    assert test_config([[1, 0]], [ops.Duplicate(), ops.Concatenate(), ops.Duplicate(), ops.Concatenate()]) == [
+        [1, 0] * 4]
+    # test one python transform followed by a C transform. type after oneHot is float (mixed use-case)
+    assert test_config([1, 0], [py_ops.OneHotOp(2), ops.TypeCast(mstype.int32)]) == [[[0, 1]], [[1, 0]]]
+    # test exceptions. compose, randomApply randomChoice use the same validator
+    assert "op_list[0] is not a c_transform op" in test_config([1, 0], [1, ops.TypeCast(mstype.int32)])
+    # test empty op list
+    assert "op_list can not be empty." in test_config([1, 0], [])
+
+
+if __name__ == "__main__":
+    test_compose()
diff --git a/tests/ut/python/dataset/test_c_random_apply.py b/tests/ut/python/dataset/test_c_random_apply.py
new file mode 100644
index 00000000000..8b4851aab57
--- /dev/null
+++ b/tests/ut/python/dataset/test_c_random_apply.py
@@ -0,0 +1,48 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import mindspore.common.dtype as mstype
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.c_transforms as ops
+
+
+def test_random_apply():
+    ds.config.set_seed(0)
+
+    def test_config(arr, op_list, prob=0.5):
+        try:
+            data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False)
+            data = data.map(input_columns=["col"], operations=ops.RandomApply(op_list, prob))
+            res = []
+            for i in data.create_dict_iterator():
+                res.append(i["col"].tolist())
+            return res
+        except (TypeError, ValueError) as e:
+            return str(e)
+
+    res1 = test_config([[0, 1]], [ops.Duplicate(), ops.Concatenate()])
+    assert res1 in [[[0, 1]], [[0, 1, 0, 1]]]
+    # test single nested compose
+    assert test_config([[0, 1, 2]], [ops.Compose([ops.Duplicate(), ops.Concatenate(), ops.Slice([0, 1, 2])])]) == [
+        [0, 1, 2]]
+    # test exception
+    assert "is not of type (<class 'list'>" in test_config([1, 0], ops.TypeCast(mstype.int32))
+    assert "Input prob is not within the required interval" in test_config([0, 1], [ops.Slice([0, 1])], 1.1)
+    assert "is not of type (<class 'float'>" in test_config([1, 0], [ops.TypeCast(mstype.int32)], None)
+    assert "op_list with value None is not of type (<class 'list'>" in test_config([1, 0], None)
+
+
+if __name__ == "__main__":
+    test_random_apply()
diff --git a/tests/ut/python/dataset/test_c_random_choice.py b/tests/ut/python/dataset/test_c_random_choice.py
new file mode 100644
index 00000000000..3faedeb26e0
--- /dev/null
+++ b/tests/ut/python/dataset/test_c_random_choice.py
@@ -0,0 +1,48 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.c_transforms as ops
+
+
+def test_random_choice():
+    ds.config.set_seed(0)
+
+    def test_config(arr, op_list):
+        try:
+            data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False)
+            data = data.map(input_columns=["col"], operations=ops.RandomChoice(op_list))
+            res = []
+            for i in data.create_dict_iterator():
+                res.append(i["col"].tolist())
+            return res
+        except (TypeError, ValueError) as e:
+            return str(e)
+
+    # test whether a op would be randomly chosen. In order to prevent random failure, both results need to be checked
+    res1 = test_config([[0, 1, 2]], [ops.PadEnd([4], 0), ops.Slice([0, 2])])
+    assert res1 in [[[0, 1, 2, 0]], [[0, 2]]]
+
+    # test nested structure
+    res2 = test_config([[0, 1, 2]], [ops.Compose([ops.Duplicate(), ops.Concatenate()]),
+                                     ops.Compose([ops.Slice([0, 1]), ops.OneHot(2)])])
+    assert res2 in [[[[1, 0], [0, 1]]], [[0, 1, 2, 0, 1, 2]]]
+    # test random_choice where there is only 1 op
+    assert test_config([[4, 3], [2, 1]], [ops.Slice([0])]) == [[4], [2]]
+
+
+if __name__ == "__main__":
+    test_random_choice()
diff --git a/tests/ut/python/dataset/test_random_select_subpolicy.py b/tests/ut/python/dataset/test_random_select_subpolicy.py
new file mode 100644
index 00000000000..4248f9d0488
--- /dev/null
+++ b/tests/ut/python/dataset/test_random_select_subpolicy.py
@@ -0,0 +1,51 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.c_transforms as ops
+import mindspore.dataset.transforms.vision.c_transforms as visions
+
+
+def test_random_select_subpolicy():
+    ds.config.set_seed(0)
+
+    def test_config(arr, policy):
+        try:
+            data = ds.NumpySlicesDataset(arr, column_names="col", shuffle=False)
+            data = data.map(input_columns=["col"], operations=visions.RandomSelectSubpolicy(policy))
+            res = []
+            for i in data.create_dict_iterator():
+                res.append(i["col"].tolist())
+            return res
+        except (TypeError, ValueError) as e:
+            return str(e)
+
+    # 3 possible outcomes
+    policy1 = [[(ops.PadEnd([4], 0), 0.5), (ops.Compose([ops.Duplicate(), ops.Concatenate()]), 1)],
+               [(ops.Slice([0, 1]), 0.5), (ops.Duplicate(), 1), (ops.Concatenate(), 1)]]
+    res1 = test_config([[1, 2, 3]], policy1)
+    assert res1 in [[[1, 2, 1, 2]], [[1, 2, 3, 1, 2, 3]], [[1, 2, 3, 0, 1, 2, 3, 0]]]
+
+    # test exceptions
+    assert "policy can not be empty." in test_config([[1, 2, 3]], [])
+    assert "policy[0] can not be empty." in test_config([[1, 2, 3]], [[]])
+    assert "op of (op, prob) in policy[1][0] is not a c_transform op (TensorOp) nor a callable pyfunc" in test_config(
+        [[1, 2, 3]], [[(ops.PadEnd([4], 0), 0.5)], [(1, 0.4)]])
+    assert "prob of (op, prob) policy[1][0] is not within the required interval of (0 to 1)" in test_config([[1]], [
+        [(ops.Duplicate(), 0)], [(ops.Duplicate(), -0.1)]])
+
+
+if __name__ == "__main__":
+    test_random_select_subpolicy()

From 04eb4f89c76dabe1ff0838dbd9046462a869bb80 Mon Sep 17 00:00:00 2001
From: Cathy Wong <cathy.wong@huawei.com>
Date: Wed, 15 Jul 2020 20:25:03 -0400
Subject: [PATCH 11/68] Cleanup dataset UT: Replace save_and_check

---
 .../data/dataset/golden/batch_01_result.npz   | Bin 1509 -> 1961 bytes
 .../data/dataset/golden/batch_02_result.npz   | Bin 1325 -> 1597 bytes
 .../data/dataset/golden/batch_03_result.npz   | Bin 1509 -> 1871 bytes
 .../data/dataset/golden/batch_04_result.npz   | Bin 1597 -> 1781 bytes
 .../data/dataset/golden/batch_05_result.npz   | Bin 1509 -> 2123 bytes
 .../data/dataset/golden/batch_06_result.npz   | Bin 1509 -> 1727 bytes
 .../data/dataset/golden/batch_07_result.npz   | Bin 1509 -> 1826 bytes
 .../data/dataset/golden/batch_08_result.npz   | Bin 1509 -> 1781 bytes
 .../data/dataset/golden/batch_09_result.npz   | Bin 1509 -> 1727 bytes
 .../data/dataset/golden/batch_10_result.npz   | Bin 264 -> 264 bytes
 .../data/dataset/golden/batch_11_result.npz   | Bin 582 -> 767 bytes
 .../data/dataset/golden/batch_12_result.npz   | Bin 1509 -> 2123 bytes
 .../data/dataset/golden/shuffle_01_result.npz | Bin 1507 -> 1691 bytes
 .../data/dataset/golden/shuffle_02_result.npz | Bin 1507 -> 1691 bytes
 .../data/dataset/golden/shuffle_03_result.npz | Bin 1507 -> 1691 bytes
 .../data/dataset/golden/shuffle_04_result.npz | Bin 635 -> 819 bytes
 .../data/dataset/golden/shuffle_05_result.npz | Bin 1507 -> 1691 bytes
 .../dataset/golden/test_2ops_batch_repeat.npz | Bin 3137 -> 3602 bytes
 .../golden/test_2ops_batch_shuffle.npz        | Bin 1914 -> 2360 bytes
 .../dataset/golden/test_2ops_repeat_batch.npz | Bin 3149 -> 3614 bytes
 .../golden/test_2ops_repeat_shuffle.npz       | Bin 3831 -> 4042 bytes
 .../golden/test_2ops_shuffle_batch.npz        | Bin 1914 -> 2360 bytes
 .../golden/test_2ops_shuffle_repeat.npz       | Bin 3831 -> 4042 bytes
 .../dataDistributionAll.json                  |   8 --
 .../dataDistributionRandom.json               |   8 --
 .../dataDistributionUnique.json               |   8 --
 .../ut/data/dataset/testPK/distribution.json  |   7 --
 tests/ut/python/dataset/test_2ops.py          |  34 ++---
 tests/ut/python/dataset/test_batch.py         |  48 ++------
 tests/ut/python/dataset/test_datasets_clue.py |   4 +
 ...enerator.py => test_datasets_generator.py} | 116 +++++++++---------
 ...reader_op.py => test_datasets_tfrecord.py} |   0
 tests/ut/python/dataset/test_iterator.py      |   8 +-
 tests/ut/python/dataset/test_shuffle.py       |  18 +--
 tests/ut/python/dataset/util.py               |  82 ++++---------
 35 files changed, 117 insertions(+), 224 deletions(-)
 delete mode 100644 tests/ut/data/dataset/testImageNetData2/dataDistributionAll.json
 delete mode 100644 tests/ut/data/dataset/testImageNetData2/dataDistributionRandom.json
 delete mode 100644 tests/ut/data/dataset/testImageNetData2/dataDistributionUnique.json
 delete mode 100644 tests/ut/data/dataset/testPK/distribution.json
 rename tests/ut/python/dataset/{test_generator.py => test_datasets_generator.py} (92%)
 rename tests/ut/python/dataset/{test_tfreader_op.py => test_datasets_tfrecord.py} (100%)

diff --git a/tests/ut/data/dataset/golden/batch_01_result.npz b/tests/ut/data/dataset/golden/batch_01_result.npz
index 7da040cd589b3a21e3e0db623d803c47f2e20c0a..b2dd3bd71e63a03ef2075782ab57671e1947ea37 100644
GIT binary patch
literal 1961
zcmbW2*;`Xb7{*TsYg}*vcO~GGQ4v`b0T-gEM|lMzilQ>ckQ@}ngbBfxsukO!l`gim
zi*~WLc3<q?+I_j{zu|o^+M8a}cjknEPoLg&cqU|i?>EVp@BL;@2;S@>%N?cGf$P_<
zmgZTOo&{Es6-g$;l@;;+A&)h~Kpm(>J+8HOw1f_MtTWcxK+GA4CIb!rz`3?Sxj)d8
zNT!mJcsP-aIU4Vb^bI)V11BQ=4%ec-++V%2+<)Hx|A%kb8;uX1?jNd%CX!CY>A}8K
zFY}EI`NByjnurgil7rEdPg!o1FCKGM<$*P#vYI^I%6nO5x2qfh4+@ZVN#&+PW2vEj
z$EWf-v-mUH>Qp`~crL4gcI6ZBc6vz9sVS|kt<7T`<^_!@HMRSSRMSMKk9suG7ar)1
zr>bhzbg2r3R5LhdHe17EV-cxl!4q+$noU-OEP`TqHJC#%7uf{!kfXtTf(6JWScp6g
z77;8)KEV<cXi!4nhmWANW$Z%4aXNF;-(M9|Wl}ArzKnCZ7|o}k$1ptHxI!QcUWe}j
zl!ZVxa=2J2kc&KxR}rrk$VUMe<pMrT(RdAUg}_uy<6^DAbQEe_Nn9l`12eg(7MO+E
z8rKl76DUG47wZM)V6MX)3z&wkQBPkYl2Wx&ZD3p-XTAOVpJQY9DDxLJ*o}YNcl>(6
z-5V)yvV%EkBGqO)Sop`txV(jOqa7^H$U!@};Kq$x>D^l?Z?l6X=?zjf*}<}#caNpz
z?UZ-e!R49VJMCcLuiN+1vQ2rH9bA?EJgIiu_IRF?N?sjP%~FBEdpKL}E4it*RIR4a
z<27z$Sjg1alvk>~rn2KjZKu4?6xGyIs{N*vZWX2KpnO1J9_C{K7P3SK1r}j3mY_t}
zzLWAH0Y6GnhNT3D1(snsRv@6|Bb2)Y1Xf}dRuf2pa;!lG)@q9;w{h({byTV_Hyq=P
zOmq@W>UMiMN+D)?*>QWhhv2yB<r8i%_Y#~my?n~;<vxPbrkCSxFDD54O)sm_e0{b!
zOiNGBrKVaXr5a#pigU1;4h=JnGj19WkUeP97;@8ih~TVA<D8qud4h*c8W-F&h6yg3
zG#+u&c$DBVlg5ZX-1i--nwYvI)n!IJ&Uxiu4(n57%#~_1<EuU_<CZ?bZBJ$_)d$14
zDb-UMBPV<KG{diEJj~c9)iW9ACQOs+S%yC+P>Cv3qXz4+9<|s&Z?6f|p&kv`h)vjx
zEojs&@I1p`5C~!`wxJ2zu>(6{Q+QEe7j~l=z#g=q6>U2FC5B%Y2w^YUu@C#vfde>5
z;f6pb4&gA4pbHX55uSX=^+i~xUY6<=?thi@wPs#}`X0RLn)W)`Eo0gnu4!))+%~4Y
z<(l?3!8^vZcU{xoBY5AK_JM2KhXfxP(>}(1J=5JAk0giG9jQKH$fum2A;Rze9Nqd>
z`GVj}L<zn^OoOipzJWt<7d;w$OYj|z6MT;o8vH=;BYFvb!buH&Cin%X2!2H$uV5#o
oe(SCX!IM?w{jaAh+crIh^#f(%SsOx*zQHXI|2FV{mwDd)1Bz5X(*OVf

literal 1509
zcmbW1%Tp6s6vn#~0!e&;7<}M^27z2KMnvW1gI1K5d=g>=1vSP_(g+GkZYMa#N12U9
z>2@~b#<I%2>-i7n-*7*Rna!-@>65MoYic%gQ~COwb8r97ce}c83jUUe(j2|Yhwk)Z
zCZH(JKcqwy)3&oC@nX5ADji&O7CMv8_%*pSpIT9s7s~5s!K&o#=#&<Hvk;AG(bbY&
zwasF-WEU)_JZo-LEUH)5%(6wfn2c#-F>Q1xroGkv=LpyR`Qp}QxfaitY%9LGwNYIs
zzh*6*wXJ-qSgG1u`D$1y%~j!Ip*bs6DDS0jM$Jk8wrokufPjh+TH)K4t?uGNwN|#m
zvTfN%8(o-?K`2nSWhgDf0{&%{s&C|;WHOm(&|txIRqoAw$jE&nBlnAsLFXCKs6(|f
zvR%LjKLs5EEeKF>Kv2sz3OWUX2vKlQAdEc}L<II?9|c_k`_WFpA%PAYpy04TCk|3@
zL?D7L3XTdK!eI)!f9mPBJI3wYPRF^E+vEf{arq~i@1lDc?NT*?BRGm~9HSP!0>^Oz
zC(%PmpMZv5^x+gGrv&;jfI&nl=@$?f!Z1!#G9VDc8N_jxlEHs@be@K5FY4KIw(mL<
zo-=2^uBjo<)Gj~6o}XP-PJ32%xrhmj;2cJA9v5&CV;CoEX9OlNi78ydWn95kOj9u~
zpyL{@V+J>H6SFX=cvj#R=8ynz8}mqFfr=vnDcnICcd>{i+{1k;o)cKc3LfAg9wCFr
z$Wn0>rkRoFkt4l;Jn2OgNXKB2j$@T{0#8XNu|_(Db<#_CMtT{~Nv~jo^s4d4Uya6h
zdc`Q2Hm3hEzG=-wH}rs)Tr>3cKi|7E*A2bPo0&26?oXe-H1D}#=sn)Dn}*)^`S&Ny
znOQ?0@a{1TJ^J_8@6DN8hCb{8<_u%UP9mYy8x8UWsHC^yBb|qzG>I0{1q4V_XeGUa
zHqtbLq<0Y_T|}64X}<A}F831DtxmyxW-T*og;@`n^^jSQn3ZAHV`gQU^@Lfb^Nl*R
z+|GwdS)P#vMp}$qW#m&vt}$|*k<S?UoRJ%h-251HK7JYwx0s*}J3B5hvdl=yNSl!r
zMphZQ#mE<o{DqM(8Ck<S)2hl>xp)eyFXI2NlbIffJL=9+v~y0U5NLG@D*eXk`Wv6s
FKL8Y9nW6vy

diff --git a/tests/ut/data/dataset/golden/batch_02_result.npz b/tests/ut/data/dataset/golden/batch_02_result.npz
index 1d126c043eb4014502e43d36bcd266859afbf932..671e5161c7daed71c0bb395684c11938bc4e698e 100644
GIT binary patch
literal 1597
zcmbW2&sP&y6vt;00wn$bHK^bZ0*ZNPj1Z7t{ve7nny-YE);6}C#z`_rY)En^!8R3Z
z`U5GpyRO{y^qlUy>pA@oy6XSnzKgo)vM0UsW=;sFr<*>`y}Wntecw0v<jzY9-j1N+
zj3MRko&Mi{@hOUJekG_FmX#TcmMV2s>6Jmd&`#Rp`{ezl)B{y{qC5@f&1%jH&xOL9
z%i%~Uyjr$umQl)-t-NW67mZ@o<al+>sF=JKvyo72G!l9i`u`))@a9Sz>y>&mSGLUP
z`bM!<;O8~!fsAG5%B5<}+Q`)cq&QW9Qr^ju3T2Bt3u>0U+tjf_J^>YecwSJaTbi%c
zD`tSY(jNYp%QMss1?o2WS12IhO{*N;q<zU`GSTEA9p-DaKf9yT0g(=HK36Vgs)bT*
ze2Nb0)FX82WjmBm8qKDm(_yIWkHEt|2rv6SbQq>iN4e@4d>lBAPWJuiVn2Xx_9x&s
zOf%i-%8%#iq)w-J!)dlb@v@r}_Gmz5rI3IJUY-mIbil`xVS!F`@#Ks^H~c2g&hpL?
zfdKaL;+(*K9N<Y<;2?T<A_RJIh$o{0hjGN@)BSva{oz)NWur#tb-KV35w?rkZ+|wM
zuQ|<+8rA0i(tZv(i%VME*IJBe@t!}n+|0NZ?{hPJNQ_k`wD`cQS3f!%C$;!stM;-M
zAO7LjhfZcni;uJ<r?q(a%}=lGOdFn9o@R8KWz9LZD|_&`WUfjwE|hB$ii_g9MB!4H
zmlPZTI>lM-hCmQ~IErKZ*f#}^qaOn}!HWfflQ@Ob805t*fe?l;j5EBrEpQeiIES!}
zvAsv8CTUS8jdky^-QE2j8Dv5>3S^_)lZ~?EG)j_7mt~`*WTU(#8)Zc{%G*w(w4X5%
zqjz+=&zs(5doRJT8Z4id@(-l^`%?Y`NB%>u(xv=IQa&T)A4+*+Zy!$NDXUYCH{{vO
ze}T5U&t<&YVr+Msi}z6rZyVIL7SuMLg%;0UlsY};Q$H3E7{z&9Km->N#U&<Q6d1!e
zVwk`rE@KMQ99$Qe!7S!*1y^wm*D=q*l0Y0ca1#r-h1*z!#=)|{9o$6%z&$J>iRJeG
zw_olVs^~;~yv0^c@T+aV+cn8_Lo$6LnSSCh{gkWflIdrX=~KycQ!;(F_bba5N`_UZ
z&vkmvn;L9iz+mk!k!AlCa_qlGp8Xa~``&oLCEHl#z&Cireg|vpU!uVNTRi3$!>rMF
m*=P!?C+PjJ8=t$v9S!@E+r9r&@Y&anqVnG~|9>NY1O5T_zp|kK

literal 1325
zcmbVMOHUI~6rSk|T0}sLAo3W%+Kbl8Q=Y;g!azSM6<_$M*vIfNTDa3<Ld2vSLv-tk
zjWIFCoooMre}laX-MFr2PA3&(+<2OPJ&*f+=bTCBCgAo2m9ljzpYq+Ij~+#_zkVgC
zq%3Q-H=5htRh4>9+6V2TZExeZ7ZdkX<+<`AoHg?qD?Fx!_m;vDExfj66|7Wlb<4_{
zc6u)LG;dNqzmeKDDHfv<tv{j-^+dFn+J9Q0=+5MJo^9_&Gh3D!eYW$ouu1c#b_1)H
znc2$a3)W7i5RgjwR3MivSEUN&m2}OhY3bgVo@MD3P~nFQ-hJtF4rdFy+h#ykt+;5>
zr4d;T1?s-^FUx>{dqt(}o~((-<FOJM1Evb{So(ER){3M&F5Xnz&xouD)lAAdK@HsG
z)C+jvC8t5ahbnST2voyQ&Pjm)YRCx+978QRjRMC}N6smMdNh#JBya*J$!Y$hgCl;L
z#U0%i)^#M$u%x5b%4&{Kn?MkaIE5xsZ5L?9X|&)B1swvdXhS<XD9{8nbfOF06m$xl
zMF{5*rl9MuN$d@sA-XF=*ef~x&sO>$X&tJxKGOAErR$M~VF7_2oW})3a1l{lqA7$x
zFZ$4r0SsaYmoZGq9)S^zVhrP$z!gklijwCAbWGzaW^fI&n1eyd3j*_4Kn%cjEFz92
zN=A@MCFMn=38TmmUP6|z7bamJ)(HEtPB?%K!a-~j4&e#mW#jAjQt6QX02B-xQ$LN*
zS~(aobZ;dXHT1e~ubj%5p*L14<A&aHaPYp&nK1N@%CRek-u3Rw<8o!v&_fl@l%a=z
zd^jvu^q5jCm8knPRKlxp5zfF(cnu!HS$GNO;3G6pML3UY!UgyVV=<B`k^`n+XZj-3
z<4j*-dV=XUn7+*Pn@qp;`voA4+pKYiHSV&;3TxbBjr*+efHfYnMv^ri#b^P0gH@(K
zW_pV0X{KkGo@Khp^fjih6WuJxjdU~t)fIIA>xQEZ9V^<e^zZvF0k6+asPq}8-$TCh
FegO_2RuBLH

diff --git a/tests/ut/data/dataset/golden/batch_03_result.npz b/tests/ut/data/dataset/golden/batch_03_result.npz
index 3bb486428e7870d46db94ac58f780071d6b45b94..3d4601cdaf577e70d6193902b97e387fa3ef5c06 100644
GIT binary patch
literal 1871
zcmbVN*;Z6X6usTe*bbnA0}eDOc7cdYGC9zSa+NL6q9~|WY`QOkLO0bkCPa*6Kq8rA
zOhN`@LgvBbF@NB1_@0O4C2v`a?5f)|&C1G4YOPbNYVT8Z>Yjb7ZV;Y~0>fz~#^S@j
zqffI8Ls_m-V1(lFf$FO0_>{}Y*PwD#q-qbkdOL&tF5{MQ+aIwf!f}6#&p+Mmuk`tc
zV(~;g6dj1gBesIuL!%Qm@rmKkxXrm}uJqNGSNiVw{(tk%c*4=ivGJ*@a4c?DjZKaw
zMtI)Ply@L*hhx!+M0_%w@Jhq+@<t;LD_t;VrMt~FC_QsBqeo^6xR49?J(-nsjU=YV
zZLiGkbMrI1n`90QxaMSTkMs(7`dmb(<(jUpu8t)(-Jvxh*ACvd<T}ykB_EEB4or+h
z6EzKTy(RO6CG%+;I*gg6rH~~z!o_|Q+#%bN1!M~0A+Q-4?6)9O!CMJ$Ll%J|WV7Fn
z90l(nyc4+ucEQWOxO3@l$hP~GDk^GCMEWdQLa$QVZZV%jLbVyVxO|U*8y*hI1Tv6m
zbF`O!et|4xb0P$Ckjp{2fEQ~NUqODKz*?;1q*7o#@;KNpkdF<DuOfdyU?VnhQY}z`
zLJn#KHe-v;ljSnFI@!=@ER>M7maOA~dfGwr=^sl=&q(t-8qC(e%qKo)(r5-UlM`;$
zMBipJkoWs+l5Q~rg(=@dW}s+c;jvms-^27hVg`y+zDLbK>7(DSDcX{)W}s|k-vE7&
znF0Twi_aYTxEZKOZe__f(_B7XT6$|E@`NQ%a%DTsT$SErW>4wNCi8h(=W{ug9b5yQ
z%48PL=qx64*Qs;oq|K6D^z9bdift&uc79q=U<Y<$7m7JKE8s&3O0io>&_myI0(($~
zz3>w_FBqmA71+l~uRtaCqY4L<9O>oRP%kf7@*-FF(fU_vg}J(<YxJ^HqgO~<x<;?+
z8Xa(I^cvxiuF*kVqhY5;BZO^TqeHqzug~YGo5H3`HT?v2wQ|^!BV2HUcC&**%rN+<
z4nF1tkCKV$;Nv>Dbb`kTPw3zY9emOWev9yPI{5QC_>{V!tFB00M806j+w_~J-T9Y>
z>S842dod-Ss{f>DcT=LNT3^;{hI_q~(ko4>ms3)u3G_-zpcQQ_`6~CA6{toHYEg%J
z97F>eDd9bVCN!f3hj182a1^Zs=L7;chT~|%37kYbOjQf7agX}~r*Ik_0M4KjUFaq;
zFA&68^xz!MqZb!&k>KkBedxy}T*ehxxQc=FE32NcCi#XX-{hum(cbRh1EyZEceFt7
zIs&~%=7ARIeJ#*KN1zV~f2aleNDH*!2=p=GPqaXvYJon(s;U?qiH71+@{uJ!r{iPV
z7YH%pml$Nfh_JfmUlINq5dz=9X8$dQ6#O0G?{S^L4;W_uBSsYb6XBn6gTNEqWd93B
s`DEG&`Ria+5H5Ft=fD1eEXQP<QP2L$A2Enb{U3B0E~AlzJ6qBJ0A;}+3IG5A

literal 1509
zcmbW1%Tv=(6vmU5LR%l879aQ^px6u6ih}a+K@=t0PfA5mP-{)wASkrGX;JHA?1pi4
z>o}tu#~Jso{SW*%+`G_?>w0pU2{?`$Z>L|*IrrvwzMDyI3jU^u(imOJ+rGLl9Z(eK
zA5tQUY1^6pc%f8Pl{PLq3!O=4yh+YXrsh=Tq4G4Ex5_y?I;usVO+{l`biQa;Y_pIl
z+Ih<<Pnb((i|XYCvt&^&Mq=7vOdIHpY0tI)9O0TjS6EptRpYs$ZN-;YmMV+n*Q|y!
zwv{Uu$`yMhR|!j{u_|21H)f>@WmWpd)vWZtlugqzAfO_ICQ#b!F3wk~B`YjjW_`5L
zsbLv}0`;W~P0O%=e^#aHGr1v|OeX3ySTI(R8?&#{a+65Q&Ej>?c}6s9P_49V74X4N
zL7SjX0SdMVG^2%rtpY)WDA*<t#s&%^0voZ3f_8z;Xr*AgKpVDDutQ)gwo$NCAcA%Z
zb_s094hlN{=;^lG&F$Pyd$^O^WG^>y`TLmfqB|MwQZ<2{*o6-4rWRcSd$1S#&`C+R
zfQBw~V?QPP1rDGG2N9*@fPg?R`f!Mn9)TDRBaS1K9Q@0p^E6z0QO};Ueb<@roH_e-
zP4#-F*7@o4{H(Kb$g{G}MNFU{M{x`T7{qa$zz|tGEHI1_jN&9t;WW-*jEZpq9cOV4
z<2a8Cn1DgWBLWw32?+p~F^MFmsMs%%!WB&8Dz0G$*Kvc2M+Igvhnu*C+eqUMGE_VU
z(@e_&WJw2+BR!5h=?Pe*LzpKW#y!#zERc?3k@O_)lb*r@($iQXJ!5?RQLlfaSB#P|
zW9*mlS!*P^p$EL=tf9AlUv+2B8G5@nGj8Y|@85rH+;iU0JH2HW488lqm%ELb2}AGk
z?lBBK`t#Gb#>_=S@ACkc3}eksBB9jkb@BzMq?h3%orIq>i6+u11V~e8CcT0d(rE-q
zuOdWx4Pnxm$@&YrTu)H98U;6)HOs6yX5D1gEoR+jR+?FNn3ZAHU1piiH|o%`Yab?M
zIY#CgX)$u1k@pz6z{o{L-e=?kMlLaO`E}6w_-Q!YLV`Bz?6}Ct5+fxeZAO+ESz+V~
zBOfyI5hEWnvWgd`Rgq7!@f1{F#Q(38nI4EcYR*x#c21`dXm$!J{f6lJj?e1f+?Se;

diff --git a/tests/ut/data/dataset/golden/batch_04_result.npz b/tests/ut/data/dataset/golden/batch_04_result.npz
index 39198c56927049e0cfb3edddef7458706a56208a..aed34bf1e7ae924755dcdbe785d96ccc06092b41 100644
GIT binary patch
literal 1781
zcmbVNSyxk66uviMN*q894u}#^%mFbn$>hLQl&kq9L=*|?HQpo_K|{j13AU+N(*dbi
z?O-Q5xVlzf`qbzCgZ>TQ`_R7hIo;>vHip&Jm!7ruUVAwE?34X{IX41#l}{<HR^?g!
zO!H!mq6qOQKE<@{Sa&eJlvkAo85A2uQmmKZ>50g!syt9029j1bVFyP1ftATXmp?F<
zv2(VWj%DnmCBoxoDr-?ZJ8v#o<g?)}e{V;Z|B?UyFK@w}NG~rg<%5ZgZ3P#XQ@I7Y
zubKD8Y%7sTXLI&)BIo5wX_q&hEM>V0WtF?e)Hrvq@v13a&7i^q*ArgjY)$6!OO}_{
zMqTtXCx>_)6sT+5GsV3O?x;%96}}}L4u^`gq{3*9Z;d}S_%;^xQa+JM#j*?OT+aaC
zZt!|$@CK3{A*E0(ng-tqmGmySNc-R>-H0mEyHQPg4{A)y;CrcJA8IMkggVmu;UV1&
zFX;ml#mA;)MFke3S5J~3G<XYD_(@vXdL1RiDnO;9+8A7LlW~Zl3e{vBW~f0e8SM;p
z@Q`tY!HX?q1Q@nrn?)9*vpN{Iqn?~jh6d~)<0!*U>>{I!!G}gNjxp@U9*gedp_bx4
zbE%A(<3WQTrz5&adi3A^C>CE+nr`2#kN&0q;x9RUx>j9w`gN`T_f;ozLf0D0nE_pE
z+SvH9R5Pe+EzUM-KSbvb>ss5ZU+<LCBf8dJE<CAgfj@tGUCNx&wa)S>r*-|GJ5`Lb
zH_6W!e3Z`ANX~8=r8C8Ia*ECX$K(K=*`1fOb0&5{PRtqBxEz+8lfiX{z1W8)>=$Fc
zNPR9bG~)mcqJ;t>20vQShC}3l;V{~91Oaj`Gjco7iKFC9Fm&M<f;cWlUrD=xejYaX
zB%K=}x$=e$H2f*ay{nRY*Ch9*CHJmN?#-0A7p1aU$-Ntrdp9NbZb|ML>ve+nv?K*9
z`}Xzn+XjzO@f{K~L_;r7%eZWrkS&w4r6pU=$(DCZE$69XLAJanTi%x~Q?liv_>*lS
zd|#5M4W6NzB@+HGlmyKkfb23rg4Yf<*)leQ!wxLDGAtEfESJHkz~Mm|hcW~Pe~Y0T
zJ?KRr`f&mS7!>HfP4~+)3}F}}IEhm@jWZaf#5)Wc&f*-#a2^*h4xNGz87|@yLI5sf
z0%1&2aD^d)E11GnT*EZ3V}^o{7^0ZP4cx>n7`TmC<sVQyK12LngFmMB1(Nqd^z?|g
z=Y0vr2NH}AB^av`j3*L|wGtQ~QN>dU#<~RKnFQlw3C445!acr_HtjrrVen6==u?u<
zU{asY5hwi;3DRF6N%~7zq`$%(>928Dym8;q_6^Ka;9D$^{tov@e~<g5e?W?!I4j3r
l#e)&3E}#3qzEngrXBEWb{N}qALACr^sfwx$QsQUn{S8u7_ALMa

literal 1597
zcmbW2+f!3l6vod<2qf_WYVd*=0*cvSjEEO5UO0+!G@pcsqM*hYl7pa-WQSmziZ#6;
zmG-SqzH~a%_r7(e|3Uu^r>}je5AAdM?VZzTr_-12OunqW_S(Pooju7;65gheQ6F8#
zAAR<(Kluzpy#q$b$dpRy{#d?PF^x8zRE26$jc*gv6UkZAcxF70<eZgkDKZj{tW8Fu
z;mCZUR4!%m=|U;zsPuSdX~kiFWg$~^I2Xgw@IY@g{387SN3iP6=2w@Cl~}e=a$?J?
zOXWrS%~XQvl9MguSIVW;Y&j^6dQ~u=tFKBE#=7*3nRC*+A)BV8PryU~o>#KjEzOlH
zMJFg*W;}e%$sy^70dqqJresjSJ7Y4tCbuLKiFl1e3r5Rw>)h*<+$K_TJKwEh)2~j6
zqYBeW$u<EGybN{-G{MJUr$94W80-@8Bfuae5X2S+?E+h|&0(~gtUUtT(aOnQfi~=5
zuuotob}{G>2%(+9eu3TCqjntNj)VW`?lwN88@oL_bx*flShsVVbm=B8zgzP)IwcPa
z?8QEGV87~fgnfDh4&WdTp_7TD0%3Ha8;3cG2pmBVjv~T|5D@4^AC7U-D-gwT#Bf4U
z`batUFSqK1T%*y3QFSD)$Kwr;>hWEBv4*`5Or2<$`oK?r!%xFXN}d$x$4Ly}6i(v|
z&SFsc8=${a0z(+a2+rX=F5n_Yxo}#*!X;eB7_Q(d#$hvgM&KH*BM#sOCXm1+lV=5z
zxQQv;!fj0B4(>8JC@_Or+{1l5Knf3$R^5i!Z5WwMN{(QTdJb9YdE}@U;7~7Oo;r%h
z8J1ftzk~%QE@P28h9}f3cuKvBB~>!c65Iae_gd{OzlzLXvq%55e+<_r*KNz!n8a<X
z^|y6*1-8}RSh-<a9dF)zU*9ufTb*th`3dqTZL9mcpC8qileX2<*m%>nBJX~9TVI*7
zt-c23mTmv-iIm%<OvjCCtwx7;U{deGL!E(_I*TUiJ@}~i(X5{90e7U(!o)-Psc8hL
zj}WA0CTcHL%^Yj8aW=0qnbTB9Q|C4Hv8FC)>SCSxggc&U>XN1|YieFo3+jiixkXmV
z_djpKOB%kS;bje9)$nH;{!tzNF}GDT{1Xj-uHkDM{$kUw?h+2&;Zv5rjPp3udGP+A
zex~7{Yxox${-uVm*Ws_YV?)Eg((u<BzNz6~gHWd|znP0AVR}N||GM#cRPLy%OYZ&s
SpM<YjB}~2s`T0)&2mA#Ss=qe?

diff --git a/tests/ut/data/dataset/golden/batch_05_result.npz b/tests/ut/data/dataset/golden/batch_05_result.npz
index 24ab9b08368179fb276c2d87550b191f39b42437..865b99825c71d99ffd50f9a45f215bfd0171c8f1 100644
GIT binary patch
literal 2123
zcmbVOXLwXa6x~hF(joM=)EyuMl0XPGOMQ?-3ZaE^37g#w1VWOt*`N`SSOCE;HY|u@
zFW3=#LB(FNH|zx!EU3SXZ+5cT@cH@2*>88xy)*B9=iPJ9%qvEGLYl*j0gl4K`wt&X
zb~yAm)sg1#g+gVcGpcJFoQ{r>q+X~e_1Iasu%LK})3M&M!4nA9`9q!wZqLRdPrBPv
zQ4<P>eAQ(&p+HclXZfn?f_A=cwXZg4$7+1KJ8M|Fdz1VB4_9Nnzq)>1Z9|5?CKSwA
zS6>ybwE6lPTxFr4zoxn_9IE$+U38dLuIhl9r4x=VjGOK(XZ$uMlrT}jiB!aGWm5Cf
zK)9hc=wfndoc)+ZxlDlr&TULBp-UmY)M;lovQ1%OVSbZ6B0tCrvu*iyVLMgovgiFZ
zRb_RR)#1z>wik9#!j8gD`Hse>CZA8(S=a?mjnaf&5vNf%VRyu9)I-=42^#eh_C}&c
zeT03Hq)|U%e<W+<77jp)MgxU|kgCyO;Sjhq8Y=V@G;Q|zf~85#j4}hXnGCbP!-XT%
zjubnuM<bj;`-*hoNQF4W>mWloN+AJ>Iv6d?R7gUy4zh&V3Moj{!5HCK1sB@rAV)Y(
zp)K0!AXhkEp*=e2V1jU>LPvDc!6e~ih0f@rgDJv1g*0^4fmb+Hp&PmfZPuwaGhN4u
zsv2LI(}dG);xmLZtwYC}nvU4B`Y~q-Ei3PYwa?w0m@S-Rc@txad?74vhof7fE9MFd
zEN|D?ib7$L<?Xd=*PiBk6bt8B-u|&mN`&(*@4(#$RyMC#AY5p9hs5r&NVwSYdXDcs
z(!8QnxWw`fkKJ^saG7PbP>;!y72tBA*iW}YSay;uQ8iWyeGxUHB9sgL5fP%b4+w*i
z+DA)XAzT$Hd9>22g_V&?n*s~h2-hm~Ku`2SZ(aT>;W~vr=!<^nuM^e68U;57U?2wR
zM6HktgE0g{;n9hZuuef?7=~knPK1T^3h5Y$42;q;wzfmg7_JwdVn6Ar!iHG?X(3M&
zZZMwMDBNT`ak_A`@kFEW4C9G2g=ZO0oGsj9Jh4@{&3NJ*;dbMR9l~>sC(aX|Z#;2<
z@WLG_+O_tGSfe$&>?~d+yx3-UiEwAWtz)AmcB$|(L+o<l6^7WA!d-^gRl=(cv1^3a
z8e+SJ*BN4agx4ElHwgC{VmAtJGQ@5c-eQQ|D!ff^Bq!N5JHXq8ci4RH6yEi(zR>23
zw!=Q*-7W2)9T@F`dxZD4bb+>6RQ&sd_qT}OTJr~l54LFDTI&76hgzhLDJ(o7d{|*L
zGLeOBjKNssV4UuSM}&_m<YGJ~U?L`AGNvF;Pdp}kT)~T}n1<<?fti>EOHVu@d{SXH
z<{%%yToj-XMS9{X;nNDmn1>R~#{w+GA}rPu&j=4Hlwt{%Vi}f0umWYRw-3Dy=ki(M
zb9N!07aq#DH{r-G{DSaBBkW7UmyNKm2wyeAz9xL#2>XWcO(X1E!ncjEhlTGLVc!+L
zXM}xU_<<4jL*Yk8*pG#u;G|9~udMck8u+R3Gn>#6;pgz_Hv2;OCCWAWO87PW8hs=D
z76FaE6Mm1NMn4FTqC%q|g+F1HMn4OWVYNobg}<OuqhE!;VU0$=3r}FJMt=zZM3sFc
s1jGERJfj%SxU~5HI{)qdZyt^M$cw!V6eCfeC=RFn8E60h9eE@82gyyTbN~PV

literal 1509
zcmbW1%Tp6s6vn#~0!e&;8hqe`0Ael}BjAIFBCQCm`6NUX1vSP_(g+GkZYMaC8K=y~
zvUEF}aWgel?p^yI_+MD;T+Cu-GwaN`CtVFwmYcb${`#DAZ~xAByV5ree@jGbjvj6D
z%h#DqK-1I}(juDWIN6~@v0T%&4lb&NYEq3~Q;T!yWnFur{TwUUmAn(1h{j&d$Kui0
zO3A4@Rxw+03brcGTI&^?>XlWiY*Q}A<I&*}x(4FWU!wnSgzNr%adV?wOXN$Ao!Ho1
zudb0_s}{~WcD_`sRGrOyH7vE}s&KK;oRvDX*U~qw=cNCQY*~;20UaUihW<vjx{C|d
zTG<ZEwk02Jbbd?*p{eQ6f(#4zmvpMWl)F-?RI)*X1yfbIJNGst_lS(#E8YdwJEBpC
zZf9h>fDe8OIs{q}pkSXsE7~a7FAzkCf&&6!?4lqd$ju%KItBKkoq~e`9oR>~A%Xok
zK*3>w2s$Y^B5)9gDCqi6Pq*DsZs&G7#+}?I$GM5iKf!z#-OXs18WlK<Bk00WYSANb
z499T--IVkSMA3s@^ik3$a1#AEg%~9#1q22#h|`qx3&e2-37n<m)Hfd0+i>m0JbP;U
zt~22|Q~PyI4S1$@_!;#4?67j$v$DfQTwn;p7{NK5#|2!(C|NrrFotnV;1Vul5?3%q
z#e{%?tGI?~T*nM%VN&s|zzy6)62L9YA%%G=4hf`j8w<FDyI90M+^6EOz!H}601xp9
z89YXoiX*VBj68=N>3QTyFQ7nr5jN>4R!GP2lyn@cq!U;py@Y3^m+_o*66>T_%+G%{
z8e8-iqh!jQ`p5hjZ6=0k1ia*`X|(_O+MT&(8lB$Ev}ts`fB#4Gnd_#}?Jb)zjo#ls
zJ!#I&nnu6(jA<INzdvj>XKt9rpa-~Vn%j1gNv+;!kS{<dy#*iX9Q>pyw2;mtK$=D?
z>20);E+9yH2O-kC2$L?(HD1x}UXr@iDY(z9C1x!%>jASKGV2kuGR%6+tSqyhFw0Wk
zsG{YzKTOK<j4UwHX5<PZpE7clk!y^6#>nT4TxaCQyP*2`X((<nNgGx>E-|vqNXbZt
zkrhT(8M(>G7mWOYkv}rBhF6wdl|SVYY3RO)|9dAhy%2ZQ)lsy4PNxxQRRx`Xqjdj`
H&+2~xalf0d

diff --git a/tests/ut/data/dataset/golden/batch_06_result.npz b/tests/ut/data/dataset/golden/batch_06_result.npz
index 6e8e923eb90df1d9468cb19b9fd5b64a5ecd0d57..5b1f3e7971ae91038996a92ba5bcec83e116c127 100644
GIT binary patch
literal 1727
zcmbVNOIK4@7`->)mG}S^eBgtCVh)Irm%MxsMY)<SA);tdkMSnC2pST;n_!!YHGLqJ
zYFleNaMIPaI`6pupp%aJ3!Hb*4%%`0-IFwi)zwMQTI}_m@4Lx9?7eet1n!z<t+d*;
zKYxAo%W$2hIe#9lSu<@r)*DDK=XI?~HJt~Yp|f6xXQv`_y7owW>`z+RgzX>m`B$g?
zJwE?@#?IMhI+n4MmeZazQ(24q*#&di;=LH{@%4B0_@4Ow|MC{xiS)|Saz2pA*j8X^
zC6!y`^O|{Y%(fDlbT((NBywKTN>6#y$<mZ`Xlvw}(BtG@r<xh670}^<>lxLRA5G@+
z%a)hwqb~k8r$?v(8uWGY%#c^W9o4zJN?XF=aHz;j0mgH*HU3=EHWBsmJdsJovWw|l
z-w<t=)F`P*NZJw73dN!+X(x2XU2rir!_C-&8phqIW!!^0#=WR#+=m9nR(Kfq!^?PJ
zs`%8j%&6mn!_t?egOUzOYGVeU*l6IsvkK5HN$mnIxVh*Ms6j0khXv|T&&3ge26(vW
z6!2mT7e@uQVjCBJf$eDILI^Zr2NzueJF$z4Zh>aBaB)mvH}+V35)ZRECz(%W%pCPd
zIxZ=|hdE*V{ClzZmZyB|USs?(<40d<(`N*0E1P~J*!bI8c`{%GTPl-5BiQ=p&G)6x
zoHT-MmB)sRV8?eq-78IojbLZxGb2XO|Hlt+OOsI}*j@R8F~j)hI92lXC+U==(~{2c
ziO+75udLj-Qm!moP>EL7>zvZ7EY*1>RavD8rIHdz(glIN*oRi^XW1?a9Kb;wLK|-;
z1$<~n2M+Vb5IBNP9EG1Zmjnd5(2Zlfxh&9w;|Sn{BX@PYh6X7l36iex>8IX#&w__l
z#->%qA}V87RmNsi#;&Q1T~`^KRT;aXGImpCEV|L)B%7CtS~cT9Kg~(HCFwRZ-U+eJ
z1tyb<?5-k<DYAQt%v5A?MV3%xNkwKUvUx>z-}wP;iu6E|79=f7dcYhH|K*7j<gy*9
ziXBdJ%T6p+oN%IAHX&Uxp-TTuMgJ=4mn+g&RF^~oz34+f1~7<|7{W07VhfC56k|Ar
z(>Q~(80ThIAc%7~j|p7BMNGorW=`M|E+Yis3Z@XoG&ffSBDjhfT*GzD;s$PV^N~Ol
zbGU`uxC4p1h*kf@oGWF7-jnpcq&$oBL5MFD=SF#~3|LhLJW&RGs0?_j3@9iAK2ipJ
ztPJ=>8StqxU=5pGjxVN7J5SFftxNihNuI-Gh7H6SKSzS`1(J*}VKIJzdB!hspYba!
zFuuYf<JWk=_!<uxzd?$xEGtLf#sd-Pu4eaty>mEE<yCO5*>`WC2x`@PN7r<1m<P7n
F{S80w<+cC-

literal 1509
zcmbW1$x{<S6vk%~0!c(c4KBD4P)vg{0xm2rjIxa8lMqo9)EFnpASfj1Nl@cb=3-HL
z^`aNcD({~C5Bv{2d$%mTc#dygrUop_i=E7u_g?q>-q%x^PQu#|QtG2q(K^3>N%<7T
zz5+@}F)S<H7t0qbs?y3ud!apPkJpLWspPz>JW!rQa%L%OMMlDrr_+&WII>W%%9fE&
z7p$CVmnV(ol1cT_qER#{7sJtTKYa&!qTy%Z{~N)oH=AEwDOO_Hf@Q{5R+r05<kzSK
z)0UYn<V$61HCql!rM@bd&(&w83gw0LjHwyvU6Tzn(kGxIKv(mcY;+dq%9WxSludIU
z+UWF<^h1HVCId4vDBzt_srpoIN+c5T8VweVmSuD1WlC-qDY-?w^4rgdMir`=k}U!r
zcqwQVXn>D`tpbf`qF|eV9{~!s3k0!=f{;KnHdD|humvp?>=0<hRtk0sY{Pa6b_s;g
zM!{}@9oR`h`yV}>c6+#;(`heva+>VpCJui;^Br^tqaA8kU>A0y9eb!nr@&t9!+vy7
z(j^c^C%SNek^=$<(TzihP;yW}pa;D;Oi8yu6h{!lQA!T|<<Wi`j=hL$&)&Y{Ot{YM
z{W_+4TvHqT^tygFSUK!k+2A57(1(5u;24hM1Wsa*tQ`>;!Z1c~3a4=fXE92}n1F_J
zIFB)mV*-=Vsd!Z20xlvB;1Z^gz%&*61d_On8C=0t%;FlZQ?XxQ4)eHyo4AD(ZX->_
z0T@O~9z%xoII^TCkRv?_lXMUZq(it%I*diq5iF6O!adT{xKDZp%cN)Z&p&Fluk;tA
zWK<vhrGE_96HV8AZgNi7TE4w-X3pzcn>#b6Ywd5}zOUaiu4^6cvI$-5diUv0eP&YE
zy4`zpU5otu@U=d3LDza+z(rkOw-b*m)mn{w0V?Swcu1$<B~74#bQ(U=BpOLCqlt6|
ze$p!lkX}WQbatxtoPMswsautT>&%*C);zOrFzY6>ZZRvxtlP{=GwTks4Er6mX_@sG
zld>!$bBr_@xxmP~j9g^o5+m<1@;)P%8M*SxZ@>IBY;HbI8@6{`U}TYzl93i8ON=Zt
za+Q$}82ONqj~H3ObHgml$C+3Xswd?A*U3x|#2HolC|W<Klkhd#1(iO7^!tX->ff5d
Bn`r<5

diff --git a/tests/ut/data/dataset/golden/batch_07_result.npz b/tests/ut/data/dataset/golden/batch_07_result.npz
index b25854e17998a5c418de8c2b8604d9902cc9d10e..c5fca2c73afbf1f367b71e12f8b87c64ae390822 100644
GIT binary patch
literal 1826
zcmbVNNp}-R5T22|Nh~&CvzW!Q!KiJF@P;=Q4;Vb456i+}kbyyrq%k(gl3J2U6iiSS
z<6yFr%_JdC$a2ejx#b_^7v#=8hvbs8Qr#nj{PJ>1zgP9T>#OSOsjs>x2xmpTVvlC!
zm(@q9Csm3f#HG|LW;z}2^d(X`RjHFk;V3{c9tS4Jf>Ww;N4e{bS(#|sJLK^$jC(sg
z-sxmIn>G{SWIARE^QalmSY*%4m??|aY_P-A-PYl`=lTD`op(kP^K+@3FPcnSzPb5$
zc9za-=G@`56-_2G+4Oug>*k8><xa$Gl&eq{xno3)aOV=QnBbKRDqL_p;8jJ}ST>ik
z+`KyEpwAp1;5AU7E^*fccQZIcDp?o!x<DY{FVK(*!&$yQ^3dQLSjbIyG#L+PW)s=2
zKEBc5wanmkB%Az7zECg?z8Na%EpU*ohtsqS-az<PRFGjCDoJlg73m$Q7GNWRJ5fW1
zU2u`!4L9jMV}<*sWrYMGB4JmIHyPYRzRe^pY`KO2G4fDplf4WMIBBwvp#qgOX=SKF
zwMDc26yRm3fr}Q5!Hsn^X=7NA4K!(I*oa!;cz_%`80xTz76%zNV+&1u4E1QB$svZV
z*k;k8T$EQFYC4`Yv%J&bT@=|((xd<OXQA+%VETk!efV$vXOF!&tZS8}MW3$K{;?<#
z+P9wq2Xw8W1P|(3<I2ja9XO<GO+{aWAJMgzr@!712n8Ocz+<}BS_(X_Yu>+}J+}jg
zb*;U$i>B)@4pvsw?ifE|@RPL72+656MJ?9zw5+ErrNKvK6^jMbWdUE*?hI{kR@Saq
zHoq*JteC;UupK+lh@Euoa}2w%8+*_siathx0R|76(Sp5X7-!grR_uqD7C}a#ZD_{<
zTAXL-z(M$MNRUw8ihaF&!r&KZtBWL)FPnpCxFq}Wvh2r@-H%h`bVc^#RoRc%WIq~q
zKVBy=Ec@|>>_=1fV`RBTTnHMnEz6Vlbn~deV-#kQO#7+)dCGfJ=ADswXYIVV2)`}!
z#%10)nKxnQO%j-rdAZD+mU%Pc3a+^cJu#j&_&j;vA$jFrq>277^7U$oFVX9gDud@r
zEQ!7@67*V$pmJ{RQi6pNH)XWkE74M7#Ne+pbfOF0=s_<IqYwSW*?ooq3}OgJa1_UI
z9K(Y2JZ=02gN740i4mN_X^cWA;7x`zI14|3a~MMa<79k`A&B#szy(~yBrf4H8Q*3I
zVG37p71v<kI>O~QQ@l?De9_<!DAN+jJAQhd#Pjq}(y}aRd1TY_F5&M<THcqmd?0E0
z(5B@h0za0tJeIV4B5C;)YuX?(n=sQk{+Yo)C-*N%R$x-HCy0>#5>e7$Ato;E*91O=
zMTT!MO?nkKNq>tO0e(l|Gt83Vd)y-Z18$T45pjA7tt|g3;tN7`)I0y{w??g39C`7Q
Vzx>GsQ7M0Is-i0W1pF>n{{T8P3*!I)

literal 1509
zcmbW1%Tp6s6vn&rfFwRZ4L<NegJLciBMQpH2dyZr`6P)b3Tlj<q!AR7+)mKRj8mq{
zvMAlU(OFDc<=(Y_z*Nn@;eHn5#&zcONms*^<;I)J*XNvj`**(E)qOMY2ck-A^eQj^
z_$!?aDvI+DD^bO??c7kJRH>^<Cl{TC&ZINmrsifc3##%|`7Ktoss%eXp~ar3V{t9E
zShj1nS<02|qUDsQ&6TP}_3DyYu_zbgacwxRjSR%K-?g6{k%qreT3fBu6NR#EC05r~
zYRlx;tVeRTRVbILHG8d4i%6xlDpD%8W~B<{mGn)idFfx5fms<8P!Wb7fpyvDE-u#U
z6)Pg!=Y6!%^q34mfx0fkvoa#!pI52+Ty9CFQpqL_7EIRU*8J<N+$OSeyLc0Fo)L`(
zR4Xex1bpyQ&?yi=kb)foZD^-pr$7i{3U&!Zu!VxCz*cOdpi5voIw;sJ(1{%s>=D?B
zT@>sUh@y*veFD3&hl1|^^>o|q=XP$V1Ki1Na*&(2{6oxl(LIcIshYrE>_a#9Q;S}K
z12~97=%J)fKtnJ3aF~+A0!Pq~qli&*L_lBwgE&S>zd#(vk-!N`j(+FSc^a<0m}k$~
zzUxeQ&Yb<arUpDyoBRxVel}S-=2_X~A}%n5lQ@N8jNmlRV3e#K7Z}4hCU6$#a2^*h
zNyUVKj*Ga2DO|=COv9k!34yD)h9rRNm_Z6@Dh>%`a09criCdV%ZQP;aNr8DR;4bdr
zKC*a#92HN&G_!ITdD0OSNKd0kdIlEhC>BY_@Q8FAOQaK6COwPCr04L2^gLEbFBpIS
z+iZTJSB#QLWAdBvNoys#p$EO>qM>*E^U9sMWawSq%#@*bzkmOsb<br(@9~yhG4#Ga
zKR;~EOdEQ?caLG{v9BM$v}Udv`k)86W*8fGl1Zh}Y?3cPCA|(G=?wg&DFjH<2$E*d
zMtTG7q_YT--b9%679yl`GtC!txt*kL4GQisYo1vP%(~00d(67etSqx0Fe}Hbhs-ja
zZ`7gXH$F_t3XCi=(qiNyBOftxiIK~Ue9Xuvj9g*l>YI@B@zZd)r6g_G*>RbX6-G)%
z+Kj9+vc||YMm}ZaGe-Wx$U0t_R!#nzPh_C_qW&M9%=AFq(QuBUjdMDKV4G7=={HK(
IcYId=2ZOzvZU6uP

diff --git a/tests/ut/data/dataset/golden/batch_08_result.npz b/tests/ut/data/dataset/golden/batch_08_result.npz
index b02f0eb324d9bac2f965d03ae35b37741938e6b7..27fa114d57c8de10023d7e0a1a92bdd7d8daa36d 100644
GIT binary patch
literal 1781
zcmbVNNp}-R5T22|Nh~&CvzW!Q!KiJF@P;=Q51R+{VObaqGBAjdG{y#5a*bpX1rrn!
z<KXO&%~^PPFPEHh&OgZC;Fd!U$t7o{rlrX8%gZJGUe)WW>h7<nzHes2a91@d#nGxf
zoqX}ze2t<w=203I)3&4Cfz(1yRT^Z{S?Cm<@iH_u9-dZ}hsq;=!pg*L|A^1OG~w^^
z`DfC0);3eow4Jb=^q85<Se(zynhO@kVz|rK+tKBF?EC-2n|H@ki}MS)Ks;?*f%(N`
zc8>3B=Dbncil<YVti2e|dPym+@}?5SDydMG$#qVRk$Z)zCaGFLg$J%@R8v}-$mSL-
zFV#j|{FoC%R0jp>3V9~UE8vc(oL!<#p-?DT;2|AGv$Q$(+@LKY;^lfgos4GYQrVsX
z+G<d}FsOlTTTsat3Z_BZp|alr7yCxI**Bqz{Z3SymO;CC?QYa?Vh?KBH=~aIUU=B=
zgO~mO@xl|+vLX%_4pvWs4j9zJ`+RJzVzrJ7&d5XM1KR{#aP#D#KozQaa!8;CwLEDT
zsDp<mhXuUYWbw?;M+kw<*up`Fz*f}rq*I^)+jw$BU^{m3q)VU?O*}a&uoJs1zK@3?
zo%_rr(`J?e1|8$Gy4ia4U;ii+UUQjm->Z-QrT^?J27S6#T@Lznt^W7rQsua=HI*v^
zy4Jk5_JgyB5eNCmAzf=JuN~I4wpYL0b;<^f=vsSu;|X2!|M}zVV&$Z+b(T&s=#;Mi
zbEk?{_9p1GL8A<yv7On_%2F54$}W}~cuqD@_RgU5vTdb~U636sHET>ZtJo)lbUyl`
zz;5h8GxqWWT@u)b{WyRY4uS$cw4x0MIe@?+wBs=R99$M0cAyhSI2ae`!chcp%xQh4
z><0QNWY7eIgxRjFtAU#{DYbW1YVVrV-jvkd^`iD}@P>%g-n7)-O{u+GQhT?h_Kej!
zNB2CW1S@g-dg+cqQ9j}>n;GP$=NU34A>$G<At5aZIa7qZ$7^RL<eY@OFCiaD$fSgv
zcV4m$g6~UE%AhpwTwtSrvE<0SM37x3$dPo3n@pJ-N5hg9gR*5tD#%zYlTpFL!!i%+
z6d3ddU*%1KZuFoRedxz=3}BF%$_Wf%7$Z1=lQ@Ob80F+!0vgWXEY9IPE?^8gCm#u1
z#3cj)T*f#;nBe4+Kp0mriL1DVDO|@5PCgchU>Y}Z3%6n54x*JWp!4w=qPGot!gtEE
zy%Xe5kMr$$D#>_PlJTA-V_A~%tVqTRuYF&V@m!LzD#`djlJTJ=<0EX~JvNsz?Hqk<
z&?mh0Q?}1wGR_Oc*uO-a{pU!q{{j~KFEQi1!C&#huW^qPYnWyK4d&Q?i~H=q!vpr;
sBgvmQD@(6pfiP58qx--9R4lX7$U7hB_5W5F)$(tpDylNbg`Z^fH&CAVtpET3

literal 1509
zcmbW1%Tp6s6vn#~5|Su_8hqe`2E|-3MnvW1gI1K*d=esxf*NBdX#|BNw-XrSqs+#l
zbn9l^Oj+fA*7G0uZ@9NC-MEgYPr4ecsoBg;<?D0Kz5P4i?dra1_*<e%bMz?V-`dtQ
z0Y!2CVI``Vww)bJ6w5VL>ENQX(3y0``_$ZYdO=mzl^3ysRmt12aV_?8CKlIXizT~i
zo5gI&E?7=^%3P^fRIe<VWs7n#7T1R2+Hikdd!_x)5vlw0#kJLPEs-zTR$_H+rMgUh
z%~~XDTlrG4QnlCe)reG@t0Kihb5^QQ-bml1nv?zw*)l5w0xH7rA-Exf?&3nVR<<It
zb>2rCof(xOC{Q<KcveOP{PQYRU&?K%R4Un^!GejZ+@5=zk!>O)cZhc(=NZwcL$xxp
zT~Gr*1swt{2vD$7Ac$59b_s+KreL=~1luTx3T#Ik1)Tyr&`!Y~fe!4XV6VU~?51F!
zKop%6>=)RBy%co)*3)fwfZMs94ss{A$sumy@((lLMRzmWrD_8EupeDGKrMO%4&o3F
zqnnam0S!Iq#Su!52pmNpjv+?LQ2~K|4B$8=eFAZuKmsQzIrcA)&eL%1#XNh?_FZSf
zbLQ;VHP!E#+Tv%x^RvauanH&Y7jc0>oWc-J;|$JX7$aougup1qFphILj|;en2`VN8
zbX>w^OyUZzVhRQoPYPVabtD1Yz%){rq2i!G8aFYETeyum+`(Nco)VbH0`B2H9w38<
z$Wn0#rkRnakt02WJn2~!NQYsOj$n~=6pu;AutYkJWzutaLV6xgNiSf9^rG?kd!zA%
zUNK50jESGd-&!-#4L#r`mkhoA>l=6GvY~f+Gn0ni_2I+E<~>&oz1v%M)zEwY`sYz|
zX3Eg}yn75okNx=cr8#rW&<8xgb;H=SlT0f0MuU6-D(MaQNT=Z^O`(N!1_9DEf}}Um
zN;-=W=`DmwZzDoFH{Ezmmpe)7R;S=Dv*ww#z^r@By3ec!%*rt9A+xf~dc-W#`9>XD
zZu7&WEYHXSBP~WQGV(Daml(Os$R~_^%E%Q)uD%O7A3qI;TTIf1ogJ4LS!SeUq|L|*
zBdd&DW8^v`|6t@ZM%M7!w5sxXE|G@ni~9fTWTpq=j=FOcZJyI<1cFXMrQZl$|Kzj!
E7hs^8WB>pF

diff --git a/tests/ut/data/dataset/golden/batch_09_result.npz b/tests/ut/data/dataset/golden/batch_09_result.npz
index 6e8e923eb90df1d9468cb19b9fd5b64a5ecd0d57..5b1f3e7971ae91038996a92ba5bcec83e116c127 100644
GIT binary patch
literal 1727
zcmbVNOIK4@7`->)mG}S^eBgtCVh)Irm%MxsMY)<SA);tdkMSnC2pST;n_!!YHGLqJ
zYFleNaMIPaI`6pupp%aJ3!Hb*4%%`0-IFwi)zwMQTI}_m@4Lx9?7eet1n!z<t+d*;
zKYxAo%W$2hIe#9lSu<@r)*DDK=XI?~HJt~Yp|f6xXQv`_y7owW>`z+RgzX>m`B$g?
zJwE?@#?IMhI+n4MmeZazQ(24q*#&di;=LH{@%4B0_@4Ow|MC{xiS)|Saz2pA*j8X^
zC6!y`^O|{Y%(fDlbT((NBywKTN>6#y$<mZ`Xlvw}(BtG@r<xh670}^<>lxLRA5G@+
z%a)hwqb~k8r$?v(8uWGY%#c^W9o4zJN?XF=aHz;j0mgH*HU3=EHWBsmJdsJovWw|l
z-w<t=)F`P*NZJw73dN!+X(x2XU2rir!_C-&8phqIW!!^0#=WR#+=m9nR(Kfq!^?PJ
zs`%8j%&6mn!_t?egOUzOYGVeU*l6IsvkK5HN$mnIxVh*Ms6j0khXv|T&&3ge26(vW
z6!2mT7e@uQVjCBJf$eDILI^Zr2NzueJF$z4Zh>aBaB)mvH}+V35)ZRECz(%W%pCPd
zIxZ=|hdE*V{ClzZmZyB|USs?(<40d<(`N*0E1P~J*!bI8c`{%GTPl-5BiQ=p&G)6x
zoHT-MmB)sRV8?eq-78IojbLZxGb2XO|Hlt+OOsI}*j@R8F~j)hI92lXC+U==(~{2c
ziO+75udLj-Qm!moP>EL7>zvZ7EY*1>RavD8rIHdz(glIN*oRi^XW1?a9Kb;wLK|-;
z1$<~n2M+Vb5IBNP9EG1Zmjnd5(2Zlfxh&9w;|Sn{BX@PYh6X7l36iex>8IX#&w__l
z#->%qA}V87RmNsi#;&Q1T~`^KRT;aXGImpCEV|L)B%7CtS~cT9Kg~(HCFwRZ-U+eJ
z1tyb<?5-k<DYAQt%v5A?MV3%xNkwKUvUx>z-}wP;iu6E|79=f7dcYhH|K*7j<gy*9
ziXBdJ%T6p+oN%IAHX&Uxp-TTuMgJ=4mn+g&RF^~oz34+f1~7<|7{W07VhfC56k|Ar
z(>Q~(80ThIAc%7~j|p7BMNGorW=`M|E+Yis3Z@XoG&ffSBDjhfT*GzD;s$PV^N~Ol
zbGU`uxC4p1h*kf@oGWF7-jnpcq&$oBL5MFD=SF#~3|LhLJW&RGs0?_j3@9iAK2ipJ
ztPJ=>8StqxU=5pGjxVN7J5SFftxNihNuI-Gh7H6SKSzS`1(J*}VKIJzdB!hspYba!
zFuuYf<JWk=_!<uxzd?$xEGtLf#sd-Pu4eaty>mEE<yCO5*>`WC2x`@PN7r<1m<P7n
F{S80w<+cC-

literal 1509
zcmbW1$x{<S6vk%~0!c(c4KBD4P)vg{0xm2rjIxa8lMqo9)EFnpASfj1Nl@cb=3-HL
z^`aNcD({~C5Bv{2d$%mTc#dygrUop_i=E7u_g?q>-q%x^PQu#|QtG2q(K^3>N%<7T
zz5+@}F)S<H7t0qbs?y3ud!apPkJpLWspPz>JW!rQa%L%OMMlDrr_+&WII>W%%9fE&
z7p$CVmnV(ol1cT_qER#{7sJtTKYa&!qTy%Z{~N)oH=AEwDOO_Hf@Q{5R+r05<kzSK
z)0UYn<V$61HCql!rM@bd&(&w83gw0LjHwyvU6Tzn(kGxIKv(mcY;+dq%9WxSludIU
z+UWF<^h1HVCId4vDBzt_srpoIN+c5T8VweVmSuD1WlC-qDY-?w^4rgdMir`=k}U!r
zcqwQVXn>D`tpbf`qF|eV9{~!s3k0!=f{;KnHdD|humvp?>=0<hRtk0sY{Pa6b_s;g
zM!{}@9oR`h`yV}>c6+#;(`heva+>VpCJui;^Br^tqaA8kU>A0y9eb!nr@&t9!+vy7
z(j^c^C%SNek^=$<(TzihP;yW}pa;D;Oi8yu6h{!lQA!T|<<Wi`j=hL$&)&Y{Ot{YM
z{W_+4TvHqT^tygFSUK!k+2A57(1(5u;24hM1Wsa*tQ`>;!Z1c~3a4=fXE92}n1F_J
zIFB)mV*-=Vsd!Z20xlvB;1Z^gz%&*61d_On8C=0t%;FlZQ?XxQ4)eHyo4AD(ZX->_
z0T@O~9z%xoII^TCkRv?_lXMUZq(it%I*diq5iF6O!adT{xKDZp%cN)Z&p&Fluk;tA
zWK<vhrGE_96HV8AZgNi7TE4w-X3pzcn>#b6Ywd5}zOUaiu4^6cvI$-5diUv0eP&YE
zy4`zpU5otu@U=d3LDza+z(rkOw-b*m)mn{w0V?Swcu1$<B~74#bQ(U=BpOLCqlt6|
ze$p!lkX}WQbatxtoPMswsautT>&%*C);zOrFzY6>ZZRvxtlP{=GwTks4Er6mX_@sG
zld>!$bBr_@xxmP~j9g^o5+m<1@;)P%8M*SxZ@>IBY;HbI8@6{`U}TYzl93i8ON=Zt
za+Q$}82ONqj~H3ObHgml$C+3Xswd?A*U3x|#2HolC|W<Klkhd#1(iO7^!tX->ff5d
Bn`r<5

diff --git a/tests/ut/data/dataset/golden/batch_10_result.npz b/tests/ut/data/dataset/golden/batch_10_result.npz
index 5568b7e0cc208d6550f2d69b667437074e1246a7..426e1567800aad0b501708d1340554f496d24886 100644
GIT binary patch
delta 47
vcmeBR>R=KI@MdNaVSoTd1~ZT7;)z0OYz8`-ItsND8!abp7hwSlp0@-5&<_hY

delta 49
xcmeBR>R=KI@MdNaVSoTdh6j7nnI{URaaiam7-;G!)J|-)oVZhn1uT5t5&-<u4Fv!I

diff --git a/tests/ut/data/dataset/golden/batch_11_result.npz b/tests/ut/data/dataset/golden/batch_11_result.npz
index 2035b10c949cf24c660365ac131fe4f329068449..c7e8d6988af7917c476fad3652264b87c7c23655 100644
GIT binary patch
delta 471
zcmZ{hyG{Z@6ozMqi$Z~@t9VzuE?y9pOJ!?Fn1;lL=7bEgCL{#>k%Z6)m4)4gH?gs{
zk=O7IEPQ}<Mo2&tPjZ_1|2hAcbKd=4AXgGTzsbfGXK$n4SC=!UJH$*T+wDp!*}U%=
zEc)}Pja}~Mt_|CLymDV}IDZ;Oj6%O0R?2OwQF*90JLx<kgatyBuvlaR&y!M?gA+%(
zh9$x>!6d9$&+~ews(~<^8JAtdDj`N#BdlA5jo(hfrX{%WIr>kS^{3(aEVc;SgdM`}
zbh;*(Wz5*vBg6^&goC2apJmL{a7Z{J91~7dx}`etm*8|lKsf6w{oE3M5pW0>mI#VD
zXM#2+tJa&c-9wUaNk|dWLdD5)NCoSy-8EXW1DlW`WC^*#=+pB)l&DNj2l;BUBn)o)
Q{yWt$#uLF8bv4Gm0pa_2!vFvP

delta 288
zcmey*dW^+1z?+#xgaHB+8SZuLIJJ+FfdPa$8AKQoi;Ch6^zsTS85sn?f*^(a7$=&#
zav16;7;5S$)J|-)n7Au|h0&X_eKHTDdwpynpGFr4)WimF#?;tCevQ~d0dzq@4R0oI
z=2W1x5Kvkeh(*wqh{BbK0j0$?yji?if$AkRyxF|jQ$boKy%Q5-3#HtTzyJUL0npCG
zSo=b0FNTKy|AE3XUO<rsAVU`6p2XCWLb)Wp0B=Sn5oT=uo_vN$m+dFe`wR>Kka<NK

diff --git a/tests/ut/data/dataset/golden/batch_12_result.npz b/tests/ut/data/dataset/golden/batch_12_result.npz
index 24ab9b08368179fb276c2d87550b191f39b42437..865b99825c71d99ffd50f9a45f215bfd0171c8f1 100644
GIT binary patch
literal 2123
zcmbVOXLwXa6x~hF(joM=)EyuMl0XPGOMQ?-3ZaE^37g#w1VWOt*`N`SSOCE;HY|u@
zFW3=#LB(FNH|zx!EU3SXZ+5cT@cH@2*>88xy)*B9=iPJ9%qvEGLYl*j0gl4K`wt&X
zb~yAm)sg1#g+gVcGpcJFoQ{r>q+X~e_1Iasu%LK})3M&M!4nA9`9q!wZqLRdPrBPv
zQ4<P>eAQ(&p+HclXZfn?f_A=cwXZg4$7+1KJ8M|Fdz1VB4_9Nnzq)>1Z9|5?CKSwA
zS6>ybwE6lPTxFr4zoxn_9IE$+U38dLuIhl9r4x=VjGOK(XZ$uMlrT}jiB!aGWm5Cf
zK)9hc=wfndoc)+ZxlDlr&TULBp-UmY)M;lovQ1%OVSbZ6B0tCrvu*iyVLMgovgiFZ
zRb_RR)#1z>wik9#!j8gD`Hse>CZA8(S=a?mjnaf&5vNf%VRyu9)I-=42^#eh_C}&c
zeT03Hq)|U%e<W+<77jp)MgxU|kgCyO;Sjhq8Y=V@G;Q|zf~85#j4}hXnGCbP!-XT%
zjubnuM<bj;`-*hoNQF4W>mWloN+AJ>Iv6d?R7gUy4zh&V3Moj{!5HCK1sB@rAV)Y(
zp)K0!AXhkEp*=e2V1jU>LPvDc!6e~ih0f@rgDJv1g*0^4fmb+Hp&PmfZPuwaGhN4u
zsv2LI(}dG);xmLZtwYC}nvU4B`Y~q-Ei3PYwa?w0m@S-Rc@txad?74vhof7fE9MFd
zEN|D?ib7$L<?Xd=*PiBk6bt8B-u|&mN`&(*@4(#$RyMC#AY5p9hs5r&NVwSYdXDcs
z(!8QnxWw`fkKJ^saG7PbP>;!y72tBA*iW}YSay;uQ8iWyeGxUHB9sgL5fP%b4+w*i
z+DA)XAzT$Hd9>22g_V&?n*s~h2-hm~Ku`2SZ(aT>;W~vr=!<^nuM^e68U;57U?2wR
zM6HktgE0g{;n9hZuuef?7=~knPK1T^3h5Y$42;q;wzfmg7_JwdVn6Ar!iHG?X(3M&
zZZMwMDBNT`ak_A`@kFEW4C9G2g=ZO0oGsj9Jh4@{&3NJ*;dbMR9l~>sC(aX|Z#;2<
z@WLG_+O_tGSfe$&>?~d+yx3-UiEwAWtz)AmcB$|(L+o<l6^7WA!d-^gRl=(cv1^3a
z8e+SJ*BN4agx4ElHwgC{VmAtJGQ@5c-eQQ|D!ff^Bq!N5JHXq8ci4RH6yEi(zR>23
zw!=Q*-7W2)9T@F`dxZD4bb+>6RQ&sd_qT}OTJr~l54LFDTI&76hgzhLDJ(o7d{|*L
zGLeOBjKNssV4UuSM}&_m<YGJ~U?L`AGNvF;Pdp}kT)~T}n1<<?fti>EOHVu@d{SXH
z<{%%yToj-XMS9{X;nNDmn1>R~#{w+GA}rPu&j=4Hlwt{%Vi}f0umWYRw-3Dy=ki(M
zb9N!07aq#DH{r-G{DSaBBkW7UmyNKm2wyeAz9xL#2>XWcO(X1E!ncjEhlTGLVc!+L
zXM}xU_<<4jL*Yk8*pG#u;G|9~udMck8u+R3Gn>#6;pgz_Hv2;OCCWAWO87PW8hs=D
z76FaE6Mm1NMn4FTqC%q|g+F1HMn4OWVYNobg}<OuqhE!;VU0$=3r}FJMt=zZM3sFc
s1jGERJfj%SxU~5HI{)qdZyt^M$cw!V6eCfeC=RFn8E60h9eE@82gyyTbN~PV

literal 1509
zcmbW1%Tp6s6vn#~0!e&;8hqe`0Ael}BjAIFBCQCm`6NUX1vSP_(g+GkZYMaC8K=y~
zvUEF}aWgel?p^yI_+MD;T+Cu-GwaN`CtVFwmYcb${`#DAZ~xAByV5ree@jGbjvj6D
z%h#DqK-1I}(juDWIN6~@v0T%&4lb&NYEq3~Q;T!yWnFur{TwUUmAn(1h{j&d$Kui0
zO3A4@Rxw+03brcGTI&^?>XlWiY*Q}A<I&*}x(4FWU!wnSgzNr%adV?wOXN$Ao!Ho1
zudb0_s}{~WcD_`sRGrOyH7vE}s&KK;oRvDX*U~qw=cNCQY*~;20UaUihW<vjx{C|d
zTG<ZEwk02Jbbd?*p{eQ6f(#4zmvpMWl)F-?RI)*X1yfbIJNGst_lS(#E8YdwJEBpC
zZf9h>fDe8OIs{q}pkSXsE7~a7FAzkCf&&6!?4lqd$ju%KItBKkoq~e`9oR>~A%Xok
zK*3>w2s$Y^B5)9gDCqi6Pq*DsZs&G7#+}?I$GM5iKf!z#-OXs18WlK<Bk00WYSANb
z499T--IVkSMA3s@^ik3$a1#AEg%~9#1q22#h|`qx3&e2-37n<m)Hfd0+i>m0JbP;U
zt~22|Q~PyI4S1$@_!;#4?67j$v$DfQTwn;p7{NK5#|2!(C|NrrFotnV;1Vul5?3%q
z#e{%?tGI?~T*nM%VN&s|zzy6)62L9YA%%G=4hf`j8w<FDyI90M+^6EOz!H}601xp9
z89YXoiX*VBj68=N>3QTyFQ7nr5jN>4R!GP2lyn@cq!U;py@Y3^m+_o*66>T_%+G%{
z8e8-iqh!jQ`p5hjZ6=0k1ia*`X|(_O+MT&(8lB$Ev}ts`fB#4Gnd_#}?Jb)zjo#ls
zJ!#I&nnu6(jA<INzdvj>XKt9rpa-~Vn%j1gNv+;!kS{<dy#*iX9Q>pyw2;mtK$=D?
z>20);E+9yH2O-kC2$L?(HD1x}UXr@iDY(z9C1x!%>jASKGV2kuGR%6+tSqyhFw0Wk
zsG{YzKTOK<j4UwHX5<PZpE7clk!y^6#>nT4TxaCQyP*2`X((<nNgGx>E-|vqNXbZt
zkrhT(8M(>G7mWOYkv}rBhF6wdl|SVYY3RO)|9dAhy%2ZQ)lsy4PNxxQRRx`Xqjdj`
H&+2~xalf0d

diff --git a/tests/ut/data/dataset/golden/shuffle_01_result.npz b/tests/ut/data/dataset/golden/shuffle_01_result.npz
index 467ea74a4c48094f9294ee43a993c9cca451d65e..589afc1271adf03c2eefd2083a2864be85a35ab3 100644
GIT binary patch
literal 1691
zcmbVNTUQfT7@bK7B=G`D@PZctiaB765RjV}Mo~tyB}5br>KG@<AkmQIOa|Ljtmy@*
zv~PX#rK@Z8z0dstU8{ei?|o=r`ka1qGHU{>t1mrkWzTmxIo~<^+jAxbZ(B%ej((}W
z>HoF6U6SlyKnh8^X=X-ZrAkefx<t^PXeaIQeR6prwIWNer8kkhQO%i=nQ&xtF%k_&
zR?DVk>ZMHC%o}!iUN2S+idWb4ia|b`j)vny(eT^w{~y7+H&@zNuhe3>vT4NDH;Pt)
z_N&)|8Pmv>OI6F<$XP)yHD?7&`DT{OkhZvIPR?@gHg8+vJ_Z>9c;54NXKLQ6Rg56-
zNPFm*i&NYW3Gy}%Eb$<NH!V|ilkZC=lZgfm0hqP;{_F>hA7JSq<#Xj?rdlXjqZ9m~
z#ygqDyWrI|eh41I!wIS0XpqqjnJ@%DVGja?N6<!i6zzn^78>v9aU30ly$BMXfKNBl
zb_eVVqj`Q(<ELo#(+utCFf`uB&;}nFVYcI^Y4)f?ra(V~2VOD;7`kwXj57>}(M`r#
z20sF13^MG;0W!`p9KlgCA`A!7Nd{vGp@)nih9LGCMmj)C+q+mTmUW8{Yy3QIB&sT%
ze{6**MZ3MADn9jR$BC)G{n=>z-ppK7Xa9CHBdRj!I-{x*`RgZVT3l6n+{~D&^tnrq
zt7>btOR6&P!!OU9OHZiE@W+ooHl0aT>2;hIZSg#x()cv(c1E1nIpJl&f;gPUuk4}E
zVeYCR*J1h^Lm2%Sz!@TCmf<W0aSjo36hW}V?RAD@IF4SNpv(=1C@vs|i{#8PoWv=d
zMjtsh85o8zjPvB&5?qT87<=S2G0x{TuF}qL3pw2pa!Ry1O{7C;>aNh#g3we_XlhYt
zDz)RcMN7k$);%GuB_XZ*LR!1UI~L~;G`>vL9*Sy@M74B+sIAkVS45%5qR<mj=&2~A
zi9*lp_rk7`5jCER8u}jjj^%k)<2hPC?~=5O62qm@R-U8B)qnNH*1OY^7j6e_89QxV
zV^DA%H!z2rxP^JB#BhP(Htrw+;4T)B#3BV>x*grs%vTH}7)2ao7{?_{V3KAQ8PZt6
zV?4oAXn2MU1=ks-FpU{p#uZ$}HOx}5#E`;0Ea5&LU>Ohbh=OIe*{&Q~uakXkO!11w
zIT2$D;;VvqD?yireQRt8;$I8mKM}-#Du}NM;y;7U`WwOereOW;p3Y_qCEcv?&o%yz
z3e=IO*<T<>_$740uaG7D8U?~Fyd-=NgK!%!2){v*@B`KeckqhvTdeAaVe#*>u@q!a
f$opU46OxTH>h=}d`nsgx6W<hBlBG#XB!%}65xdev

literal 1507
zcmbW1OH&hB6vw+00!c&x4L<NeKrk1K5%9so2dyZr`AY~O3Sx|TG=f5s+X>8MM$2r}
z8n;tr6&uSc_p_dF;MZ_Ji<!;rtcvHJbTv#(&1P;Y|30t!-E(er-z2<EA+51`wC!)L
zJ1ag-Q~!V#(lWN49*Px8Rb6Z6pej^JwSG-3&Lo$0ZCm>}lDEn^J2DZDyqt|h!;#gZ
zU9mHTbkWXRDm<OpC|eXSuVqRW`C>d89v&HphF^vM_X^g$xx&_FsT#`_Z7a69wNY88
zelyiz+O~4VLb+mZ<tjm`HL8Mzd?PD$Xm6xvO3zB~JJ~cZeF8cHXniZ2ozi@zTC#$&
zWywQh&W=exH0bYSU|t3Vyh}PoU&?)nL?T|N#e&I-Y|Z|XlKVwU9uR0oi<OdX0!{Fd
z(JtQm)tS+%L8m~6fCpYO4hpoRgN#E02XTmukboZnGCBoXv7d~?0*7&gj3WXE&_>2l
zfe<>$I3^IpJ~Fzv*hwE}gp)b(kG-93PjY8Yo2LZA=s_<|6R2CD5B)fU2svTy&S|Vi
z;3$rv3&$zbD-gvw#BiRR(*h@O5~t8jPM?6l00wcEoPKW9Y5$B1*i#;Hw@@QFmEvDL
ztAljf9dM^p=i`h#C}7|QZej|za2wMwY4@`NcW@VR0QWG11ZF81b*J4k=sAHQ3}XZr
za1obq8KYDg6IjAB9$^KKk-`(CDR^FB4C9!<6<ozNT*o8@hXj(C!#oynAB%W^hZG!k
z4`R<zjUb;%$qUGlUPOlU60)S1u}(UQ=cHq>NXPMvbOIZsSFlEU6)#AyVU_f{X|(<E
zCQL6BIg_U0Gk<d&!~F7Rz5cb4xnWNJ?PhM8M!)M!nMUNV&raDb)97?Fw@stlMNgaN
zF3mKJ-jBaOZJ_U%#^BD*r-pOaG`d_T4lgwYcu4QXwOYMS#td}Q1pK742#_YxL^_9N
z()pSCYkDl8h4emxq>J#8K48{EW-Y~Ovl<1Lne~WSE6jS#tQ51Js4r5{(oB2Gv<%a-
zOv^DXzxzp2t;NVyMm}TY8Y9;k`TV_Ked@Fn_XTq|n7hf`0&|PZEyZbIHL+x*&B!t%
zD~#M?<Te!Q2S)wKs4AhXiu@@XOG5XAy#IAC(;+ykrmmvhTRI6}vkK_+8>Qz5zNx<h
D$s?NA

diff --git a/tests/ut/data/dataset/golden/shuffle_02_result.npz b/tests/ut/data/dataset/golden/shuffle_02_result.npz
index 27eb0a470d370fab9d4938e5fb9e3eb50d603bd1..03540388d304ee6478c09d98e7cef018bae7fa09 100644
GIT binary patch
literal 1691
zcmbVNOLG%P5Z;yiki^3V%RJ1(vcYI<gs^4f7Y{EOyr2)u!eEerwOC1OERZG5YE7bG
zg7Pp9PHs7INu?^4YtH!xx#c(Ho<nj-&PwlSYh_m|mvogr^-Onl&wl;wY!cp<pwzHF
z>Bql*UFm9*B>U%=f|72U>EURxT$QCxv1upTLEFAfEX^mEW$BglI-D~qSu;Ex3U4li
zBcbqW$+S$pm@b()!``0L3l)R*D{Fe$pf#I{gkpn{(3{Zzw?NICEpDurtI=%9G@|Pp
z1uIYI)vJNDX=F>qie+wOtpJxAS%G4%5#=(ZE$*3>Gu*q)TNZgMgA6}B?|7S&nzO29
zBf#5J9(v4$N$!ILd7Jwed4R#2l4*C7?@J^S@j96R%vgMX=Do%buvCEJ*-{~0$rr7W
zaeh$a9Zch$@aY;q1ex$~T&mUU<aEJL7@V)arN3^p6CQzw@F>~{kD-O|I0A${@DiRt
zt8S$13D_M*a{Q#mPf_*L44pV+XuOxf11~utw&SBT+iH+$qmSV*y2$BgID(_(oMAYK
z4sy;iw4jxo0fzlJK+ZV^AN=Hm8G`60hcUFFot#020QMP1%1@>3Q>+$By2Xbyex42z
zQI!Gprx1ncv=`KwzuiSt{pF8({kKNsqN;@d{J~iat7<bcqALC0{rs$v7E_g0x4@{X
z^!~mTYDC6VrQ3~MQk9_(AHHuy##N=mO`A}a9w*YIEtcbx8lR%mPBVnihkl%)2fHj-
zaGvtY9{L<kuQHs)0L~#y>DL${xPT}w(qe|;Bu?QpdTF5u@&rp7zs_(B$I*im6uBYB
z<ji4~fng9sI8TwAf_Y~iw*={8O2!^JjgRp;jjMF>+d@uvgq-5dNfYT1n!3B=v&Bhf
zOKV<8D<Pz{Af%NP(z++4wJ4-@Ur1}Wct>OWfyS4p+e6Xqk?59+6VEj&x-1$!w)JoK
zcp`c{6+JZ3V@32xiyqHJ4}Fh(M{_)*@hsKPF(|l>8<@pS+`=4GqQr1%v_<Jqv-+>O
z*wS@|_MG81?jR1}F6NQI0+m@~NMRX|@dQtyVFhX0%ri`43e&iZE4Yeln4!%V?sRO;
zJ1@yghGC2#hEa^+62>t>nFUujy9BY$ki<PK;yxZ=2@mmzGK=o`cL}8APTrMJ^L4VX
zjY(eCI45FEL3~9JZ^h}-uy2hGLHsM&tbZg}|5&hI6|8?ESbr^8-xRFB*)!QpzNnj3
z{;9^_QiB@uROK^d2tS8T_yt}Neu)C%7S;&gAxpT8=Y(IuAbgLPggeL)evMV#Ff9H}
kCYpro33~tQdqT2ttY%-K&96%mt>T*^OR_XUfrME71C%+`J^%m!

literal 1507
zcmbW1OH&hB6vw+00!c(c4L<NegJLciBjAIF4_bM&<}V?lD2OrU(Fh7jZYMaC8K)*2
zwZ>UmWtAJtD)+OVZ!lkD_Ooy{uH(5UT@6!HZoH}d``mL*e)pVH>6?PTC89M~ul9TG
z?dO$%rm1g8i)dNL$qdDd<(jT_vQs6hpjvN}i*u=EU3;$m94*+Dyc3-;qA%v7F(bNK
za;i?Ym?=30Te)Ym8x@=Um9=cyro9-C8N(w3G2<8GyH~jG&lk5g%e8pE<k<1et&QqB
z)tjw_Gmf1v6)RO|D_;#ut(g@r7MfA1LwhNGQ+iJNU(1#S84%DBLi;P(>ZTT|wXz+S
zZA(6CbAC()p+SEwLklu2;9t_o`$FzYCX<N<Ef!2xWqa<owA?Mya*seK_S$LLA>f0b
zj85?;sAfj14xJo(1@>V-8T$kd;2;_M1@@qWjEF!B0%UXvv|~3J2Lyr$k#SHUf-W)+
z3ACb(jKcz9>>{IEz(6nhaDp~R7~!TI{m<EMLB|A6q93OarS#(hF`PjhXKB+Ta1_UI
z96hu#_+<BpUV%e6jBXsENFVp;HgQ5gU;u+SO_7saw%bfUSM0WUO2EW5T*nlqF@srH
zlpghtP~Eye6aTAO&BSeJK;Q;$A_3qQ=8(iZVFm@3u#5*-!9%3+2pMvo78t`gCU6Ot
zaRpZ~NzRy8xf-Th+8Kc%3}Xc6a2^+M5u=nD_h!0dSZ4)NxQzwe!Cfrk9_~}-kT;qg
z!yERdvSYj>SkI>AIpj#sBTIS#Pf0IggLD*Yq+`gFj^hdG1Z>hvct&~|1=1^6CB15y
z{nj4_{h??xX-$6dHm3F0=SJi2X5^Y>M*scfZmwI_c4W#j`#${nxS2L>nE?+lW0^hw
zyfm7TS<CG5B9>(izJLF*8M$GZ9bVc^%k1_d2?VJqKqtMG(CUo_8FL7cCg&Qz(qkTN
zq$&7FZ=;oT0WG9=5GK6~Kj|U@r1zM0pIJ)@I=N2HWoA84Z=Ry9Fzq4J(oB2Av<%Z8
zGcC)s9MkelD{Q}##IPB;`X;DeX<CZ=gt=?XU1#o7=00QY26H!=TV!sDx#a{ktU8vA
zbf8cbMpYTL#i-|u`hihDGO9)>yDERm#Z%CI5&yUDWg3FJ>gp=mzNJ$Lv?_;ApHX_g
H<D2>`G6I{m

diff --git a/tests/ut/data/dataset/golden/shuffle_03_result.npz b/tests/ut/data/dataset/golden/shuffle_03_result.npz
index 6a6e62f3ffa75ce7a915a0f92fccaef73dbe3933..297b54d9cac8ebe73a18a777f816f1689f22e7a5 100644
GIT binary patch
literal 1691
zcmbW2+gB4;6vk%~0wi8Q3109*Krsi55dw1a!YImUK1m{q26c>+WRPe`awdaqD%RK*
zskCo>@};Y5^}Wyi5BhKT-iP+3Z(S?hb21&$)zz1tweqcf&OVua^4nj|q~L7}O3g7K
zC3^o%_Owfq{qajdNjJ^RXtY$R$x^piv^UyGdwiEzUQDgX(o5-8IB!&QW_UIf-b{ug
zq3~+iv`oE}DVur2US80PRfE>6YkI|?Ih%=uV#ATp>(Kunfx0(W+E}mDqPenZMAtWp
zR)Nl|*8&;S$dyY~%iPFW0WLLn1xop5mdlW~xMyC@a_=^8TjD+j8Gd-)@pflx-l|oM
z0Pje9=w~LUc_$>u+uXmz0}S4@OskuGPa=_sH)sgKoW=KM-)npyO9v>QD;G1>LdhDN
z<oh+=#Wda>m+Fm%uJHqqNe{w9+5<0X5N)KrXeT`cAL(Iqk{&?^=~4Je`w$>Kw%B;1
z8)>@(c7?G#Kd$i;^z@T#r;`@#QHN}3yr01XF9jinHuxwQU}#4N1%nKo@KbP#VK4Sk
zaGD{2JroQvbmIU8XBhUQi-Iu2LG)0-7=q}fV3^?$4jV?=PsQw0tQO0<#YZ%LmJSk8
zfBCJ^_(&-_?m2bt5A~-|a}re*pF26PDqX*BIhj#a>2)(>s?zUfVye>j;lmHj2aK!A
zkh^U{RR+KR`DrtAK~=)P|M;;vnN*b#_d8Rn`uA~mvBmOyTH`Zx+Sz|#bBJ9O#2i?c
z1S|*7WdY0~b49Rn=*%%3!BO<#7!gtgDnUu(R~bSWz#vXh<{HCsoWM!+(`26EG=^{n
zVVYcLU>L>-&eG%tLj>m##d*TqYNyEwzMyfHPJVN@-vs)W@KjuQ3c^#jg{KyUrxL<b
zN#UuK@YEgQsU_j5yE~n>jcM4{YOOXN<M%YaOx5nkiEW(<JrIS`qR@&c^iULfBnmwi
zg)~v<i71p2g`SE+y8TaSweNVIXEmOq=jZ>`2ivqx3k<gfwp|_it1f-pmJaJ@ZWmfG
zuel9q!Chb&#Ta53#{@255>tfzx!VI5zs6rMD7cDin8$V8zyeg-S!9^T3}$f=mv9+Z
zFh`5)3^#ELaR9fmhy;?fSYk-w4wi5i_pprnctDF~hBQ|25RdQ}8lE82@^`YYjcH!d
zI43Y>oGuOf)~E{bmH@vYz`qpWKM~+R72s<E{AU9ED*=8}fPanF&SncG-K_D?HU5SQ
z)S=VU-y%!;1#+ZcB2W4i4ALz;BYlTe(rpw-zs4HrdpsxI!3)xFP}B{>;@@VYDaf9n
f_rKmJWFcqN?JIQm?UI5|yi;UJmZm6?5VOAkB}>u}

literal 1507
zcmbW1%TE(g6vk&-3T;I}Ek5u;L9rLC74X5s2csyX{iIYB1+ms?I|vGG@3g4(5xY^N
zTN4vE#>BmA{{#Iq+_-X~8xwc+%;{vn7&qRg-<)&q?eBc|rgM|<HiVS==u|#G`1N(p
zrzrL}poA2|veJFAe6gY`tz5Jh+LQKpm6)DL&Z^2?<zXaema<l4Bpi7(8Ht7?^98GH
z8ToX<%9(a~+*m4^R4*+UMU!$d91Zsm^hCpt!~Z>kRc|)GvRtghvIWbGEw3z<7s;<t
z38pPGTgaEn)=IV<luCV7FrTZ>N)^ge=^0Zq()(OCOi7=BiU69Q$wp^su3RaaLD@9p
zp>0kMNk0^*&t+gr1_it`DpeoJb%{hGUZcT+(XwpLyhzFQA|*G7mwx+%XjGw^DcK_6
zftP|-fd=>}*eKA5CJHtQ_z|FBvp_S}QxFmeVjTr-0<G9c!4`oHXrW-Mz$R>_V4FY)
zZ4_)5*n+JTwExl5X}5#hIh}TLCx^X@yE)k1+{i(92yDZ4v||TR!`#Pd(J2r{C%UkQ
zX1WA+Vi$I!gOWW0d(n-3h)}XuK%fV`*iT8fKokcM!$C^+{pHa<4aZ)@wP&A$<4m~D
z?EN}EdR&KVEcCkO)>zpu(1(5u;1CYu2##WqTtr=eYpfj*&~OT;F@`fZi*e|*GA1yD
zVT|Ayj^hMQVw8#p1<v6-;s7pS0trk~u}>h0i<rVCT*fr6;3^gS1!gddYna1zq;Lai
zDh|LfQt}Woq=%6uJ%SwRQJAEIxJ5dIdD3All8#`3^cZfF9>*Qh6IdcWsek%jt9_xr
z7$u|n=uiDaxSnXb=5v!%y4LdTsWWq0*V^2fF<t9$XU^za`<plK>VUJl*6l7E*R`&<
zA8*!YbX|-5c>krIoYS>l_c!Ntebr7pu2gF^@&%}*7vLeCfR{9Z2GU71k|yCJy$C<)
z6q-mcAwYT=LDK1o+7o(QiBq>K1y`9h!>n0mU1Qc9v#v8M#jG35N;B&wvkdzkwP~5v
z7n8CqBXf*289C3$TZ~*_<RT+)Gx822ml(PH(r>@~G;D4@P8+s&Twr98k&=-XBTI}d
zGjfHIcNuw)k@p!{!4tzQ%LkcQ5~?TU{nx!r2jYyXeHE?V(sZDWc0r}jAU$96P5m3w
C8l2|<

diff --git a/tests/ut/data/dataset/golden/shuffle_04_result.npz b/tests/ut/data/dataset/golden/shuffle_04_result.npz
index a3b9469f9cde16a39cd991b923f5817771b1c992..704cc82389786b0f5ec3de32057361a52b5d5c6b 100644
GIT binary patch
delta 491
zcmY+B$xZ@65QcjeheZX{!Ce;DaRFI`c=L#%E*!aZNRNyO2>}YZaDuZ}9)rZh_z=E;
z58%nGCleD7V>J?CD(R%V`paMSSN-TOMxg?lZnD2fGW}b98>@`T6JRD2-EJ+DZgu(!
zi%gOd9IMZTm@=pa>3-L4)*c$IUbcuKj$w;qq|An{D>$xzBAlu*g|c;A<+w(GX+8R-
z48;%>hhx+N6*R}G1}G&3@0)GW!*z}ugm0E!KVA2ozDRScbn{huO2|iark$(fHpd-;
zcSjw?9PC-3!vYxKBhmRTi&q%q7$-0>>w#pGlUC=+d=B?H9uRq0rfNfijy!(ybv)*H
zLV)G*Lskabe|e6l7Wg3mL0E((ScVXU=Zwn9UE>+YbJDpe)7<3XQl4(?MoV=2nC5s%
jPzD4MS+IrU^e|_qE1;;R=GmZcy28<|nle6UlZ^cWI5~W5

delta 296
zcmdnY_M62tz?+#xgaHB+864j-EdR{NzyQLW3?dAPMMd!jdU*wvj0^%`L6E}Fj2lhm
z80}*V`82vXpjsQe8B=2m`8B+myqSRv0TdaiprD2~i#Mw`n>Ra9h0x?ICYgF+4Q~!_
zPH!%6Zf_oMUT;2cexQ_ycVc2}p{REfkQ8%2{{H{}2S8hZ9C3I1ukPy<ARGxVhKB$D
zfufS$j6hP#3n<(GWJrU#G6<Ie#bm)^a$Z0UARY2RZfZ%PLXxyzfHxzP2s5?-n8d8h
KrpE*{lmP&M9!FXL

diff --git a/tests/ut/data/dataset/golden/shuffle_05_result.npz b/tests/ut/data/dataset/golden/shuffle_05_result.npz
index 27eb0a470d370fab9d4938e5fb9e3eb50d603bd1..03540388d304ee6478c09d98e7cef018bae7fa09 100644
GIT binary patch
literal 1691
zcmbVNOLG%P5Z;yiki^3V%RJ1(vcYI<gs^4f7Y{EOyr2)u!eEerwOC1OERZG5YE7bG
zg7Pp9PHs7INu?^4YtH!xx#c(Ho<nj-&PwlSYh_m|mvogr^-Onl&wl;wY!cp<pwzHF
z>Bql*UFm9*B>U%=f|72U>EURxT$QCxv1upTLEFAfEX^mEW$BglI-D~qSu;Ex3U4li
zBcbqW$+S$pm@b()!``0L3l)R*D{Fe$pf#I{gkpn{(3{Zzw?NICEpDurtI=%9G@|Pp
z1uIYI)vJNDX=F>qie+wOtpJxAS%G4%5#=(ZE$*3>Gu*q)TNZgMgA6}B?|7S&nzO29
zBf#5J9(v4$N$!ILd7Jwed4R#2l4*C7?@J^S@j96R%vgMX=Do%buvCEJ*-{~0$rr7W
zaeh$a9Zch$@aY;q1ex$~T&mUU<aEJL7@V)arN3^p6CQzw@F>~{kD-O|I0A${@DiRt
zt8S$13D_M*a{Q#mPf_*L44pV+XuOxf11~utw&SBT+iH+$qmSV*y2$BgID(_(oMAYK
z4sy;iw4jxo0fzlJK+ZV^AN=Hm8G`60hcUFFot#020QMP1%1@>3Q>+$By2Xbyex42z
zQI!Gprx1ncv=`KwzuiSt{pF8({kKNsqN;@d{J~iat7<bcqALC0{rs$v7E_g0x4@{X
z^!~mTYDC6VrQ3~MQk9_(AHHuy##N=mO`A}a9w*YIEtcbx8lR%mPBVnihkl%)2fHj-
zaGvtY9{L<kuQHs)0L~#y>DL${xPT}w(qe|;Bu?QpdTF5u@&rp7zs_(B$I*im6uBYB
z<ji4~fng9sI8TwAf_Y~iw*={8O2!^JjgRp;jjMF>+d@uvgq-5dNfYT1n!3B=v&Bhf
zOKV<8D<Pz{Af%NP(z++4wJ4-@Ur1}Wct>OWfyS4p+e6Xqk?59+6VEj&x-1$!w)JoK
zcp`c{6+JZ3V@32xiyqHJ4}Fh(M{_)*@hsKPF(|l>8<@pS+`=4GqQr1%v_<Jqv-+>O
z*wS@|_MG81?jR1}F6NQI0+m@~NMRX|@dQtyVFhX0%ri`43e&iZE4Yeln4!%V?sRO;
zJ1@yghGC2#hEa^+62>t>nFUujy9BY$ki<PK;yxZ=2@mmzGK=o`cL}8APTrMJ^L4VX
zjY(eCI45FEL3~9JZ^h}-uy2hGLHsM&tbZg}|5&hI6|8?ESbr^8-xRFB*)!QpzNnj3
z{;9^_QiB@uROK^d2tS8T_yt}Neu)C%7S;&gAxpT8=Y(IuAbgLPggeL)evMV#Ff9H}
kCYpro33~tQdqT2ttY%-K&96%mt>T*^OR_XUfrME71C%+`J^%m!

literal 1507
zcmbW1OH&hB6vw+00!c(c4L<NegJLciBjAIF4_bM&<}V?lD2OrU(Fh7jZYMaC8K)*2
zwZ>UmWtAJtD)+OVZ!lkD_Ooy{uH(5UT@6!HZoH}d``mL*e)pVH>6?PTC89M~ul9TG
z?dO$%rm1g8i)dNL$qdDd<(jT_vQs6hpjvN}i*u=EU3;$m94*+Dyc3-;qA%v7F(bNK
za;i?Ym?=30Te)Ym8x@=Um9=cyro9-C8N(w3G2<8GyH~jG&lk5g%e8pE<k<1et&QqB
z)tjw_Gmf1v6)RO|D_;#ut(g@r7MfA1LwhNGQ+iJNU(1#S84%DBLi;P(>ZTT|wXz+S
zZA(6CbAC()p+SEwLklu2;9t_o`$FzYCX<N<Ef!2xWqa<owA?Mya*seK_S$LLA>f0b
zj85?;sAfj14xJo(1@>V-8T$kd;2;_M1@@qWjEF!B0%UXvv|~3J2Lyr$k#SHUf-W)+
z3ACb(jKcz9>>{IEz(6nhaDp~R7~!TI{m<EMLB|A6q93OarS#(hF`PjhXKB+Ta1_UI
z96hu#_+<BpUV%e6jBXsENFVp;HgQ5gU;u+SO_7saw%bfUSM0WUO2EW5T*nlqF@srH
zlpghtP~Eye6aTAO&BSeJK;Q;$A_3qQ=8(iZVFm@3u#5*-!9%3+2pMvo78t`gCU6Ot
zaRpZ~NzRy8xf-Th+8Kc%3}Xc6a2^+M5u=nD_h!0dSZ4)NxQzwe!Cfrk9_~}-kT;qg
z!yERdvSYj>SkI>AIpj#sBTIS#Pf0IggLD*Yq+`gFj^hdG1Z>hvct&~|1=1^6CB15y
z{nj4_{h??xX-$6dHm3F0=SJi2X5^Y>M*scfZmwI_c4W#j`#${nxS2L>nE?+lW0^hw
zyfm7TS<CG5B9>(izJLF*8M$GZ9bVc^%k1_d2?VJqKqtMG(CUo_8FL7cCg&Qz(qkTN
zq$&7FZ=;oT0WG9=5GK6~Kj|U@r1zM0pIJ)@I=N2HWoA84Z=Ry9Fzq4J(oB2Av<%Z8
zGcC)s9MkelD{Q}##IPB;`X;DeX<CZ=gt=?XU1#o7=00QY26H!=TV!sDx#a{ktU8vA
zbf8cbMpYTL#i-|u`hihDGO9)>yDERm#Z%CI5&yUDWg3FJ>gp=mzNJ$Lv?_;ApHX_g
H<D2>`G6I{m

diff --git a/tests/ut/data/dataset/golden/test_2ops_batch_repeat.npz b/tests/ut/data/dataset/golden/test_2ops_batch_repeat.npz
index b4346fd796775cc864759d2729741b31615ea0d4..cba3a7fa01d24e67810ad5b4c89526c333049b92 100644
GIT binary patch
literal 3602
zcmbVPd3;pW6`sjPGVBocZCK|5A&>+@SOWwI<UkUUC8&=unR$UgnB?6|z_6KC5yidK
z1#PLV-50vI?%FEi)>T_~#kF+>wbrH8_?>g-z5#we|8e-8cfa$U<(?DHy>F(LGPCjn
z<}*6bFYC#Q13d!)mb^fIAgXo!w8BJdM=;PwNOoi+e-2i!Sy{V27}y@z6^XlTF&&v5
zj_j_96oey<Ejp>AiTV~DcbQ%qZEkbHw{4ELy3kcwLAZEgL3mI2%YQ<hnXyFsw$_fq
zSc`TGx3xDXn^14GBUG>5SWBWUsoP`8kQXpsp+wwRy&whlc^MVK1}}5Jm$k~vRurUM
z%DC9eNx8<89j$K2>sFV6HfzefTnbQdzn8bl3n|L13xeJ4b+4|juIj=^9#o$6dNf?>
zcs*5J2=-V@bA4M=B3V@G^>V!4%JKR@`c?%xySk!|*N=jD_NNRy^C=V00hASW9d97K
z22nPM!IXpN5bB2KP|C$~80E3caJY=HiAGAIQIaSuiAI}5W8gJb5{;8Y<0a7qNfhBk
zZe32Q&Z4*n=ZTO>kjd&mE)4wXq#*VyP?SNL08<oYQ8qxKq8#c5Fje6Xm*)aaLu8Sn
z5Os%Atf&X|1SnC|i+Tf0SJa340?bg<kNR^&DI#W`xmTIpYnI(>w%u!vzt>zu&a->X
zw|kY_y%yNL7N+(}cUm0x7CBx8;uk}fK$f0aL8Ykx1ulbmxh<p0lmVJRuQ0VZ-bzyj
z=xWe4ww78`3+S^zud)T5Z3}X|)!^4a)+)-U0dQQWXdn&ZKTI9y^@;}55Gdy;8cM^M
zJ{R<PiiXn&D2}3$G>YjBpz9TdX*86LipJ1brlX)66pf?tP-2QEP{d_E4&H?{p4s;%
zzwga3H`%_o_<e5$-E8~b=J%Zd-D3N0_50?D(zfrmbl*wv?U3!Z?;U>M=Y#IBeed-9
z-UWKM?R$^k_g>H!*uFdczP|$ctG4e8ZQt%`XR>s<cah`mgXhJN{poSX8M{Qz)1`8r
z4#;`B%$%po;d+Ifrz_<=T_xw~YB^8Wn1lH>xEwq!+t(%AwUX^R$#%WTb^}}wNwym$
z+f9<~8<Opt=5(HVi<A_5haK-*2)Y^a?J7Q-DE2#&{uW99T}l5vlm7eg`hleXp``zj
zr2ny`|A{H~r*Qe1O>{&O{ah0LLK6MbB>EM+el3Z9BZ+=1iGC-Eet-HcQWE!Wb-ddU
za69A<$eob8&YZKm{W-e_=6mg&{UJ4H7|}n1zYlW19k!$Xuss0!m>sqUQ^SUkhrmA!
zdBjfKqy9DXC(wVk!}gcdup#8H;Qt2syPdel{E2%U^gryx{WCRj2zdhhlaQzE*gfr!
z-7}z{wPW|3KX#7yJPa>DjzeBlq-Y`%y`*RoP3GhHGU!(n70?tYuPQ2}sd7wT1OG3`
zzZFfRBG_J6R7@ot@&@QP6-}oZP~K8hN;4(oZSe0vPADp)S+Jc{G@Isd$h)B5Q#6<6
zL3v-%d@7fa|A7Ah@?S*@Xd!GLDq2Jp9P$z9j}<MZB~U(5w3I3(<bUA*5BXHlGFlGX
zXNszbIHXHCI)Hbhq7}5#b#zd0HPs*xY6h#0&NNGwS+aRa9~pk%=p5$cl>?=l@yiuX
z-aa}{Fu#B}ZOAxuH%kw(@jJ=UJq7a{2}&<x?=76XoOB<-{C;AGzQ&=SS^A4jURgR{
zuz73g0meR1IJvp?Ai;8ZBhO&tFvKiF#U@v|9wyk_=6blXj}T7obUjkAT<gd)$~c70
zGFohM$?GwK%{{Nj8v8im<hIx21)B>Wi6$72h*^|bCW?<tqn;$#45XfH>;=NfSn4T)
zQ<JF+jl)#4OcR@os4fz0rc@Ujdx>x|t9rWN)VS&y#-Y?KGsPxztIGtN(bcnzeYS8i
zz<Q40)C}vn#$ldW=8H{+S(gho^Q;#b`$FMls`VnlslnD2#$mBpmWWLzTrU-DhFn(~
z`!eCooG%xgn)CFVrlBbj)g8JDR!5TwTVa-!SST8GbTxwpfEordfLaD|fU_95QTzw4
z0>Z&Ln~6qPRx{WHu!g~AfVB*o0M;?s!a@8c6+!rVZgx6{0q=A!1K#O82E3EQKz7;y
zgrIsRc&CjFc&8`>-l>5B?-XNzo!q)Syo~tUZew#xG^yj@9qr<YM2%+Igr)M>iLS2W
za7FB9*2_PrJQ6l~6YC4IeSM4Qz2DrIvTqfAK+2Bhnn57_HsOaJI&`zilMsEBAJ-!K
zn8UYjH1<}}$NT$uqDM~Na-8kRt3e=d8}p~sn`HfzdfO!~UGH|$)AjBU{giso2Z6jD
z!l&!qDSEozU81M!-7R{$-aVrG^}23S?*-LRSWCf-{LC-<8HeUlpH6tAQ~RT?ma_RL
PKp=>JGqE0(A9kMuq9Djg

literal 3137
zcmbVPX;@WN6n&4GhYBh<;J^dLcThw`5X1o#1!c>SWobqDfajC~_W>4ZMT3zxXOj(P
z**w@>+C1Al&!Y`CS*fMXMQfeMxwrVf{&e~FVeNJ9-s|l179UV8;Sq_h(39%gm6Nb<
zdz{O~h;=2p{J~&tc2;9^yW5qZNxsNUJ{v0MS5z-_yH>eY`x*i*^+8{u*SDt1m+AE_
zZVI*r{f)Iv!G-|oQvZsU0Qi=r{^kI3Rgme;&GhDFWO|SH{?EtL5nkWewz9cBtG+21
z$XeO9qIDV0>u>ke1_Sj?jV-Ogw)$3&<qEy!X>17XT5fTzwZcl=byj$%6*13>RN|Hx
ziIB)nE2{hDhSv7xfX9kn5Qagk3al7$iM!K^oo9KJgfDP|U1P;nR#ujG;n9l}wOYOE
z*40?ORgKk0t&iay;pq@}pvH<<5+>mQ35rSpeU(H>G(bNE^;m%ZN<0z=kf@}W^ae;$
z(nsO}1}I68z5oN2^ppMogL*0qHVVl`VTe%}Y7~YUh2cga#VB}<LaI?n!yF@es*DV&
z-~j2^9%ZC_At}%bbcWFz9nu3m26Uzo9UBq_odtTFQO*u2gB}k$M@gb2A(^0LfD8-(
z<$}&rGDrp^nW!XLhLFw&U7%#B3_~(W$#6*_T?l%z60f8pnW7|3Mv$HgdYY1vl8&TE
z$tdxWE(Sfl=d?5IX-lv@(@Z<dp0*TpnVELBJ?$LO<z`y3r=1JB!c1FfPg@1L+Dv<p
zJ?%Wu2b*aRv8SC6`cO0NVfM5OKrb}Y9&S&21n48pv^Dm$M}e+IEb2M2-=4S*+x2GR
zhVF@RQ33Fa5lhV2OYO0jfnILLKDv8sl&k>1647X8Z?b1^25p(ygWa>Eqy>B{qRm{u
zD*FPC0e!5wfcEYSK*@37S0mP#TR7gng%d!ZXl|jS`xa1g68Mu5rzm_#86ZwoGFrya
z4{Jf6rX*9wBI#6;CF26*PY1sav0h2Gj0bUsk{p@9tusNNr6gDKkesb#qU7t6bHJaA
zI8RA|Ov2vzN(yB%OE!SMK*<!DisV8i)1*k3Tm=4N#3f3KWjgjYDw!cAEV&f)WlCns
zEF_mJDU~u^as~KJh%1%MmO0qltfX9oC0BvITFG3gK(a+irBvyXYrtQNxUQ!QuD4xq
z1GaB8F1X2Y!Oh@rLELIwu+?_KZJ=*AF1W*S!JXjmLfma!aF6YRdqLl4T(He?!FKTX
zBOWj=c+hsiL!civE_lRo!K2_GLp*L=@PzGxCqX}DT=2Bxf*s(WK|E_*@SN>}=Rv<<
zT=1gff|tO*jCjSk;8oiNuYrEuxZn-P1#g0X3-Pvb!8^7K-Ua=hal!kJ3qAn<A>t$B
zf{$$%>;(OZalxmK3qAw?IbxS_!56j*z6AZ1alzM)3%(J*zsA}vby&ZZdaU0`1J>^)
zfOU^7#=2LQVBIH6vHl>-u>L5^vF_Kk2XyUEy7p&X`-`sqRoDKeYk$|Zf9Tpjb?ske
zJN9&S?ZmH0H0mlVI#9ONtCOG$zhBFWBkkO+^Z4CsyZ6I%o@DQb>pXem#?7Jn2%V?c
z??>u9ZPT_zq5UYGr`z?>I`{3}vNN<F!`wMsFqZj0&gaqZJLikjxpTf=I(N?3Tj$RC
z`smy_U%bwp^CjrqIbUC$JLl_%yxi5%)#a}V_6HU>Kq5gHKoUVXzyN{>fPn;&0D}mk
z00t9810)l~01P3B1sDoonhzt%=EDitJcWSGy##EYO2Fo61Z+NnfXzn|uz5NGn~wr0
z$0c_F`$)36B4G0j0yZB_z~*BJ*gTVf&Bqe3c@_bik0W67Yyi`IJV`dsAz<?f1Z<v5
zz~*@bY(9~I&GQM^ynuksClRoDA%OEY9f--?p~4gbDoiDy!ZZRZ6cJFNn1Bk?38*lG
zfC?o9RG10yFNIm$p+YGE70L*xFq?o1a|ozVPCx}Apu$`NDpU}}`U9=ON}#%|YH^1p
jhX2=J{rGgcpANi@A5QZ>K($0hk#OTHAK&-u{{#L8p15JS

diff --git a/tests/ut/data/dataset/golden/test_2ops_batch_shuffle.npz b/tests/ut/data/dataset/golden/test_2ops_batch_shuffle.npz
index e3273425d396efc44d078aa58f9f0a6534b98528..54ff4435e016a073b000e27384c7f2786a15f41c 100644
GIT binary patch
literal 2360
zcmbW3cUTlx6vbzEscTp4ZL#fB5d{>n1*|AXirBD>xa<y!;=&si6QV{^)M$E4GsT#g
zUX1Cc_n6*$HNBWbQ%qvo=DeAmMg6|~lli^_+~2)#-+eRZGFt{uLYih@>00{XgW=sN
znx>ShrD*}vtQ?UUu5WT_?JcMZRZ?ABOXe1rEpTb8wKe{b(GWEKV}1U$rTz?`zq-zh
zn1OI*of$F|o))NWFvuI02kH&RA}_<2JvhU+&iDT>Z?h*DZd_I0lo_lujm%YzwUHY3
z8)))YnntiL+z>GvgAuRP>?&_KWM`!d+6L*K?5dKUO){ZeCJMNa0{2Fl6e|ryn(7U&
zOs;VAn5B6#1sYtNWNNwe3V149q-$lXl9H05C@<>=@*}c!)n;9`5fxtMgLSo)4K?9N
zR<3NT%XUJS?Wqn(3FxvTQUiuAI}xNIfuJ)I73f0H6>frV@F>uophuC`9F3CogiEnr
zWWC`f=u;eBuSj1KACgGYksL6Lilo^7Ss~d^m;IR@Kn)aVj}ELKB+wC^4F2UO7J_F_
zWiVL4i&hE`As#A_fJ6ov0!c_#co=b}*b;mCS~FaP!v);%sPqV?vjo!6nL)Nd7j#uP
zhj^qwYqVi7N}w&;8AtS(9g?}a9L=IUY78}2z=w1ej}z#J{^~Qv6HgErfPoB-6Bq<P
zKSP)K<OS43=V+7SN1IGM#pz~htQ#(wMqWrwcaA+He(WM*IGxUPI@M({c?nf|?AgoW
zXP?D%xpT*6$L<(+n?pXAnkV?dg9+vf48c&<dj;_VfeZ{|uuvcq!>uzdB414D0^QJ^
zxg`QU&{Hj`Bwi}e3%wZx1p1&aml#L+EqA1>(q)j9A<D4)R&D!j8NqVPZ#A~xRuHVT
z{8nrGZ52Vd<pI-Cb++m1nUI$0%q=O(^SmtYH82seyw_-Z??~_GWXsjMJf0;dP)$}Z
zC)&NNAy{kmvd-@1B!cx;FU@u@Clj1fL?<`1jZ^J5P9xi3wXsnRoz=l6veT^&HXrT&
zoRHk2%QKihlRAq!n>t6J026ut&J~!1$?CSAM|{4(6ij8XRbUzlttW8-`GwR)0@E>r
zxr+sg0JY>2;%x#mQOw{{ffAIeC!x#Bn7Ev}g4*sF`O3JFcMxCY82RcJBkv@?hPu}A
z$#rp`Tu*$1<C7a5pKz<2n7Em`g}T)lfL-wcxQ%$XGXS@@48R@acT#sb{<=HvuX~8^
zb^Ns_?k`>5$He{A1Jr}ZPQgR*DR`LaN1Q2mv}Fn&BY&KF!kM`z<1_aZ@zc)C?R933
zr+bEpXQ}6?=LO0zi|_@3a?Dou`9<QF1m<8agO>&7VZL?$Um<^$dQG4L3z&OdU?CQ%
zC2tVFDX<tigSP~hpwfP3Z!_@@^)B_EzzAfq;C+E?<fwHY5Pv8z5~CP=B#?{I)>a>r
z@1s5u$io=sJ{1^?acaqD#Gea{#{>pn2porewN=ZcR@H<9W|RC<mtV2;Yw8=+(C6P`
zh5C=apWr(f1mB}tfgcDCAV_c!Aq9RU_z3}mpHZd2F9e6MlHga=D)1Y@?^s4~7|YdL
zAvN|^sIILGMC2d3{F4=bQGXW}3@qH`W17YP6c+do?%byWen?d4rpDJ@VL@l7!gN`2
zk8Rs_*hQwBfOQ^=C&ud%thn9&4Zc`iqFtAy_}&B2=sv}DGg-y?hhpmt-Au9TQWY;4
z65B^Ny(%tr*0HXYUDw*G>l@ogH``cozwLXLTKgJC#B57cm01RtJI(W7Uv^j=yP8?e
Y&YZ74Wk^&nRNe-cHk!ZpTQ9(W0sU#&2><{9

literal 1914
zcmbW2*;5o(6vlfP)=>c!TySAPaSjNGsDQXIiqgs_izo;x;xN+)vi2~T5HVsBLL{4s
z*~nr{%)XdCG5a2~{|&z9A*od5nS6bFdZ@B0FS%9c%sIbv&)0Y9>S=^KK0~udj@FWT
zq4)1ZO|vXT%g_Sha7Rh;z|g2mOIOgEXcet-vA(&kvCXBO(oXwA#&9s~tMvND8hnLb
zU)Nwb5)KS>42DC7g=+$R!v^_qcVNh1E)|8|vO;fZexdh__kRx0xH~v7(mym>92^WA
z#r-3Fkshu$FzV?D8^OVW;YfHS81a~zJ<Br?va6;G+JqTb?dmk$lV*IAnILc>8HtFS
zG!vtfLy^%T!(%43#&M$!6=pItxF*e%CetHux4KBj%sKV-^|dn`%AqP^&h0$kZqAc-
zGgXj|`9`~$CRl)l4AO<qNMSHv;K3XQ3k2~<V6ag5@FWHqGL>wdEXO!p1et<3xEU-G
zWFV8lV!<LTX0Svs7xNe_6{I4KK^C98OyEThljVY3tTGs_AYLh0jXWmVf;I42*h`$_
ztTNYLCGjfa)y~86?1vGrA@(^t5PJv2`NRdzLe|;~Auc2?x_K|f_Fh<C$8w3Y`SteZ
ziA#w$2n6{|HVO)`)>`N$;xa)YikOrOim}eZ6~vnbOR$tlr63E-EWCwyt6(`+Fxe(p
ziEImRC$6IT1%B)x*df@7YOA!9xLU9ayP50~)Ic{ZzMH&;rVI99FTox`Ent=Q64wg$
zp^ga`T#p9TvX8uuR`1+GL+lnBiT69V&=kD|b{`-=NNaXB-V)pRA>vkN<89H6v*j@P
z5!zAb_S$2&cZ|5hxxM4j+ha?ByptAm4lEQqFoU?uIj|Ga17k}!c@OR6&G*<FyT?A3
z`<;6nh~6Vx2g!$Mrt`vtV=v4wam0CHMxrkaTTYSRLc3Mai2Vekf+ifW4*NFZ(}II&
zW-=yd!69|nXUK1-ofWjAjkR&XVH~kq&Jo`sIEr>AcM6W7L$%yRK0&)%P=fWWO$th}
z!D_jO_`F~vHZhqJl%ZU;+)I8R?S4T8HnaACpb}fGmIsM12)1GylZOP`QKecQMo*yK
zyoi(ZkHDaR6kYU>AxQr?LiA4{KtGL6`X|v#{}lS@pT-ILXV6W5N%!aJSH1igFnLz@
z`~JFe!%i;i{>)hAIo+Q%J$>1(Jg@r`VwEepKkbhRuU&aTzy9aU%ndtvQLp+ZHti+d
zUl0>`S=VQ$y`uZGV*;=0{@hE~j@tsSA(@MM9Vzs0AfEnBB+y?)9Q|8x)4yG-jnB+5
zcn2=}cj2LbuWsfnf8IwT{Rc>*|4?atq_nOnt&f$~Cray6rS(}YpF7UUpDV2|l-8Hl
zch^$;N~wLV)V@(_-zv55l-l=7?FXfHU8(&z`<Z3;PfGG<CHc!#vh_jduylV_y1!A^
rh?u{37B|8bm*M`eze}z#I>xR4huJ^RMkJ`eOqb@;%2_B%vdaGgyqG^G

diff --git a/tests/ut/data/dataset/golden/test_2ops_repeat_batch.npz b/tests/ut/data/dataset/golden/test_2ops_repeat_batch.npz
index 7a9a70861ca0a3cebc41e2adb5e0ed0dbc220eb7..40b0489a598a133146812ce35d4251497ae6eb05 100644
GIT binary patch
literal 3614
zcmbVPXLyvy5x!GHIy#8nj(Q6Sfg}*3N+3X>K?Tu7`4FeOPe>q6+SLhL25iMK#wI;Z
zut{ue$LYms?sj@{OR-a3Qb_C+_g)e^k>8oUJ;^+e|M<ISzJ2GN*_qvac5d$KDKjfS
zV4vxMhbk*BJv}rKV95*Q2cpI_Eh<cO_5=eXgk(oH@^e+~_L}-#!N8Hg(Ma6yikZmr
zaO7BBq#zt==`cwXO*D0wxX<*4XnU6rzN<Cb=|fj#1>xek1>xi2_y2@?Gh>PF!<{{a
zu@2)G9`0^Wwqd=|o=}tVV;zaEr0I?&Lpor+LW#Jw>L3M9=#0u>v(7xJvl?`^q9El`
z#^pLE<r+`+bowDZs4)ZEtSi&G6rkWqo!6j4iZUC6V2|m+wY9a?eR$+W<w-rH`ASa@
zRgEFoV;${HU2TbEQK=s0>EX)LBOoKI1HFBHQBRMeAl{=X1Mhsw#Cr^7MSV|?h1WRB
z1~Hy;@SZ?}@SaGycu%4{c9{&9DHH-RwWjY94AWfHutc4XsBoJBx0w=p7D9V^wnUvH
zQ6q@z`;9rNg%-s%oaaL3LFTJdxiIk4OF={`P?SNL01FglQ8qxKq8u6ouu$RT@_e90
zm|3JKM1!FeD;h#W0ZJ4Nqu~IH6^)>g0812&qR~8~6f>48%BL|<$`p;IaRAHC&%WGc
zUtzPuXC-E?a!Xikm*DAgm%YMeUt_Zm*lBTGuk~~#=C6aShio`MUzN>=G#g>w<hERG
z^MNMNn_bcxn-p{{=sK6VK9$+iTR=Cs&2LR@9`4(~Z-?w~C$Q6=08EXbcPScA6QJx?
zG?6CpdF=tcSJ7me0>x7_m8LPh4|J2FFinTDU(pPj$#fKSv!Yov8%j*k9E$kN$HDuM
z7DbBYf;gaP9?fS{E9f>w1+)OlK}ChMkm*C9+s{9x!|78>z}(?ZsWW{_e4~szrLNQ|
zAzBiAH{^&rsteLbbs^{;cT^Xp@5xcn$J{9$PoL7opf7Q!)SEt~4}iYZoze$Wr-U0Y
zuqaCx>&rZS0-l#cPWE4Sl*tuRLsv>=o|5wUkgd!Q!}TLlLm!nI`j}Ma$E7kqVJq{K
zaJfn<^HZmC`KtjB7wD(omEp8xxY}0fHJEZn(tJj$^s|!TTFG#ot<r%jFDce%J$*eU
z-2l0<nrjpZJ|`h>l0|=BqJ6<G`it=Tl7#%Sg#3yu`m3_&uh~U^9WLLHMc>TdWW3&Q
zy13txxZk!*{SF+zE5UD(rQRxWZ<DyUzxxZNB(Cr9^qrV+7vyfpJ&^C6e~G@IzC=HO
z`G@Wj{U~*ba9e&1{wI*1x|{T~biMr?^u6v9-RCOZ)Axga0P>)_Nxw+nq+f#mmAgs5
zPTeGAc?kT&kl(oL^xO1x`W@)syX*7^cbz=_NAQn89(9%XSi15a2mL2kd4G15hcthI
z;jfT$kiWU|`+K_lo&f!%E5CoF$`8~23I1P@e=Ax<MIfG1R7@pYnoom%M$uwg0_9mn
zrL<Iv{5kN?Ltao+M$2G(QPFZ*!BbuW{j#E!v<k{AidIv(OnDXjYmnC!RnQvP-cYob
zDtXGAp#P(29j%A*Uqu_JN~XL8{%y$r6m6tUu>D_AH4#tgQ=SQ+Pb%6>HNIzpf@`S`
z6QO3X>X}SCWZ5BGeClZn9C8FV&{p;yWbL`aZKLg=@&xaoof0l&9R}NBh}arw7yO0_
z-c5Tr&M<2qF5F)7K#dT*kD4UTNb4}l4x@SKuUu}0CSUxy8$ual{l*F>jnRw~%)OBv
z##@I8c9<wOZkL{!B$%5el*!gUML6l3W~yLro$N5pI)v>oU2M`#%?!b|p_-Z2K1(=h
zt7f)f?yM4Mj`fJxLD^xh_(-=g^90+5W9D0XfpF4x%mTsEd9bcR>#)!ci^L{<$P@{-
zt;iHxdx>z;kj!Gish(t(Scg(OEEStHCsQWa_9wH<+LsF_UCOKwER725S!o?s*<rQV
zq-0FFU|Th&!rIpeC$(eN3Qm=esk9F3?66*JQb%TkU|UM2%Gx&yCk16T3AQygpy@QX
zC8DOsRKw~SveP!(p#}p*qn@c{&<s$=AO=v+AP%sFfgi;V8-P$aTbXEqWgCM70NWX~
z0_<SW2C$RCL7v1vSTPAd)dO@0mfZ~60roK9+<O^tZjV8XC+!2mq$VbK4f`4R08s{<
zyO{y!jxlKEAC!6MNBqmTrM)AXG;#2r@$p8W7CRikP<8JMeSPQPin*<<m%mnZbJ*%_
ztXE{G^@E}hf96EWen|8&DLdBL4g%p13qSG9nHy}Jgy_@K^EyPIarTb=*4`=l>~tP2
zdgR5M&aoYF4G6^T5`JDfZj$w?{`GgWK43pbSRb&T3uM0A58_@30&#nU@88!&qPu;0
z=BViX`#C0h|9*~(?)HP?PMV8BH5b-XFe5+nz5dc+bE&5n-q@-8gI7=4{BIx-#AhkS
IhviS+JN$Op+5i9m

literal 3149
zcmbW4cT`kI6vlVyEGVd8!NP*#4v2^ff>=ONP_7iQVL@DAtq8njF(H~riXobqW|}F+
z#Pni%VtO^E_g+meCeakr6Y_m;ci!f3PX5W`+&RDd&6|7Y&Rov!R!L}Bg3bS=*ml?o
zx9^X!*=W(W1e@FIt;^18Ztbw!;!Tk+a+1%EiUs9Wi|w`zwvDbvPg{f6Rp4}Os&r*K
zT`OC>KCipEuEpEvp<Lo_YV$yETkUT3z*qU1&YVnVZhEHkc<29o9G#&J&F$-3JF*&D
zyq>J}?M=Ql*sr_8QRnqEv^2N*yzLD>hhg)-<!Em7&l+~IZ81WM?e#`zml0NNge$R2
zl!Qrmmk|+oxzX3r>Twv6H6bXpGT(?2o7lUI=xW2EB(%m3b(0ZOQBhIWjmI3M&}YQf
zZ>=?Ysam7A+7`t#!qX{sPpuKBBt$|1@rs!MeUwB<B%rTC_GmyqB@T%JBq)iMUVua;
zy(JFNUrD_50Sr*mSNZ`4DoKz;z#t|4WdI<lXU@So=MbH9sLnY|=S<c)ojPZV&Y6lj
zhWCs)!XFdq(l9+z2X^@bLspQ}b@U8>bjYJ1XX+HA{V5=4K^~)P%l6j>c`W2{y1?=N
z0wL!>&Q&r{2Em!2BuNH)KzWe!l?;)ga3(4lCdrfwAWu@_loU9Vm88mW%2OauRWd@-
z;1ntuDK5%Ikf$k8k`8COk_;I|xft>cC7CiB&P*j)GKO*q<kFsJGs`-g*_bZVXM>o$
zXsnt8xm=%3g>^QSkgN3B%(c#@8uC1SHuJ5sSpa#VKAT0>+0;N@tj}hNbv8>OFVkmJ
zYn{z<$aSz4J!^1VHPmCeLD$e2r~yTIps$3j(uJ(H3RwertuAC;pb#u+g1#QstgC6U
zs%eF6=xV%yYOtgYx)0W_OWI(SbPVKUbx9q8lCb1B=o?|1^qn7X-T4WSH|sm^4BR=E
zoCy6S*vb0hPq8k33*=Mv#difR9!pMxz7@7jNw$oIaJrIlGM?9a2ITEZawHednMx)|
zo`?Eb(9edQqa<G@LO54RflT7m4#?*znJiP_oUdf66q-vefPNwDA|*vK4Rbq{OqXIV
zxft>#N@mDRIF~9Zky3NXWzcuQE>|*3W@B!*k}?r4xdQT)O6Ev8oIOe^q|#h+74)lN
z*C?ryxtP0FNwv)5lItK}uVlU~fOCVAg|f(8awGJcU^gqNk;RzXt7M5R<&s+<->PJp
z)WW$<$#SVPm)s8h4%nSNd+jc(*Y3vjJ-XNS1$*sY==Z_y*S+?D)oTwzen|J)!@*vA
z1p1?}$8@hfZuQy|koW6edotK-2cSO%ds_F}GghxX3;8+SYtILJ?FHyB!d}w7_OjJ$
zuRwlP_u6a0UV9z-8?ZNZuf1jU+S`!d(Y^L=u-D#${yyvj-D@9Oy><}tN4nQO4))q7
z&_9KJrhDyktJl7O{H5--uY$eywYc52#v!T4_>DAR{8kz<ekUG`hh-(kBeDwP_p%z}
z53&a1kFplyPqGf<QE9^Xvl-_XGtRGOoZrkizngLXFys7b#`(*P^LOci!`<Bn@ezr|
z-KB*`OZPfW$7{pqYiUuq<=ag^?$DONe2D2MTJxc%pR{x5ZvXl)(@(bE4>$ePUHew}
z=Oau%%~~I6`mQ5;4*KV#Oh3bl7frv^+N{?>Ke%~s4E=xYKh|6y+<!0A5AMIW=?C{8
zXZpeY$D4j||9wn9xc|QJ%WR$9-R@d%Kd{&V34{<pA|VvepAZHZKnMp6Bt!rP5h4Le
zgebsZLNs6q!2uWwDDU138V1t&k~z(MP6G3#5STBOz<k3A%r}C-d}##c8wn`GDR+Wh
z6d6hp7%H8>P#FY<8bx5JOaeoVCNNYMfm<I#;MTJV-1=Amzm#$F<H&)G<M{$J<`9@M
zm%xk@2+WvAV8(m`GfpHhV*w!ey$-@8&M@9&0^>~~Fy2%G;}sGZuZX~S(+G?=oxlo;
z39MiSffdXIY>VQTOFS&2#1v2mAD&(*UuEuD1m>PiVD2&ka|?mF=Mb2?oDk{u_`DUM
l`m8Fkha`ml*DnBnHZvpM#y_h5Jy0d#5ftqB%Ok{c_y^B@Z;${0

diff --git a/tests/ut/data/dataset/golden/test_2ops_repeat_shuffle.npz b/tests/ut/data/dataset/golden/test_2ops_repeat_shuffle.npz
index b0ab3fb798b1bbce1bb8e99589a3ea397913c7e1..2ae358d16bf263f32c8c3a2d9d72a21174201851 100644
GIT binary patch
literal 4042
zcmbVPcYIXG5x!GHI*KTwH&L~K5J&<cs^|g@Du^b^Ax^p{BoHUP)d^b$j9dVNyK%#X
zxTF`SCr+=9(|fN@FHT~oI6YCmnYn$3^!w$Xy!-9jZ@!tG*`3?jy=$P%tb&j}vqICK
zTi(7gKNRAY7b*zFtZiOelIZRWheiv@hOFf0a^0TVhW+8t$<V2yxYyHai&hmDovtq`
zE-Y&6vPm0DG<Vs!$Mn`%XO9QIr#;r~K{l1eg=Gth3(pk(|4*bpvo+CsqPwr8waa=X
zCwe=R9hh&dFVbwi)~-ZP()P9{BYsG&B8j+a{V;{j`5BwSEq>+&KdaHtHWa2@$~f=m
zq)g+<zHTq#4{OT6GV3e-TnbV6f}hvuM+{{)g~6Wohu78B)ePW~7gZ(w5iJ*^{z%gl
zfxfk?v$>}uku0t7M@9X76ZJ<^RxIj|p$xpoQYPNxC<pHX3gbPVhT%Pd^6;KmJ8%}=
zB#PiYxhB*<FaR)xvhkiOBd3X%=}t96R5Qgzp@f+wPG?JiIqGh%_?{<D=Tk0x72)kQ
z<)mg?8uu~F1!xNm(vjyy{Y8edC>x;IP!0_PSZrtnjRYt$ltGyQOAL*oe1N5f#?m-|
zQbQAHB0!m;0vZocZZ72_3P1f62C>XggoXnwcaAGus1-wPE1m617ipE7-)c9BHEvpK
zUF>ykHdSsC>kW;jF^IWAL3-t7aergf--MNKM%&`-wyFz^s0O{w1>LUm0bK){T;v@(
zJJ7YD>kLhz$q?!dO`)kCPy^_lZk3I?ZqU0x?>01zrbF0cXa>z>>%E}&8JbJ;AT$}8
zPen}c2YtX$F)fC0&`=32Vfqm0sKHzoKsan@AuVFM8T65%`HX4Opl*S>)ukt{nFH;C
zZga~z8Z4_FbcajAu^<V@L3a*~aUzJ3fV#`Y=ni7=R<mx^J;ADzpnF~ZPX_tF4)pad
zb$vnVZUBACCHi!b=rf>ibU8X3<ftF?O)jH12N}Hu^sO#+=YrIo2Yta%A<aTYZZkBS
z=5R(Yg1+SF+f%eRsNyS@`FBM9J7IX2l;Yh|tM^FV-YW%ppA_e1DcAd@Dj&F%%O?pQ
zF8l|jMjuMo47i7-#IJ~#N5tc!gY=Jy{&DenRpP!%qC6oHpB&`>DT(>CM0vHG1h0{k
z;M#Mmyv%=X)PEgT@p|$028sDbiT@_?`DXF|7K!+bczi3LguH;aiR-u5aJ@r(hs^Dr
zGWK2K=iN^A9#OqloIERm-Y0I~FCjkQ9DY#ze@NVZSZ4f@L1$Na-2Z6Q{}^WbakNjM
zebRkN&jsJzr$9gNHo&J-8vx(wXTX0J?Q?G9eLmQDUjY3@w|l>o+PxU^W$<4?`>NZ0
zUki5M*Fk^7?d)%+b~c853;egyzGG-9m4f)Lp)xAxP4zv{-#4_3mP7b~p$b~zG5<sG
zKSF!KRo#z+s{0A(pSl|TS*k`c<mcdjf%Z#7m9!GXuMDlC)$Hxppnqd%4XuUnqM>zE
zB~gA0{<H?)nmL-&%Q8X_L@GUz`T+CjAt{%EL<>Sf5E!2cQT6+=z5AG*I7IzR_`
z$X`MK&Cnr=LioF(!_+K8{sI1<X#X<QKs%xPx1mPb#Y6rBdce?b+T%rS2$!p&y|j-<
z*)U5{o1tAMcj;ZvH;K&>d%lVwWUF0{aB_3mVS@ScVuM^Y$kQ$&Iyp4$aKU<R+7YTB
zDV&_Oc9dW_Zs8|i4MuA>Ms#xC+p&W6$hYHEUm%>E@OHdlIppDIf*MTJZj$KalCqNp
z^MwktDXO0;oLs4Pnqa<X*<iXF%+PKocSG`2DE6s5%~HGB!fBr72zGgzs|NG5n=d-e
zQ<30Qo{Z`j2&Z{kDA?s`ks1_hw^($tFKvln-I{iZ>X!;98`PExPVG@!rUvEOEfbv_
z&33t9J)LcZ>Q@LSSEsEMEVn1}v{DULX}4N*@>9pI5zIeIV76BE>x7fvJGM$N|I%TD
z^=hy|yN#lgs<xX1Yhl~Xs^21<l(*d~I91`cS`D^ow_S8nQMN{~7L_H{?+{K(%hn1`
zRhO+(gL>^6L?^{;cM8^ewvDRaC7e{X-7Pp(*mjQ^?A30c=%idN|LAoUYw@F>%kqG5
zTDJ!UyShE322t$}i*7w_K<1hSZ=_9}#v`ha3AdTHfNBxEm8vDrt!fb0&J&&bX%n3C
zb5!;1!l|DQ!RjY{Keu!wVz$p7gEDH5>!?n2<i(=)1cO$91cNw07lWe!-3(d)d<N|R
zmccQA9tNENNe0IOdKtt3PBLf%xQ+okyq*Dv=yNtVIGa-p*z;)yJfAZRIOvTGcp7IJ
zaIAg?9Q`H+9Q0-e9RRl|@S5^)`|`h#w$83t(%uR>YR}<~FYY|L>h<HRpDBb5##|6N
zJCJV^Ise6TDg8x}X9n^mmaAR~>~9zS?5ocnQGa)cT=eqOFQ|Mc%hfLp45Z|{M8ELL
zl_ykxH_Okb_4kN=N}#`2<n(;*6FEKK%R|QBKV<v^BB$r~pvdX<Kg9Ai>v>r8>GfV=
z`I`AZBKq|D9u+ygp2tL<7v$&hL9*v1?Nu->B@GnLD9HR@f4`BN)Wcslz9{!+yn(X$
QPg^LA&kFQs<PZ6O0UKORkN^Mx

literal 3831
zcmbW4cX$-n6~%Yekd_Xjx25g_gy<lu009DBR1m!^VkIqANSe_KTLz3=00XvT$7v+)
zPH~IV6Q@_(>AhE{7bme(oSw+%yxIE(@Avti4Buma_uMz{mbnAF4U%6_8VRn+k;PS)
z@9r2BiEx!fN+T)XZ<<%$-q{n446!0N(#h+tx*fF*d!mt(kyDA*bXSX?SRPNDu1}Q3
z6NftdjGt<6>hN3BlsBf@y3)|Qj-)!%uuVl-e8HlbW$`od|NDyd=C`zWpXlr<Z|U&U
z<tMt^GDnebswdXur&~JOyE1-vOD5(;f?cup*1+pUC34QotBN*z`4_x`Mz7FFRElNv
zd9NtDwKdbznT~n=cITnY`U<aDA`-pel{9)WBl)|dP)~dP>+0%i`f%C5q%z|TXug>A
z2AZTd$l$z^bkZAaq(BM*LyQzjKfq8U17sjzn2|im2MjkdNCpE+jSQ3FfDuMU$|%4{
zBc(C|Fv?siX3KH)N)*CqBQfa@7^4S|)llPd_m0=S6ExC9&2N$>5!bXPYwRhS%~VZd
znvo$g6fvhO%s{OPHB4{={4*iX(h#$Q=#a}G&(S#LK`xNzLY`-2w2Xl<-^f@Qmj*3>
zyijXe6jTX$G2|si#>)g4ON~sFN!(old6|)EG9AWpBQqpHc?IN^M#^Lkj8#U;WiI8_
zkSmQCnF(W!ky$dE@><C2a=TX*v<U9?aBt9NYz+DYxf=2&Eo-wYs|GS61aq*(HK7)A
zU2cqe7o!30ts0}z#n=XUyH>r!RlO7PF75wr*Z)0`_iF3*xz_E6oYY1iaE)$)d{BFo
zay@E>+@c+AbsbGZKBTQX>{@pO@=+sknT(DcGcrY{vZKc#x2b%>$?cFkusYS{IhPN)
zOT%WeVR2G7^pjZEXt~$Aa(f_Or#(BB?HO)44gCz(^;+{;S934q8?>J{X8Va-Zi0R@
z);S~dWIlxRMi$6ImU02|Ek+i}Vi*^VERm&Y>X)Ej#=2FX^=<B1-wydzIvIClCj+<K
z3H{YrcNwXWWe{FtWVx*1X|IL+IwLD(6^z##SuK?|${V0x!Fr>SO|lujHyNoB;VpMV
zzQ@QGsfBT`kvgfjx7-K)eyj(K?3O+7J!oXF?BgvDL4LE5{gQ<779$6w$=>o-=x@V%
z*hqtHh3^p~jk1lmJPP?SBim&MjK_`alwJ0gw?lsd>m9i>{Z2R2?}Gc?I*#wjjw4Qa
zFZB0ey<f-a18$t2g#1CB$q!{`61RL9`bV%ns*m(z?vZ{R@+b6>elmQdpMw5rtk3A+
zJmm)GX~@s$;Cwb5oX<i3Jk}R-Yx<(A=}T~bS!?=ASkqUbe+}#FTGKaNP2Yt4Ev@O>
zVNKtG{#~r^=`?)bO~Vf$|4^slN8vR582V4Jp4Io~r|$jv8RVbq`}2$N{rM&IUt#@P
z$MiRDOn(dccRHreg=6}A=zqZaqrME!yO-ew$bZt8;m_gA@E7QR#d=X^?r(19{to#c
zI&=RFXYOCn|Bdw@o$UX*$^IYYzU*ZC5&W;~hTD(QO8R-hN51vhk@X8K2UF`82L2+e
z+1d5`S<a5HUmR>G2|i-hw?@{clYW28>!b?P?+*z41Fg1RHb5F=d81VGHSh-q8-@fQ
zL#;13XPD*eIl}{gsnvpWMp)K!M#7SGD(R0Rv;amES^;AShXG>=&46))BY^RQV}J>S
zHo!!}alj-(3J@n80!$|Gz$pZVn5uiG>7MBXo;-uVd=dl(H3X(HlfYQB2#h|Pz@TM>
zqkuUI<$&rnBdVW><0}jtbFE$I?0ME6{M@;$f4;RRIeP)^$`{@Kh1Ngi-X{+R=Pj~!
z;)REv4eZ6VtDo=d%i2q<f7X>N_Xqx^w4Vw671lr2`IlKc%xAf^!+cld+`lsC{#Div
z^IL80u>MNgSJkt|`ontH(!MJHb=Du&S7q(6p7qwA?)tMK$KD8A3efm!LLOifAs?`r
zPz0zUL;*tR2iQU=0n`F&`_6*uD8&HvfSO2eUmv)EVj*BFfrmB{7-rj5+qUbr9Rvp0
zNnl#L2#n1y*ESpcu*inoONm+RBQW-ULNOqzaDc#rng|Shkib+@1lHF~V2l<5!?zN6
zavH#|@vQI=MMgPH5#!PeK0=Wd9wqSDV*~~|e$}=%-FAY&5bXpe)<IzOP6G4s02<V%
z$V|EjjGiH|#%_g^1Riw_fuXM@Fqs|#&%BPnD5nTa;530Jp8;Gd<_{KJEcAMM80xI`
zpi*9}wwE4Odjo+1ZX__|O}Tq-*1hKljCh{FJTDNK%q;|_cM<Ti94^tr>@E|S%&i30
veOmzOjDI_*xx7K5d8PTU^jkGZ-WGt}_%}v>xi&~)5rruJEXC)e_Sfr69xWP<

diff --git a/tests/ut/data/dataset/golden/test_2ops_shuffle_batch.npz b/tests/ut/data/dataset/golden/test_2ops_shuffle_batch.npz
index 579431fcc905586d5c5abb38ca9a17a71e1f0b61..2939b29786bf0cb7acfc381d7afe1c27b473d762 100644
GIT binary patch
literal 2360
zcmbW3cXSk06vk(>sY{2@+fw%dA&>+@XeQJ^K2iuRlp$<(ClCmmyvYWQfW!g_cCleW
z6zq!Fv3IfeUJ!dh1q*gvzc*z8&+#AUoNssU@80{~o%imW*)n(&(>43*qb2X$H(__G
zrYWUq=~~b<D~D%B>YF@TTMMc{<y6;}k~zg?^F7)sZFL}QG=$8+7=K_*X&}QNsID`k
zW-wA&XNC=hrv_^q4DyC$!Fq$X$jk6&56bYb_5c6N*X#{N8duggWrpfZBXebAZM25<
z2Ah19rV*-(G(^qDP}C<iJIWUc+g|B`wq7Po@>EIhMwwVHlLS0SMZyM|?1YA+P4$LP
zrc@;GHcRtlDl~XD%CvIn6Yy4eNY}^~B_$<AF<#aO@}sh4)h1oG5*0rBLv^*44K<Ny
zR<3NV%QixnZK-xh4C=By5`u;<I}oJ9OVAO?3Ungq3=csUq$tpppc~Q%x);aRDbj<a
zCz42d!KXlP0zXm-`V?u+u~^VBDw3V~v%<2kF8eW}KQ%z`CexTQP#_UW27d*Jg+MY=
zXbci)iB<{^CLSV?056RUf!1iF@KEARf%fP?W0*iEbXIscah8A&Eofv5q@$z4Im9Cb
z+M*qekz%vcG+Hw}#!)q9hh?rVM=>~$8cmIHlg8R}F?}5Ic(?rH?77q&`Q!!E1iN%y
zPK*~jiFmR=7j$LZ6oGE&&S`ZymAsIeW-o#+r`z{WR}nD;dY~tb83Mh~+e#@WFQG~W
z`eOinWdZ{cP@~NxE*HqaP#UuYGBM0bnN2>2nrknIF6YIUtAcpG;5>tvvOr)khFB>J
z$rn+&fFFJ6TP)BQ{nRX##7mA{L0tyz4Px#w{gyjIR_QXtj4)+b{Z?)F+fssMR=?HQ
z{kEK7h1GAhcE7D8h*<qrw>ed{HZQB~>gkYH+nILTH4sFtwrf1Hv#R%w?){u>xk{JE
zGxP+i$*Sc<yOz}iYphz<+O?cSu+FNb*{<bef>W$oPE`lYD&sV=^;Q`h>@qeIoNkq|
zsfbPAe02Bcgym*kp23JSsk5lFsdL=rJvY9*=MkUp?(G(5Z@KIjkY7k$<Tm8R@rJyF
zc&od&mpXf^w(2rEE~l=bwz=(hrPF@8+)jLz+hSKAX|WyTJE>~~h9isPTq}@`95v!~
z#McXqz(^W52;^cE_ng0Pq~j*)X6hFAaO`pphc0g=-tC@?+njU3wA;z=pzajN!)SuL
z1jb;j8u4!8dj!T|JdHg9$06UI>RvkTqwc335SWJP40uqW2vB(s5kD+21I08R5hy{a
zRpF!Lk5P{cRA4@RPY5i)LY49)@lyhepwrkZuo#v0R8Q0K4D~GaoO?{3caDiJUm$+b
zJ#a4_IdCtNze2q#P==Wt^EH8T%u-XmPW*<zY|NqYroddxv!{BCj<>0IsCNYlFo6N@
z2~5N!mG?gJ2Lh8Zg~mRCsVKDW>O=DV)JOk5ksmuJQkS35{i*wa`OJC1FgboYs%j!Z
zvq^rg%P$!ECG`~yKEuC8NIldK5PX9$!M9kdz;^@(QAKbF%M|#Y;0G)x_z|@V{6ugV
z)dW9dg#y12{E8Za-w;&iBF%X#RM*x8qw;rM{=tYpslN&fIu`Eo)6L+&3k#AQgM-9`
zZfbnp6&AEPu-=X^J(k_uu?^i!AYhzV@%+Q?NM@oH*LT~VC2C~GCE0Pw77rZSv0riB
zOi^~>!C1_)8JB9urCD*^wr<^S$N4OKuwyf>g&o(@;sfIIwo>-q_;`j9HCq!^WtPE{
jkna7jKRcYnxtbZx%G|#`Wk^ylR4#)@8^!Mf)|>1fXa(1+

literal 1914
zcmbW2*;5o(6vlfP)^Pz9TySAPaSjNGsDQXIiqgs_Frp}^h{MbvC|eJM2@xYEAw;rk
zHezB-%)XdCF=k)Q{x{@352jL;XY$=U(?gY2dC9FhXU_SZKHt54Z}+qh&csY3I&zJR
z*U#VlJJ~Rlr5TxqKOAl^E$JH=b{H8NstZ+7jSCIU^}bezand;D35Es(VNaFYGt%fO
za(g=a!;!GRuf0DU3@Kde?->k{4|e$nLd>PI$X#CKE-NT<pLYMx;Tm-Y`i6Q3hD!qd
z;ZRBMP*0?r>-7)2+QXqhf8SsvJQRqyEF*fAt1lR>S`HXvRzi)V!*Y&WiA`3Lz=2dG
zBVpW1j$Is#3=f1{R!U0(Z?v(}N`(Q(xRusqxdhG@2kD44qoJXpZi+)YR7b3t9p~Dt
zS<+_B7AmHN+N^XzB9a(n2xmxPFh?*Gvlz@3B*4jFo?tf88Dt9PVjhDm!F()WFkj%p
z3<e7XnaE<WP>_K+3>L{msyYUaQ8);)Iqzb-x+E$^yp(vEy_}q=9<iG^*Ir6qbOprA
ziB|{~ViA*kK{ghL7_B7s*ta6lTOlqWE)*=mQYNbe%aEh4E+Q@#tUx}K62VG%6kbhS
zDkwrRlQn`8tX8;;c&&YV>!No~yq>sRAShr`At=Nub#*2227w#7OsWKVSg!C!;!U^i
z$mZw{v%H1nYMR&X+#2uPMqFbrc6)5GY}r9xOEc{)-5KA~UBq?vin&5{R%#Ku$?Iti
z_69b_H_%7C$KI)?*iNx!FZn)Nv!E1f2=)uguvV!ZAZ`(?!+IvIf^t;oSq_pPq8+vm
zwJm<AM~K_)Lp>ThRCfExJ7@txB{mQQ1y$InW(g5@3N~RglVgG{sMfP|k$2OM3wB}`
zYbOMCfNJR>?iK7tJ(E5`0~&QpKluR7611R|wXonI4yl$w;)vid+L#Opj-Xw)oFu=E
zcDsGh!|{W@gZPyFw2Z`_7IvQ|zms-G;KLq*Q9%>-s#(qw-zC_GW+rzF_Tzw_<sR}e
z+PwlVwz4)Z*oGR_av$+I!FKFmG9jpisax(Re}MMjt<Tg$@n`B`mLIX-iAQ7a1Xa!>
z<ZrVsAVB{Zg7lB$82uCIpnnow^iSb9{UmzmpGGJBMVz322Ho_}!cTw6^k$h?-F&~9
zJZE~7;>l&xoBromtn$3+&50*3nAQKpFS}xT^DbRG8Wnia^m_if@<%jz$uw_HO~sOz
zO>g$(<mG7P71LW7PhK^>`Ej+^%p21^uOpF*dIJgcZ^B9c7Lw_&!a@HwQt0178vVQV
zQ)l?|9+K$ahl~CLq|$#_XN*owG5APpeXO;vX{}GR)^)A*sn+^TYkjW%XqDO*TJ1}%
z_LWxqx{m7~W$7EO_N`X?POE*d)qc=wH?-Q1TJ5KaRQ2c0p(KCSlD}xlU$x|K)Bmsj
qP{jJZqr?YCLZ<V-K9^iTY>cY!!}Q0~ha~;UbQlh!f`#G~RsI)MT|yK9

diff --git a/tests/ut/data/dataset/golden/test_2ops_shuffle_repeat.npz b/tests/ut/data/dataset/golden/test_2ops_shuffle_repeat.npz
index 5c39c0e64b87b77844a10c529ff628368029b4bb..6e2486024fde4b75c934940e56abeff6d9cd7955 100644
GIT binary patch
literal 4042
zcmbVPXLwY{72Q=sT8b#5H&J&0A&>+@RM7>xs34jsi&*WhKp-T&(F$7zj9dVNyK%#X
z*e1O=J#l)E(|dEO(~FbXDNaw6GiTnsHTu5%$=h$<oO|w_d+*Gf`{r$<Wakux>@_1a
z@#=Gx>kC66(+WaGp}5v9i%UDZ`@*4-B#og_n(I>iuDZs(;n0cD$!IdwlhDx>#nDp@
z(URh5TbEAjcxOwOPNodKIo{Ee0^f5a-kpN1s!EE>=am$nF8=?oNPl*sv-fy+UumLC
zr%I3acBI>pZ@e$kqEm^k&YraHO{61U$XZ1@lUD16C3M!y+7NE_vd?=tO<t~&u;feD
zIWNyQO{V+0QxR`Sa~7uAQ03)INW$m6f+jDbB)d5b_LMiYzP`S802gmko%V*cUWj?a
zRdWRTL{~>kPkU#&tkN41^9ohW8!0*Qm^VtYa33w%xQ~%M+>0cP`&b!*`#34UeSF=(
z8FUjQ7x#%0!F`hC<371I)ITr)Foj;GI&m5;r_)6-!^~iSnG7(?x|>bkbLeg^-9-^O
z)tu*NTbA^Y<vg_c3d>QDig^o^<VY?+iIO}S0<ciYFc}U|sw7LY0TwA4A%y^om5i1#
z0A)(X$#{TrB@r14P@$wq#sVx+GEznXEL9it5!YP(5(ZJ}LM<Dtsd8SHyGScsek)yC
zt6W;EUF<b3n`#$(t&6?Rrk<I3dD2@S^EP1S8__m7yUo@GlB)r|#Rc7J^8sB8T3qC9
zHapODpzD=Pkcki)luVMzDWFEs+ubaiZ04YMfZnNOicE#DOUX2uZmf5M-lJr;%z@CX
zWUfRFy%+R8B_*;D!hR*CvdGW}K*!uf585>V-2(cMLW=Vs#FfmK1;)A+bV5n7%z%(o
zGE-(5It99I@aiA-7c1r+fx6u-!qH$6j)CqNJnDEbsuSuimsxj^nb~UE&AKO;bsBW9
zTmKWm`d<h7dbiMh!9w2v`lMUvQ^7)?27RMjqcg!8^@G02t?13cirxbHR<}lHgEcw_
z`n+4C+k!Q^0Q#bPh`0L>F?CJFS1$4Hh<SIy@Gh3(-K^DnSdjO!An#+vUt+o5&#HXj
zV!nBk;4+2(Aj|n7R_ep7)XSM_f&LMCeDoUbF^+khKCdwDs~GVKMtt%b|4%XIRr-H5
z{lA89f>*w`D$2dr#=O^I7O$tbH!$WK>GMtW`DV`IEsXdyJ-*ev3C#rF#$?{k0PkRc
zch;Iphw5GQ^KK`;hnDZ9lV=#{eGKt_hWLPU_(A&r5FLJ)4nK0uyQ?DUeKh8M4B37h
z?GtF9bkFWn!L$1`=x5yq_>8{+u<t(${&Q%bcN^~u!N&U{=r6h5`(=OkV#rs(e--U(
zZufmX*nQss{Y|&Czvb_24EZ+r@1Q-WWU-Wi_^y(2sW652J<#7*vP70b_<@p2S(Y;V
z55fNk?Ri&qKMtzyC!l}oD&l8;MPSI!!T$p7mrAN+If!2=Ss^Qpw_k()jgnQe8o~=o
z)<`v@{1*K0(0;FEi)@ANMJ2T&X2?sR|Da@>)Is>8l6q<2kUxR{Guq2anq@C^e^Ihe
z_M0Jp1^qWA2P6jJ?@A6z3y1sz{6EqD<x1_}L8<)*^nhQlI)ua3FWFQ~hm91|S+>ij
zu2Hr_lSAAjI}lXoT74e5ow5tm5aQjk$3)lp)}X+45$e{;I%tLxua^zRY?###C$~{H
zff_-)S!(E~&>D=i-6+##s?eNcdNl3L(FkFTwJRdWlT?o-HixM(7-tQ}+in7NJRtQ%
zVsoIvY?9SaCdZRhPa!tvtTC8s4W`*{y6FbxRZM$7uNl^ECOMneEMk|}Y-=#bc5|uY
zDWapq_8`&9>gSQ;nWX0v^H@S&3#>tj?G{qUeW^={?bg(btbQ>$ZctrD?C()sZVf7I
zw}d(#UwSFAJ;8LP)h{E*qf=KA^YlbM%dNo*+pVOI=d@l$Y>#Ta+UnPk<H@Y6iFr7~
z&suA+&UWjmV^!-7#I~^YMyuaMj^(X46Z;jeYplT*+ij(e6{Ty5ZBc2l`fcP`TDp$d
zuP$A04H|6MNF9q=Zzs0(tedQU2RT-?-bw5iw%%n8cH3?bb++EjN3RmU-tf_H%5on$
zTW|Y`UA-N!1~J<mq|Vk`3$b5shpawM&emHiv8%U)HAvbnMIAp)bsMqyyu{-;Z1qRT
z@%vP_6PqtnT*3L=+TI!0eflVrF@4OA>Od!{cuXHRAOX;6KoX$KfWrXY2DAcr1{?v<
z1{?+GF`xq=Z9p49uK~vZP8iS*aGe2hfa?t~A^IG1g8^pLNdt`MQwErPP8(p-xX}QU
z#u)=ltbPMb^qUMYv2HfN#Ja_RR(#hN;KVimLfSgI;%R*==$Jl>J09FQbTw<o);wJd
z8;m(mIX94Rqg?pHSzmvF^7KHyXyodb1N+;lpLyk(L)PCNl%p?QecsA<8oB1hfdOB>
zi~32IFF#@RcT=7p$oEj56ZpT^$j@fR-$#8W-%Eq^_fwzo{{ZDoeh*U4%>N<EnfX3U
zIWynOMt(*9k5HeP-=mZ>`9DTElmFvJ4%RD`O6x0NT1y)xoK=+lzy5w>ZT!nTZagUW
VXS`8z&7ZbV7{6ucPxBA?e*qr9PAC8X

literal 3831
zcmbW4cU)9g7RP58Kp4AXZ)2T{A}SUT3wD$vMHKsp0}M6<-eWK!Y9z5mlbB+f>50h}
zcav;M+uqmhy(C+b>?T`NOxbL*mHdA9&ApHC`Q)EGKUcozckaFC-1E+bnJNjT=LEcG
zd?3``ySSxaAi$9w$O*(<w{~WJb8AO1FhGl(NGG4O6*c8mTY`ZDfrHV8L|eTZEsjJF
zRYvn7(Z&`x>BgIDTik{O<yG;fwgmLH-SO50Y*myODVUR+7dagH&`-ECRNvgbzqKR3
zzQs-C?{9BP?m@osj&QA;sBdX*OS<j#$*>dfW`&y@Jg*a!z;P#Sd9cn2opjQxoeU*G
z$&%qGoXphJhGa);BJA|qoQ7jo7CBiGkl;xtyV?mW32hESJ>>MRsHiCG!lVC_lBCn8
z?o`a_t71++h5fP<F{i(hbjbh=P?9OV00Wiuk-mUIO41|*7_6k9^atc986<-NLzE1a
zVSu4Z!qOWsOi7Lm0Ss3%Kn4OvsMA?|33xgs2w|iNHL9m)wAp2hi8R*aH_oILF=>rA
zu_u^pCYsojOzg=f^(i=O)Tqi^fK+lJPc<Q?dC?)~L7r~n<a@b5o&kBLlHoD}#w;Zx
zWmE!G0C~1K(;P2P$a5jjQ!-k{z$jEQR>pC55#;$wCdp(N#Y(0~l=1?|3zg)_bQp`2
z<jV}oiy@boV=eJc40$Q!WlEIf!dR|ks!ZeT6_8gdiO6^utCUQTiIhtrukKmvHC}1p
zE`u9ed3~_f7NHz+MbA~0c2yPJ>r7_VHna7RH<(k`*i&zWyvfvmv#tLY$XiXJx7k8(
zha5A7-eC(}3wfuhQQX$34syMzXoIb20&=6N(Jot~-H`W~8tt_;+6TGGyu|(XB{oBD
z!Duxr9J|7W+-AZiQ(;lgcIXE%E;Gly+#a_B@)f3L2U9h}l0(oBV_a#@e8ir)6Y^E2
zo>!;pi6uv&AHz7VWTwo5a6(Ce%;xJp3Hcf&b7U@zQ%dGZVS@T;=+|OgXWsSo_FbQW
ze1mxpH>TbLmfQsWW{k5+iex^7Ta*;b0`7Jz<lB@iltnOZSF%`2bd)=wpToFQ$!b{x
z-(5<|M7ZQ`$oD8&E9EfmRZ=09ddYpz@5gvR$!6IC--AlF$~G=J5BVV_+a(6$VI@1H
zRxkMo^p9eE%)H=_+ZTKR@+VA}d@|J~Sp5j}M=>5#QYGskd`d~RtY?%@L;j4C4N?Q+
zvr0C~CLQH*=ucogsbq;Ph3_dP%Vas1JPrACN><297@t?NN=o&TFF^ky#+Q2DoG;s(
z^A)(iYFh1UsaC@-Ux)q;jBlF8`Ic>*Z$rLl8s|HH<9rwT_b|TSGujVqv>(F#BNOe%
zezc!J|0%}LOa*^#EBFh@zcdy6m0!VMLw^S2S<?-_vEA@n$iFk)@O!@-o`e28#tY{D
z{K4LzKSF-d+@F{H`|~I0f5!NWxeR}`m*H=a|86eB%l>6}1^TNPubInm$zF!nA-`cR
z!<+tPcnkVJFy1yj_fOk%??8Un^xVJvp8GfS|6u&rboT#jXa65^SE{q!0RHOQ=5~X$
zVs4uEBcy%0|J-!VUJJSzo<CD-x)I%8np3^#W_c5`y&qxi(@p92*6j7A+sE_w)tc^3
zx1Z)zgS!2_2?M+z1GP{8>URfe=AZFI%pL6cbF`*E2e?Bt^Ai9k4230$c+4F}s0R!u
zGyp~rb^%5b>Hwn%y8)vKdjVqzO@Oh4M!-12K0t)92QZ!x2TUL^#6+V^B5>7Y0(YK5
zU_Ma-(@+GakxO8#sRTx!MqsQw0%J`lFx7lO>5?I(k4Er;1@;VWXIOitw)?+)JmsII
z?QzyFpk4B|oj+UqC*1q=PH(?C+K#?{{sqsTOS|+^S69lOr~M<(ox9)j7ixQ|wTrYp
z+3r7|_C<evvG)7<F6iN3sQv!_i?r?Mw^-Z$`Af9zpLdD2{qruR-7WuR+V7ulxwifM
zS7_VMe<kg1^;!j6(oy(QLK<K-Ap}@M$OM!Tf&d}(0<0xu1IhvAT}MC_lrjL7gfO6r
zkOf!=C<}CUb%CoXGR*pJ;s%ObTSH)gjRdB(iNI|5;aX>dFBYkgTPZP%Z3KqgPGHCw
zfhq4GaHCoR1Meho$2fuKt0Qo7J%QmH2;4aV;Kz8Lu#qCO+(lrx-2{f)!)a)sPVioe
z479JC*hG<Q_Y)YRnZU$a2+XIIz<eBl3F=a0CT#=;O%fQioxrpY5V+N41cttxz&$$%
z-17<oCm$p*fkOoDd>C*#i(gpq@Ss=H!(@*T7_*bWm{;k!I4K{V_G)?<@@P-bF?zVm
zaRMWrATZC91g3Wlf$5zByf25-^f0??3C!U-0&}>Y5R501?io;Bew75%azY>UUDZof
ar;FM69b>+^R!K%Cg&_VF;^zhZ?e#qtcNoe5

diff --git a/tests/ut/data/dataset/testImageNetData2/dataDistributionAll.json b/tests/ut/data/dataset/testImageNetData2/dataDistributionAll.json
deleted file mode 100644
index 3ebc4c989c8..00000000000
--- a/tests/ut/data/dataset/testImageNetData2/dataDistributionAll.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "deviceNum":4,
-  "deviceId": 2,
-  "shardConfig":"ALL",
-  "shuffle":"ON",
-  "seed": 0,
-  "epoch": 2
-}
diff --git a/tests/ut/data/dataset/testImageNetData2/dataDistributionRandom.json b/tests/ut/data/dataset/testImageNetData2/dataDistributionRandom.json
deleted file mode 100644
index a0f468f91db..00000000000
--- a/tests/ut/data/dataset/testImageNetData2/dataDistributionRandom.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "deviceNum":4,
-  "deviceId": 2,
-  "shardConfig":"RANDOM",
-  "shuffle":"ON",
-  "seed": 0,
-  "epoch": 1
-}
diff --git a/tests/ut/data/dataset/testImageNetData2/dataDistributionUnique.json b/tests/ut/data/dataset/testImageNetData2/dataDistributionUnique.json
deleted file mode 100644
index a4eeddd9ae9..00000000000
--- a/tests/ut/data/dataset/testImageNetData2/dataDistributionUnique.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "deviceNum":4,
-  "deviceId": 2,
-  "shardConfig":"UNIQUE",
-  "shuffle":"ON",
-  "seed": 0,
-  "epoch": 3
-}
diff --git a/tests/ut/data/dataset/testPK/distribution.json b/tests/ut/data/dataset/testPK/distribution.json
deleted file mode 100644
index 33f869f653b..00000000000
--- a/tests/ut/data/dataset/testPK/distribution.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "deviceNum":1,
-  "deviceId": 0,
-  "shardConfig":"RANDOM",
-  "shuffle":"OFF",
-  "seed": 0
-}
diff --git a/tests/ut/python/dataset/test_2ops.py b/tests/ut/python/dataset/test_2ops.py
index ef60a42e272..cf781d6dfdd 100644
--- a/tests/ut/python/dataset/test_2ops.py
+++ b/tests/ut/python/dataset/test_2ops.py
@@ -12,15 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-from util import save_and_check
-
 import mindspore.dataset as ds
 from mindspore import log as logger
+from util import save_and_check_dict
 
 DATA_DIR = ["../data/dataset/testTFTestAllTypes/test.data"]
 SCHEMA_DIR = "../data/dataset/testTFTestAllTypes/datasetSchema.json"
-COLUMNS = ["col_1d", "col_2d", "col_3d", "col_binary", "col_float",
-           "col_sint16", "col_sint32", "col_sint64"]
 GENERATE_GOLDEN = False
 
 
@@ -33,9 +30,6 @@ def test_2ops_repeat_shuffle():
     repeat_count = 2
     buffer_size = 5
     seed = 0
-    parameters = {"params": {'repeat_count': repeat_count,
-                             'buffer_size': buffer_size,
-                             'seed': seed}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)
@@ -44,7 +38,7 @@ def test_2ops_repeat_shuffle():
     data1 = data1.shuffle(buffer_size=buffer_size)
 
     filename = "test_2ops_repeat_shuffle.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_2ops_shuffle_repeat():
@@ -56,10 +50,6 @@ def test_2ops_shuffle_repeat():
     repeat_count = 2
     buffer_size = 5
     seed = 0
-    parameters = {"params": {'repeat_count': repeat_count,
-                             'buffer_size': buffer_size,
-                             'reshuffle_each_iteration': False,
-                             'seed': seed}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)
@@ -68,7 +58,7 @@ def test_2ops_shuffle_repeat():
     data1 = data1.repeat(repeat_count)
 
     filename = "test_2ops_shuffle_repeat.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_2ops_repeat_batch():
@@ -79,8 +69,6 @@ def test_2ops_repeat_batch():
     # define parameters
     repeat_count = 2
     batch_size = 5
-    parameters = {"params": {'repeat_count': repeat_count,
-                             'batch_size': batch_size}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)
@@ -88,7 +76,7 @@ def test_2ops_repeat_batch():
     data1 = data1.batch(batch_size, drop_remainder=True)
 
     filename = "test_2ops_repeat_batch.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_2ops_batch_repeat():
@@ -99,8 +87,6 @@ def test_2ops_batch_repeat():
     # define parameters
     repeat_count = 2
     batch_size = 5
-    parameters = {"params": {'repeat_count': repeat_count,
-                             'batch_size': batch_size}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)
@@ -108,7 +94,7 @@ def test_2ops_batch_repeat():
     data1 = data1.repeat(repeat_count)
 
     filename = "test_2ops_batch_repeat.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_2ops_batch_shuffle():
@@ -120,9 +106,6 @@ def test_2ops_batch_shuffle():
     buffer_size = 5
     seed = 0
     batch_size = 2
-    parameters = {"params": {'buffer_size': buffer_size,
-                             'seed': seed,
-                             'batch_size': batch_size}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)
@@ -131,7 +114,7 @@ def test_2ops_batch_shuffle():
     data1 = data1.shuffle(buffer_size=buffer_size)
 
     filename = "test_2ops_batch_shuffle.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_2ops_shuffle_batch():
@@ -143,9 +126,6 @@ def test_2ops_shuffle_batch():
     buffer_size = 5
     seed = 0
     batch_size = 2
-    parameters = {"params": {'buffer_size': buffer_size,
-                             'seed': seed,
-                             'batch_size': batch_size}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)
@@ -154,7 +134,7 @@ def test_2ops_shuffle_batch():
     data1 = data1.batch(batch_size, drop_remainder=True)
 
     filename = "test_2ops_shuffle_batch.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 if __name__ == '__main__':
diff --git a/tests/ut/python/dataset/test_batch.py b/tests/ut/python/dataset/test_batch.py
index 9b9baeec33e..1220d98344f 100644
--- a/tests/ut/python/dataset/test_batch.py
+++ b/tests/ut/python/dataset/test_batch.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 import mindspore.dataset as ds
 from mindspore import log as logger
-from util import save_and_check
+from util import save_and_check_dict
 
 # Note: Number of rows in test.data dataset:  12
 DATA_DIR = ["../data/dataset/testTFTestAllTypes/test.data"]
@@ -29,8 +29,6 @@ def test_batch_01():
     # define parameters
     batch_size = 2
     drop_remainder = True
-    parameters = {"params": {'batch_size': batch_size,
-                             'drop_remainder': drop_remainder}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -38,7 +36,7 @@ def test_batch_01():
 
     assert sum([1 for _ in data1]) == 6
     filename = "batch_01_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_batch_02():
@@ -49,8 +47,6 @@ def test_batch_02():
     # define parameters
     batch_size = 5
     drop_remainder = True
-    parameters = {"params": {'batch_size': batch_size,
-                             'drop_remainder': drop_remainder}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -58,7 +54,7 @@ def test_batch_02():
 
     assert sum([1 for _ in data1]) == 2
     filename = "batch_02_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_batch_03():
@@ -69,8 +65,6 @@ def test_batch_03():
     # define parameters
     batch_size = 3
     drop_remainder = False
-    parameters = {"params": {'batch_size': batch_size,
-                             'drop_remainder': drop_remainder}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -78,7 +72,7 @@ def test_batch_03():
 
     assert sum([1 for _ in data1]) == 4
     filename = "batch_03_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_batch_04():
@@ -89,8 +83,6 @@ def test_batch_04():
     # define parameters
     batch_size = 7
     drop_remainder = False
-    parameters = {"params": {'batch_size': batch_size,
-                             'drop_remainder': drop_remainder}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -98,7 +90,7 @@ def test_batch_04():
 
     assert sum([1 for _ in data1]) == 2
     filename = "batch_04_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_batch_05():
@@ -108,7 +100,6 @@ def test_batch_05():
     logger.info("test_batch_05")
     # define parameters
     batch_size = 1
-    parameters = {"params": {'batch_size': batch_size}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -116,7 +107,7 @@ def test_batch_05():
 
     assert sum([1 for _ in data1]) == 12
     filename = "batch_05_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_batch_06():
@@ -127,8 +118,6 @@ def test_batch_06():
     # define parameters
     batch_size = 12
     drop_remainder = False
-    parameters = {"params": {'batch_size': batch_size,
-                             'drop_remainder': drop_remainder}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -136,7 +125,7 @@ def test_batch_06():
 
     assert sum([1 for _ in data1]) == 1
     filename = "batch_06_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_batch_07():
@@ -148,9 +137,6 @@ def test_batch_07():
     batch_size = 4
     drop_remainder = False
     num_parallel_workers = 2
-    parameters = {"params": {'batch_size': batch_size,
-                             'drop_remainder': drop_remainder,
-                             'num_parallel_workers': num_parallel_workers}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -159,7 +145,7 @@ def test_batch_07():
 
     assert sum([1 for _ in data1]) == 3
     filename = "batch_07_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_batch_08():
@@ -170,8 +156,6 @@ def test_batch_08():
     # define parameters
     batch_size = 6
     num_parallel_workers = 1
-    parameters = {"params": {'batch_size': batch_size,
-                             'num_parallel_workers': num_parallel_workers}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -179,7 +163,7 @@ def test_batch_08():
 
     assert sum([1 for _ in data1]) == 2
     filename = "batch_08_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_batch_09():
@@ -190,8 +174,6 @@ def test_batch_09():
     # define parameters
     batch_size = 13
     drop_remainder = False
-    parameters = {"params": {'batch_size': batch_size,
-                             'drop_remainder': drop_remainder}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -199,7 +181,7 @@ def test_batch_09():
 
     assert sum([1 for _ in data1]) == 1
     filename = "batch_09_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_batch_10():
@@ -210,8 +192,6 @@ def test_batch_10():
     # define parameters
     batch_size = 99
     drop_remainder = True
-    parameters = {"params": {'batch_size': batch_size,
-                             'drop_remainder': drop_remainder}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -219,7 +199,7 @@ def test_batch_10():
 
     assert sum([1 for _ in data1]) == 0
     filename = "batch_10_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_batch_11():
@@ -229,7 +209,6 @@ def test_batch_11():
     logger.info("test_batch_11")
     # define parameters
     batch_size = 1
-    parameters = {"params": {'batch_size': batch_size}}
 
     # apply dataset operations
     # Use schema file with 1 row
@@ -239,7 +218,7 @@ def test_batch_11():
 
     assert sum([1 for _ in data1]) == 1
     filename = "batch_11_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_batch_12():
@@ -249,7 +228,6 @@ def test_batch_12():
     logger.info("test_batch_12")
     # define parameters
     batch_size = True
-    parameters = {"params": {'batch_size': batch_size}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -257,7 +235,7 @@ def test_batch_12():
 
     assert sum([1 for _ in data1]) == 12
     filename = "batch_12_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_batch_exception_01():
diff --git a/tests/ut/python/dataset/test_datasets_clue.py b/tests/ut/python/dataset/test_datasets_clue.py
index e1959acb426..0d8a60f5d11 100644
--- a/tests/ut/python/dataset/test_datasets_clue.py
+++ b/tests/ut/python/dataset/test_datasets_clue.py
@@ -356,9 +356,13 @@ def test_clue_to_device():
 
 if __name__ == "__main__":
     test_clue()
+    test_clue_num_shards()
+    test_clue_num_samples()
+    test_textline_dataset_get_datasetsize()
     test_clue_afqmc()
     test_clue_cmnli()
     test_clue_csl()
     test_clue_iflytek()
     test_clue_tnews()
     test_clue_wsc()
+    test_clue_to_device()
diff --git a/tests/ut/python/dataset/test_generator.py b/tests/ut/python/dataset/test_datasets_generator.py
similarity index 92%
rename from tests/ut/python/dataset/test_generator.py
rename to tests/ut/python/dataset/test_datasets_generator.py
index 926b84a7f44..b7061cf0f32 100644
--- a/tests/ut/python/dataset/test_generator.py
+++ b/tests/ut/python/dataset/test_datasets_generator.py
@@ -26,7 +26,7 @@ def generator_1d():
         yield (np.array([i]),)
 
 
-def test_case_0():
+def test_generator_0():
     """
     Test 1D Generator
     """
@@ -48,7 +48,7 @@ def generator_md():
         yield (np.array([[i, i + 1], [i + 2, i + 3]]),)
 
 
-def test_case_1():
+def test_generator_1():
     """
     Test MD Generator
     """
@@ -70,7 +70,7 @@ def generator_mc(maxid=64):
         yield (np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]]))
 
 
-def test_case_2():
+def test_generator_2():
     """
     Test multi column generator
     """
@@ -88,7 +88,7 @@ def test_case_2():
         i = i + 1
 
 
-def test_case_3():
+def test_generator_3():
     """
     Test 1D Generator + repeat(4)
     """
@@ -108,7 +108,7 @@ def test_case_3():
             i = 0
 
 
-def test_case_4():
+def test_generator_4():
     """
     Test fixed size 1D Generator + batch
     """
@@ -146,7 +146,7 @@ def type_tester(t):
         i = i + 4
 
 
-def test_case_5():
+def test_generator_5():
     """
     Test 1D Generator on different data type
     """
@@ -173,7 +173,7 @@ def type_tester_with_type_check(t, c):
         i = i + 4
 
 
-def test_case_6():
+def test_generator_6():
     """
     Test 1D Generator on different data type with type check
     """
@@ -208,7 +208,7 @@ def type_tester_with_type_check_2c(t, c):
         i = i + 4
 
 
-def test_case_7():
+def test_generator_7():
     """
     Test 2 column Generator on different data type with type check
     """
@@ -223,7 +223,7 @@ def test_case_7():
         type_tester_with_type_check_2c(np_types[i], [None, de_types[i]])
 
 
-def test_case_8():
+def test_generator_8():
     """
     Test multi column generator with few mapops
     """
@@ -249,7 +249,7 @@ def test_case_8():
         i = i + 1
 
 
-def test_case_9():
+def test_generator_9():
     """
     Test map column order when len(input_columns) == len(output_columns).
     """
@@ -280,7 +280,7 @@ def test_case_9():
         i = i + 1
 
 
-def test_case_10():
+def test_generator_10():
     """
     Test map column order when len(input_columns) != len(output_columns).
     """
@@ -303,7 +303,7 @@ def test_case_10():
         i = i + 1
 
 
-def test_case_11():
+def test_generator_11():
     """
     Test map column order when len(input_columns) != len(output_columns).
     """
@@ -327,7 +327,7 @@ def test_case_11():
         i = i + 1
 
 
-def test_case_12():
+def test_generator_12():
     """
     Test map column order when input_columns and output_columns are None.
     """
@@ -361,7 +361,7 @@ def test_case_12():
         i = i + 1
 
 
-def test_case_13():
+def test_generator_13():
     """
     Test map column order when input_columns is None.
     """
@@ -391,7 +391,7 @@ def test_case_13():
         i = i + 1
 
 
-def test_case_14():
+def test_generator_14():
     """
     Test 1D Generator MP + CPP sampler
     """
@@ -408,7 +408,7 @@ def test_case_14():
             i = 0
 
 
-def test_case_15():
+def test_generator_15():
     """
     Test 1D Generator MP + Python sampler
     """
@@ -426,7 +426,7 @@ def test_case_15():
             i = 0
 
 
-def test_case_16():
+def test_generator_16():
     """
     Test multi column generator Mp + CPP sampler
     """
@@ -445,7 +445,7 @@ def test_case_16():
         i = i + 1
 
 
-def test_case_17():
+def test_generator_17():
     """
     Test multi column generator Mp + Python sampler
     """
@@ -465,7 +465,7 @@ def test_case_17():
         i = i + 1
 
 
-def test_case_error_1():
+def test_generator_error_1():
     def generator_np():
         for i in range(64):
             yield (np.array([{i}]),)
@@ -477,7 +477,7 @@ def test_case_error_1():
     assert "Invalid data type" in str(info.value)
 
 
-def test_case_error_2():
+def test_generator_error_2():
     def generator_np():
         for i in range(64):
             yield ({i},)
@@ -489,7 +489,7 @@ def test_case_error_2():
     assert "Generator should return a tuple of numpy arrays" in str(info.value)
 
 
-def test_case_error_3():
+def test_generator_error_3():
     with pytest.raises(ValueError) as info:
         # apply dataset operations
         data1 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"])
@@ -501,7 +501,7 @@ def test_case_error_3():
     assert "When (len(input_columns) != len(output_columns)), columns_order must be specified." in str(info.value)
 
 
-def test_case_error_4():
+def test_generator_error_4():
     with pytest.raises(RuntimeError) as info:
         # apply dataset operations
         data1 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"])
@@ -513,7 +513,7 @@ def test_case_error_4():
     assert "Unexpected error. Result of a tensorOp doesn't match output column names" in str(info.value)
 
 
-def test_sequential_sampler():
+def test_generator_sequential_sampler():
     source = [(np.array([x]),) for x in range(64)]
     ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler())
     i = 0
@@ -523,14 +523,14 @@ def test_sequential_sampler():
         i = i + 1
 
 
-def test_random_sampler():
+def test_generator_random_sampler():
     source = [(np.array([x]),) for x in range(64)]
     ds1 = ds.GeneratorDataset(source, ["data"], shuffle=True)
     for _ in ds1.create_dict_iterator():  # each data is a dictionary
         pass
 
 
-def test_distributed_sampler():
+def test_generator_distributed_sampler():
     source = [(np.array([x]),) for x in range(64)]
     for sid in range(8):
         ds1 = ds.GeneratorDataset(source, ["data"], shuffle=False, num_shards=8, shard_id=sid)
@@ -541,7 +541,7 @@ def test_distributed_sampler():
             i = i + 8
 
 
-def test_num_samples():
+def test_generator_num_samples():
     source = [(np.array([x]),) for x in range(64)]
     num_samples = 32
     ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(num_samples=num_samples))
@@ -564,7 +564,7 @@ def test_num_samples():
     assert count == num_samples
 
 
-def test_num_samples_underflow():
+def test_generator_num_samples_underflow():
     source = [(np.array([x]),) for x in range(64)]
     num_samples = 256
     ds2 = ds.GeneratorDataset(source, ["data"], sampler=[i for i in range(64)], num_samples=num_samples)
@@ -600,7 +600,7 @@ def type_tester_with_type_check_2c_schema(t, c):
         i = i + 4
 
 
-def test_schema():
+def test_generator_schema():
     """
     Test 2 column Generator on different data type with type check with schema input
     """
@@ -615,9 +615,9 @@ def test_schema():
         type_tester_with_type_check_2c_schema(np_types[i], [de_types[i], de_types[i]])
 
 
-def manual_test_keyborad_interrupt():
+def manual_test_generator_keyboard_interrupt():
     """
-    Test keyborad_interrupt
+    Test keyboard_interrupt
     """
     logger.info("Test 1D Generator MP : 0 - 63")
 
@@ -635,31 +635,31 @@ def manual_test_keyborad_interrupt():
 
 
 if __name__ == "__main__":
-    test_case_0()
-    test_case_1()
-    test_case_2()
-    test_case_3()
-    test_case_4()
-    test_case_5()
-    test_case_6()
-    test_case_7()
-    test_case_8()
-    test_case_9()
-    test_case_10()
-    test_case_11()
-    test_case_12()
-    test_case_13()
-    test_case_14()
-    test_case_15()
-    test_case_16()
-    test_case_17()
-    test_case_error_1()
-    test_case_error_2()
-    test_case_error_3()
-    test_case_error_4()
-    test_sequential_sampler()
-    test_distributed_sampler()
-    test_random_sampler()
-    test_num_samples()
-    test_num_samples_underflow()
-    test_schema()
+    test_generator_0()
+    test_generator_1()
+    test_generator_2()
+    test_generator_3()
+    test_generator_4()
+    test_generator_5()
+    test_generator_6()
+    test_generator_7()
+    test_generator_8()
+    test_generator_9()
+    test_generator_10()
+    test_generator_11()
+    test_generator_12()
+    test_generator_13()
+    test_generator_14()
+    test_generator_15()
+    test_generator_16()
+    test_generator_17()
+    test_generator_error_1()
+    test_generator_error_2()
+    test_generator_error_3()
+    test_generator_error_4()
+    test_generator_sequential_sampler()
+    test_generator_distributed_sampler()
+    test_generator_random_sampler()
+    test_generator_num_samples()
+    test_generator_num_samples_underflow()
+    test_generator_schema()
diff --git a/tests/ut/python/dataset/test_tfreader_op.py b/tests/ut/python/dataset/test_datasets_tfrecord.py
similarity index 100%
rename from tests/ut/python/dataset/test_tfreader_op.py
rename to tests/ut/python/dataset/test_datasets_tfrecord.py
diff --git a/tests/ut/python/dataset/test_iterator.py b/tests/ut/python/dataset/test_iterator.py
index af5a66e89e7..6413e79612b 100644
--- a/tests/ut/python/dataset/test_iterator.py
+++ b/tests/ut/python/dataset/test_iterator.py
@@ -33,7 +33,7 @@ def check(project_columns):
         assert all([np.array_equal(d1, d2) for d1, d2 in zip(data_actual, data_expected)])
 
 
-def test_case_iterator():
+def test_iterator_create_tuple():
     """
     Test creating tuple iterator
     """
@@ -95,7 +95,9 @@ class MyDict(dict):
 
 
 def test_tree_copy():
-    #  Testing copying the tree with a pyfunc that cannot be pickled
+    """
+    Testing copying the tree with a pyfunc that cannot be pickled
+    """
 
     data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=COLUMNS)
     data1 = data.map(operations=[MyDict()])
@@ -110,4 +112,6 @@ def test_tree_copy():
 
 
 if __name__ == '__main__':
+    test_iterator_create_tuple()
+    test_iterator_weak_ref()
     test_tree_copy()
diff --git a/tests/ut/python/dataset/test_shuffle.py b/tests/ut/python/dataset/test_shuffle.py
index 460c491ca1b..6da7a1c8856 100644
--- a/tests/ut/python/dataset/test_shuffle.py
+++ b/tests/ut/python/dataset/test_shuffle.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 import numpy as np
-from util import save_and_check
-
 import mindspore.dataset as ds
 from mindspore import log as logger
+from util import save_and_check_dict
 
 # Note: Number of rows in test.data dataset:  12
 DATA_DIR = ["../data/dataset/testTFTestAllTypes/test.data"]
@@ -31,7 +30,6 @@ def test_shuffle_01():
     # define parameters
     buffer_size = 5
     seed = 1
-    parameters = {"params": {'buffer_size': buffer_size, "seed": seed}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -39,7 +37,7 @@ def test_shuffle_01():
     data1 = data1.shuffle(buffer_size=buffer_size)
 
     filename = "shuffle_01_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_shuffle_02():
@@ -50,7 +48,6 @@ def test_shuffle_02():
     # define parameters
     buffer_size = 12
     seed = 1
-    parameters = {"params": {'buffer_size': buffer_size, "seed": seed}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -58,7 +55,7 @@ def test_shuffle_02():
     data1 = data1.shuffle(buffer_size=buffer_size)
 
     filename = "shuffle_02_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_shuffle_03():
@@ -69,7 +66,6 @@ def test_shuffle_03():
     # define parameters
     buffer_size = 2
     seed = 1
-    parameters = {"params": {'buffer_size': buffer_size, "seed": seed}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -77,7 +73,7 @@ def test_shuffle_03():
     data1 = data1.shuffle(buffer_size)
 
     filename = "shuffle_03_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_shuffle_04():
@@ -88,7 +84,6 @@ def test_shuffle_04():
     # define parameters
     buffer_size = 2
     seed = 1
-    parameters = {"params": {'buffer_size': buffer_size, "seed": seed}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, num_samples=2)
@@ -96,7 +91,7 @@ def test_shuffle_04():
     data1 = data1.shuffle(buffer_size=buffer_size)
 
     filename = "shuffle_04_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_shuffle_05():
@@ -107,7 +102,6 @@ def test_shuffle_05():
     # define parameters
     buffer_size = 13
     seed = 1
-    parameters = {"params": {'buffer_size': buffer_size, "seed": seed}}
 
     # apply dataset operations
     data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
@@ -115,7 +109,7 @@ def test_shuffle_05():
     data1 = data1.shuffle(buffer_size=buffer_size)
 
     filename = "shuffle_05_result.npz"
-    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
+    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
 
 
 def test_shuffle_06():
diff --git a/tests/ut/python/dataset/util.py b/tests/ut/python/dataset/util.py
index 11c57354065..649f638b496 100644
--- a/tests/ut/python/dataset/util.py
+++ b/tests/ut/python/dataset/util.py
@@ -24,9 +24,6 @@ import numpy as np
 import mindspore.dataset as ds
 from mindspore import log as logger
 
-# These are the column names defined in the testTFTestAllTypes dataset
-COLUMNS = ["col_1d", "col_2d", "col_3d", "col_binary", "col_float",
-           "col_sint16", "col_sint32", "col_sint64"]
 # These are list of plot title in different visualize modes
 PLOT_TITLE_DICT = {
     1: ["Original image", "Transformed image"],
@@ -82,39 +79,6 @@ def _save_json(filename, parameters, result_dict):
     fout.write(jsbeautifier.beautify(json.dumps(out_dict), options))
 
 
-def save_and_check(data, parameters, filename, generate_golden=False):
-    """
-    Save the dataset dictionary and compare (as numpy array) with golden file.
-    Use create_dict_iterator to access the dataset.
-    Note: save_and_check() is deprecated; use save_and_check_dict().
-    """
-    num_iter = 0
-    result_dict = {}
-    for column_name in COLUMNS:
-        result_dict[column_name] = []
-
-    for item in data.create_dict_iterator():  # each data is a dictionary
-        for data_key in list(item.keys()):
-            if data_key not in result_dict:
-                result_dict[data_key] = []
-            result_dict[data_key].append(item[data_key].tolist())
-        num_iter += 1
-
-    logger.info("Number of data in data1: {}".format(num_iter))
-
-    cur_dir = os.path.dirname(os.path.realpath(__file__))
-    golden_ref_dir = os.path.join(cur_dir, "../../data/dataset", 'golden', filename)
-    if generate_golden:
-        # Save as the golden result
-        _save_golden(cur_dir, golden_ref_dir, result_dict)
-
-    _compare_to_golden(golden_ref_dir, result_dict)
-
-    if SAVE_JSON:
-        # Save result to a json file for inspection
-        _save_json(filename, parameters, result_dict)
-
-
 def save_and_check_dict(data, filename, generate_golden=False):
     """
     Save the dataset dictionary and compare (as dictionary) with golden file.
@@ -203,6 +167,29 @@ def save_and_check_tuple(data, parameters, filename, generate_golden=False):
         _save_json(filename, parameters, result_dict)
 
 
+def config_get_set_seed(seed_new):
+    """
+    Get and return the original configuration seed value.
+    Set the new configuration seed value.
+    """
+    seed_original = ds.config.get_seed()
+    ds.config.set_seed(seed_new)
+    logger.info("seed: original = {}  new = {} ".format(seed_original, seed_new))
+    return seed_original
+
+
+def config_get_set_num_parallel_workers(num_parallel_workers_new):
+    """
+    Get and return the original configuration num_parallel_workers value.
+    Set the new configuration num_parallel_workers value.
+    """
+    num_parallel_workers_original = ds.config.get_num_parallel_workers()
+    ds.config.set_num_parallel_workers(num_parallel_workers_new)
+    logger.info("num_parallel_workers: original = {}  new = {} ".format(num_parallel_workers_original,
+                                                                        num_parallel_workers_new))
+    return num_parallel_workers_original
+
+
 def diff_mse(in1, in2):
     mse = (np.square(in1.astype(float) / 255 - in2.astype(float) / 255)).mean()
     return mse * 100
@@ -265,29 +252,6 @@ def visualize_image(image_original, image_de, mse=None, image_lib=None):
     plt.show()
 
 
-def config_get_set_seed(seed_new):
-    """
-    Get and return the original configuration seed value.
-    Set the new configuration seed value.
-    """
-    seed_original = ds.config.get_seed()
-    ds.config.set_seed(seed_new)
-    logger.info("seed: original = {}  new = {} ".format(seed_original, seed_new))
-    return seed_original
-
-
-def config_get_set_num_parallel_workers(num_parallel_workers_new):
-    """
-    Get and return the original configuration num_parallel_workers value.
-    Set the new configuration num_parallel_workers value.
-    """
-    num_parallel_workers_original = ds.config.get_num_parallel_workers()
-    ds.config.set_num_parallel_workers(num_parallel_workers_new)
-    logger.info("num_parallel_workers: original = {}  new = {} ".format(num_parallel_workers_original,
-                                                                        num_parallel_workers_new))
-    return num_parallel_workers_original
-
-
 def visualize_with_bounding_boxes(orig, aug, annot_name="annotation", plot_rows=3):
     """
     Take a list of un-augmented and augmented images with "annotation" bounding boxes

From 595767b4b5e33f37e6dbcad477731f35ea41f12a Mon Sep 17 00:00:00 2001
From: fary86 <fary.fanrui@huawei.com>
Date: Thu, 16 Jul 2020 04:26:44 +0800
Subject: [PATCH 12/68] Fix bug of mindspore dtype equal

---
 mindspore/core/ir/dtype_py.cc            | 8 ++++++--
 tests/ut/python/ir/test_dtype.py         | 8 ++++++++
 tests/ut/python/ops/test_tensor_slice.py | 2 +-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/mindspore/core/ir/dtype_py.cc b/mindspore/core/ir/dtype_py.cc
index 66bd8ba5f6f..b1e2151b6dd 100644
--- a/mindspore/core/ir/dtype_py.cc
+++ b/mindspore/core/ir/dtype_py.cc
@@ -36,8 +36,12 @@ REGISTER_PYBIND_DEFINE(
     (void)m_sub.def("str_to_type", &StringToType, "string to typeptr");
     (void)py::class_<Type, std::shared_ptr<Type>>(m_sub, "Type")
       .def_readonly(PYTHON_DTYPE_FLAG, &mindspore::Type::parse_info_)
-      .def("__eq__",
-           [](const TypePtr &t1, const TypePtr &t2) {
+     .def("__eq__",
+           [](const TypePtr &t1, const py::object &other) {
+             if (!py::isinstance<Type>(other)) {
+               return false;
+             }
+             auto t2 = py::cast<TypePtr>(other);
              if (t1 != nullptr && t2 != nullptr) {
                return *t1 == *t2;
              }
diff --git a/tests/ut/python/ir/test_dtype.py b/tests/ut/python/ir/test_dtype.py
index 1523a77ea39..49f834092e0 100644
--- a/tests/ut/python/ir/test_dtype.py
+++ b/tests/ut/python/ir/test_dtype.py
@@ -134,3 +134,11 @@ def test_dtype():
     with pytest.raises(NotImplementedError):
         x = 1.5
         dtype.get_py_obj_dtype(type(type(x)))
+
+
+def test_type_equal():
+    t1 = (dtype.int32, dtype.int32)
+    valid_types = [dtype.float16, dtype.float32]
+    assert t1 not in valid_types
+    assert dtype.int32 not in valid_types
+    assert dtype.float32 in valid_types
diff --git a/tests/ut/python/ops/test_tensor_slice.py b/tests/ut/python/ops/test_tensor_slice.py
index 66590945da7..ec8b9957a22 100644
--- a/tests/ut/python/ops/test_tensor_slice.py
+++ b/tests/ut/python/ops/test_tensor_slice.py
@@ -971,7 +971,7 @@ raise_error_set = [
                         Tensor(np.random.randint(7, size=(3, 4, 5)), mstype.int32)],
     }),
     ('TensorGetItemByMixedTensorsTypeError', {
-        'block': (TensorGetItemByMixedTensorsTypeError(), {'exception': TypeError}),
+        'block': (TensorGetItemByMixedTensorsTypeError(), {'exception': IndexError}),
         'desc_inputs': [Tensor(np.arange(3 * 4 * 5 * 6 * 7 * 8 * 9).reshape((3, 4, 5, 6, 7, 8, 9)), mstype.int32),
                         Tensor(np.random.randint(3, size=(3, 4, 5)), mstype.int32),
                         Tensor(np.random.randint(4, size=(3, 4, 5)), mstype.int32)],

From b48592ce79a799d55a6d21b7c41cd2a8bfc1c955 Mon Sep 17 00:00:00 2001
From: lvliang <lvliang18@huawei.com>
Date: Wed, 15 Jul 2020 14:28:32 +0800
Subject: [PATCH 13/68] 
 enable-stream-sync-when-context-is-graph-but-run-in-pynative

---
 mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
index 1a87f3e6afe..1b09ef94b65 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@@ -100,7 +100,7 @@ void AscendDeviceAddress::SyncStream() const {
   MS_LOG(INFO) << "Start!";
   auto ms_context = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(ms_context);
-  if (ms_context->execution_mode() != kPynativeMode) {
+  if (ms_context->execution_mode() != kPynativeMode && !ms_context->enable_pynative_infer()) {
     MS_LOG(INFO) << "Finish!";
     return;
   }

From cdadcb4a700dd07421222df4aaabc074476beae3 Mon Sep 17 00:00:00 2001
From: kingfo <wangqiuliang@huawei.com>
Date: Wed, 15 Jul 2020 18:13:03 +0800
Subject: [PATCH 14/68] replace unsafe function

---
 .../ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc b/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc
index 83f7fae5826..5c7af1eb597 100644
--- a/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc
@@ -153,7 +153,7 @@ AnfNodePtr TensorMultiplyBase::NewTensorFilledWithData(const AnfNodePtr &node, c
   char *data = reinterpret_cast<char *>(new_tensor_ptr->data_c());
 
   if (x == nullptr) {
-    std::memset(data, 0, mem_size);
+    memset_s(data, mem_size, 0, mem_size);
     auto new_vnode = NewValueNode(new_tensor_ptr);
     new_vnode->set_abstract(new_tensor_ptr->ToAbstract());
     return new_vnode;
@@ -188,10 +188,11 @@ AnfNodePtr TensorMultiplyBase::NewTensorFilledWithData(const AnfNodePtr &node, c
   char *source_data = reinterpret_cast<char *>(GetPointerToTensorData(x));
   if (x_tensor_ptr->DataSize() == 1) {
     for (int i = 0; i < new_tensor_ptr->ElementsNum(); i++) {
-      memcpy(data + i * GetTypeByte(tensor_type_ptr), source_data, GetTypeByte(tensor_type_ptr));
+      memcpy_s(data + i * GetTypeByte(tensor_type_ptr), GetTypeByte(tensor_type_ptr), source_data,
+               GetTypeByte(tensor_type_ptr));
     }
   } else {
-    memcpy(data, source_data, mem_size);
+    memcpy_s(data, mem_size, source_data, mem_size);
   }
   auto new_vnode = NewValueNode(new_tensor_ptr);
   new_vnode->set_abstract(new_tensor_ptr->ToAbstract());

From 7c1b44731ed3fe7f973c386b849fc3dd76d1cced Mon Sep 17 00:00:00 2001
From: wuyongkang <wuyongkang7@huawei.com>
Date: Thu, 16 Jul 2020 09:43:36 +0800
Subject: [PATCH 15/68] Optimization for map_data_.clear()

---
 mindspore/ccsrc/utils/ordered_map.h | 4 +++-
 mindspore/ccsrc/utils/ordered_set.h | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/mindspore/ccsrc/utils/ordered_map.h b/mindspore/ccsrc/utils/ordered_map.h
index 48aa36df31f..56ae281dcf1 100644
--- a/mindspore/ccsrc/utils/ordered_map.h
+++ b/mindspore/ccsrc/utils/ordered_map.h
@@ -88,7 +88,9 @@ class OrderedMap {
   }
 
   void clear() {
-    map_data_.clear();
+    if (!map_data_.empty()) {
+      map_data_.clear();
+    }
     sequential_data_.clear();
   }
 
diff --git a/mindspore/ccsrc/utils/ordered_set.h b/mindspore/ccsrc/utils/ordered_set.h
index f393ce74f2e..8d2f7af15c3 100644
--- a/mindspore/ccsrc/utils/ordered_set.h
+++ b/mindspore/ccsrc/utils/ordered_set.h
@@ -127,7 +127,9 @@ class OrderedSet {
 
   // Clear the elements
   void clear() {
-    mapped_data_.clear();
+    if (!mapped_data_.empty()) {
+      mapped_data_.clear();
+    }
     ordered_data_.clear();
   }
 

From 236d65b7d31c1c653446a3d696e348237213f91b Mon Sep 17 00:00:00 2001
From: caifubi <caifubi1@huawei.com>
Date: Thu, 16 Jul 2020 10:00:05 +0800
Subject: [PATCH 16/68] Fix graph id changed in data dump release

---
 mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc | 7 ++++---
 mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.h  | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
index ca9a74022ac..e1e3ee61a7d 100644
--- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
@@ -71,6 +71,8 @@ void DataDumper::LoadDumpInfo() {
   }
   RtLoadDumpData(dump_info, &dev_load_mem_);
   load_flag_ = true;
+  // graph id may changed in Unload
+  graph_id_ = kernel_graph_->graph_id();
   MS_LOG(INFO) << "[DataDump] LoadDumpInfo end";
 }
 
@@ -135,11 +137,10 @@ void DataDumper::UnloadDumpInfo() {
     MS_LOG(WARNING) << "Load not success, no need to unload";
     return;
   }
-  MS_EXCEPTION_IF_NULL(kernel_graph_);
-  MS_LOG(INFO) << "[DataDump] UnloadDumpInfo start. graphId:" << kernel_graph_->graph_id();
+  MS_LOG(INFO) << "[DataDump] UnloadDumpInfo start. graphId:" << graph_id_;
 
   aicpu::dump::OpMappingInfo op_mapping_info;
-  op_mapping_info.set_model_id(kernel_graph_->graph_id());
+  op_mapping_info.set_model_id(graph_id_);
   op_mapping_info.set_flag(kAicpuUnloadFlag);
 
   for (const auto &kernel_name : dump_kernel_names_) {
diff --git a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.h b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.h
index d99eb4db686..82cfd1919dc 100644
--- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.h
+++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.h
@@ -42,6 +42,7 @@ class DataDumper {
       : load_flag_(false),
         dev_load_mem_(nullptr),
         dev_unload_mem_(nullptr),
+        graph_id_(UINT32_MAX),
         kernel_graph_(kernel_graph),
         runtime_info_map_(runtime_info_map) {}
   ~DataDumper();
@@ -58,6 +59,7 @@ class DataDumper {
   bool load_flag_;
   void *dev_load_mem_;
   void *dev_unload_mem_;
+  uint32_t graph_id_;
   std::vector<std::string> dump_kernel_names_;
   const session::KernelGraph *kernel_graph_;
   std::map<std::string, std::shared_ptr<RuntimeInfo>> runtime_info_map_;

From 89462e9c3b944f942b228867e373ae8188ad5851 Mon Sep 17 00:00:00 2001
From: Li Hongzhang <lihongzhang1@huawei.com>
Date: Tue, 14 Jul 2020 15:45:24 +0800
Subject: [PATCH 17/68] check disk space before writing and remove unused mode
 value

---
 mindspore/train/summary/_summary_writer.py | 15 +++--
 mindspore/train/summary/_writer_pool.py    | 69 ++++++++++++++--------
 mindspore/train/summary/summary_record.py  |  6 +-
 3 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/mindspore/train/summary/_summary_writer.py b/mindspore/train/summary/_summary_writer.py
index 36d020819a5..a5648fc94e2 100644
--- a/mindspore/train/summary/_summary_writer.py
+++ b/mindspore/train/summary/_summary_writer.py
@@ -15,6 +15,7 @@
 """Writes events to disk in a logdir."""
 import os
 import stat
+from shutil import disk_usage
 
 from ..._c_expression import EventWriter_
 from ._summary_adapter import package_init_event
@@ -42,9 +43,11 @@ class BaseWriter:
         self.init_writer()
         return self._writer
 
-    def write(self, plugin, mode, data):
+    def write(self, plugin, data):
         """Write data to file."""
-        raise NotImplementedError()
+        if self.writer and disk_usage(self._filepath).free < len(data) * 32:
+            raise RuntimeError('The disk space may be soon exhausted.')
+        self.writer.Write(data)
 
     def flush(self):
         """Flush the writer."""
@@ -64,16 +67,16 @@ class SummaryWriter(BaseWriter):
         """Write some metadata etc."""
         self.writer.Write(package_init_event().SerializeToString())
 
-    def write(self, plugin, mode, data):
+    def write(self, plugin, data):
         """Write data to file."""
         if plugin in ('summary', 'graph'):
-            self.writer.Write(data)
+            super().write(plugin, data)
 
 
 class LineageWriter(BaseWriter):
     """LineageWriter for write lineage."""
 
-    def write(self, plugin, mode, data):
+    def write(self, plugin, data):
         """Write data to file."""
         if plugin in ('dataset_graph', 'train_lineage', 'eval_lineage', 'custom_lineage_data'):
-            self.writer.Write(data)
+            super().write(plugin, data)
diff --git a/mindspore/train/summary/_writer_pool.py b/mindspore/train/summary/_writer_pool.py
index d9cdfd3c8cd..d0cf998b30c 100644
--- a/mindspore/train/summary/_writer_pool.py
+++ b/mindspore/train/summary/_writer_pool.py
@@ -18,6 +18,8 @@ import time
 from collections import deque
 from multiprocessing import Pool, Process, Queue, cpu_count
 
+import mindspore.log as logger
+
 from ._lineage_adapter import serialize_to_lineage_event
 from ._summary_adapter import package_graph_event, package_summary_event
 from ._summary_writer import LineageWriter, SummaryWriter
@@ -25,20 +27,18 @@ from ._summary_writer import LineageWriter, SummaryWriter
 
 def _pack_data(datadict, wall_time):
     """Pack data according to which plugin."""
-    result = []
-    summaries, step, mode = [], None, None
+    result, summaries, step = [], [], None
     for plugin, datalist in datadict.items():
         for data in datalist:
             if plugin == 'graph':
-                result.append([plugin, data.get('mode'), package_graph_event(data.get('value')).SerializeToString()])
+                result.append([plugin, package_graph_event(data.get('value')).SerializeToString()])
             elif plugin in ('train_lineage', 'eval_lineage', 'custom_lineage_data', 'dataset_graph'):
-                result.append([plugin, data.get('mode'), serialize_to_lineage_event(plugin, data.get('value'))])
+                result.append([plugin, serialize_to_lineage_event(plugin, data.get('value'))])
             elif plugin in ('scalar', 'tensor', 'histogram', 'image'):
                 summaries.append({'_type': plugin.title(), 'name': data.get('tag'), 'data': data.get('value')})
                 step = data.get('step')
-                mode = data.get('mode')
     if summaries:
-        result.append(['summary', mode, package_summary_event(summaries, step, wall_time).SerializeToString()])
+        result.append(['summary', package_summary_event(summaries, step, wall_time).SerializeToString()])
     return result
 
 
@@ -54,46 +54,65 @@ class WriterPool(Process):
     def __init__(self, base_dir, **filedict) -> None:
         super().__init__()
         self._base_dir, self._filedict = base_dir, filedict
-        self._queue = Queue(cpu_count() * 2)
+        self._queue, self._writers_ = Queue(cpu_count() * 2), None
         self.start()
 
     def run(self):
-        writers = self._get_writers()
-
         with Pool(min(cpu_count(), 32)) as pool:
             deq = deque()
             while True:
                 while deq and deq[0].ready():
-                    for plugin, mode, data in deq.popleft().get():
-                        for writer in writers:
-                            writer.write(plugin, mode, data)
+                    for plugin, data in deq.popleft().get():
+                        self._write(plugin, data)
 
-                if not self._queue.empty():
+                if not self._queue.empty() and self._writers:
                     action, data = self._queue.get()
                     if action == 'WRITE':
                         deq.append(pool.apply_async(_pack_data, (data, time.time())))
                     elif action == 'FLUSH':
-                        for writer in writers:
-                            writer.flush()
+                        self._flush()
                     elif action == 'END':
                         break
             for result in deq:
-                for plugin, mode, data in result.get():
-                    for writer in writers:
-                        writer.write(plugin, mode, data)
+                for plugin, data in result.get():
+                    self._write(plugin, data)
 
-            for writer in writers:
-                writer.close()
+            self._close()
 
-    def _get_writers(self):
-        writers = []
+    @property
+    def _writers(self):
+        """Get the writers in the subprocess."""
+        if self._writers_ is not None:
+            return self._writers_
+        self._writers_ = []
         for plugin, filename in self._filedict.items():
             filepath = os.path.join(self._base_dir, filename)
             if plugin == 'summary':
-                writers.append(SummaryWriter(filepath))
+                self._writers_.append(SummaryWriter(filepath))
             elif plugin == 'lineage':
-                writers.append(LineageWriter(filepath))
-        return writers
+                self._writers_.append(LineageWriter(filepath))
+        return self._writers_
+
+    def _write(self, plugin, data):
+        """Write the data in the subprocess."""
+        for writer in self._writers[:]:
+            try:
+                writer.write(plugin, data)
+            except RuntimeError:
+                logger.warning(f'The disk space may be soon exhausted by this {type(writer).__name__}, '
+                               'so the writer will be closed and not for further writing.')
+                self._writers.remove(writer)
+                writer.close()
+
+    def _flush(self):
+        """Flush the writers in the subprocess."""
+        for writer in self._writers:
+            writer.flush()
+
+    def _close(self):
+        """Close the writers in the subprocess."""
+        for writer in self._writers:
+            writer.close()
 
     def write(self, data) -> None:
         """
diff --git a/mindspore/train/summary/summary_record.py b/mindspore/train/summary/summary_record.py
index 21c8c58d3bb..2bc605797fc 100644
--- a/mindspore/train/summary/summary_record.py
+++ b/mindspore/train/summary/summary_record.py
@@ -218,14 +218,14 @@ class SummaryRecord:
             if name in {item['tag'] for item in self._data_pool[plugin]}:
                 entry = repr(f'{name}/{plugin}')
                 logger.warning(f'{entry} has duplicate values. Only the newest one will be recorded.')
-            self._data_pool[plugin].append(dict(tag=name, mode=self._mode, value=np_value))
+            self._data_pool[plugin].append(dict(tag=name, value=np_value))
 
         elif plugin in ('train_lineage', 'eval_lineage', 'dataset_graph', 'custom_lineage_data'):
             _check_lineage_value(plugin, value)
-            self._data_pool[plugin].append(dict(mode=self._mode, value=value.SerializeToString()))
+            self._data_pool[plugin].append(dict(value=value.SerializeToString()))
         elif plugin == 'graph':
             package_graph_event(value)
-            self._data_pool[plugin].append(dict(mode=self._mode, value=value))
+            self._data_pool[plugin].append(dict(value=value))
         else:
             raise ValueError(f'No such plugin of {repr(plugin)}')
 

From 0b930bd859bd58abce4df2bcf861ec0289430b27 Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Thu, 16 Jul 2020 09:42:52 +0800
Subject: [PATCH 18/68] ssim raise type error for unsupported dtypes

---
 mindspore/nn/layer/image.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mindspore/nn/layer/image.py b/mindspore/nn/layer/image.py
index 63ae7a94ace..88ab386c1ae 100644
--- a/mindspore/nn/layer/image.py
+++ b/mindspore/nn/layer/image.py
@@ -99,6 +99,10 @@ def _check_input_filter_size(input_shape, param_name, filter_size, func_name):
     validator.check(param_name + " shape[2]", input_shape[2], "filter_size", filter_size, Rel.GE, func_name)
     validator.check(param_name + " shape[3]", input_shape[3], "filter_size", filter_size, Rel.GE, func_name)
 
+@constexpr
+def _check_input_dtype(input_dtype, param_name, allow_dtypes, cls_name):
+    validator.check_type_name(param_name, input_dtype, allow_dtypes, cls_name)
+
 def _conv2d(in_channels, out_channels, kernel_size, weight, stride=1, padding=0):
     return Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride,
                   weight_init=weight, padding=padding, pad_mode="valid")
@@ -211,6 +215,7 @@ class SSIM(Cell):
         self.concat = P.Concat(axis=1)
 
     def construct(self, img1, img2):
+        _check_input_dtype(F.dtype(img1), "img1", [mstype.float32, mstype.float16], self.cls_name)
         _check_input_filter_size(F.shape(img1), "img1", self.filter_size, self.cls_name)
         P.SameTypeShape()(img1, img2)
         max_val = _convert_img_dtype_to_float32(self.max_val, self.max_val)

From 35c3a6370118fd18f285a290c78612368110c238 Mon Sep 17 00:00:00 2001
From: avakh <ava.khonsari@huawei.com>
Date: Tue, 14 Jul 2020 23:57:34 -0400
Subject: [PATCH 19/68] support cpp invert operation

---
 .../minddata/dataset/api/python_bindings.cc   |   5 +
 .../dataset/kernels/image/CMakeLists.txt      |   1 +
 .../dataset/kernels/image/invert_op.cc        |  57 ++++++
 .../dataset/kernels/image/invert_op.h         |  44 +++++
 .../minddata/dataset/kernels/tensor_op.h      |   1 +
 .../dataset/transforms/vision/c_transforms.py |   7 +
 .../dataset/golden/invert_01_result_c.npz     | Bin 0 -> 713 bytes
 ..._01_result.npz => invert_01_result_py.npz} | Bin
 tests/ut/python/dataset/test_invert.py        | 180 ++++++++++++++++--
 9 files changed, 284 insertions(+), 11 deletions(-)
 create mode 100644 mindspore/ccsrc/minddata/dataset/kernels/image/invert_op.cc
 create mode 100644 mindspore/ccsrc/minddata/dataset/kernels/image/invert_op.h
 create mode 100644 tests/ut/data/dataset/golden/invert_01_result_c.npz
 rename tests/ut/data/dataset/golden/{invert_01_result.npz => invert_01_result_py.npz} (100%)

diff --git a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
index 173c1af2f22..90c33cd306e 100644
--- a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
@@ -54,6 +54,7 @@
 #include "minddata/dataset/kernels/image/decode_op.h"
 #include "minddata/dataset/kernels/image/hwc_to_chw_op.h"
 #include "minddata/dataset/kernels/image/image_utils.h"
+#include "minddata/dataset/kernels/image/invert_op.h"
 #include "minddata/dataset/kernels/image/normalize_op.h"
 #include "minddata/dataset/kernels/image/pad_op.h"
 #include "minddata/dataset/kernels/image/random_color_adjust_op.h"
@@ -362,6 +363,10 @@ void bindTensorOps1(py::module *m) {
     .def(py::init<float, float, float, float, float, float>(), py::arg("meanR"), py::arg("meanG"), py::arg("meanB"),
          py::arg("stdR"), py::arg("stdG"), py::arg("stdB"));
 
+  (void)py::class_<InvertOp, TensorOp, std::shared_ptr<InvertOp>>(*m, "InvertOp",
+                                                                  "Tensor operation to apply invert on RGB images.")
+    .def(py::init<>());
+
   (void)py::class_<RescaleOp, TensorOp, std::shared_ptr<RescaleOp>>(
     *m, "RescaleOp", "Tensor operation to rescale an image. Takes scale and shift.")
     .def(py::init<float, float>(), py::arg("rescale"), py::arg("shift"));
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
index bfc27a920bd..402989af0de 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
@@ -6,6 +6,7 @@ add_library(kernels-image OBJECT
     decode_op.cc
     hwc_to_chw_op.cc
     image_utils.cc
+    invert_op.cc
     normalize_op.cc
     pad_op.cc
     random_color_adjust_op.cc
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/invert_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/invert_op.cc
new file mode 100644
index 00000000000..44a7f1f5b49
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/invert_op.cc
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "minddata/dataset/kernels/image/invert_op.h"
+#include "minddata/dataset/kernels/image/image_utils.h"
+#include "minddata/dataset/core/cv_tensor.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+
+// only supports RGB images
+
+Status InvertOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+
+  try {
+    std::shared_ptr<CVTensor> input_cv = CVTensor::AsCVTensor(input);
+    cv::Mat input_img = input_cv->mat();
+    if (!input_cv->mat().data) {
+      RETURN_STATUS_UNEXPECTED("Could not convert to CV Tensor");
+    }
+
+    if (input_cv->Rank() != 3) {
+      RETURN_STATUS_UNEXPECTED("Shape not <H,W,C>");
+    }
+    int num_channels = input_cv->shape()[2];
+    if (num_channels != 3) {
+      RETURN_STATUS_UNEXPECTED("The shape is incorrect: num of channels != 3");
+    }
+
+    auto output_cv = std::make_shared<CVTensor>(input_cv->shape(), input_cv->type());
+    RETURN_UNEXPECTED_IF_NULL(output_cv);
+
+    output_cv->mat() = cv::Scalar::all(255) - input_img;
+    *output = std::static_pointer_cast<Tensor>(output_cv);
+  }
+
+  catch (const cv::Exception &e) {
+    RETURN_STATUS_UNEXPECTED("Error in invert");
+  }
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/invert_op.h b/mindspore/ccsrc/minddata/dataset/kernels/image/invert_op.h
new file mode 100644
index 00000000000..01a3ef34495
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/invert_op.h
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_KERNELS_IMAGE_INVERT_OP_H
+#define DATASET_KERNELS_IMAGE_INVERT_OP_H
+
+#include <memory>
+#include <string>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class InvertOp : public TensorOp {
+ public:
+  InvertOp() {}
+  ~InvertOp() = default;
+
+  // Description: A function that prints info about the node
+  void Print(std::ostream &out) const override { out << Name(); }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+  std::string Name() const override { return kInvertOp; }
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_KERNELS_IMAGE_INVERT_OP_H
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
index 00b4fa5efb3..27bcfed0077 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
@@ -92,6 +92,7 @@ constexpr char kDecodeOp[] = "DecodeOp";
 constexpr char kCenterCropOp[] = "CenterCropOp";
 constexpr char kCutOutOp[] = "CutOutOp";
 constexpr char kHwcToChwOp[] = "HwcToChwOp";
+constexpr char kInvertOp[] = "InvertOp";
 constexpr char kNormalizeOp[] = "NormalizeOp";
 constexpr char kPadOp[] = "PadOp";
 constexpr char kRandomColorAdjustOp[] = "RandomColorAdjustOp";
diff --git a/mindspore/dataset/transforms/vision/c_transforms.py b/mindspore/dataset/transforms/vision/c_transforms.py
index 2de575d14d6..ca356dd79c3 100644
--- a/mindspore/dataset/transforms/vision/c_transforms.py
+++ b/mindspore/dataset/transforms/vision/c_transforms.py
@@ -71,6 +71,13 @@ def parse_padding(padding):
     return padding
 
 
+class Invert(cde.InvertOp):
+    """
+    Apply invert on input image in RGB mode.
+    does not have input arguments.
+    """
+
+
 class Decode(cde.DecodeOp):
     """
     Decode the input image in RGB mode.
diff --git a/tests/ut/data/dataset/golden/invert_01_result_c.npz b/tests/ut/data/dataset/golden/invert_01_result_c.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0a819192d2ed61c125bd8079b95e94d43114f23d
GIT binary patch
literal 713
zcmWIWW@Zs#fB;1X$+_HYj!X;;Ak4`i!jM>06mOuHS5V2wAOIEwDFjJ^z+}Hr-+)L)
zhBAg~^_0}&<RWz|1@#(#bsYuuwEUuyqQt!T{Gyapkg!`~PH`$wyf`DVAQi~hFw#*l
z($rC?RUjX5H83aVmF5;y>LuqFrRwFD=9FXt-J4j+6<?H^oS#=*QdF8;!d1usQN@*)
z0+B6b^k!%+WO8OqDr9aiWC<!{)$nHYX7XlgD`bNzO)04?NaZSIj{y19-=dJio57p0
zy^u4gkW0gxIf4-=T2sjF=jZ439|*vNw|z+=Pf}-WA+JURD^NZsF)1~tkT157Un7Gd
z18k*02GBmiph6*#eJrho!Wj??(o70PAQp%Q6^g+uuqhNrwSb|dP$H?*S>WU4|1Hj%
z<wk9~pJb0bElMhsEGd*qf>|OBvP34RP!?*5T!t7nJ@O@m3Q4J8muBWBrl%Gv#uh5U
zU8)Q+L?x(D6>5kY*bv#{v1y$d5ibv~HvTO?SwFc@y`)eBZm}jvk5*8jHdK!ec0IZ!
vg?dS;sU?N_NqPa^j7%cTxKbZ55TT#}6yS)o7~svy1`=QdLUSNp15S?sP!YdG

literal 0
HcmV?d00001

diff --git a/tests/ut/data/dataset/golden/invert_01_result.npz b/tests/ut/data/dataset/golden/invert_01_result_py.npz
similarity index 100%
rename from tests/ut/data/dataset/golden/invert_01_result.npz
rename to tests/ut/data/dataset/golden/invert_01_result_py.npz
diff --git a/tests/ut/python/dataset/test_invert.py b/tests/ut/python/dataset/test_invert.py
index f366553c6ef..4f70c5a7ee2 100644
--- a/tests/ut/python/dataset/test_invert.py
+++ b/tests/ut/python/dataset/test_invert.py
@@ -19,18 +19,20 @@ import numpy as np
 
 import mindspore.dataset.engine as de
 import mindspore.dataset.transforms.vision.py_transforms as F
+import mindspore.dataset.transforms.vision.c_transforms as C
 from mindspore import log as logger
-from util import visualize_list, save_and_check_md5
+from util import visualize_list, save_and_check_md5, diff_mse
 
 DATA_DIR = "../data/dataset/testImageNetData/train/"
 
 GENERATE_GOLDEN = False
 
-def test_invert(plot=False):
+
+def test_invert_py(plot=False):
     """
-    Test Invert
+    Test Invert python op
     """
-    logger.info("Test Invert")
+    logger.info("Test Invert Python op")
 
     # Original Images
     ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
@@ -52,7 +54,7 @@ def test_invert(plot=False):
                                         np.transpose(image, (0, 2, 3, 1)),
                                         axis=0)
 
-            # Color Inverted Images
+    # Color Inverted Images
     ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
 
     transforms_invert = F.ComposeOp([F.Decode(),
@@ -83,11 +85,143 @@ def test_invert(plot=False):
         visualize_list(images_original, images_invert)
 
 
-def test_invert_md5():
+def test_invert_c(plot=False):
     """
-    Test Invert with md5 check
+    Test Invert Cpp op
     """
-    logger.info("Test Invert with md5 check")
+    logger.info("Test Invert cpp op")
+
+    # Original Images
+    ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
+
+    transforms_original = [C.Decode(), C.Resize(size=[224, 224])]
+
+    ds_original = ds.map(input_columns="image",
+                         operations=transforms_original)
+
+    ds_original = ds_original.batch(512)
+
+    for idx, (image, _) in enumerate(ds_original):
+        if idx == 0:
+            images_original = image
+        else:
+            images_original = np.append(images_original,
+                                        image,
+                                        axis=0)
+
+    # Invert Images
+    ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
+
+    transform_invert = [C.Decode(), C.Resize(size=[224, 224]),
+                        C.Invert()]
+
+    ds_invert = ds.map(input_columns="image",
+                       operations=transform_invert)
+
+    ds_invert = ds_invert.batch(512)
+
+    for idx, (image, _) in enumerate(ds_invert):
+        if idx == 0:
+            images_invert = image
+        else:
+            images_invert = np.append(images_invert,
+                                      image,
+                                      axis=0)
+    if plot:
+        visualize_list(images_original, images_invert)
+
+    num_samples = images_original.shape[0]
+    mse = np.zeros(num_samples)
+    for i in range(num_samples):
+        mse[i] = diff_mse(images_invert[i], images_original[i])
+    logger.info("MSE= {}".format(str(np.mean(mse))))
+
+
+def test_invert_py_c(plot=False):
+    """
+    Test Invert Cpp op and python op
+    """
+    logger.info("Test Invert cpp and python op")
+
+    # Invert Images in cpp
+    ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
+    ds = ds.map(input_columns=["image"],
+                operations=[C.Decode(), C.Resize((224, 224))])
+
+    ds_c_invert = ds.map(input_columns="image",
+                         operations=C.Invert())
+
+    ds_c_invert = ds_c_invert.batch(512)
+
+    for idx, (image, _) in enumerate(ds_c_invert):
+        if idx == 0:
+            images_c_invert = image
+        else:
+            images_c_invert = np.append(images_c_invert,
+                                        image,
+                                        axis=0)
+
+    # invert images in python
+    ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
+    ds = ds.map(input_columns=["image"],
+                operations=[C.Decode(), C.Resize((224, 224))])
+
+    transforms_p_invert = F.ComposeOp([lambda img: img.astype(np.uint8),
+                                       F.ToPIL(),
+                                       F.Invert(),
+                                       np.array])
+
+    ds_p_invert = ds.map(input_columns="image",
+                         operations=transforms_p_invert())
+
+    ds_p_invert = ds_p_invert.batch(512)
+
+    for idx, (image, _) in enumerate(ds_p_invert):
+        if idx == 0:
+            images_p_invert = image
+        else:
+            images_p_invert = np.append(images_p_invert,
+                                        image,
+                                        axis=0)
+
+    num_samples = images_c_invert.shape[0]
+    mse = np.zeros(num_samples)
+    for i in range(num_samples):
+        mse[i] = diff_mse(images_p_invert[i], images_c_invert[i])
+    logger.info("MSE= {}".format(str(np.mean(mse))))
+
+    if plot:
+        visualize_list(images_c_invert, images_p_invert, visualize_mode=2)
+
+
+def test_invert_one_channel():
+    """
+     Test Invert cpp op with one channel image
+     """
+    logger.info("Test Invert C Op With One Channel Images")
+
+    c_op = C.Invert()
+
+    try:
+        ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
+        ds = ds.map(input_columns=["image"],
+                    operations=[C.Decode(),
+                                C.Resize((224, 224)),
+                                lambda img: np.array(img[:, :, 0])])
+
+        ds.map(input_columns="image",
+               operations=c_op)
+
+    except RuntimeError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert "The shape" in str(e)
+
+
+def test_invert_md5_py():
+    """
+    Test Invert python op with md5 check
+    """
+    logger.info("Test Invert python op with md5 check")
 
     # Generate dataset
     ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
@@ -98,10 +232,34 @@ def test_invert_md5():
 
     data = ds.map(input_columns="image", operations=transforms_invert())
     # Compare with expected md5 from images
-    filename = "invert_01_result.npz"
+    filename = "invert_01_result_py.npz"
+    save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
+
+
+def test_invert_md5_c():
+    """
+    Test Invert cpp op with md5 check
+    """
+    logger.info("Test Invert cpp op with md5 check")
+
+    # Generate dataset
+    ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
+
+    transforms_invert = [C.Decode(),
+                         C.Resize(size=[224, 224]),
+                         C.Invert(),
+                         F.ToTensor()]
+
+    data = ds.map(input_columns="image", operations=transforms_invert)
+    # Compare with expected md5 from images
+    filename = "invert_01_result_c.npz"
     save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
 
 
 if __name__ == "__main__":
-    test_invert(plot=True)
-    test_invert_md5()
+    test_invert_py(plot=False)
+    test_invert_c(plot=False)
+    test_invert_py_c(plot=False)
+    test_invert_one_channel()
+    test_invert_md5_py()
+    test_invert_md5_c()

From 39f08eb7dd685d7b0dce05f23d1f81e4a8955240 Mon Sep 17 00:00:00 2001
From: Ziyan <gongziyan1@huawei.com>
Date: Tue, 14 Jul 2020 20:44:16 +0800
Subject: [PATCH 20/68] enable optimizer parallel

---
 .../optimizer/pass/communication_op_fusion.cc |   5 +-
 mindspore/context.py                          |   6 +-
 mindspore/nn/optim/optimizer.py               |   8 +-
 mindspore/nn/wrap/grad_reducer.py             | 135 ++++++++----------
 mindspore/parallel/_auto_parallel_context.py  |   2 +-
 .../parallel/test_parallel_optimizer.py       |  12 +-
 .../test_set_auto_parallel_context.py         |   4 +-
 7 files changed, 80 insertions(+), 92 deletions(-)

diff --git a/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.cc b/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.cc
index 3ba055880cd..eafb2bb59d2 100644
--- a/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.cc
@@ -100,7 +100,10 @@ bool CommunicationOpFusion::GetSplitSegments(const CommunicationOpInfo &communic
 
   auto parallel_context = parallel::ParallelContext::GetInstance();
   MS_EXCEPTION_IF_NULL(parallel_context);
-  const auto &split_indices = parallel_context->GetAllReduceFusionSplitIndices(group);
+  std::vector<uint32_t> split_indices;
+  if (!parallel_context->enable_parallel_optimizer()) {
+    split_indices = parallel_context->GetAllReduceFusionSplitIndices(group);
+  }
 
   size_t segments = 0;
   if (split_indices.size() != 0) {
diff --git a/mindspore/context.py b/mindspore/context.py
index 0de6084caf5..551ec7b79a9 100644
--- a/mindspore/context.py
+++ b/mindspore/context.py
@@ -443,7 +443,7 @@ def _context():
 
 @args_type_check(device_num=int, global_rank=int, mirror_mean=bool, cast_before_mirror=bool, parallel_mode=str,
                  auto_parallel_search_mode=str, parameter_broadcast=bool, strategy_ckpt_load_file=str,
-                 strategy_ckpt_save_file=str, full_batch=bool)
+                 strategy_ckpt_save_file=str, full_batch=bool, enable_parallel_optimizer=bool)
 def set_auto_parallel_context(**kwargs):
     """
     Set auto parallel context.
@@ -487,6 +487,9 @@ def set_auto_parallel_context(**kwargs):
         strategy_ckpt_load_file (str): The path to load parallel strategy checkpoint. Default: ''
         strategy_ckpt_save_file (str): The path to save parallel strategy checkpoint. Default: ''
         full_batch (bool): Whether to load the whole batch on each device. Default: False.
+        enable_parallel_optimizer(bool): This is a developing feature, which shards the weight update  computation in
+                       data parallel training in the benefit of time and memory saving.
+
 
     Raises:
         ValueError: If input key is not attribute in auto parallel context.
@@ -532,6 +535,7 @@ def reset_auto_parallel_context():
     - parameter_broadcast: False.
     - strategy_ckpt_load_file: "".
     - strategy_ckpt_save_file: "".
+    - enable_parallel_optimizer: False.
     """
     _reset_auto_parallel_context()
 
diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py
index cdf1565f349..4b2ca1aee38 100755
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -28,8 +28,8 @@ from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
 from mindspore import log as logger
 from mindspore.parallel._utils import _get_global_rank, _get_device_num, _get_parallel_mode
-from mindspore.parallel._auto_parallel_context import auto_parallel_context
 from mindspore.train.parallel_utils import ParallelMode
+from mindspore import context
 
 __all__ = ['Optimizer']
 
@@ -157,13 +157,12 @@ class Optimizer(Cell):
         self.param_length = len(self.parameters)
         self.map_ = C.Map()
 
-        use_parallel = auto_parallel_context().get_enable_parallel_optimizer()
+        use_parallel = context.get_auto_parallel_context("enable_parallel_optimizer")
         self.use_parallel = use_parallel
         if use_parallel:
             if self.cls_name not in ["Lamb", "AdamWeightDecayDynamicLR", "AdamWeightDecay"]:
                 raise RuntimeError("Optimizer segmentation does not support optimizer {}".format(self.cls_name))
-            if _get_parallel_mode() not in [ParallelMode.HYBRID_PARALLEL, ParallelMode.DATA_PARALLEL,
-                                            ParallelMode.AUTO_PARALLEL]:
+            if _get_parallel_mode() != ParallelMode.DATA_PARALLEL:
                 raise RuntimeError("Optimizer segmentation does not support parallel mode {}".format
                                    (_get_parallel_mode()))
             self.dev_num = _get_device_num()
@@ -175,6 +174,7 @@ class Optimizer(Cell):
             self.param_names = []
             for param in self.parameters:
                 self.param_names.append(param.name)
+
         else:
             self.optim_filter = (True,) * self.param_length
 
diff --git a/mindspore/nn/wrap/grad_reducer.py b/mindspore/nn/wrap/grad_reducer.py
index 9354b42e55e..3d754977d45 100644
--- a/mindspore/nn/wrap/grad_reducer.py
+++ b/mindspore/nn/wrap/grad_reducer.py
@@ -13,107 +13,95 @@
 # limitations under the License.
 # ============================================================================
 """grad reducer cell for distributed training"""
+from mindspore import context
 from mindspore.nn.cell import Cell
 from mindspore.communication.management import GlobalComm, get_group_size
 from mindspore.ops import functional as F, composite as C, operations as P
-from mindspore.ops.operations.comm_ops import AllReduce, ReduceOp, AllGather
+from mindspore.ops.operations.comm_ops import AllReduce, AllGather
+from mindspore.parallel._auto_parallel_context import auto_parallel_context
 import mindspore.common.dtype as mstype
 
 reduce_opt = C.MultitypeFuncGraph("reduce_opt")
 
-_all_reduce = AllReduce()
-_all_gather = None
+
+def _init_allreduce_operators(length):
+    """ initialize allreduce communication operators"""
+    is_parallel_optimizer = context.get_auto_parallel_context("enable_parallel_optimizer")
+    split_indices = auto_parallel_context().get_all_reduce_fusion_split_indices()
+    if is_parallel_optimizer and split_indices:
+        group = 1
+        fusion = ()
+        for i in range(length):
+            fusion = fusion + (group,)
+            if split_indices[group - 1] <= i + 1:
+                if group >= len(split_indices):
+                    continue
+                group = group + 1
+        index = tuple(range(1, length + 1))
+    else:
+        fusion = (1,) * length
+        index = (0,) * length
+    opt_list = ()
+    for i in range(length):
+        opt = AllReduce('sum', GlobalComm.WORLD_COMM_GROUP)
+        opt.add_prim_attr('fusion', fusion[i])
+        opt.add_prim_attr('index', index[i])
+        opt_list = opt_list + (opt,)
+    return opt_list
 
 
-def _init_optimizer_communication():
-    global _all_reduce
-    global _all_gather
-
-    _all_reduce = AllReduce(ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
-    _all_reduce.add_prim_attr('fusion', 1)
-    _all_gather = AllGather(GlobalComm.WORLD_COMM_GROUP)
-
-
-@reduce_opt.register("Function", "Number", "Bool", "Tensor")
-def _tensors_allreduce_mean(mul, degree, allreduce_filter, grad):
-    """
-    Apply mean and allreduce on gradient. Allreduce is a communication operation used for distributed deep learning.
-
-    Args:
-        mul (Primitive): Div operation.
-        degree (int): The mean coefficient.
-        allreduce_filter (bool): When it is true, allreduce would apply.
-        grad (Tensor): The gradient tensor before operation.
-
-    Returns:
-        Tensor, the gradient tensor after operation.
-    """
-    if allreduce_filter:
-        degree = F.scalar_cast(degree, F.dtype(grad))
-        grad = _all_reduce(grad)
-        cast_op = P.Cast()
-        return mul(grad, cast_op(F.scalar_to_array(1.0/degree), F.dtype(grad)))
-    return grad
-
-
-@reduce_opt.register("Function", "Number", "Bool", "Tuple")
-def _tensors_allreduce_mean_with_sparse(mul, degree, allreduce_filter, grad):
-    """
-    Apply mean and allgather on gradient instead of allreduce for sparse feature.
-    Allgather is a communication operation used for distributed deep learning.
-
-    Args:
-        mul (Primitive): Div operation.
-        degree (int): The mean coefficient.
-        allreduce_filter (bool): When it is true, allgather would apply.
-        grad (Tuple): The indices, gradient tensor and tensor_shape before operation.
-
-    Returns:
-        Tuple, include indices, the gradient tensor and tensor_shape after operation.
-    """
-    if allreduce_filter:
-        indices = _all_gather(grad[0])
-        degree = F.scalar_cast(degree, F.dtype(grad[1]))
-        dout = _all_gather(grad[1])
-        cast_op = P.Cast()
-        dout = mul(dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout)))
-        grad = (indices, dout, grad[2])
-    return grad
-
-
-@reduce_opt.register("Bool", "Tensor")
-def _tensors_allreduce(allreduce_filter, grad):
+@reduce_opt.register("Number", "Bool", "Function", "Bool", "Tensor", "Function")
+def _tensors_allreduce(degree, mean, allgather, allreduce_filter, grad, allreduce):
     """
     Apply allreduce on gradient.
 
     Args:
+        degree (int): The mean coefficient.
+        mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients.
+        allgather (Primitive): The communication operator for sparse gradients.
         allreduce_filter (bool): When it is true, allreduce would apply.
         grad (Tensor): The gradient tensor before operation.
+        allreduce (Primitive): The communication operator for gradients.
 
     Returns:
         Tensor, the gradient tensor after operation.
     """
     if allreduce_filter:
-        return _all_reduce(grad)
+        grad = allreduce(grad)
+        if mean:
+            degree = F.scalar_cast(degree, F.dtype(grad))
+            cast_op = P.Cast()
+            mul_op = P.Mul()
+            grad = mul_op(grad, cast_op(F.scalar_to_array(1.0/degree), F.dtype(grad)))
+        return grad
     return grad
 
 
-@reduce_opt.register("Bool", "Tuple")
-def _tensors_allreduce_with_sparse(allreduce_filter, grad):
+@reduce_opt.register("Number", "Bool", "Function", "Bool", "Tuple", "Function")
+def _tensors_allreduce_with_sparse(degree, mean, allgather, allreduce_filter, grad, allreduce):
     """
-    Apply mean and allgather on gradient instead of allreduce for sparse feature.
+    Apply allgather on gradient instead of allreduce for sparse feature.
     Allgather is a communication operation used for distributed deep learning.
 
     Args:
+        degree (int): The mean coefficient.
+        mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients.
+        allgather (Primitive): The communication operator for sparse gradients.
         allreduce_filter (bool): When it is true, allgather would apply.
-        grad (Tuple): The indices, gradient tensor and tensor_shape before operation.
+        grad (tuple): The indices, gradient tensor and tensor_shape before operation.
+        allreduce (Primitive): The communication operator for gradients.
 
     Returns:
         Tuple, include indices, the gradient tensor and tensor_shape after operation.
     """
     if allreduce_filter:
-        indices = _all_gather(grad[0])
-        dout = _all_gather(grad[1])
+        indices = allgather(grad[0])
+        dout = allgather(grad[1])
+        if mean:
+            degree = F.scalar_cast(degree, F.dtype(grad[1]))
+            cast_op = P.Cast()
+            mul_op = P.Mul()
+            dout = mul_op(dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout)))
         grad = (indices, dout, grad[2])
     return grad
 
@@ -259,7 +247,6 @@ class DistributedGradReducer(Cell):
     def __init__(self, parameters, mean=True, degree=None):
         super(DistributedGradReducer, self).__init__(auto_prefix=False)
         self.map_ = C.Map()
-        self.mul = P.Mul()
         if degree is None:
             self.degree = get_group_size()
         else:
@@ -268,7 +255,8 @@ class DistributedGradReducer(Cell):
             self.degree = degree
         self.mean = mean
         self.allreduce_filter = tuple(x.layerwise_parallel is False for x in parameters)
-        _init_optimizer_communication()
+        self.opt_list = _init_allreduce_operators(len(parameters))
+        self.allgather = AllGather(GlobalComm.WORLD_COMM_GROUP)
 
     def construct(self, grads):
         """
@@ -284,11 +272,8 @@ class DistributedGradReducer(Cell):
         """
         datatypes = self.map_(F.partial(_get_datatype), grads)
         grads = self.map_(F.partial(_cast_datatype, mstype.float32), grads)
-
-        if self.mean:
-            new_grad = self.map_(F.partial(reduce_opt, self.mul, self.degree), self.allreduce_filter, grads)
-        else:
-            new_grad = self.map_(F.partial(reduce_opt), self.allreduce_filter, grads)
+        new_grad = self.map_(F.partial(reduce_opt, self.degree, self.mean, self.allgather),
+                             self.allreduce_filter, grads, self.opt_list)
 
         new_grad = self.map_(F.partial(_cast_datatype), datatypes, new_grad)
         return new_grad
diff --git a/mindspore/parallel/_auto_parallel_context.py b/mindspore/parallel/_auto_parallel_context.py
index 93fe2338557..3f6ce21cb94 100644
--- a/mindspore/parallel/_auto_parallel_context.py
+++ b/mindspore/parallel/_auto_parallel_context.py
@@ -513,7 +513,7 @@ def _set_auto_parallel_context(**kwargs):
         strategy_ckpt_load_file (str): The path to load parallel strategy checkpoint. Default: ''
         strategy_ckpt_save_file (str): The path to save parallel strategy checkpoint. Default: ''
         full_batch (bool): Whether to load the whole batch on each device. Default: False.
-        enable_parallel_optimizer (bool): Enable using optimizer segmentation or noe. Default: False.
+        enable_parallel_optimizer (bool): Enable using optimizer segmentation or not. Default: False.
 
     Raises:
         ValueError: If input key is not attribute in auto parallel context.
diff --git a/tests/ut/python/parallel/test_parallel_optimizer.py b/tests/ut/python/parallel/test_parallel_optimizer.py
index 6663e34871e..ee9291fb98a 100644
--- a/tests/ut/python/parallel/test_parallel_optimizer.py
+++ b/tests/ut/python/parallel/test_parallel_optimizer.py
@@ -22,7 +22,6 @@ from mindspore.common.api import _executor
 from mindspore.nn import TrainOneStepCell, WithLossCell
 from mindspore.nn.optim import Adam, AdamWeightDecay, AdamWeightDecayDynamicLR, Lamb
 from mindspore.ops import operations as P
-from mindspore.parallel._auto_parallel_context import auto_parallel_context
 from mindspore import context
 
 
@@ -54,8 +53,7 @@ class Net(nn.Cell):
 
 def test_AdamWeightDecayDynamicLR():
     """ test_AdamWeightDecayDynamicLR """
-    auto_parallel_context().set_enable_parallel_optimizer(True)
-    context.set_auto_parallel_context(parallel_mode="data_parallel", device_num=2)
+    context.set_auto_parallel_context(parallel_mode="data_parallel", device_num=2, enable_parallel_optimizer=True)
     inputs = Tensor(np.ones([32, 128]).astype(np.float32))
     label = Tensor(np.zeros([32, 768]).astype(np.float32))
     net = Net()
@@ -70,8 +68,7 @@ def test_AdamWeightDecayDynamicLR():
 
 def test_AdamWeightDecay():
     """ test_AdamWeightDecayDynamicLR """
-    auto_parallel_context().set_enable_parallel_optimizer(True)
-    context.set_auto_parallel_context(parallel_mode="data_parallel", device_num=2)
+    context.set_auto_parallel_context(parallel_mode="data_parallel", device_num=2, enable_parallel_optimizer=True)
     inputs = Tensor(np.ones([32, 128]).astype(np.float32))
     label = Tensor(np.zeros([32, 768]).astype(np.float32))
     net = Net()
@@ -86,8 +83,7 @@ def test_AdamWeightDecay():
 
 def test_lamb_compile():
     """ test_Lamb_compile """
-    auto_parallel_context().set_enable_parallel_optimizer(True)
-    context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=2)
+    context.set_auto_parallel_context(parallel_mode="data_parallel", device_num=2, enable_parallel_optimizer=True)
     inputs = Tensor(np.ones([32, 128]).astype(np.float32))
     label = Tensor(np.zeros([32, 768]).astype(np.float32))
     net = Net()
@@ -102,7 +98,7 @@ def test_lamb_compile():
 
 def test_edge_case():
     """ test_edge_case """
-    auto_parallel_context().set_enable_parallel_optimizer(True)
+    context.set_auto_parallel_context(enable_parallel_optimizer=True)
     net = Net()
     with pytest.raises(RuntimeError):
         context.set_auto_parallel_context(parallel_mode="stand_alone")
diff --git a/tests/ut/python/parallel/test_set_auto_parallel_context.py b/tests/ut/python/parallel/test_set_auto_parallel_context.py
index c476b0cebc3..19187cb262c 100644
--- a/tests/ut/python/parallel/test_set_auto_parallel_context.py
+++ b/tests/ut/python/parallel/test_set_auto_parallel_context.py
@@ -81,8 +81,8 @@ def test_set_auto_parallel_context():
     with pytest.raises(ValueError):
         set_algo_parameters(tensor_slice_align_size=1025)
 
-    auto_parallel_context().set_enable_parallel_optimizer(True)
-    assert auto_parallel_context().get_enable_parallel_optimizer() is True
+    context.set_auto_parallel_context(enable_parallel_optimizer=True)
+    assert context.get_auto_parallel_context("enable_parallel_optimizer")
     assert not auto_parallel_context().get_all_reduce_fusion_split_indices()
 
 

From 1feca960aaeff7f90df4940b8965bfc17fa3e62d Mon Sep 17 00:00:00 2001
From: peixu_ren <peixu.ren1@huawei.com>
Date: Wed, 15 Jul 2020 23:32:03 -0300
Subject: [PATCH 21/68] Rollback to Normal on D

---
 mindspore/nn/distribution/bernoulli.py |  5 +-
 mindspore/nn/distribution/normal.py    |  5 +-
 mindspore/ops/composite/__init__.py    |  2 -
 mindspore/ops/composite/random_ops.py  | 63 -------------------
 mindspore/ops/operations/__init__.py   |  4 +-
 mindspore/ops/operations/random_ops.py | 86 +++++++++++++-------------
 tests/st/ops/gpu/test_normal.py        | 56 -----------------
 tests/ut/python/ops/test_ops.py        |  4 +-
 8 files changed, 52 insertions(+), 173 deletions(-)
 delete mode 100644 mindspore/ops/composite/random_ops.py
 delete mode 100644 tests/st/ops/gpu/test_normal.py

diff --git a/mindspore/nn/distribution/bernoulli.py b/mindspore/nn/distribution/bernoulli.py
index 9aa20d668fe..d0d8a5b08ab 100644
--- a/mindspore/nn/distribution/bernoulli.py
+++ b/mindspore/nn/distribution/bernoulli.py
@@ -14,7 +14,6 @@
 # ============================================================================
 """Bernoulli Distribution"""
 from mindspore.ops import operations as P
-from mindspore.ops import composite as C
 from .distribution import Distribution
 from ._utils.utils import cast_to_tensor, check_prob
 from ...common import dtype as mstype
@@ -54,7 +53,6 @@ class Bernoulli(Distribution):
             check_prob(self._probs)
         else:
             self._probs = probs
-        self.seed = seed
 
         # ops needed for the class
         self.log = P.Log()
@@ -66,6 +64,7 @@ class Bernoulli(Distribution):
         self.const = P.ScalarToArray()
         self.less = P.Less()
         self.cast = P.Cast()
+        self.normal = P.Normal(seed=seed)
         self.erf = P.Erf()
         self.sqrt = P.Sqrt()
 
@@ -160,7 +159,7 @@ class Bernoulli(Distribution):
             mean_zero = self.const(0.0)
             sd_one = self.const(1.0)
             sqrt_two = self.sqrt(self.const(2.0))
-            sample_norm = C.normal(sample_shape, mean_zero, sd_one, self.seed)
+            sample_norm = self.normal(sample_shape, mean_zero, sd_one)
             sample_uniform = 0.5 * (1 + self.erf(self.realdiv(sample_norm, sqrt_two)))
             sample = self.less(sample_uniform, probs1)
             sample = self.cast(sample, self._dtype)
diff --git a/mindspore/nn/distribution/normal.py b/mindspore/nn/distribution/normal.py
index 61cec6d8106..344dbd2eeb4 100644
--- a/mindspore/nn/distribution/normal.py
+++ b/mindspore/nn/distribution/normal.py
@@ -15,7 +15,6 @@
 """Normal Distribution"""
 import numpy as np
 from mindspore.ops import operations as P
-from mindspore.ops import composite as C
 from .distribution import Distribution
 from ._utils.utils import convert_to_batch, check_greater_equal_zero
 from ...common import dtype as mstype
@@ -61,7 +60,6 @@ class Normal(Distribution):
         else:
             self._mean_value = mean
             self._sd_value = sd
-        self.seed = seed
 
         #ops needed for the class
         self.exp = P.Exp()
@@ -72,6 +70,7 @@ class Normal(Distribution):
         self.sqrt = P.Sqrt()
         self.realdiv = P.RealDiv()
         self.expm1 = P.Expm1() if get_context('device_target') == 'Ascend' else self._expm1_by_step
+        self.normal = P.Normal(seed=seed)
         self.shape = P.Shape()
         self.zeroslike = P.ZerosLike()
         self.const = P.ScalarToArray()
@@ -164,7 +163,7 @@ class Normal(Distribution):
             sample_shape = shape + batch_shape
             mean_zero = self.const(0.0)
             sd_one = self.const(1.0)
-            sample_norm = C.normal(sample_shape, mean_zero, sd_one, self.seed)
+            sample_norm = self.normal(sample_shape, mean_zero, sd_one)
             sample = self.add(mean, self.mul(sample_norm, sd))
             return sample
         return None
diff --git a/mindspore/ops/composite/__init__.py b/mindspore/ops/composite/__init__.py
index bb5e2960ff4..6db8d666a27 100644
--- a/mindspore/ops/composite/__init__.py
+++ b/mindspore/ops/composite/__init__.py
@@ -27,7 +27,6 @@ from .clip_ops import clip_by_value
 from .multitype_ops.add_impl import hyper_add
 from .multitype_ops.ones_like_impl import ones_like
 from .multitype_ops.zeros_like_impl import zeros_like
-from .random_ops import normal
 
 
 __all__ = [
@@ -48,5 +47,4 @@ __all__ = [
     'zeros_like',
     'ones_like',
     'zip_operation',
-    'normal',
     'clip_by_value',]
diff --git a/mindspore/ops/composite/random_ops.py b/mindspore/ops/composite/random_ops.py
deleted file mode 100644
index db338f5672d..00000000000
--- a/mindspore/ops/composite/random_ops.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-"""Operations for random number generatos."""
-
-from mindspore.ops.primitive import constexpr
-from .. import operations as P
-
-# set graph-level RNG seed
-_GRAPH_SEED = 0
-
-@constexpr
-def set_seed(seed):
-    global _GRAPH_SEED
-    _GRAPH_SEED = seed
-
-@constexpr
-def get_seed():
-    return _GRAPH_SEED
-
-
-def normal(shape, mean, stddev, seed):
-    """
-    Generates random numbers according to the Normal (or Gaussian) random number distribution.
-    It is defined as:
-
-    Args:
-        - **shape** (tuple) - The shape of random tensor to be generated.
-        - **mean** (Tensor) - The mean μ distribution parameter, which specifies the location of the peak.
-          With float32 data type.
-        - **stddev** (Tensor) - The deviation σ distribution parameter. With float32 data type.
-        - **seed** (int): Seed is used as entropy source for Random number engines generating pseudo-random numbers.
-          Default: 0.
-
-    Returns:
-        Tensor. The shape should be the broadcasted shape of Input "shape" and shapes of mean and stddev.
-        The dtype is float32.
-
-    Examples:
-        >>> shape = (4, 16)
-        >>> mean = Tensor(1.0, mstype.float32)
-        >>> stddev = Tensor(1.0, mstype.float32)
-        >>> output = C.normal(shape, mean, stddev, seed=5)
-    """
-    set_seed(10)
-    seed1 = get_seed()
-    seed2 = seed
-    stdnormal = P.StandardNormal(seed1, seed2)
-    rnd = stdnormal(shape)
-    value = rnd * stddev + mean
-    return value
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index 14dbbb5ea0f..423ef89f928 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -55,7 +55,7 @@ from .math_ops import (Abs, ACos, Asin, Asinh, AddN, AccumulateNV2, AssignAdd, A
                        Sin, Sqrt, Rsqrt, BesselI0e, BesselI1e,
                        Square, Sub, TensorAdd, Sign, Round, SquareSumAll, Atan, Atanh, Cosh, Sinh, Eps)
 
-from .random_ops import (RandomChoiceWithMask, StandardNormal)
+from .random_ops import (RandomChoiceWithMask, Normal)
 from .nn_ops import (LSTM, SGD, Adam, SparseApplyAdam, SparseApplyLazyAdam, ApplyMomentum, BatchNorm,
                      BiasAdd, Conv2D,
                      DepthwiseConv2dNative,
@@ -170,7 +170,7 @@ __all__ = [
     'HSigmoid',
     'Tanh',
     'RandomChoiceWithMask',
-    'StandardNormal',
+    'Normal',
     'ResizeBilinear',
     'ScalarSummary',
     'ImageSummary',
diff --git a/mindspore/ops/operations/random_ops.py b/mindspore/ops/operations/random_ops.py
index bf212281ced..7a457d09981 100644
--- a/mindspore/ops/operations/random_ops.py
+++ b/mindspore/ops/operations/random_ops.py
@@ -21,48 +21,6 @@ from ...common import dtype as mstype
 from ..primitive import PrimitiveWithInfer, prim_attr_register
 
 
-class StandardNormal(PrimitiveWithInfer):
-    r"""
-    Generates random numbers according to the standard Normal (or Gaussian) random number distribution.
-
-    Args:
-        seed (int): Random seed. Default: 0.
-        seed2 (int): Random seed2. Default: 0.
-
-    Inputs:
-        - **shape** (tuple) - The shape of random tensor to be generated. Only constant value is allowed.
-
-    Outputs:
-        Tensor. The shape should be the broadcasted shape of Input "shape" and shapes of mean and stddev.
-        The dtype is float32.
-
-    Examples:
-        >>> shape = (4, 16)
-        >>> stdnormal = P.StandardNormal(seed=2)
-        >>> output = stdnormal(shape)
-    """
-
-    @prim_attr_register
-    def __init__(self, seed=0, seed2=0):
-        """Init StandardNormal"""
-        self.init_prim_io_names(inputs=['shape'], outputs=['output'])
-        validator.check_value_type('seed', seed, [int], self.name)
-        validator.check_value_type('seed2', seed2, [int], self.name)
-
-    def __infer__(self, shape):
-        shape_v = shape["value"]
-        if shape_v is None:
-            raise ValueError(f"For {self.name}, shape must be const.")
-        validator.check_value_type("shape", shape_v, [tuple], self.name)
-        for i, shape_i in enumerate(shape_v):
-            validator.check_integer("shape[%d]" % i, shape_i, 0, Rel.GT, self.name)
-        out = {
-            'shape': shape_v,
-            'dtype': mstype.float32,
-            'value': None}
-        return out
-
-
 class RandomChoiceWithMask(PrimitiveWithInfer):
     """
     Generates a random samply as index tensor with a mask tensor from a given tensor.
@@ -106,3 +64,47 @@ class RandomChoiceWithMask(PrimitiveWithInfer):
     def infer_dtype(self, x_dtype):
         validator.check_tensor_type_same({'x': x_dtype}, [mstype.bool_], self.name)
         return (mstype.int32, mstype.bool_)
+
+
+class Normal(PrimitiveWithInfer):
+    """
+    Generates random samples from a normal(Gaussian) distribution.
+
+    Args:
+        seed (int): Random seed. Default: 0.
+
+    Inputs:
+        - **shape** (tuple[int]) - The shape of output tensor. Only constant value is allowed.
+        - **mean** (Tensor) - The mean of the distribution, with float32 data type.
+        - **stddev** (Tensor) - The standard deviation of the distribution, with float32 data type.
+
+    Outputs:
+        Tensor, with the given shape from the specific distribution and float32 data type.
+
+    Examples:
+        >>> normal = P.Normal()
+        >>> mean = Tensor(0., mstype.float32)
+        >>> stddev = Tensor(1., mstype.float32)
+        >>> out = normal((32, 3, 3), mean, stddev)
+    """
+
+    @prim_attr_register
+    def __init__(self, seed=0):
+        """Init Normal"""
+        validator.check_value_type("seed", seed, [int], self.name)
+
+    def __infer__(self, shape, mean, stddev):
+        shape_value = shape["value"]
+        if shape_value is None:
+            raise ValueError(f"For {self.name}, shape must be const.")
+        validator.check_value_type("shape", shape_value, [tuple], self.name)
+        for i, shape_i in enumerate(shape_value):
+            validator.check_integer("shape[%d]" % i, shape_i, 0, Rel.GE, self.name)
+
+        validator.check_tensor_type_same({"mean": mean["dtype"]}, [mstype.float32], self.name)
+        validator.check_tensor_type_same({"stddev": stddev["dtype"]}, [mstype.float32], self.name)
+
+        out = {"shape": shape_value,
+               "dtype": mstype.float32,
+               "value": None}
+        return out
diff --git a/tests/st/ops/gpu/test_normal.py b/tests/st/ops/gpu/test_normal.py
deleted file mode 100644
index 0c4866f6f0a..00000000000
--- a/tests/st/ops/gpu/test_normal.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-import numpy as np
-
-import mindspore.context as context
-import mindspore.nn as nn
-from mindspore import Tensor
-from mindspore.common import dtype as mstype
-from mindspore.ops import composite as C
-
-context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
-
-
-class Net(nn.Cell):
-    def __init__(self, shape, seed=0):
-        super(Net, self).__init__()
-        self.shape = shape
-        self.seed = seed
-
-    def construct(self, mean, stddev):
-        return C.normal(self.shape, mean, stddev, self.seed)
-
-
-def test_net_1D():
-    seed = 10
-    shape = (3, 2, 4)
-    mean = 1.0
-    stddev = 1.0
-    net = Net(shape, seed)
-    tmean, tstddev = Tensor(mean, mstype.float32), Tensor(stddev, mstype.float32)
-    output = net(tmean, tstddev)
-    assert output.shape == (3, 2, 4)
-
-
-def test_net_ND():
-    seed = 10
-    shape = (3, 1, 2)
-    mean = np.array([[[1], [2]], [[3], [4]], [[5], [6]]]).astype(np.float32)
-    stddev = np.array([1.0]).astype(np.float32)
-    net = Net(shape, seed)
-    tmean, tstddev = Tensor(mean, mstype.float32), Tensor(stddev, mstype.float32)
-    output = net(tmean, tstddev)
-    assert output.shape == (3, 2, 2)
diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py
index 4817a192b31..022e969d31e 100755
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@@ -533,10 +533,10 @@ class NormalNet(nn.Cell):
     def __init__(self, shape=None, seed=0):
         super(NormalNet, self).__init__()
         self.shape = shape
-        self.seed = seed
+        self.normal = P.Normal(seed=seed)
 
     def construct(self, mean, stddev):
-        out = C.normal(self.shape, mean, stddev, self.seed)
+        out = self.normal(self.shape, mean, stddev)
         return out
 
 

From b902fd49d6ca05893e7ae16fb750bf8a94d54407 Mon Sep 17 00:00:00 2001
From: dinghao <dinghao7@huawei.com>
Date: Thu, 16 Jul 2020 11:52:31 +0800
Subject: [PATCH 22/68] fix serving compile

---
 serving/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/serving/CMakeLists.txt b/serving/CMakeLists.txt
index 4529323fe14..fdd56fffc41 100644
--- a/serving/CMakeLists.txt
+++ b/serving/CMakeLists.txt
@@ -61,7 +61,7 @@ add_custom_command(
 
 # Include generated *.pb.h files
 include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/core"
-        "${PROJECT_SOURCE_DIR}/mindspore/ccsrc")
+        "${PROJECT_SOURCE_DIR}/mindspore/ccsrc" "${PROJECT_SOURCE_DIR}/mindspore/core")
 file(GLOB_RECURSE CORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "core/*.cc" "core/util/*.cc" "core/version_control/*.cc")
 

From 68c78ab6bbc226e33d32a42192c814c668a2b2bc Mon Sep 17 00:00:00 2001
From: laiyongqiang <laiyongqiang1@huawei.com>
Date: Sat, 11 Jul 2020 09:55:37 +0800
Subject: [PATCH 23/68] reuse communication op output's memory

---
 .../optimizer/mem_reuse/kernel_refcount.h     |   7 +-
 .../backend/optimizer/mem_reuse/mem_reuse.cc  |  31 ++++-
 .../mem_reuse/mem_reuse_allocator.cc          | 109 +++++++++++++++++-
 .../optimizer/mem_reuse/mem_reuse_allocator.h |  18 ++-
 .../optimizer/mem_reuse/mem_reuse_checker.cc  |   6 +-
 .../device/ascend/ascend_memory_manager.cc    |  13 ++-
 .../ccsrc/runtime/device/kernel_runtime.cc    |  11 +-
 .../ccsrc/runtime/device/kernel_runtime.h     |   2 +-
 .../ccsrc/runtime/device/memory_manager.cc    |   3 +
 .../ccsrc/runtime/device/memory_manager.h     |   1 +
 .../mem_reuse/mem_reuse_allocator_test.cc     |   2 +-
 .../pre_activate/mem_reuse/mem_reuse_test.cc  |   1 -
 12 files changed, 186 insertions(+), 18 deletions(-)

diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/kernel_refcount.h b/mindspore/ccsrc/backend/optimizer/mem_reuse/kernel_refcount.h
index 4b928d6565f..58f7ef36720 100644
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/kernel_refcount.h
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/kernel_refcount.h
@@ -25,7 +25,8 @@
 namespace mindspore {
 namespace memreuse {
 enum RefCountType { kDynamicRefCount, kStaticRefCount };
-enum NodeType { NORMAL, SPECIAL };
+enum NodeType { COMMON_NODE, COMMUNICATION_NODE };
+enum KernelRefType { COMMON, REFNODE_OUTPUT, COMM_NOTREUSE, COMM_REUSE, SUMMARY };
 static constexpr int kInitIndex = -1;
 class KernelRefCount {
  public:
@@ -36,6 +37,7 @@ class KernelRefCount {
   size_t offset_;
   size_t size_;
   int index_;
+  KernelRefType type_;
   // remember to reset offset
   KernelRefCount()
       : stream_id_(0),
@@ -44,6 +46,7 @@ class KernelRefCount {
         offset_(0),
         size_(0),
         index_(kInitIndex),
+        type_(COMMON),
         reftype_(kStaticRefCount) {}
   ~KernelRefCount() = default;
   void SetKernelRefCountInfo(int index, size_t size, RefCountType reftype);
@@ -65,7 +68,7 @@ class KernelDef {
   KernelMap inputs_;
   KernelMap outputs_;
   KernelMap wk_space_;
-  NodeType dirty = NORMAL;
+  NodeType type_ = COMMON_NODE;
   KernelDef() = default;
   ~KernelDef() = default;
   void set_input_refs(const KernelRefCountPtrList &kernelRefPtrList) { input_refs_ = kernelRefPtrList; }
diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.cc b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.cc
index 263ceaec63b..8166a7bcc1c 100644
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.cc
@@ -46,6 +46,8 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() {
     if (iter == kernel_output_refs_.end()) {
       auto output_sizes = kernel_mod->GetOutputSizeList();
       KernelRefCountPtrList kernel_refs;
+      bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel_cnode);
+      size_t output_index = 0;
       for (auto size : output_sizes) {
         total_dy_size_ += size;
         // do not MallocDynamicMem just record this
@@ -54,9 +56,20 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() {
         auto curr_stream_id = AnfAlgo::GetStreamId(kernel_cnode);
         kernel_ref->stream_id_ = curr_stream_id;
         kernel_ref->SetKernelRefCountInfo(index, size, kDynamicRefCount);
+        if (is_comm_op) {
+          kernel_ref->type_ = COMM_REUSE;
+        } else {
+          session::AnfWithOutIndex out_pair(kernel_cnode, output_index);
+          if (graph_->IsInRefOutputMap(out_pair)) {
+            kernel_ref->type_ = REFNODE_OUTPUT;
+          } else {
+            kernel_ref->type_ = COMMON;
+          }
+        }
         kernel_refs.push_back(kernel_ref);
         kernel_out_ref_num++;
         total_refs_list_.push_back(kernel_ref);
+        output_index++;
       }
       if (!kernel_refs.empty()) {
         kernel_output_refs_[key] = kernel_refs;
@@ -155,9 +168,19 @@ void MemReuseUtil::SetInputMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr
   MS_EXCEPTION_IF_NULL(kernel);
   MS_EXCEPTION_IF_NULL(kernel_def_ptr);
   auto key = kernel.get();
-  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
+  bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel);
+  size_t input_tensor_num = AnfAlgo::GetInputTensorNum(kernel);
+  for (size_t i = 0; i < input_tensor_num; ++i) {
     auto ref_ptr = GetKernelInputRef(kernel, i);
     if (ref_ptr != nullptr) {
+      if (is_comm_op) {
+        if (input_tensor_num == 1) {
+          ref_ptr->type_ = COMM_REUSE;
+        } else {
+          ref_ptr->type_ = COMM_NOTREUSE;
+        }
+      }
+
       if (ref_ptr->reftype() == kStaticRefCount) {
         continue;
       } else if (ref_ptr->reftype() == kDynamicRefCount) {
@@ -258,6 +281,11 @@ void MemReuseUtil::SetKernelDefMap() {
     auto key = kernel.get();
     kernel_def_ptr->set_input_refs(kernel_def_ptr->inputs_[key]);
     kernel_def_ptr->set_output_refs(kernel_def_ptr->outputs_[key]);
+    if (AnfAlgo::IsCommunicationOp(kernel)) {
+      kernel_def_ptr->type_ = COMMUNICATION_NODE;
+    } else {
+      kernel_def_ptr->type_ = COMMON_NODE;
+    }
     kernel_def_ptr_list_.push_back(kernel_def_ptr);
     kernel_map_[key] = kernel_def_ptr;
   }
@@ -337,6 +365,7 @@ void MemReuseUtil::SetSummaryNodesRefCount() {
       KernelRefCountPtr kernel_ref = kernel_output_refs_[node.get()][index];
       kernel_ref->ref_count_ = kMaxRefCount;
       kernel_ref->ref_count_dynamic_use_ = kMaxRefCount;
+      kernel_ref->type_ = SUMMARY;
       total_summary_size += kernel_ref->size_;
       MS_LOG(INFO) << "Set summary node's ref count, node: " << node->fullname_with_scope() << " index: " << index;
     } else {
diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
index 787d334a1a0..bb613c4db63 100644
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
@@ -30,11 +30,11 @@ void BestFitMemReuse::InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr) {
   set_op_ptr_list(mem_reuse_util_ptr->kernel_def_ptr_list());
   // check info Correctness
   for (auto &tensor : tensor_ptr_list_) {
-    tensor->size_ = AlignMemorySize(tensor->size_);
+    tensor->size_ = AlignCommonMemorySize(tensor->size_);
   }
   // align wk size to 512 && refcount == 1
   for (auto &wk : wk_tensor_list_) {
-    wk->size_ = AlignMemorySize(wk->size_);
+    wk->size_ = AlignCommonMemorySize(wk->size_);
     wk->ref_count_ = 1;
   }
 #ifdef ENABLE_D
@@ -123,11 +123,23 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr
   return false;
 }
 
-void BestFitMemReuse::AssignNodeOutputOffset() {
+void BestFitMemReuse::AssignCommonNodeOutputOffset() {
+  MS_EXCEPTION_IF_NULL(current_kernel_);
   for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
     size_t index = GetTensorIndex(tensor_idx);
     auto tensor_desc = tensor_ptr_list_[index];
     MS_EXCEPTION_IF_NULL(tensor_desc);
+    if (tensor_desc->type_ == REFNODE_OUTPUT) {
+      total_refoutput_size += tensor_desc->size_;
+      continue;
+    } else if (tensor_desc->type_ == COMM_NOTREUSE) {
+      total_comm_not_reuse_size += tensor_desc->size_;
+    } else if (tensor_desc->type_ == COMM_REUSE) {
+      // get align size for communication op's single input
+      tensor_desc->size_ = AlignCommunicationMemorySize(tensor_desc->size_);
+      total_comm_reuse_size += tensor_desc->size_;
+    }
+
     auto reusable_membuf_map = GetReusableMembufMap(tensor_desc->size_);
     if (!reusable_membuf_map.empty()) {
       auto membuf_index = reusable_membuf_map.begin()->second;
@@ -140,6 +152,86 @@ void BestFitMemReuse::AssignNodeOutputOffset() {
       MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
 #endif
     }
+    // skip left align border for communication op single input to used
+    if (tensor_desc->type_ == COMM_REUSE) {
+      tensor_desc->offset_ += kDefaultMemAlignSize;
+    }
+  }
+}
+
+void BestFitMemReuse::AssignCommunicationNodeOutputOffset() {
+  size_t total_kernel_output_size = 0;
+  size_t output_num = 0;
+  // get all output size
+  MS_EXCEPTION_IF_NULL(current_kernel_);
+  for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
+    size_t index = GetTensorIndex(tensor_idx);
+    auto tensor_desc = tensor_ptr_list_[index];
+    MS_EXCEPTION_IF_NULL(tensor_desc);
+    if (tensor_desc->type_ == COMM_REUSE) {
+      total_comm_reuse_size += tensor_desc->size_;
+      total_comm_output_reuse_size += tensor_desc->size_;
+      total_kernel_output_size += tensor_desc->size_;
+    } else {
+      MS_LOG(ERROR) << "All communication op's outputs should be memory reuse, Kernel:"
+                    << current_kernel_->scope_full_name();
+      continue;
+    }
+  }
+  total_kernel_output_size = AlignCommunicationMemorySize(total_kernel_output_size);
+
+  // add left align border for the first output and right align border for the last output to alloc align border memory
+  size_t output_index = 0;
+  for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
+    size_t index = GetTensorIndex(tensor_idx);
+    auto tensor_desc = tensor_ptr_list_[index];
+    MS_EXCEPTION_IF_NULL(tensor_desc);
+    if (output_index == 0 || output_index == output_num - 1) {
+      tensor_desc->size_ += kDefaultMemAlignSize;
+    }
+    output_index++;
+  }
+
+  auto reusable_membuf_map = GetReusableMembufMap(total_kernel_output_size);
+  if (!reusable_membuf_map.empty()) {
+    auto membuf_index = reusable_membuf_map.begin()->second;
+    output_index = 0;
+    for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
+      size_t index = GetTensorIndex(tensor_idx);
+      auto tensor_desc = tensor_ptr_list_[index];
+      MS_EXCEPTION_IF_NULL(tensor_desc);
+      ReuseExistMembuf(tensor_desc.get(), membuf_index + output_index, kDynamicMem);
+      // skip skip left align border for communication op's first output to used
+      if (output_index == 0) {
+        tensor_desc->offset_ += kDefaultMemAlignSize;
+      }
+      output_index++;
+    }
+  } else {
+    // no membuf can reuse, add new membuf after the membuf_ptr_list
+    output_index = 0;
+    for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
+      size_t index = GetTensorIndex(tensor_idx);
+      auto tensor_desc = tensor_ptr_list_[index];
+      MS_EXCEPTION_IF_NULL(tensor_desc);
+      AddNewMembufPtr(tensor_desc.get(), kDynamicMem);
+      // skip align size offset for first output to used
+      if (output_index == 0) {
+        tensor_desc->offset_ += kDefaultMemAlignSize;
+      }
+      output_index++;
+#ifdef MEM_REUSE_DEBUG
+      MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
+#endif
+    }
+  }
+}
+
+void BestFitMemReuse::AssignNodeOutputOffset() {
+  if (current_kernel_->type_ == COMMUNICATION_NODE) {
+    AssignCommunicationNodeOutputOffset();
+  } else {
+    AssignCommonNodeOutputOffset();
   }
 }
 
@@ -307,11 +399,17 @@ void BestFitMemReuse::ReleaseMembuf(size_t tensor_index, int flag) {
   }
 }
 
-size_t BestFitMemReuse::AlignMemorySize(size_t size) const {
+size_t BestFitMemReuse::AlignCommonMemorySize(size_t size) const {
   // memory size 512 align
   return (size + kDefaultMemAlignSize + kAttAlignSize) / kDefaultMemAlignSize * kDefaultMemAlignSize;
 }
 
+size_t BestFitMemReuse::AlignCommunicationMemorySize(size_t size) const {
+  // memory size 512 align and add communication memory:  left align border memory - data - right align border memory
+  return kDefaultMemAlignSize + (size + kDefaultMemAlignSize - 1) / kDefaultMemAlignSize * kDefaultMemAlignSize +
+         kDefaultMemAlignSize;
+}
+
 size_t BestFitMemReuse::GetAllocatedSize() {
   size_t AllocatedSize = kTotalSize;
   if (membuf_ptr_list_.empty()) {
@@ -400,6 +498,9 @@ void BestFitMemReuse::Reuse(const MemReuseUtil *mem_reuse_util_ptr) {
     ++op_num;
 #endif
   }
+  MS_LOG(INFO) << "Special Tensor total size: RefOutput: " << total_refoutput_size
+               << " CommReuse: " << total_comm_reuse_size << " CommOutputReuse: " << total_comm_output_reuse_size
+               << " CommNotReuse: " << total_comm_not_reuse_size;
 #ifdef MEM_REUSE_DEBUG
   MemReuseChecker::GetInstance().ExportMembufInfoIR();
   MemReuseChecker::GetInstance().ExportAddNewMmebufIR();
diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.h b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.h
index ef1cfd3e111..322c7b940cb 100644
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.h
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.h
@@ -74,6 +74,14 @@ class BestFitMemReuse {
    * Assign output tensor memory offset of current kernel
    */
   void AssignNodeOutputOffset();
+  /**
+   * Assign output tensor memory offset of common kernel
+   */
+  void AssignCommonNodeOutputOffset();
+  /**
+   * Assign output tensor memory offset of communication kernel
+   */
+  void AssignCommunicationNodeOutputOffset();
   /**
    * Update input tensor's status of current kernel, and the status of membuf used by current kernel
    */
@@ -110,8 +118,10 @@ class BestFitMemReuse {
   void AddNewMembufPtr(KernelRefCount *tensor_desc, int flag);
   // Merge unused membuf
   void ReleaseMembuf(size_t tensor_index, int flag);
-  // Memory address alignment 512
-  size_t AlignMemorySize(size_t size) const;
+  // Memory address alignment for common memory
+  size_t AlignCommonMemorySize(size_t size) const;
+  // Memory address alignment for communication used memory
+  size_t AlignCommunicationMemorySize(size_t size) const;
   int GetRealIndex(size_t index, int flag = kDynamicMem) const;
   size_t GetTensorIndex(int index) const;
   size_t GetWorkspaceIndex(int index) const;
@@ -153,6 +163,10 @@ class BestFitMemReuse {
   // kernel_front_map_, key: the kernel_def, value: kernels before this kernel_def
   std::map<KernelDefPtr, std::set<KernelDefPtr>> kernel_front_map_;
   std::vector<std::vector<uint32_t>> stream_groups_;
+  size_t total_refoutput_size{0};
+  size_t total_comm_reuse_size{0};
+  size_t total_comm_output_reuse_size{0};
+  size_t total_comm_not_reuse_size{0};
 };
 }  // namespace memreuse
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_checker.cc b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_checker.cc
index b93bf42f9f6..eca595cead1 100644
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_checker.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_checker.cc
@@ -170,12 +170,14 @@ void MemReuseChecker::CheckMemReuseIR(const KernelRefCountPtrList &total_refs_li
   ofs << "all_tensor_refs:\n";
   ofs << "index:"
       << "\tsize:"
-      << "\trefcount:\n";
+      << "\trefcount:"
+      << "\ttype:\n";
   for (auto &ref : total_refs_list) {
     ofs << "%" << ref->index_ << "T"
         << "\t"
         << "#" << ref->size_ << "S"
         << "\t" << ref->ref_count_ << "C"
+        << "\t" << ref->type_ << "t"
         << "\n";
   }
   ofs << "kernel_def exc_order:\n";
@@ -241,7 +243,7 @@ bool MemReuseChecker::CheckGraphOutputAssigned(const session::KernelGraph *graph
 void MemReuseChecker::ExportMemOpIr(const KernelDef *def, std::ofstream &ofs, int def_idx) {
   auto scope_name = def->scope_full_name();
   std::string split_name = GetSplitName(scope_name);
-  ofs << "$" << def_idx << "\t" << split_name << "\t";
+  ofs << "$" << def_idx << "\t" << split_name << "\t" << static_cast<int>(def->type_) << "\t";
   ofs << "inputs[";
   for (auto &in : def->inputs_) {
     for (auto &in_ref : in.second) {
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc
index f9da0850c6c..52278585718 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_memory_manager.cc
@@ -95,6 +95,12 @@ uint8_t *AscendMemoryManager::MallocStaticMem(size_t size, bool communication_me
   } else {
     align_size = GetCommonAlignSize(size);
   }
+
+  auto device_mem_pool_offset = AscendMemoryPool::GetInstance().device_mem_pool_offset();
+  MS_LOG(INFO) << "Malloc Memory: Static, total[" << device_mem_size_ << "] (dynamic[" << total_dynamic_size_
+               << "] memory pool[" << device_mem_pool_offset << "])"
+               << " malloc [" << align_size << "]";
+
   if (communication_mem) {
     // create protect area [kMemAlignSize -- data -- kMemAlignSize]
     uint8_t *alloc_address = reinterpret_cast<uint8_t *>(AscendMemoryPool::GetInstance().AllocTensorMem(align_size));
@@ -111,12 +117,17 @@ uint8_t *AscendMemoryManager::MallocDynamicMem(size_t size, bool communication_m
   } else {
     align_size = GetCommonAlignSize(size);
   }
+
+  auto device_mem_pool_offset = AscendMemoryPool::GetInstance().device_mem_pool_offset();
+  MS_LOG(INFO) << "Malloc Memory: Dynamic, total[" << device_mem_size_ << "] (dynamic[" << total_dynamic_size_
+               << "] memory pool[" << device_mem_pool_offset << "])"
+               << " malloc [" << align_size << "]";
+
   if (dynamic_mem_offset_ < align_size) {
     MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "] (dynamic[" << total_dynamic_size_
                       << "]) malloc [" << align_size << "] failed!";
   }
   auto new_offset = dynamic_mem_offset_ - align_size;
-  auto device_mem_pool_offset = AscendMemoryPool::GetInstance().device_mem_pool_offset();
   if (new_offset <= device_mem_pool_offset) {
     MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "] (dynamic[" << total_dynamic_size_
                       << "] memory pool[" << device_mem_pool_offset << "])"
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
index d5fd00da5b0..6e2ff666974 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@@ -398,7 +398,7 @@ void KernelRuntime::UpdateRefNodeOutputMem(const session::KernelGraph *graph) {
 }
 
 void KernelRuntime::AssignCommunicationNodeMem(int flag, const AnfNodePtr &node) {
-  AssignCommunicationNodeInputMem(node);
+  AssignCommunicationNodeInputMem(flag, node);
   AssignCommunicationNodeOutputMem(flag, node);
 }
 
@@ -428,6 +428,11 @@ void KernelRuntime::AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr
     total_size += mem_size;
     align_size_list.emplace_back(mem_size);
   }
+
+  if (flag == kReuseDynamicMem) {
+    // reuse communication op's all outputs' memory
+    flag = kReuseDynamicCommMem;
+  }
   uint8_t *output_ptr = mem_manager_->MallocOutputMem(node, 0, flag, total_size);
   for (size_t j = 0; j < align_size_list.size(); ++j) {
     std::string output_format = AnfAlgo::GetOutputFormat(node, j);
@@ -456,7 +461,7 @@ DeviceAddressPtr KernelRuntime::PreAssignCNodeMemory(const AnfNodePtr &anf_node,
   return address;
 }
 
-void KernelRuntime::AssignCommunicationNodeInputMem(const AnfNodePtr &node) {
+void KernelRuntime::AssignCommunicationNodeInputMem(int flag, const AnfNodePtr &node) {
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
   MS_EXCEPTION_IF_NULL(node);
@@ -477,7 +482,7 @@ void KernelRuntime::AssignCommunicationNodeInputMem(const AnfNodePtr &node) {
     total_size += mem_size;
     addr_size.emplace_back(address.get(), mem_size);
   }
-  uint8_t *input_ptr = mem_manager_->MallocOutputMem(node, 0, kDynamicMem, total_size);
+  uint8_t *input_ptr = mem_manager_->MallocOutputMem(node, 0, flag, total_size);
   for (const auto &iter : addr_size) {
     MS_EXCEPTION_IF_NULL(iter.first);
     iter.first->set_ptr(input_ptr);
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h
index 8320355b82a..41cbd6f4e49 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -88,7 +88,7 @@ class KernelRuntime {
   void UpdateRefNodeOutputMem(const session::KernelGraph *graph);
 
   void AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr &node);
-  void AssignCommunicationNodeInputMem(const AnfNodePtr &node);
+  void AssignCommunicationNodeInputMem(int flag, const AnfNodePtr &node);
   void AssignCommunicationNodeMem(int flag, const AnfNodePtr &node);
 #ifdef ENABLE_DUMP_E2E
   bool SetDumpConf();
diff --git a/mindspore/ccsrc/runtime/device/memory_manager.cc b/mindspore/ccsrc/runtime/device/memory_manager.cc
index 563d5f0f501..0199f8ee188 100644
--- a/mindspore/ccsrc/runtime/device/memory_manager.cc
+++ b/mindspore/ccsrc/runtime/device/memory_manager.cc
@@ -57,6 +57,9 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, in
     }
     if (flag == kStaticMem) {
       ptr = MallocStaticMem(size, communication_mem);
+    } else if (flag == kReuseDynamicCommMem) {
+      MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr_);
+      ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index);
     } else {
       ptr = MallocDynamicMem(size, communication_mem);
     }
diff --git a/mindspore/ccsrc/runtime/device/memory_manager.h b/mindspore/ccsrc/runtime/device/memory_manager.h
index 3c6fb1b39a4..02210b651b0 100644
--- a/mindspore/ccsrc/runtime/device/memory_manager.h
+++ b/mindspore/ccsrc/runtime/device/memory_manager.h
@@ -25,6 +25,7 @@ namespace device {
 const int kStaticMem = 0;
 const int kDynamicMem = 1;
 const int kReuseDynamicMem = 2;
+const int kReuseDynamicCommMem = 3;
 const int kGetAllOuts = -1;
 const uint64_t kMemAlignSize = 512;
 using MemReuseUtilPtr = mindspore::memreuse::MemReuseUtilPtr;
diff --git a/tests/ut/cpp/pre_activate/mem_reuse/mem_reuse_allocator_test.cc b/tests/ut/cpp/pre_activate/mem_reuse/mem_reuse_allocator_test.cc
index 2a6904658e6..69fd649f8c8 100644
--- a/tests/ut/cpp/pre_activate/mem_reuse/mem_reuse_allocator_test.cc
+++ b/tests/ut/cpp/pre_activate/mem_reuse/mem_reuse_allocator_test.cc
@@ -146,7 +146,7 @@ TEST_F(TestMemReuseAllocator, mem_reuse_allocator_split_membuf) {
 
 TEST_F(TestMemReuseAllocator, mem_reuse_allocator_align) {
   auto best_fit_mem_reuse = std::make_shared<BestFitMemReuse>();
-  auto size = best_fit_mem_reuse->AlignMemorySize(510);
+  auto size = best_fit_mem_reuse->AlignCommonMemorySize(510);
   ASSERT_EQ(size, 1024);
 }
 }  // namespace memreuse
diff --git a/tests/ut/cpp/pre_activate/mem_reuse/mem_reuse_test.cc b/tests/ut/cpp/pre_activate/mem_reuse/mem_reuse_test.cc
index 31ae923c0ad..d0aaa9ed2db 100644
--- a/tests/ut/cpp/pre_activate/mem_reuse/mem_reuse_test.cc
+++ b/tests/ut/cpp/pre_activate/mem_reuse/mem_reuse_test.cc
@@ -225,7 +225,6 @@ TEST_F(TestMemReuseWithPy, KernelRef) {
   ASSERT_EQ(kernel_ref_count_ptr->size_, 512);
   KernelDefPtr kernel_def_ptr = std::make_shared<KernelDef>();
   ASSERT_NE(kernel_def_ptr, nullptr);
-  ASSERT_EQ(kernel_def_ptr->dirty, false);
   MembufPtr membuf_ptr = std::make_shared<Membuf>();
   ASSERT_NE(membuf_ptr, nullptr);
 }

From 5c0962acfa36928324cefb856d17ef98773fd963 Mon Sep 17 00:00:00 2001
From: zhaoting <zhaoting23@huawei.com>
Date: Wed, 15 Jul 2020 11:35:34 +0800
Subject: [PATCH 24/68] add gpu split and restructure gpu concat

---
 .../gpu/arrays/concatv2_gpu_kernel.h          |  92 ++++++-----
 .../gpu/arrays/split_gpu_kernel.cc            |  31 ++++
 .../gpu/arrays/split_gpu_kernel.h             | 153 ++++++++++++++++++
 .../gpu/cuda_impl/concatv2_impl.cu            | 117 +++++---------
 .../gpu/cuda_impl/concatv2_impl.cuh           |  11 +-
 .../gpu/cuda_impl/split_impl.cu               |  50 ++++++
 .../gpu/cuda_impl/split_impl.cuh              |  24 +++
 tests/st/ops/gpu/test_split.py                |  58 +++++++
 8 files changed, 406 insertions(+), 130 deletions(-)
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.cc
 create mode 100644 mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h
 create mode 100755 mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cu
 create mode 100755 mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh
 create mode 100644 tests/st/ops/gpu/test_split.py

diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h
index 15ccedcaeca..bae315d1c14 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h
@@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CONCATV2_GPU_KERNEL_H
 
 #include <vector>
+#include <memory>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
@@ -27,40 +28,35 @@ namespace kernel {
 template <typename T>
 class ConcatV2GpuFwdKernel : public GpuKernel {
  public:
-  ConcatV2GpuFwdKernel() : axis_(0), output_size_(0) {}
+  ConcatV2GpuFwdKernel()
+      : axis_(0),
+        input_num_(1),
+        output_size_(0),
+        all_size_before_axis_(1),
+        all_size_axis_(1),
+        inputs_host_(nullptr),
+        len_axis_(nullptr) {}
   ~ConcatV2GpuFwdKernel() override = default;
   const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
   const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
   const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
 
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
               const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    if (inputs.size() == 2) {
-      T *input_0 = GetDeviceAddress<T>(inputs, 0);
-      T *input_1 = GetDeviceAddress<T>(inputs, 1);
-      T *output = GetDeviceAddress<T>(outputs, 0);
-      ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], input_0, input_1, output,
-                   reinterpret_cast<cudaStream_t>(stream_ptr));
-    }
-
-    if (inputs.size() == 3) {
-      T *input_0 = GetDeviceAddress<T>(inputs, 0);
-      T *input_1 = GetDeviceAddress<T>(inputs, 1);
-      T *input_2 = GetDeviceAddress<T>(inputs, 2);
-      T *output = GetDeviceAddress<T>(outputs, 0);
-      ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], input_0, input_1, input_2, output,
-                   reinterpret_cast<cudaStream_t>(stream_ptr));
-    }
-
-    if (inputs.size() == 4) {
-      T *input_0 = GetDeviceAddress<T>(inputs, 0);
-      T *input_1 = GetDeviceAddress<T>(inputs, 1);
-      T *input_2 = GetDeviceAddress<T>(inputs, 2);
-      T *input_3 = GetDeviceAddress<T>(inputs, 3);
-      T *output = GetDeviceAddress<T>(outputs, 0);
-      ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], w_[3], input_0, input_1, input_2, input_3, output,
-                   reinterpret_cast<cudaStream_t>(stream_ptr));
+    T *output = GetDeviceAddress<T>(outputs, 0);
+    T **inputs_device = GetDeviceAddress<T *>(workspace, 0);
+    int *len_axis_device = GetDeviceAddress<int>(workspace, 1);
+    for (size_t i = 0; i < inputs.size(); i++) {
+      inputs_host_[i] = GetDeviceAddress<T>(inputs, i);
     }
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_,
+                                               cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                               "ConcatV2 opt cudaMemcpyAsync inputs failed");
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_,
+                                               cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                               "ConcatV2 opt cudaMemcpyAsync length on axis failed");
+    ConcatKernel(output_size_, input_num_, all_size_before_axis_, all_size_axis_, len_axis_device, inputs_device,
+                 output, reinterpret_cast<cudaStream_t>(stream_ptr));
     return true;
   }
   bool Init(const CNodePtr &kernel_node) override {
@@ -74,25 +70,34 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
       axis_ += SizeToInt(input_shape.size());
     }
 
-    auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-    for (size_t i = 0; i < input_num; i++) {
-      auto input_size = sizeof(T);
+    input_num_ = SizeToInt(AnfAlgo::GetInputTensorNum(kernel_node));
+    inputs_host_ = std::make_unique<T *[]>(input_num_);
+    len_axis_ = std::make_unique<int[]>(input_num_);
+    for (int i = 0; i < input_num_; i++) {
+      int input_size = 1;
       auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
       for (size_t j = 0; j < input_shape.size(); j++) {
         input_size *= SizeToInt(input_shape[j]);
-        if (j >= IntToSize(axis_)) {
-          w_[i] *= SizeToInt(input_shape[j]);
-        }
-        input_size_list_.push_back(input_size);
       }
+      input_size_list_.push_back(IntToSize(input_size * sizeof(T)));
+      len_axis_[i] = SizeToInt(input_shape[axis_]);
     }
+    workspace_size_list_.push_back(sizeof(T *) * input_num_);
+    workspace_size_list_.push_back(sizeof(int) * input_num_);
 
     auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
-    output_size_ = sizeof(T);
-    for (size_t i = 0; i < output_shape.size(); i++) {
+    output_size_ = 1;
+    for (int i = 0; i < SizeToInt(output_shape.size()); i++) {
       output_size_ *= output_shape[i];
+      if (i > axis_) {
+        all_size_before_axis_ *= output_shape[i];
+        all_size_axis_ *= output_shape[i];
+      }
+      if (i == axis_) {
+        all_size_before_axis_ *= output_shape[i];
+      }
     }
-    output_size_list_.push_back(output_size_);
+    output_size_list_.push_back(IntToSize(output_size_ * sizeof(T)));
 
     InitSizeLists();
     return true;
@@ -103,11 +108,6 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
 
  private:
   bool CheckParam(const CNodePtr &kernel_node) {
-    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-    if (input_num < 2 || input_num > 4) {
-      MS_LOG(ERROR) << "Input number is " << input_num << ", but ConcatV2GpuFwdKernel needs inputs between 2 and 4.";
-      return false;
-    }
     size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
     if (output_num != 1) {
       MS_LOG(ERROR) << "Output number is " << output_num << ", but ConcatV2GpuFwdKernel needs 1 output.";
@@ -115,9 +115,13 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
     }
     return true;
   }
-  int w_[4] = {1, 1, 1, 1};
   int axis_;
-  size_t output_size_;
+  int input_num_;
+  int output_size_;
+  int all_size_before_axis_;
+  int all_size_axis_;
+  std::unique_ptr<T *[]> inputs_host_;
+  std::unique_ptr<int[]> len_axis_;
   std::vector<size_t> input_size_list_;
   std::vector<size_t> output_size_list_;
   std::vector<size_t> workspace_size_list_;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.cc
new file mode 100644
index 00000000000..0101f650018
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.cc
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+MS_REG_GPU_KERNEL_ONE(
+  Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  SplitGpuFwdKernel, float)
+MS_REG_GPU_KERNEL_ONE(Split,
+                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+                      SplitGpuFwdKernel, int)
+MS_REG_GPU_KERNEL_ONE(
+  Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+  SplitGpuFwdKernel, half)
+}  // namespace kernel
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h
new file mode 100644
index 00000000000..b26c01ee106
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h
@@ -0,0 +1,153 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
+#define MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
+
+#include <vector>
+#include <memory>
+#include "backend/kernel_compiler/gpu/gpu_kernel.h"
+#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+class SplitGpuFwdKernel : public GpuKernel {
+ public:
+  SplitGpuFwdKernel()
+      : axis_(0),
+        output_num_(1),
+        input_size_(1),
+        axis_step_(1),
+        all_size_before_axis_(1),
+        all_size_axis_(1),
+        outputs_host_(nullptr) {}
+  ~SplitGpuFwdKernel() override = default;
+  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
+  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
+  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
+    T *input = GetDeviceAddress<T>(inputs, 0);
+    T **outputs_device = GetDeviceAddress<T *>(workspace, 0);
+    for (size_t i = 0; i < outputs.size(); i++) {
+      outputs_host_[i] = GetDeviceAddress<T>(outputs, i);
+    }
+    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_,
+                                               cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                               "Split opt cudaMemcpyAsync outputs failed");
+    SplitKernel(input_size_, axis_step_, all_size_before_axis_, all_size_axis_, input, outputs_device,
+                reinterpret_cast<cudaStream_t>(stream_ptr));
+    return true;
+  }
+
+  bool Init(const CNodePtr &kernel_node) override {
+    axis_ = GetAttr<int>(kernel_node, "axis");
+    if (axis_ < 0) {
+      auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+      axis_ += SizeToInt(input_shape.size());
+    }
+    output_num_ = GetAttr<int>(kernel_node, "output_num");
+
+    if (!CheckParam(kernel_node)) {
+      return false;
+    }
+
+    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    input_size_ = 1;
+    all_size_before_axis_ = 1;
+    all_size_axis_ = 1;
+
+    for (int i = 0; i < SizeToInt(input_shape.size()); i++) {
+      input_size_ *= input_shape[i];
+      if (i > axis_) {
+        all_size_before_axis_ *= input_shape[i];
+        all_size_axis_ *= input_shape[i];
+      }
+      if (i == axis_) {
+        all_size_before_axis_ *= input_shape[i];
+      }
+    }
+    input_size_list_.push_back(IntToSize(input_size_ * sizeof(T)));
+    axis_step_ = input_shape[axis_] / output_num_;
+
+    for (int i = 0; i < output_num_; i++) {
+      size_t output_size = 1;
+      auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, i);
+      for (size_t j = 0; j < output_shape.size(); j++) {
+        output_size *= output_shape[j];
+      }
+      output_size_list_.push_back(output_size * sizeof(T));
+    }
+    workspace_size_list_.push_back(sizeof(T *) * output_num_);
+    InitSizeLists();
+    outputs_host_ = std::make_unique<T *[]>(output_num_);
+    return true;
+  }
+
+ protected:
+  void InitSizeLists() override {}
+
+ private:
+  bool CheckParam(const CNodePtr &kernel_node) {
+    auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+    int dims = SizeToInt(input_shape.size());
+    int output_num = SizeToInt(AnfAlgo::GetOutputTensorNum(kernel_node));
+
+    if (input_num != 1) {
+      MS_LOG(ERROR) << "Input number is " << input_num << ", but Split needs 1 input.";
+      return false;
+    }
+    if (dims == 0) {
+      MS_LOG(ERROR) << "Input dims is " << dims << ", scalar is not supported.";
+      return false;
+    }
+    if (axis_ < -dims || axis_ >= dims) {
+      MS_LOG(ERROR) << "Attr axis " << axis_ << " must be in " << -dims << "~" << dims;
+      return false;
+    }
+    if (output_num_ > SizeToInt(input_shape[axis_])) {
+      MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must less than" << input_shape[axis_];
+      return false;
+    }
+    if (input_shape[axis_] % output_num_ != 0) {
+      MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must be divided by" << input_shape[axis_];
+      return false;
+    }
+    if (output_num_ != output_num) {
+      MS_LOG(ERROR) << "Output num is " << output_num << ", but need " << output_num_;
+      return false;
+    }
+    return true;
+  }
+  int axis_;
+  int output_num_;
+  int input_size_;
+  int axis_step_;
+  int all_size_before_axis_;
+  int all_size_axis_;
+  std::unique_ptr<T *[]> outputs_host_;
+  std::vector<size_t> input_size_list_;
+  std::vector<size_t> output_size_list_;
+  std::vector<size_t> workspace_size_list_;
+};
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cu
index 147782591ae..c3a77d186d7 100755
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cu
@@ -19,90 +19,51 @@
 #include <cuda_runtime.h>
 #include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
 template <typename T>
-__global__ void Concat(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
-    int n = pos / (w1 + w2);
-    int m = pos % (w1 + w2);
-    output[pos] = m >= w1 ? input_2[n * w2 + m - w1] : input_1[n * w1 + m];
+__global__ void Concat(const int size, const int input_num,
+                       const int all_size_before_axis, const int all_size_axis,
+                       int* len_axis, T** inputs, T* output) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
+    int num = pos % all_size_before_axis / all_size_axis;
+    int block = -1;
+    int axis_inc = 0;
+    int block_len = 0;
+    for (int i = 0; i < input_num; i++) {
+      if (axis_inc <= num) {
+        block++;
+        axis_inc += len_axis[i];
+      } else {
+        break;
+      }
+    }
+    block_len = len_axis[block];
+    axis_inc -= len_axis[block];
+    int block_pos = pos / all_size_before_axis * block_len * all_size_axis +
+                    (num - axis_inc) * all_size_axis + pos % all_size_axis;;
+    output[pos] = inputs[block][block_pos];
   }
   return;
 }
 
 template <typename T>
-__global__ void Concat(const size_t size, const int w1, const int w2, const int w3,
-                       const T* input_1, const T* input_2, const T* input_3, T* output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
-    int n = pos / (w1 + w2 + w3);
-    int m = pos % (w1 + w2 + w3);
-    output[pos] = m < w1 ? input_1[n * w1 + m] :
-                    m < w1 + w2 ? input_2[n * w2 + m - w1] :
-                      input_3[n * w3 + m - w1 - w2];
-  }
-  return;
-}
-
-template <typename T>
-__global__ void Concat(const size_t size, const int w1, const int w2, const int w3, const int w4,
-                       const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
-    int n = pos / (w1 + w2 + w3 + w4);
-    int m = pos % (w1 + w2 + w3 + w4);
-    output[pos] = m < w1 ? input_1[n * w1 + m] :
-                    m < w1 + w2 ? input_2[n * w2 + m - w1]:
-                      m < w1 + w2 + w3 ? input_3[n * w3 + m - w1 - w2]:
-                        input_4[n * w4 + m - w1 - w2 - w3];
-  }
-  return;
-}
-
-template <typename T>
-void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output,
-                 cudaStream_t cuda_stream) {
-  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, input_1, input_2, output);
-  return;
-}
-
-template <typename T>
-void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
-                  const T* input_1, const T* input_2, const T* input_3, T* output,
+void ConcatKernel(const int size, const int input_num,
+                  const int all_size_before_axis, const int all_size_axis,
+                  int* len_axis, T** inputs, T* output,
                   cudaStream_t cuda_stream) {
-  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, input_1, input_2, input_3, output);
+  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_num,
+                                                            all_size_before_axis, all_size_axis,
+                                                            len_axis, inputs, output);
   return;
 }
 
-template <typename T>
-void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
-                  const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output,
-                  cudaStream_t cuda_stream) {
-  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, w4, input_1,
-                                                            input_2, input_3, input_4, output);
-  return;
-}
-
-template void ConcatKernel(const size_t size, const int w1, const int w2, const float* input_1, const float* input_2,
-                           float* output, cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int* input_1, const int* input_2,
-                           int* output, cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int w1, const int w2, const half* input_1, const half* input_2,
-                           half* output, cudaStream_t cuda_stream);
-
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
-                           const float* input_1, const float* input_2, const float* input_3,
-                           float* output, cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
-                           const int* input_1, const int* input_2, const int* input_3,
-                           int* output, cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
-                           const half* input_1, const half* input_2, const half* input_3,
-                           half* output, cudaStream_t cuda_stream);
-
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
-                           const float* input_1, const float* input_2, const float* input_3, const float* input_4,
-                           float* output, cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
-                           const int* input_1, const int* input_2, const int* input_3, const int* input_4,
-                           int* output, cudaStream_t cuda_stream);
-template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
-                           const half* input_1, const half* input_2, const half* input_3, const half* input_4,
-                           half* output, cudaStream_t cuda_stream);
-
+template void ConcatKernel(const int size, const int input_num,
+                           const int all_size_before_axis, const int all_size_axis,
+                           int* len_axis, float** inputs, float* output,
+                           cudaStream_t cuda_stream);
+template void ConcatKernel(const int size, const int input_num,
+                           const int all_size_before_axis, const int all_size_axis,
+                           int* len_axis, int** inputs, int* output,
+                           cudaStream_t cuda_stream);
+template void ConcatKernel(const int size, const int input_num,
+                           const int all_size_before_axis, const int all_size_axis,
+                           int* len_axis, half** inputs, half* output,
+                           cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh
index 7bd32c140fb..010e2977e23 100755
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh
@@ -19,13 +19,8 @@
 
 #include "runtime/device/gpu/cuda_common.h"
 template <typename T>
-void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output,
-                  cudaStream_t cuda_stream);
-template <typename T>
-void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
-                  const T* input_1, const T* input_2, const T* input_3, T* output, cudaStream_t cuda_stream);
-template <typename T>
-void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
-                  const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output,
+void ConcatKernel(const int size, const int input_num,
+                  const int all_size_before_axis, const int all_size_axis,
+                  int* len_axis, T** inputs, T* output,
                   cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CONCATV2IMPL_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cu
new file mode 100755
index 00000000000..a2422908601
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cu
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <cuda_runtime.h>
+#include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
+template <typename T>
+__global__ void Split(const int size, const int axis_step, const int all_size_before_axis,
+                      const int all_size_axis, const T* input, T** outputs) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    int num = pos % all_size_before_axis / all_size_axis;
+    int block = num / axis_step;
+    int block_pos = pos / all_size_before_axis * axis_step * all_size_axis +
+                    num % axis_step * all_size_axis + pos % all_size_axis;
+    outputs[block][block_pos] = input[pos];
+  }
+  return;
+}
+
+template <typename T>
+void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
+                 const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) {
+  Split<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, axis_step, all_size_before_axis,
+                                                           all_size_axis, input, outputs);
+  return;
+}
+
+template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
+                          const int all_size_axis, const float* input, float** outputs,
+                          cudaStream_t cuda_stream);
+template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
+                          const int all_size_axis, const int* input, int** outputs,
+                          cudaStream_t cuda_stream);
+template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
+                          const int all_size_axis, const half* input, half** outputs,
+                          cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh
new file mode 100755
index 00000000000..5306648da89
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh
@@ -0,0 +1,24 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
+#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
+
+#include "runtime/device/gpu/cuda_common.h"
+template <typename T>
+void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
+                 const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
diff --git a/tests/st/ops/gpu/test_split.py b/tests/st/ops/gpu/test_split.py
new file mode 100644
index 00000000000..f9e3cfce2fd
--- /dev/null
+++ b/tests/st/ops/gpu/test_split.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+
+import mindspore.context as context
+from mindspore import Tensor
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+
+
+class Net(nn.Cell):
+    def __init__(self, axis=0, out_nums=1):
+        super(Net, self).__init__()
+        self.split = P.Split(axis, out_nums)
+
+    def construct(self, x):
+        return self.split(x)
+
+
+context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_split():
+    x = np.array([[[1, -1, 1], [2, -2, 2]],
+                  [[3, -3, 3], [4, -4, 4]],
+                  [[5, -5, 5], [6, -6, 6]]]).astype(np.float32)
+
+    split_op = Net(0, 3)
+    outputs = split_op(Tensor(x))
+    for i, out in enumerate(outputs):
+        assert (out.asnumpy() == x[i]).all()
+
+
+def test_split_4d():
+    x_np = np.random.randn(2, 6, 4, 4).astype(np.float32)
+    y = np.split(x_np, 3, axis=1)
+
+    split_op = Net(1, 3)
+    outputs = split_op(Tensor(x_np))
+
+    for i, out in enumerate(outputs):
+        assert (out.asnumpy() == y[i]).all()

From 50e2fda52d181021e460c9e89ef10f0daa031525 Mon Sep 17 00:00:00 2001
From: WilliamLian <lianliguang@huawei.com>
Date: Mon, 13 Jul 2020 19:45:49 +0800
Subject: [PATCH 25/68] refactor primitive ComputeFunction function

---
 .../ccsrc/backend/session/kernel_graph.cc     |  2 +-
 .../pipeline/pynative/pynative_execute.cc     | 12 ++++---
 mindspore/ccsrc/utils/primitive_utils.cc      | 24 +++++++++++++
 mindspore/ccsrc/utils/primitive_utils.h       |  5 +++
 mindspore/ccsrc/vm/vmimpl.cc                  | 22 +++---------
 mindspore/core/ir/primitive.h                 |  1 +
 mindspore/core/ir/primitive_py.cc             | 36 ++++++++++++++-----
 mindspore/core/ir/primitive_py.h              |  5 ++-
 tests/ut/cpp/operator/ops_test.cc             |  3 +-
 tests/ut/cpp/parallel/step_parallel_test.cc   |  3 +-
 tests/ut/cpp/vm/segment_runner_test.cc        | 16 ++++-----
 11 files changed, 85 insertions(+), 44 deletions(-)

diff --git a/mindspore/ccsrc/backend/session/kernel_graph.cc b/mindspore/ccsrc/backend/session/kernel_graph.cc
index 0bf447751bd..6fcd3b65a0b 100644
--- a/mindspore/ccsrc/backend/session/kernel_graph.cc
+++ b/mindspore/ccsrc/backend/session/kernel_graph.cc
@@ -307,7 +307,7 @@ CNodePtr KernelGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
   if (inputs.size() == 1 || !feature_map_input_indexs.empty()) {
     kernel_info->SetFeatureMapFlag(true);
   }
-  if (AnfAlgo::IsRealCNodeKernel(cnode)) {
+  if (AnfAlgo::IsRealKernel(cnode)) {
     AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(kernel_info->is_feature_map()), cnode);
     AnfAlgo::SetNodeAttr(kIsFeatureMapInputList, MakeValue(feature_map_input_indexs), cnode);
   }
diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
index 5e3add1b5fb..db41b2a0a86 100644
--- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
@@ -363,19 +363,21 @@ py::object RunOpInVM(const OpExecInfoPtr &op_exec_info, PynativeStatusCode *stat
     MS_LOG(INFO) << "RunOpInVM end";
     return std::move(result);
   }
-  auto func = op_exec_info->py_primitive->GetComputeFunction();
-  if (py::isinstance<py::none>(func)) {
-    MS_LOG(ERROR) << "VM failed to get func";
+  auto primitive = op_exec_info->py_primitive;
+  MS_EXCEPTION_IF_NULL(primitive);
+  auto result = primitive->RunPyComputeFunction(op_exec_info->op_inputs);
+  if (py::isinstance<py::none>(result)) {
+    MS_LOG(ERROR) << "VM got the result none, please check whether it is failed to get func";
     *status = PYNATIVE_OP_NOT_IMPLEMENTED_ERR;
     py::tuple err_ret(0);
     return std::move(err_ret);
   }
 
   // execute op
-  py::tuple result = py::make_tuple(func(*op_exec_info->op_inputs));
+  py::tuple tuple_result = py::make_tuple(result);
   *status = PYNATIVE_SUCCESS;
   MS_LOG(INFO) << "RunOpInVM end";
-  return std::move(result);
+  return std::move(tuple_result);
 }
 
 bool RunOpConvertConstInputToAttr(const py::object &input_object, size_t input_index, const PrimitivePtr &op_prim,
diff --git a/mindspore/ccsrc/utils/primitive_utils.cc b/mindspore/ccsrc/utils/primitive_utils.cc
index 490e2517a9a..abd5cb1660d 100644
--- a/mindspore/ccsrc/utils/primitive_utils.cc
+++ b/mindspore/ccsrc/utils/primitive_utils.cc
@@ -15,6 +15,9 @@
  */
 
 #include "utils/primitive_utils.h"
+
+#include <memory>
+
 #include "pipeline/jit/parse/python_adapter.h"
 #include "utils/log_adapter.h"
 #include "common/utils.h"
@@ -43,4 +46,25 @@ py::function GetComputeFunction(std::string name) {
   py::object fn = mod.attr(common::SafeCStr(name));
   return fn;
 }
+
+py::tuple ConvertDatatoPyTuple(const VectorRef &args) {
+  auto py_args = py::tuple(args.size());
+  size_t i = 0;
+  for (auto &arg : args) {
+    py_args[i] = BaseRefToPyData(arg);
+    MS_LOG(DEBUG) << "arg:" << i << ":" << arg.ToString();
+    i++;
+  }
+  return py_args;
+}
+
+BaseRef RunComputeFunction(const PrimitivePtr &prim, const VectorRef &args) {
+  auto func = GetComputeFunction(prim->name());
+  if (py::isinstance<py::none>(func)) {
+    MS_LOG(EXCEPTION) << prim->name() << " 's compute function run failed, please check whether it is not implemented";
+  }
+  auto py_args = ConvertDatatoPyTuple(args);
+  py::object obj = func(*py_args);
+  return std::make_shared<PyObjectRef>(obj);
+}
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/utils/primitive_utils.h b/mindspore/ccsrc/utils/primitive_utils.h
index b7e2515aeae..0faeca9c47f 100644
--- a/mindspore/ccsrc/utils/primitive_utils.h
+++ b/mindspore/ccsrc/utils/primitive_utils.h
@@ -19,6 +19,7 @@
 
 #include <string>
 #include "pybind11/pybind11.h"
+#include "utils/base_ref.h"
 
 namespace py = pybind11;
 
@@ -28,6 +29,10 @@ py::function GetBpropFunctionByObj(py::object obj);
 py::function GetBpropFunction(std::string name);
 
 py::function GetComputeFunction(std::string name);
+
+BaseRef RunComputeFunction(const PrimitivePtr &prim, const VectorRef &args);
+
+py::tuple ConvertDatatoPyTuple(const VectorRef &args);
 }  // namespace mindspore
 
 #endif  // MINDSPORE_CCSRC_UTILS_PRIMITIVE_UTILS_H_
diff --git a/mindspore/ccsrc/vm/vmimpl.cc b/mindspore/ccsrc/vm/vmimpl.cc
index 2aebf8ad0d9..8ce65c3a269 100644
--- a/mindspore/ccsrc/vm/vmimpl.cc
+++ b/mindspore/ccsrc/vm/vmimpl.cc
@@ -440,25 +440,13 @@ VectorRef VM::RunGraph(const FuncGraphPtr &g, const VectorRef &args) {
 }
 
 BaseRef RunOperation(const PrimitivePtr &prim, const VectorRef &args) {
-  PrimitivePyPtr operation = dyn_cast<PrimitivePy>(prim);
-
   MS_LOG(DEBUG) << "operation start " << prim->name();
-  auto func = operation != nullptr ? operation->GetComputeFunction() : GetComputeFunction(prim->name());
-  if (py::isinstance<py::none>(func)) {
-    MS_LOG(EXCEPTION) << prim->name() << " 's compute function is not implemented";
+  MS_EXCEPTION_IF_NULL(prim);
+  auto result = prim->RunComputeFunction(args);
+  if (result.is_null()) {
+    return RunComputeFunction(prim, args);
   }
-
-  py::tuple py_args = py::tuple(args.size());
-  MS_LOG(DEBUG) << "input for operation:";
-  size_t i = 0;
-  for (auto &arg : args) {
-    py_args[i] = BaseRefToPyData(arg);
-    MS_LOG(DEBUG) << "arg: " << i << ":";
-    i++;
-  }
-  py::object obj = func(*py_args);
-  MS_LOG(DEBUG) << "result:" << py::str(obj);
-  return obj;
+  return result;
 }
 
 }  // namespace compile
diff --git a/mindspore/core/ir/primitive.h b/mindspore/core/ir/primitive.h
index 5471b580637..a1784a85a39 100644
--- a/mindspore/core/ir/primitive.h
+++ b/mindspore/core/ir/primitive.h
@@ -83,6 +83,7 @@ class Primitive : public Named {
 
   void set_attr(const std::string &attrName, const ValuePtr &attr) { attrs_[attrName] = attr; }
   void EraseAttr(const std::string &attrName) { (void)attrs_.erase(attrName); }
+  virtual BaseRef RunComputeFunction(const VectorRef &args) const { return nullptr; }
 
   ValuePtr GetAttr(const std::string &attrName) const {
     auto iter = attrs_.find(attrName);
diff --git a/mindspore/core/ir/primitive_py.cc b/mindspore/core/ir/primitive_py.cc
index 1a97487ddc5..2a8f003623a 100644
--- a/mindspore/core/ir/primitive_py.cc
+++ b/mindspore/core/ir/primitive_py.cc
@@ -79,13 +79,7 @@ py::function PrimitivePy::GetBpropFunction() {
 }
 
 BaseRef PrimitivePy::RunHookFunction(const VectorRef &args) const {
-  auto py_args = py::tuple(args.size());
-  size_t i = 0;
-  for (auto &arg : args) {
-    py_args[i] = BaseRefToPyData(arg);
-    MS_LOG(DEBUG) << "arg:" << i << ":";
-    i++;
-  }
+  auto py_args = ConvertDatatoPyTuple(args);
   py::object obj;
   bool is_bprop = this->HasAttr(kBpropAttrName);
   if (is_bprop) {
@@ -123,7 +117,7 @@ BaseRef PrimitivePy::RunHookFunction(const VectorRef &args) const {
   return std::make_shared<PyObjectRef>(obj);
 }
 
-py::function PrimitivePy::GetComputeFunction() {
+py::function PrimitivePy::GetComputeFunction() const {
   static const char *const compute_func_name = "vm_impl";
 
   if (py::hasattr(python_obj_, compute_func_name)) {
@@ -176,6 +170,32 @@ void PrimitivePy::CopyHookFunction(const PrimitivePtr &primitive) {
   this->set_hook(primitive_py->hook());
 }
 
+BaseRef PrimitivePy::RunComputeFunction(const VectorRef &args) const {
+  auto py_args = ConvertDatatoPyTuple(args);
+  auto result = this->RunPyComputeFunction(py_args);
+  if (py::isinstance<py::none>(result)) {
+    return std::make_shared<BaseRef>(nullptr);
+  }
+  return std::make_shared<PyObjectRef>(result);
+}
+
+py::object PrimitivePy::RunPyComputeFunction(const py::tuple &py_args) const {
+  auto func = this->GetComputeFunction();
+  if (py::isinstance<py::none>(func)) {
+    return py::none();
+  }
+  auto result = func(*py_args);
+  return result;
+}
+
+bool PrimitivePy::HasComputeFunction() const {
+  auto func = GetComputeFunction();
+  if (py::isinstance<py::none>(func)) {
+    return false;
+  }
+  return true;
+}
+
 REGISTER_PYBIND_DEFINE(Primitive_, ([](const py::module *m) {
                          (void)py::enum_<PrimType>(*m, "prim_type", py::arithmetic())
                            .value("unknown", PrimType::kPrimTypeUnknown)
diff --git a/mindspore/core/ir/primitive_py.h b/mindspore/core/ir/primitive_py.h
index 2dc45ac341e..8c576016fab 100644
--- a/mindspore/core/ir/primitive_py.h
+++ b/mindspore/core/ir/primitive_py.h
@@ -41,7 +41,6 @@ class PrimitivePy : public Primitive {
   ~PrimitivePy() override = default;
   MS_DECLARE_PARENT(PrimitivePy, Primitive);
   py::function GetBpropFunction();
-  py::function GetComputeFunction();
 
   void set_signatures(
     std::vector<std::tuple<std::string, SignatureEnumRW, SignatureEnumKind, py::object, SignatureEnumDType>>
@@ -57,11 +56,15 @@ class PrimitivePy : public Primitive {
   void set_hook(const py::function &hook) { hook_ = hook; }
   py::function hook() const { return hook_; }
   BaseRef RunHookFunction(const VectorRef &args) const override;
+  BaseRef RunComputeFunction(const VectorRef &args) const override;
+  py::object RunPyComputeFunction(const py::tuple &py_args) const;
+  bool HasComputeFunction() const;
   const bool parse_info_ = true;
   const py::object &GetPyObj() const { return python_obj_; }
   bool is_tuple_input_ = false;
 
  private:
+  py::function GetComputeFunction() const;
   py::object python_obj_;
   py::function hook_;
   std::vector<Signature> signatures_;
diff --git a/tests/ut/cpp/operator/ops_test.cc b/tests/ut/cpp/operator/ops_test.cc
index 789b1cab252..20f4734bf06 100644
--- a/tests/ut/cpp/operator/ops_test.cc
+++ b/tests/ut/cpp/operator/ops_test.cc
@@ -454,8 +454,7 @@ TEST_F(TestOps, GetConv2DPrimPyTest) {
   ASSERT_TRUE(conv2d_ptr);
   if (nullptr != conv2d_ptr) {
     MS_LOG(INFO) << "Get PrimitivePyPtr: " << conv2d_ptr->name();
-    auto func = conv2d_ptr->GetComputeFunction();
-    if (py::isinstance<py::none>(func)) {
+    if(!conv2d_ptr->HasComputeFunction()){
       MS_LOG(EXCEPTION) << "" << conv2d_ptr->name() << "'s compute function is not implemented";
     }
 
diff --git a/tests/ut/cpp/parallel/step_parallel_test.cc b/tests/ut/cpp/parallel/step_parallel_test.cc
index 5657db87906..383a0618052 100644
--- a/tests/ut/cpp/parallel/step_parallel_test.cc
+++ b/tests/ut/cpp/parallel/step_parallel_test.cc
@@ -294,8 +294,7 @@ TEST_F(TestStepParallel, CreatOpInstance) {
   ASSERT_TRUE(allreduce_ptr);
   if (nullptr != allreduce_ptr) {
     MS_LOG(INFO) << "Get PrimitivePyPtr: " << allreduce_ptr->name();
-    auto func = allreduce_ptr->GetComputeFunction();
-    if (py::isinstance<py::none>(func)) {
+    if (!allreduce_ptr->HasComputeFunction()) {
       MS_LOG(EXCEPTION) << "" << allreduce_ptr->name() << "'s compute function is not implemented";
     }
 
diff --git a/tests/ut/cpp/vm/segment_runner_test.cc b/tests/ut/cpp/vm/segment_runner_test.cc
index c83b1b3434d..60c027b0779 100644
--- a/tests/ut/cpp/vm/segment_runner_test.cc
+++ b/tests/ut/cpp/vm/segment_runner_test.cc
@@ -57,11 +57,11 @@ TEST_F(TestCompileSegmentRunner, test_MsVmConvert1) {
 
   std::vector<BaseRef> todos(splits.size());
   auto it = std::copy_if(std::begin(splits), std::end(splits), std::begin(todos),
-                         [](const BaseRef& seg) -> bool { return utils::isa<VectorRef>(seg); });
+                         [](const BaseRef &seg) -> bool { return utils::isa<VectorRef>(seg); });
   todos.resize(std::distance(todos.begin(), it));
   ASSERT_EQ(todos.size(), 1);
 
-  AnfNodePtrList anf_list; 
+  AnfNodePtrList anf_list;
   for (auto &item : utils::cast<VectorRef>(todos[0])) {
     anf_list.push_back(utils::cast<AnfNodePtr>(item));
   }
@@ -81,11 +81,11 @@ TEST_F(TestCompileSegmentRunner, test_MsVmConvert2) {
 
   std::vector<BaseRef> todos(splits.size());
   auto it = std::copy_if(std::begin(splits), std::end(splits), std::begin(todos),
-                         [](const BaseRef& seg) -> bool { return utils::isa<VectorRef>(seg); });
+                         [](const BaseRef &seg) -> bool { return utils::isa<VectorRef>(seg); });
   todos.resize(std::distance(todos.begin(), it));
   ASSERT_EQ(todos.size(), 1);
 
-  AnfNodePtrList anf_list; 
+  AnfNodePtrList anf_list;
   for (auto &item : utils::cast<VectorRef>(todos[0])) {
     anf_list.push_back(utils::cast<AnfNodePtr>(item));
   }
@@ -105,11 +105,11 @@ TEST_F(TestCompileSegmentRunner, test_if) {
 
   std::vector<BaseRef> todos(splits.size());
   auto it = std::copy_if(std::begin(splits), std::end(splits), std::begin(todos),
-                         [](const BaseRef& seg) -> bool { return utils::isa<VectorRef>(seg); });
+                         [](const BaseRef &seg) -> bool { return utils::isa<VectorRef>(seg); });
   todos.resize(std::distance(todos.begin(), it));
   ASSERT_EQ(todos.size(), 1);
 
-  AnfNodePtrList anf_list; 
+  AnfNodePtrList anf_list;
   for (auto &item : utils::cast<VectorRef>(todos[0])) {
     anf_list.push_back(utils::cast<AnfNodePtr>(item));
   }
@@ -122,13 +122,13 @@ TEST_F(TestCompileSegmentRunner, test_if) {
 
 TEST_F(TestCompileSegmentRunner, test_RunOperation1) {
   VectorRef args({1});
-  auto res = RunOperation(prim::kPrimIdentity, args);
+  auto res = RunOperation(std::make_shared<PrimitivePy>(py::str(prim::kPrimIdentity->name()), py::none()), args);
   ASSERT_EQ(py::cast<int>(BaseRefToPyData(res)), 1);
 }
 
 TEST_F(TestCompileSegmentRunner, test_RunOperation2) {
   VectorRef args({1, 2});
-  auto res = RunOperation(prim::kPrimScalarGt, args);
+  auto res = RunOperation(std::make_shared<PrimitivePy>(py::str(prim::kPrimScalarGt->name()), py::none()), args);
   ASSERT_EQ(py::cast<bool>(BaseRefToPyData(res)), false);
 }
 }  // namespace compile

From cb8b5dbd76c9622cf78644f7fd787aafd8be6eb6 Mon Sep 17 00:00:00 2001
From: caifubi <caifubi1@huawei.com>
Date: Thu, 16 Jul 2020 11:04:44 +0800
Subject: [PATCH 26/68] Remove build option -S

---
 build.sh                                      | 14 ++-------
 cmake/options.cmake                           |  4 ---
 .../kernel_compiler/ascend_kernel_mod.h       | 10 +------
 mindspore/ccsrc/debug/CMakeLists.txt          |  4 +--
 mindspore/ccsrc/debug/data_dump_parser.cc     |  2 +-
 .../device/ascend/ascend_kernel_runtime.cc    | 29 ++++++-------------
 .../device/ascend/ascend_kernel_runtime.h     |  4 ---
 .../runtime/device/ascend/dump/data_dumper.cc |  2 --
 .../runtime/device/ascend/dump/data_dumper.h  |  2 --
 tests/ut/cpp/stub/tasksink/task_sink_stub.cc  |  4 +++
 10 files changed, 18 insertions(+), 57 deletions(-)

diff --git a/build.sh b/build.sh
index cfa657ff3ed..428743f0ffb 100755
--- a/build.sh
+++ b/build.sh
@@ -24,7 +24,7 @@ usage()
 {
   echo "Usage:"
   echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
-  echo "              [-a on|off] [-Q on|off] [-S on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
+  echo "              [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
   echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]"
   echo ""
   echo "Options:"
@@ -48,7 +48,6 @@ usage()
   echo "    -P Enable dump anf graph to file in ProtoBuffer format, default on"
   echo "    -Q Enable dump memory, default off"
   echo "    -D Enable dumping of function graph ir, default on"
-  echo "    -S Enable async data dump, default off"
   echo "    -z Compile dataset & mindrecord, default on"
   echo "    -M Enable MPI and NCCL for GPU training, gpu default on"
   echo "    -V Specify the minimum required cuda version, default CUDA 10.1"
@@ -89,7 +88,6 @@ checkopts()
   ENABLE_TIMELINE="off"
   ENABLE_DUMP2PROTO="on"
   ENABLE_DUMPE2E="off"
-  ENABLE_DATA_DUMP="off"
   ENABLE_DUMP_IR="on"
   COMPILE_MINDDATA="on"
   ENABLE_MPI="off"
@@ -104,7 +102,7 @@ checkopts()
   ENABLE_PYTHON="on"
 
   # Process the options
-  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:S:D:zM:V:K:sB:E' opt
+  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:sB:E' opt
   do
     OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
     case "${opt}" in
@@ -220,11 +218,6 @@ checkopts()
         ENABLE_DUMPE2E="$OPTARG"
         echo "enable dump end to end"
         ;;
-      S)
-        check_on_off $OPTARG S
-        ENABLE_DATA_DUMP="$OPTARG"
-        echo "enable data dump"
-        ;;
       D)
         check_on_off $OPTARG D
         ENABLE_DUMP_IR="$OPTARG"
@@ -328,9 +321,6 @@ build_mindspore()
     if [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
         CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON"
     fi
-    if [[ "X$ENABLE_DATA_DUMP" = "Xon" ]]; then
-        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DATA_DUMP=ON"
-    fi
     CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}"
     CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}"
     if [[ "X$ENABLE_MPI" = "Xon" ]]; then
diff --git a/cmake/options.cmake b/cmake/options.cmake
index 2470c25a90c..b01c623377d 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -116,10 +116,6 @@ if(ENABLE_DUMP_E2E)
     add_compile_definitions(ENABLE_DUMP_E2E)
 endif()
 
-if(ENABLE_DATA_DUMP)
-    add_compile_definitions(ENABLE_DATA_DUMP)
-endif()
-
 if(ENABLE_DEBUGGER)
     add_compile_definitions(ENABLE_DEBUGGER)
 endif()
diff --git a/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h b/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
index c6398eda9e7..0b8bdd1b193 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
@@ -21,9 +21,7 @@
 #include <memory>
 #include "framework/ge_runtime/task_info.h"
 #include "backend/kernel_compiler/kernel.h"
-#ifdef ENABLE_DATA_DUMP
 #include "debug/data_dump_parser.h"
-#endif
 
 using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>;
 namespace mindspore {
@@ -34,13 +32,7 @@ class AscendKernelMod : public KernelMod {
                                            const std::vector<AddressPtr> &, uint32_t) = 0;
   uint32_t block_dim() { return block_dim_; }
   uint32_t stream_id() { return stream_id_; }
-  virtual bool NeedDump() {
-#ifdef ENABLE_DATA_DUMP
-    return DataDumpParser::GetInstance().NeedDump(kernel_name_);
-#else
-    return false;
-#endif
-  }
+  virtual bool NeedDump() { return DataDumpParser::GetInstance().NeedDump(kernel_name_); }
 
  protected:
   uint32_t block_dim_{1};
diff --git a/mindspore/ccsrc/debug/CMakeLists.txt b/mindspore/ccsrc/debug/CMakeLists.txt
index 37ffcceeaf5..8be5a0a834b 100644
--- a/mindspore/ccsrc/debug/CMakeLists.txt
+++ b/mindspore/ccsrc/debug/CMakeLists.txt
@@ -23,9 +23,7 @@ if (ENABLE_D)
     list(APPEND _DEBUG_SRC_LIST
         "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
         )
-    if (ENABLE_DATA_DUMP)
-        list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc")
-    endif(ENABLE_DATA_DUMP)
+    list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc")
 endif()
 
 if (ENABLE_DUMP_E2E)
diff --git a/mindspore/ccsrc/debug/data_dump_parser.cc b/mindspore/ccsrc/debug/data_dump_parser.cc
index 259ec388d32..55c66e055ba 100644
--- a/mindspore/ccsrc/debug/data_dump_parser.cc
+++ b/mindspore/ccsrc/debug/data_dump_parser.cc
@@ -35,7 +35,7 @@ void DataDumpParser::ResetParam() {
 bool DataDumpParser::DumpEnabled() const {
   auto enable_dump = std::getenv(kEnableDataDump);
   if (!enable_dump) {
-    MS_LOG(WARNING) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP";
+    MS_LOG(INFO) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP";
     return false;
   }
 
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
index 3ab3a52d42f..c1e3bff79fe 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -49,6 +49,10 @@ using mindspore::device::ascend::tasksink::TaskGenerator;
 using mindspore::kernel::tbe::TbeUtils;
 using std::vector;
 
+constexpr uint32_t kTupleTaskId = 0;
+constexpr uint32_t kTupleStreamId = 1;
+constexpr uint32_t kTupleArgs = 2;
+
 namespace mindspore {
 namespace device {
 namespace ascend {
@@ -91,13 +95,11 @@ std::string GetRankId() {
 AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); }
 
 void AscendKernelRuntime::ClearGraphModelMap() {
-#ifdef ENABLE_DATA_DUMP
   for (auto &iter : graph_data_dumper_) {
     MS_LOG(INFO) << "[DataDump] Unload data dumper:" << iter.first;
     iter.second->UnloadDumpInfo();
   }
   graph_data_dumper_.clear();
-#endif
   for (auto &iter : graph_model_map_) {
     MS_LOG(INFO) << "Ge UnloadModel " << iter.first;
     auto ret = ModelRunner::Instance().UnloadModel(iter.first);
@@ -167,9 +169,7 @@ bool AscendKernelRuntime::Init() {
   }
 #endif
 
-#ifdef ENABLE_DATA_DUMP
   DataDumpParser::GetInstance().ParseDumpConfig();
-#endif
 
   // Start up profiling before rtSetDevice
   ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
@@ -510,9 +510,8 @@ bool AscendKernelRuntime::LoadTask(const session::KernelGraph *graph) {
     ProfilingUtils::ReportProfilingData(task_ids, stream_ids, NOT_NULL(graph));
   }
 
-#ifdef ENABLE_DATA_DUMP
   LaunchDataDump(NOT_NULL(graph));
-#endif
+
   if (!ModelRunner::Instance().LoadModelComplete(model_iter->first)) {
     MS_LOG(ERROR) << "Call ge runtime LoadModelComplete failed";
     return false;
@@ -520,7 +519,6 @@ bool AscendKernelRuntime::LoadTask(const session::KernelGraph *graph) {
   return true;
 }
 
-#ifdef ENABLE_DATA_DUMP
 void AscendKernelRuntime::LaunchDataDump(NotNull<const session::KernelGraph *> graph) {
   if (!DataDumpParser::GetInstance().DumpEnabled()) {
     return;
@@ -534,21 +532,12 @@ void AscendKernelRuntime::LaunchDataDump(NotNull<const session::KernelGraph *> g
     MS_LOG(WARNING) << "[DataDump] Insert graphId:" << graph->graph_id() << " data dumper failed";
   }
 }
-#endif
 
 void AscendKernelRuntime::DebugTaskIdName(GraphId graph_id) {
-  auto task_ids = ModelRunner::Instance().GetTaskIdList(graph_id);
-  auto graph_task_names = ProfilingUtils::graph_kernel_name();
-  auto iter = graph_task_names.find(graph_id);
-  if (iter != graph_task_names.end()) {
-    const auto &task_names = iter->second;
-    if (task_ids.size() != task_names.size()) {
-      MS_LOG(WARNING) << "Task_ids and task_names size not match";
-      return;
-    }
-    for (size_t i = 0; i < task_ids.size(); ++i) {
-      MS_LOG(INFO) << "Task_id:" << task_ids[i] << " task_name:" << task_names[i];
-    }
+  auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(graph_id);
+  for (auto iter : runtime_info_map) {
+    MS_LOG(WARNING) << "Task name:" << iter.first << " task_id:" << std::get<kTupleTaskId>(*iter.second)
+                    << " stream_id:" << std::get<kTupleStreamId>(*iter.second);
   }
 }
 
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
index 4f1663d4d5a..8cde6a01012 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@@ -24,10 +24,8 @@
 #include "framework/ge_runtime/davinci_model.h"
 #include "runtime/device/kernel_runtime_manager.h"
 #include "backend/session/session_basic.h"
-#ifdef ENABLE_DATA_DUMP
 #include "debug/data_dump_parser.h"
 #include "runtime/device/ascend/dump/data_dumper.h"
-#endif
 
 using ge::model_runner::TaskInfo;
 using std::unordered_map;
@@ -70,10 +68,8 @@ class AscendKernelRuntime : public KernelRuntime {
   bool initialized_{false};
   unordered_map<GraphId, vector<std::shared_ptr<TaskInfo>>> task_map_;
   unordered_map<GraphId, std::shared_ptr<ge::model_runner::DavinciModel>> graph_model_map_;
-#ifdef ENABLE_DATA_DUMP
   void LaunchDataDump(NotNull<const session::KernelGraph *> graph);
   unordered_map<GraphId, std::shared_ptr<DataDumper>> graph_data_dumper_;
-#endif
 };
 
 MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime);
diff --git a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
index ca9a74022ac..19157f619cd 100644
--- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifdef ENABLE_DATA_DUMP
 #include "runtime/device/ascend/dump/data_dumper.h"
 
 #include <map>
@@ -279,4 +278,3 @@ void DumpKernelInput(const CNodePtr &kernel, void *args, NotNull<aicpu::dump::Ta
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
-#endif
diff --git a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.h b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.h
index d99eb4db686..f8055f106eb 100644
--- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.h
+++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.h
@@ -16,7 +16,6 @@
 
 #ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_DUMP_DATADUMP_H_
 #define MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_DUMP_DATADUMP_H_
-#ifdef ENABLE_DATA_DUMP
 #include <tuple>
 #include <map>
 #include <memory>
@@ -65,5 +64,4 @@ class DataDumper {
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
-#endif
 #endif  // MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_DUMP_DATADUMP_H_
diff --git a/tests/ut/cpp/stub/tasksink/task_sink_stub.cc b/tests/ut/cpp/stub/tasksink/task_sink_stub.cc
index 0b12a3862c1..967fbda4e55 100644
--- a/tests/ut/cpp/stub/tasksink/task_sink_stub.cc
+++ b/tests/ut/cpp/stub/tasksink/task_sink_stub.cc
@@ -15,6 +15,7 @@
  */
 
 #include "runtime/device/ascend/tasksink/task_generator.h"
+#include "runtime/device/ascend/dump/data_dumper.h"
 
 namespace mindspore {
 namespace device {
@@ -25,6 +26,9 @@ bool TaskGenerator::GenTasks(const std::vector<CNodePtr> &anf_node_list, std::ve
   return true;
 }
 }  // namespace tasksink
+void DataDumper::LoadDumpInfo() {}
+void DataDumper::UnloadDumpInfo() {}
+DataDumper::~DataDumper() {}
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
\ No newline at end of file

From 220f090ce53511618cf5916c204727c89afe7575 Mon Sep 17 00:00:00 2001
From: leonwanghui <leon.wanghui@huawei.com>
Date: Thu, 16 Jul 2020 15:05:46 +0800
Subject: [PATCH 27/68] update README.md. Update Slack join link.

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 25abdd6fcba..f312343af9c 100644
--- a/README.md
+++ b/README.md
@@ -202,10 +202,10 @@ Check out how MindSpore Open Governance [works](https://gitee.com/mindspore/comm
 
 ### Communication
 
-- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/enQtOTcwMTIxMDI3NjM0LTNkMWM2MzI5NjIyZWU5ZWQ5M2EwMTQ5MWNiYzMxOGM4OWFhZjI4M2E5OGI2YTg3ODU1ODE2Njg1MThiNWI3YmQ) - Communication platform for developers.
+- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/zt-dgk65rli-3ex4xvS4wHX7UDmsQmfu8w) - Communication platform for developers.
 - IRC channel at `#mindspore` (only for meeting minutes logging purpose)
-- Video Conferencing: https://meet.jit.si
-- Mailing-list: https://mailweb.mindspore.cn/postorius/lists
+- Video Conferencing: TBD
+- Mailing-list: <https://mailweb.mindspore.cn/postorius/lists>
 
 ## Contributing
 

From bc676fe250918b45b0c466a442a1ad744edbbb84 Mon Sep 17 00:00:00 2001
From: liyong <liyong126@huawei.com>
Date: Mon, 13 Jul 2020 14:12:41 +0800
Subject: [PATCH 28/68] save op in minddataset

---
 .../ccsrc/minddata/dataset/api/de_pipeline.cc | 226 ++++++++++
 .../ccsrc/minddata/dataset/api/de_pipeline.h  |  16 +
 .../minddata/dataset/api/python_bindings.cc   |   6 +-
 .../ccsrc/minddata/dataset/core/tensor.h      |  10 +-
 .../engine/datasetops/source/mindrecord_op.cc |   2 +-
 .../mindrecord/include/common/shard_utils.h   |   4 +
 .../mindrecord/include/shard_header.h         |   4 +
 .../include/shard_index_generator.h           |   2 +
 .../mindrecord/include/shard_writer.h         |   7 +
 .../mindrecord/io/shard_index_generator.cc    |  16 +
 .../minddata/mindrecord/io/shard_writer.cc    |  52 +++
 .../minddata/mindrecord/meta/shard_header.cc  |  30 ++
 mindspore/dataset/engine/datasets.py          |  32 +-
 mindspore/dataset/engine/iterators.py         |  23 ++
 mindspore/dataset/engine/validators.py        |  17 +
 tests/ut/python/dataset/test_save_op.py       | 390 ++++++++++++++++++
 16 files changed, 828 insertions(+), 9 deletions(-)
 create mode 100644 tests/ut/python/dataset/test_save_op.py

diff --git a/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc b/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc
index c780d8f645b..0c4c6273a6f 100644
--- a/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc
@@ -42,11 +42,17 @@
 #include "minddata/dataset/util/status.h"
 #include "minddata/mindrecord/include/shard_category.h"
 #include "minddata/mindrecord/include/shard_distributed_sample.h"
+#include "minddata/mindrecord/include/shard_header.h"
+#include "minddata/mindrecord/include/shard_index_generator.h"
+#include "minddata/mindrecord/include/shard_sample.h"
+#include "minddata/mindrecord/include/shard_shuffle.h"
+#include "minddata/mindrecord/include/shard_writer.h"
 #include "pybind11/stl.h"
 #include "utils/log_adapter.h"
 
 namespace mindspore {
 namespace dataset {
+using json = nlohmann::json;
 using pFunction = Status (DEPipeline::*)(const py::dict &, std::shared_ptr<DatasetOp> *, std::shared_ptr<DatasetOp> *);
 
 static std::unordered_map<uint32_t, pFunction> g_parse_op_func_ = {
@@ -355,6 +361,226 @@ Status DEPipeline::ParseShuffleOp(const py::dict &args, std::shared_ptr<DatasetO
   return Status::OK();
 }
 
+Status DEPipeline::SaveDataset(const std::vector<std::string> &file_names, const std::string &file_type) {
+  Status s;
+  auto mr_header = std::make_shared<mindrecord::ShardHeader>();
+  auto mr_writer = std::make_unique<mindrecord::ShardWriter>();
+  std::vector<std::string> blob_fields;
+  uint64_t mr_schema_id = 0;
+  if (mindrecord::SUCCESS != mindrecord::ShardWriter::initialize(&mr_writer, file_names)) {
+    RETURN_STATUS_UNEXPECTED("Error: failed to initialize ShardWriter.");
+  }
+
+  TensorRow row;
+  std::unordered_map<std::string, int32_t> column_name_id_map =
+    iterator_->GetColumnNameMap();  // map of column name, id
+  bool first_loop = true;           // build schema in first loop
+  do {
+    json row_raw_data;
+    std::map<std::string, std::unique_ptr<std::vector<uint8_t>>> row_bin_data;
+    {
+      py::gil_scoped_release gil_release;
+      s = iterator_->FetchNextTensorRow(&row);
+    }
+    RETURN_IF_NOT_OK(s);
+    if (row.empty()) break;
+    if (first_loop) {
+      json mr_json;
+      std::vector<std::string> index_fields;
+      s = FetchMetaFromTensorRow(column_name_id_map, row, &mr_json, &index_fields);
+      RETURN_IF_NOT_OK(s);
+      mindrecord::ShardHeader::initialize(&mr_header, mr_json, index_fields, blob_fields, mr_schema_id);
+      mr_writer->SetShardHeader(mr_header);
+      first_loop = false;
+    }
+    // construct data
+    if (!row.empty()) {  // write data
+      s = FetchDataFromTensorRow(row, column_name_id_map, &row_raw_data, &row_bin_data);
+      RETURN_IF_NOT_OK(s);
+      std::shared_ptr<std::vector<uint8_t>> output_bin_data;
+      mr_writer->MergeBlobData(blob_fields, row_bin_data, &output_bin_data);
+      std::map<std::uint64_t, std::vector<json>> raw_data;
+      raw_data.insert(std::pair<uint64_t, std::vector<json>>(mr_schema_id, std::vector<json>{row_raw_data}));
+      std::vector<std::vector<uint8_t>> bin_data;
+      if (nullptr != output_bin_data) {
+        bin_data.emplace_back(*output_bin_data);
+      }
+      mr_writer->WriteRawData(raw_data, bin_data);
+    }
+  } while (!row.empty());
+  mr_writer->Commit();
+  mindrecord::ShardIndexGenerator::finalize(file_names);
+  return Status::OK();
+}
+
+Status DEPipeline::FetchDataFromTensorRow(const TensorRow &row,
+                                          const std::unordered_map<std::string, int32_t> &column_name_id_map,
+                                          json *row_raw_data,
+                                          std::map<std::string, std::unique_ptr<std::vector<uint8_t>>> *row_bin_data) {
+  if (row_raw_data == nullptr) {
+    RETURN_STATUS_UNEXPECTED("error: row raw data is NULL.");
+  }
+  if (row_bin_data == nullptr) {
+    RETURN_STATUS_UNEXPECTED("error: row bin data is NULL.");
+  }
+  if (column_name_id_map.empty()) {
+    RETURN_STATUS_UNEXPECTED("Error: column not found");
+  }
+  Status s;
+  for (auto &col : column_name_id_map) {
+    auto idx = col.second;
+    auto column_name = col.first;
+    auto &tensor = row[idx];
+    auto column_type = tensor->type();
+
+    std::unique_ptr<std::vector<uint8_t>> data_ptr;
+    if (column_type == DataType::DE_INT8) {
+      std::unique_ptr<int32_t> data;
+      std::unique_ptr<int8_t> dummy;
+      s = TransfromTensor(tensor->GetBuffer(), tensor->shape(), tensor->Size(), &data, &data_ptr, &dummy, true);
+      RETURN_IF_NOT_OK(s);
+      if (data != nullptr) (*row_raw_data)[column_name] = std::move(*data);
+    } else if (column_type == DataType::DE_INT16) {
+      std::unique_ptr<int32_t> data;
+      std::unique_ptr<int16_t> dummy;
+      s = TransfromTensor(tensor->GetBuffer(), tensor->shape(), tensor->Size(), &data, &data_ptr, &dummy, true);
+      RETURN_IF_NOT_OK(s);
+      if (data != nullptr) (*row_raw_data)[column_name] = std::move(*data);
+    } else if (column_type == DataType::DE_UINT16) {
+      std::unique_ptr<int32_t> data;
+      std::unique_ptr<uint16_t> dummy;
+      s = TransfromTensor(tensor->GetBuffer(), tensor->shape(), tensor->Size(), &data, &data_ptr, &dummy, true);
+      RETURN_IF_NOT_OK(s);
+      if (data != nullptr) (*row_raw_data)[column_name] = std::move(*data);
+    } else if (column_type == DataType::DE_UINT8) {
+      std::unique_ptr<uint8_t> data, dummy;
+      s = TransfromTensor(tensor->GetBuffer(), tensor->shape(), tensor->Size(), &data, &data_ptr, &dummy);
+      RETURN_IF_NOT_OK(s);
+      if (data != nullptr) (*row_raw_data)[column_name] = std::move(*data);
+    } else if (column_type == DataType::DE_INT32) {
+      std::unique_ptr<int32_t> data, dummy;
+      s = TransfromTensor(tensor->GetBuffer(), tensor->shape(), tensor->Size(), &data, &data_ptr, &dummy);
+      RETURN_IF_NOT_OK(s);
+      if (data != nullptr) (*row_raw_data)[column_name] = std::move(*data);
+    } else if (column_type == DataType::DE_UINT32) {
+      std::unique_ptr<int64_t> data;
+      std::unique_ptr<uint32_t> dummy;
+      s = TransfromTensor(tensor->GetBuffer(), tensor->shape(), tensor->Size(), &data, &data_ptr, &dummy, true);
+      RETURN_IF_NOT_OK(s);
+      if (data != nullptr) (*row_raw_data)[column_name] = std::move(*data);
+    } else if (column_type == DataType::DE_INT64) {
+      std::unique_ptr<int64_t> data, dummy;
+      s = TransfromTensor(tensor->GetBuffer(), tensor->shape(), tensor->Size(), &data, &data_ptr, &dummy);
+      RETURN_IF_NOT_OK(s);
+      if (data != nullptr) (*row_raw_data)[column_name] = std::move(*data);
+    } else if (column_type == DataType::DE_FLOAT32) {
+      std::unique_ptr<float> data, dummy;
+      s = TransfromTensor(tensor->GetBuffer(), tensor->shape(), tensor->Size(), &data, &data_ptr, &dummy);
+      RETURN_IF_NOT_OK(s);
+      if (data != nullptr) (*row_raw_data)[column_name] = std::move(*data);
+    } else if (column_type == DataType::DE_FLOAT64) {
+      std::unique_ptr<double> data, dummy;
+      s = TransfromTensor(tensor->GetBuffer(), tensor->shape(), tensor->Size(), &data, &data_ptr, &dummy);
+      RETURN_IF_NOT_OK(s);
+      if (data != nullptr) (*row_raw_data)[column_name] = std::move(*data);
+    } else if (column_type == DataType::DE_STRING) {
+      auto buffer = tensor->GetStringsBuffer();
+      std::string ss(reinterpret_cast<const char *>(buffer));  // assume scalar string tensor
+      (*row_raw_data)[column_name] = std::move(ss);
+      continue;
+    } else {
+      RETURN_STATUS_UNEXPECTED("Got unexpected type when casting data.");
+    }
+    RETURN_IF_NOT_OK(s);
+    if (data_ptr != nullptr) {
+      (*row_bin_data)[column_name] = std::move(data_ptr);
+    }
+  }
+  return Status::OK();
+}
+
+template <typename T, typename S>
+Status DEPipeline::TransfromTensor(const unsigned char *src, const TensorShape &shape, const int64_t num_of_elements,
+                                   std::unique_ptr<T> *data, std::unique_ptr<std::vector<uint8_t>> *data_ptr,
+                                   std::unique_ptr<S> *s, bool need_convert) {
+  if (nullptr == src) {
+    RETURN_STATUS_UNEXPECTED("Error: buffer of Tensor is NULL.");
+  }
+  *data_ptr = std::make_unique<std::vector<uint8_t>>(num_of_elements * sizeof(T));
+  if (need_convert) {
+    auto tmp_ptr = std::make_unique<std::vector<uint8_t>>(num_of_elements * sizeof(S));
+    std::copy(src, src + sizeof(S) * num_of_elements, tmp_ptr->begin());
+    auto s_ptr = reinterpret_cast<S *>(&(*(tmp_ptr->begin())));
+    auto el = std::make_unique<T>();
+    for (uint32_t i = 0; i < num_of_elements; ++i) {
+      *el = *(s_ptr + i);
+      auto t_ptr = reinterpret_cast<uint8_t *>(el.get());
+      for (uint32_t j = 0; j < sizeof(T); ++j) {
+        *((*data_ptr)->begin() + i * sizeof(T) + j) = *(t_ptr + j);
+      }
+    }
+  } else {
+    std::copy(src, src + sizeof(T) * num_of_elements, (*data_ptr)->begin());
+  }
+  if (shape.empty()) {
+    *data = std::make_unique<T>();
+    auto t_ptr = reinterpret_cast<uint8_t *>((*data).get());
+    for (uint32_t i = 0; i < sizeof(T); ++i) {
+      *(t_ptr + i) = *((*data_ptr)->begin() + i);
+    }
+  }
+  return Status::OK();
+}
+
+Status DEPipeline::FetchMetaFromTensorRow(const std::unordered_map<std::string, int32_t> &column_name_id_map,
+                                          const TensorRow &row, json *schema, std::vector<std::string> *index_fields) {
+  if (schema == nullptr) {
+    RETURN_STATUS_UNEXPECTED("error: schema is NULL.");
+  }
+  if (index_fields == nullptr) {
+    RETURN_STATUS_UNEXPECTED("error: index fields is NULL.");
+  }
+  if (column_name_id_map.empty()) {
+    RETURN_STATUS_UNEXPECTED("Error: column not found.");
+  }
+  for (auto &col : column_name_id_map) {
+    auto idx = col.second;
+    auto column_name = col.first;
+    auto &tensor = row[idx];
+    auto column_type = tensor->type();
+    auto column_shape = tensor->shape();
+
+    std::string mr_type;
+    auto shapes = column_shape.AsVector();
+    std::vector<int> mr_shape(shapes.begin(), shapes.end());
+    std::string el = column_type.ToString();
+    if (mindrecord::kTypesMap.find(el) == mindrecord::kTypesMap.end()) {
+      std::string err_msg("Error: can not support data type: " + el);
+      RETURN_STATUS_UNEXPECTED(err_msg);
+    } else {
+      mr_type = mindrecord::kTypesMap.at(el);
+    }
+    if (mr_shape.empty()) {
+      if (mr_type == "bytes") {  // map to int32 when bytes without shape.
+        mr_type == "int32";
+      }
+      (*schema)[column_name] = {{"type", mr_type}};
+    } else {
+      if (mr_type == "string") {  // mindrecord can not support string with shape.
+        std::string err_msg("Error: mindrecord can not support multi-dimensional string tensor.");
+        RETURN_STATUS_UNEXPECTED(err_msg);
+      }
+      if (mr_type == "bytes") {  // ignore shape of bytes in minrecord
+        (*schema)[column_name] = {{"type", mr_type}};
+      } else {
+        (*schema)[column_name] = {{"type", mr_type}, {"shape", mr_shape}};
+      }
+    }
+    if (mr_type == "bytes" || !mr_shape.empty()) continue;
+    index_fields->emplace_back(column_name);  // candidate of index fields
+  }
+  return Status::OK();
+}
 Status DEPipeline::BuildMindrecordSamplerChain(const py::handle &handle,
                                                std::vector<std::shared_ptr<mindrecord::ShardOperator>> *operators,
                                                int num_padded) {
diff --git a/mindspore/ccsrc/minddata/dataset/api/de_pipeline.h b/mindspore/ccsrc/minddata/dataset/api/de_pipeline.h
index 755e827ef2e..b3adb6ae9f5 100644
--- a/mindspore/ccsrc/minddata/dataset/api/de_pipeline.h
+++ b/mindspore/ccsrc/minddata/dataset/api/de_pipeline.h
@@ -17,6 +17,7 @@
 #define DATASET_API_DE_PIPELINE_H_
 
 #include <iostream>
+#include <map>
 #include <memory>
 #include <stack>
 #include <string>
@@ -33,6 +34,7 @@
 namespace py = pybind11;
 namespace mindspore {
 namespace dataset {
+using json = nlohmann::json;
 using DsOpPtr = std::shared_ptr<DatasetOp>;
 
 class CacheClient;
@@ -100,6 +102,8 @@ class DEPipeline {
 
   Status GetOutputTypes(py::list *output);
 
+  Status SaveDataset(const std::vector<std::string> &file_names, const std::string &file_type);
+
   int GetDatasetSize() const;
 
   int GetBatchSize() const;
@@ -110,6 +114,18 @@ class DEPipeline {
 
   Status ParseMindRecordOp(const py::dict &args, std::shared_ptr<DatasetOp> *top, std::shared_ptr<DatasetOp> *bottom);
 
+  template <typename T, typename S>
+  Status TransfromTensor(const unsigned char *src, const TensorShape &shape, const int64_t num_of_elements,
+                         std::unique_ptr<T> *data, std::unique_ptr<std::vector<uint8_t>> *data_ptr,
+                         std::unique_ptr<S> *s, bool need_convert = false);
+
+  Status FetchMetaFromTensorRow(const std::unordered_map<std::string, int32_t> &column_name_id_map,
+                                const TensorRow &row, json *schema, std::vector<std::string> *index_fields);
+
+  Status FetchDataFromTensorRow(const TensorRow &row,
+                                const std::unordered_map<std::string, int32_t> &column_name_id_map, json *row_raw_data,
+                                std::map<std::string, std::unique_ptr<std::vector<uint8_t>>> *row_bin_data);
+
   Status BuildMindrecordSamplerChain(const py::handle &handle,
                                      std::vector<std::shared_ptr<mindrecord::ShardOperator>> *operators,
                                      int num_padded);
diff --git a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
index 173c1af2f22..b880c0cc460 100644
--- a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
@@ -184,7 +184,11 @@ void bindDEPipeline(py::module *m) {
     .def("GetDatasetSize", &DEPipeline::GetDatasetSize)
     .def("GetBatchSize", &DEPipeline::GetBatchSize)
     .def("GetNumClasses", &DEPipeline::GetNumClasses)
-    .def("GetRepeatCount", &DEPipeline::GetRepeatCount);
+    .def("GetRepeatCount", &DEPipeline::GetRepeatCount)
+    .def("SaveDataset", [](DEPipeline &de, const std::vector<std::string> &file_names, const std::string &file_type) {
+      THROW_IF_ERROR(de.SaveDataset(file_names, file_type));
+      return true;
+    });
 }
 void bindDatasetOps(py::module *m) {
   (void)py::class_<TFReaderOp, DatasetOp, std::shared_ptr<TFReaderOp>>(*m, "TFReaderOp")
diff --git a/mindspore/ccsrc/minddata/dataset/core/tensor.h b/mindspore/ccsrc/minddata/dataset/core/tensor.h
index b0b173e9c37..8707cbd7c06 100644
--- a/mindspore/ccsrc/minddata/dataset/core/tensor.h
+++ b/mindspore/ccsrc/minddata/dataset/core/tensor.h
@@ -312,6 +312,11 @@ class Tensor {
   // @return const unsigned char*
   const unsigned char *GetBuffer() const;
 
+  // Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if the
+  // tensor's type is a string, otherwise undefined address would be returned.
+  // @return address of the first string of the tensor.
+  uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; }
+
   // Getter of the type
   // @return
   DataType type() const { return type_; }
@@ -643,11 +648,6 @@ class Tensor {
   // @return length of the string
   Status GetStringAt(dsize_t index, uchar **string_start, offset_t *length) const;
 
-  // Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if the
-  // tensor's type is a string, otherwise undefined address would be returned.
-  // @return address of the first string of the tensor.
-  uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; }
-
   // all access to shape_ should be via shape
   TensorShape shape_;
   // data type of tensor
diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc
index cf1493eb78e..0886f751424 100644
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc
@@ -215,7 +215,7 @@ void MindRecordOp::Print(std::ostream &out, bool show_all) const {
     // Call the super class for displaying any common detailed info
     ParallelOp::Print(out, show_all);
     // Then show any custom derived-internal stuff
-    out << "\n Dataset file : ";
+    out << "\nDataset file : ";
     for (auto &file : dataset_file_) {
       out << file << " ";
     }
diff --git a/mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h b/mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h
index bd1cda8a99c..6c3e4e9c6c6 100644
--- a/mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h
+++ b/mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h
@@ -137,6 +137,10 @@ const std::set<std::string> kScalarFieldTypeSet = {"string", "int32", "int64", "
 // number field list
 const std::set<std::string> kNumberFieldTypeSet = {"int32", "int64", "float32", "float64"};
 
+const std::unordered_map<std::string, std::string> kTypesMap = {
+  {"bool", "int32"},      {"int8", "int32"},      {"uint8", "bytes"},     {"int16", "int32"},
+  {"uint16", "int32"},    {"int32", "int32"},     {"uint32", "int64"},    {"int64", "int64"},
+  {"float16", "float32"}, {"float32", "float32"}, {"float64", "float64"}, {"string", "string"}};
 /// \brief split a string using a character
 /// \param[in] field target string
 /// \param[in] separator a character for spliting
diff --git a/mindspore/ccsrc/minddata/mindrecord/include/shard_header.h b/mindspore/ccsrc/minddata/mindrecord/include/shard_header.h
index 67169e8696a..008f37941da 100644
--- a/mindspore/ccsrc/minddata/mindrecord/include/shard_header.h
+++ b/mindspore/ccsrc/minddata/mindrecord/include/shard_header.h
@@ -124,6 +124,10 @@ class ShardHeader {
 
   MSRStatus FileToPages(const std::string dump_file_name);
 
+  static MSRStatus initialize(const std::shared_ptr<ShardHeader> *header_ptr, const json &schema,
+                              const std::vector<std::string> &index_fields, std::vector<std::string> &blob_fields,
+                              uint64_t &schema_id);
+
  private:
   MSRStatus InitializeHeader(const std::vector<json> &headers, bool load_dataset);
 
diff --git a/mindspore/ccsrc/minddata/mindrecord/include/shard_index_generator.h b/mindspore/ccsrc/minddata/mindrecord/include/shard_index_generator.h
index fb85d9adbcb..c05b8876e97 100644
--- a/mindspore/ccsrc/minddata/mindrecord/include/shard_index_generator.h
+++ b/mindspore/ccsrc/minddata/mindrecord/include/shard_index_generator.h
@@ -57,6 +57,8 @@ class ShardIndexGenerator {
   /// \brief create databases for indexes
   MSRStatus WriteToDatabase();
 
+  static MSRStatus finalize(const std::vector<std::string> file_names);
+
  private:
   static int Callback(void *not_used, int argc, char **argv, char **az_col_name);
 
diff --git a/mindspore/ccsrc/minddata/mindrecord/include/shard_writer.h b/mindspore/ccsrc/minddata/mindrecord/include/shard_writer.h
index 833928773e6..67d4e471f8f 100644
--- a/mindspore/ccsrc/minddata/mindrecord/include/shard_writer.h
+++ b/mindspore/ccsrc/minddata/mindrecord/include/shard_writer.h
@@ -108,6 +108,13 @@ class ShardWriter {
                          std::map<uint64_t, std::vector<py::handle>> &blob_data, bool sign = true,
                          bool parallel_writer = false);
 
+  MSRStatus MergeBlobData(const std::vector<string> &blob_fields,
+                          const std::map<std::string, std::unique_ptr<std::vector<uint8_t>>> &row_bin_data,
+                          std::shared_ptr<std::vector<uint8_t>> *output);
+
+  static MSRStatus initialize(const std::unique_ptr<ShardWriter> *writer_ptr,
+                              const std::vector<std::string> &file_names);
+
  private:
   /// \brief write shard header data to disk
   MSRStatus WriteShardHeader();
diff --git a/mindspore/ccsrc/minddata/mindrecord/io/shard_index_generator.cc b/mindspore/ccsrc/minddata/mindrecord/io/shard_index_generator.cc
index f9b18a3bf02..5b102c3968f 100644
--- a/mindspore/ccsrc/minddata/mindrecord/io/shard_index_generator.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/io/shard_index_generator.cc
@@ -622,5 +622,21 @@ void ShardIndexGenerator::DatabaseWriter() {
     shard_no = task_++;
   }
 }
+MSRStatus ShardIndexGenerator::finalize(const std::vector<std::string> file_names) {
+  if (file_names.empty()) {
+    MS_LOG(ERROR) << "Mindrecord files is empty.";
+    return FAILED;
+  }
+  ShardIndexGenerator sg{file_names[0]};
+  if (SUCCESS != sg.Build()) {
+    MS_LOG(ERROR) << "Failed to build index generator.";
+    return FAILED;
+  }
+  if (SUCCESS != sg.WriteToDatabase()) {
+    MS_LOG(ERROR) << "Failed to write to database.";
+    return FAILED;
+  }
+  return SUCCESS;
+}
 }  // namespace mindrecord
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc b/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc
index e85229cc34e..2f2aebf7f01 100644
--- a/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc
@@ -637,6 +637,42 @@ MSRStatus ShardWriter::WriteRawDataPreCheck(std::map<uint64_t, std::vector<json>
   *row_count = std::get<2>(v);
   return SUCCESS;
 }
+MSRStatus ShardWriter::MergeBlobData(const std::vector<string> &blob_fields,
+                                     const std::map<std::string, std::unique_ptr<std::vector<uint8_t>>> &row_bin_data,
+                                     std::shared_ptr<std::vector<uint8_t>> *output) {
+  if (blob_fields.empty()) {
+    return SUCCESS;
+  }
+  if (blob_fields.size() == 1) {
+    auto &blob = row_bin_data.at(blob_fields[0]);
+    auto blob_size = blob->size();
+    *output = std::make_shared<std::vector<uint8_t>>(blob_size);
+    std::copy(blob->begin(), blob->end(), (*output)->begin());
+  } else {
+    size_t output_size = 0;
+    for (auto &field : blob_fields) {
+      output_size += row_bin_data.at(field)->size();
+    }
+    output_size += blob_fields.size() * sizeof(uint64_t);
+    *output = std::make_shared<std::vector<uint8_t>>(output_size);
+    std::vector<uint8_t> buf(sizeof(uint64_t), 0);
+    size_t idx = 0;
+    for (auto &field : blob_fields) {
+      auto &blob = row_bin_data.at(field);
+      uint64_t blob_size = blob->size();
+      // big edian
+      for (size_t i = 0; i < buf.size(); ++i) {
+        buf[buf.size() - 1 - i] = std::numeric_limits<uint8_t>::max() & blob_size;
+        blob_size >>= 8u;
+      }
+      std::copy(buf.begin(), buf.end(), (*output)->begin() + idx);
+      idx += buf.size();
+      std::copy(blob->begin(), blob->end(), (*output)->begin() + idx);
+      idx += blob->size();
+    }
+  }
+  return SUCCESS;
+}
 
 MSRStatus ShardWriter::WriteRawData(std::map<uint64_t, std::vector<json>> &raw_data,
                                     std::vector<std::vector<uint8_t>> &blob_data, bool sign, bool parallel_writer) {
@@ -1250,5 +1286,21 @@ void ShardWriter::SetLastBlobPage(const int &shard_id, std::shared_ptr<Page> &la
     last_blob_page = page.first;
   }
 }
+
+MSRStatus ShardWriter::initialize(const std::unique_ptr<ShardWriter> *writer_ptr,
+                                  const std::vector<std::string> &file_names) {
+  if (nullptr == writer_ptr) {
+    MS_LOG(ERROR) << "ShardWriter pointer is NULL.";
+    return FAILED;
+  }
+  auto res = (*writer_ptr)->Open(file_names, false);
+  if (SUCCESS != res) {
+    MS_LOG(ERROR) << "Failed to open mindrecord files to writer.";
+    return FAILED;
+  }
+  (*writer_ptr)->SetHeaderSize(1 << 24);
+  (*writer_ptr)->SetPageSize(1 << 25);
+  return SUCCESS;
+}
 }  // namespace mindrecord
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc b/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc
index 500037399b0..843b412a31c 100644
--- a/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc
@@ -721,5 +721,35 @@ MSRStatus ShardHeader::FileToPages(const std::string dump_file_name) {
   page_in_handle.close();
   return SUCCESS;
 }
+
+MSRStatus ShardHeader::initialize(const std::shared_ptr<ShardHeader> *header_ptr, const json &schema,
+                                  const std::vector<std::string> &index_fields, std::vector<std::string> &blob_fields,
+                                  uint64_t &schema_id) {
+  if (nullptr == header_ptr) {
+    MS_LOG(ERROR) << "ShardHeader pointer is NULL.";
+    return FAILED;
+  }
+  auto schema_ptr = Schema::Build("mindrecord", schema);
+  if (nullptr == schema_ptr) {
+    MS_LOG(ERROR) << "Got unexpected error when building mindrecord schema.";
+    return FAILED;
+  }
+  schema_id = (*header_ptr)->AddSchema(schema_ptr);
+  // create index
+  std::vector<std::pair<uint64_t, std::string>> id_index_fields;
+  if (!index_fields.empty()) {
+    for (auto &el : index_fields) {
+      id_index_fields.emplace_back(schema_id, el);
+    }
+    if (SUCCESS != (*header_ptr)->AddIndexFields(id_index_fields)) {
+      MS_LOG(ERROR) << "Got unexpected error when adding mindrecord index.";
+      return FAILED;
+    }
+  }
+
+  auto build_schema_ptr = (*header_ptr)->GetSchemas()[0];
+  blob_fields = build_schema_ptr->GetBlobFields();
+  return SUCCESS;
+}
 }  // namespace mindrecord
 }  // namespace mindspore
diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py
index 846e7e0a562..f3136cefae6 100644
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@@ -38,13 +38,13 @@ from mindspore._c_expression import typing
 
 from mindspore import log as logger
 from . import samplers
-from .iterators import DictIterator, TupleIterator, DummyIterator
+from .iterators import DictIterator, TupleIterator, DummyIterator, SaveOp
 from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
     check_rename, check_numpyslicesdataset, \
     check_take, check_project, check_imagefolderdatasetv2, check_mnist_cifar_dataset, check_manifestdataset, \
     check_tfrecorddataset, check_vocdataset, check_cocodataset, check_celebadataset, check_minddataset, \
     check_generatordataset, check_sync_wait, check_zip_dataset, check_add_column, check_textfiledataset, check_concat, \
-    check_random_dataset, check_split, check_bucket_batch_by_length, check_cluedataset, check_positive_int32
+    check_random_dataset, check_split, check_bucket_batch_by_length, check_cluedataset, check_positive_int32, check_save
 from ..core.datatypes import mstype_to_detype, mstypelist_to_detypelist
 
 try:
@@ -1044,6 +1044,34 @@ class Dataset:
 
         return TransferDataset(self, queue_name, device_id, device_type, num_batch)
 
+    @check_save
+    def save(self, file_name, num_files=1, file_type='mindrecord'):
+        """
+        Save the dynamic data processed by dataset pipeline as common dataset format, support: mindrecord.
+
+        Note:
+            1. To save the samples in order, should set dataset's shuffle false and num_files 1.
+            2. Before call the function, do not use batch, repeat operator or data augmentation operators
+               with random attribute in map operator.
+            3. Mindreocrd do not support np.uint64, multi-dimensional np.uint8(drop dimension) and
+               multi-dimensional string.
+
+        Args:
+            file_name (str): Path to dataset file.
+            num_files (int, optional): Number of dataset files.(default=1).
+            file_type (str, optional): dataset format.(default='mindrecord')
+
+        """
+
+        if num_files == 1:
+            file_names = [file_name]
+        else:
+            suffix = len(str(num_files - 1))
+            file_names = ["{}{}".format(file_name, str(x).rjust(suffix, '0'))
+                          for x in range(num_files)]
+
+        return SaveOp(self).save(file_names, file_type)
+
     def create_tuple_iterator(self, columns=None):
         """
         Create an Iterator over the dataset. The data retrieved will be a list of ndarray of data.
diff --git a/mindspore/dataset/engine/iterators.py b/mindspore/dataset/engine/iterators.py
index a2a23cbb44c..45da9718403 100644
--- a/mindspore/dataset/engine/iterators.py
+++ b/mindspore/dataset/engine/iterators.py
@@ -173,6 +173,7 @@ class Iterator:
 
     # Convert python node into C node and add to C layer execution tree in postorder traversal.
     def __convert_node_postorder(self, node):
+        self.check_node_type(node)
         op_type = self.__get_dataset_type(node)
         c_nodes = self.depipeline.AddNodeToTree(op_type, node.get_args())
 
@@ -224,6 +225,10 @@ class Iterator:
         self._index += 1
         return data
 
+    @abstractmethod
+    def check_node_type(self, node):
+        pass
+
     def get_output_shapes(self):
         return [t for t in self.depipeline.GetOutputShapes()]
 
@@ -245,11 +250,27 @@ class Iterator:
     def __deepcopy__(self, memo):
         return self
 
+class SaveOp(Iterator):
+    """
+    The derived class of Iterator with dict type.
+    """
+    def get_next(self):
+        pass
+
+    def check_node_type(self, node):
+        if isinstance(node, (de.ShuffleDataset, de.RepeatDataset, de.BatchDataset)):
+            logger.warning("Used shuffle, repeat, batch before save operator.")
+
+    def save(self, file_names, file_type):
+        return self.depipeline.SaveDataset(file_names, file_type)
+
 
 class DictIterator(Iterator):
     """
     The derived class of Iterator with dict type.
     """
+    def check_node_type(self, node):
+        pass
 
     def __iter__(self):
         return self
@@ -269,6 +290,8 @@ class TupleIterator(Iterator):
     """
     The derived class of Iterator with list type.
     """
+    def check_node_type(self, node):
+        pass
 
     def __init__(self, dataset, columns=None):
         if columns is not None:
diff --git a/mindspore/dataset/engine/validators.py b/mindspore/dataset/engine/validators.py
index 29904f1a9ef..c61630a0354 100644
--- a/mindspore/dataset/engine/validators.py
+++ b/mindspore/dataset/engine/validators.py
@@ -246,7 +246,24 @@ def check_celebadataset(method):
 
     return new_method
 
+def check_save(method):
+    """A wrapper that wrap a parameter checker to the save op."""
 
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        _, param_dict = parse_user_args(method, *args, **kwargs)
+
+        nreq_param_int = ['num_files']
+        nreq_param_str = ['file_name', 'file_type']
+        validate_dataset_param_value(nreq_param_int, param_dict, int)
+        if(param_dict.get('num_files') <= 0 or param_dict.get('num_files') > 1000):
+            raise ValueError("num_files should between {} and {}.".format(1, 1000))
+        validate_dataset_param_value(nreq_param_str, param_dict, str)
+        if param_dict.get('file_type') != 'mindrecord':
+            raise ValueError("{} dataset format is not supported.".format(param_dict.get('file_type')))
+        return method(self, *args, **kwargs)
+
+    return new_method
 def check_minddataset(method):
     """A wrapper that wraps a parameter checker to the original Dataset(MindDataset)."""
 
diff --git a/tests/ut/python/dataset/test_save_op.py b/tests/ut/python/dataset/test_save_op.py
new file mode 100644
index 00000000000..2ed326276b3
--- /dev/null
+++ b/tests/ut/python/dataset/test_save_op.py
@@ -0,0 +1,390 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+This is the test module for saveOp.
+"""
+import os
+import mindspore.dataset as ds
+from mindspore import log as logger
+from mindspore.mindrecord import FileWriter
+import numpy as np
+import pytest
+
+CV_FILE_NAME1 = "../data/mindrecord/testMindDataSet/temp.mindrecord"
+CV_FILE_NAME2 = "../data/mindrecord/testMindDataSet/auto.mindrecord"
+
+FILES_NUM = 1
+num_readers = 1
+
+
+@pytest.fixture(name="add_and_remove_cv_file")
+def fixture_remove():
+    """add/remove cv file"""
+    if os.path.exists("{}".format(CV_FILE_NAME1)):
+        os.remove("{}".format(CV_FILE_NAME1))
+    if os.path.exists("{}.db".format(CV_FILE_NAME1)):
+        os.remove("{}.db".format(CV_FILE_NAME1))
+
+    if os.path.exists("{}".format(CV_FILE_NAME2)):
+        os.remove("{}".format(CV_FILE_NAME2))
+    if os.path.exists("{}.db".format(CV_FILE_NAME2)):
+        os.remove("{}.db".format(CV_FILE_NAME2))
+    yield "yield_cv_data"
+    if os.path.exists("{}".format(CV_FILE_NAME1)):
+        os.remove("{}".format(CV_FILE_NAME1))
+    if os.path.exists("{}.db".format(CV_FILE_NAME1)):
+        os.remove("{}.db".format(CV_FILE_NAME1))
+
+    if os.path.exists("{}".format(CV_FILE_NAME2)):
+        os.remove("{}".format(CV_FILE_NAME2))
+    if os.path.exists("{}.db".format(CV_FILE_NAME2)):
+        os.remove("{}.db".format(CV_FILE_NAME2))
+
+
+def test_case_00(add_and_remove_cv_file):  # only bin data
+    data = [{"image1": bytes("image1 bytes abc", encoding='UTF-8'),
+             "image2": bytes("image1 bytes def", encoding='UTF-8'),
+             "image3": bytes("image1 bytes ghi", encoding='UTF-8'),
+             "image4": bytes("image1 bytes jkl", encoding='UTF-8'),
+             "image5": bytes("image1 bytes mno", encoding='UTF-8')},
+            {"image1": bytes("image2 bytes abc", encoding='UTF-8'),
+             "image2": bytes("image2 bytes def", encoding='UTF-8'),
+             "image3": bytes("image2 bytes ghi", encoding='UTF-8'),
+             "image4": bytes("image2 bytes jkl", encoding='UTF-8'),
+             "image5": bytes("image2 bytes mno", encoding='UTF-8')},
+            {"image1": bytes("image3 bytes abc", encoding='UTF-8'),
+             "image2": bytes("image3 bytes def", encoding='UTF-8'),
+             "image3": bytes("image3 bytes ghi", encoding='UTF-8'),
+             "image4": bytes("image3 bytes jkl", encoding='UTF-8'),
+             "image5": bytes("image3 bytes mno", encoding='UTF-8')},
+            {"image1": bytes("image5 bytes abc", encoding='UTF-8'),
+             "image2": bytes("image5 bytes def", encoding='UTF-8'),
+             "image3": bytes("image5 bytes ghi", encoding='UTF-8'),
+             "image4": bytes("image5 bytes jkl", encoding='UTF-8'),
+             "image5": bytes("image5 bytes mno", encoding='UTF-8')},
+            {"image1": bytes("image6 bytes abc", encoding='UTF-8'),
+             "image2": bytes("image6 bytes def", encoding='UTF-8'),
+             "image3": bytes("image6 bytes ghi", encoding='UTF-8'),
+             "image4": bytes("image6 bytes jkl", encoding='UTF-8'),
+             "image5": bytes("image6 bytes mno", encoding='UTF-8')}]
+    schema = {
+        "image1": {"type": "bytes"},
+        "image2": {"type": "bytes"},
+        "image3": {"type": "bytes"},
+        "image4": {"type": "bytes"},
+        "image5": {"type": "bytes"}}
+    writer = FileWriter(CV_FILE_NAME1, FILES_NUM)
+    writer.add_schema(schema, "schema")
+    writer.write_raw_data(data)
+    writer.commit()
+
+    d1 = ds.MindDataset(CV_FILE_NAME1, None, num_readers, shuffle=False)
+    d1.save(CV_FILE_NAME2, FILES_NUM)
+    data_value_to_list = []
+
+    for item in data:
+        new_data = {}
+        new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8)
+        new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8)
+        new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8)
+        new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8)
+        new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8)
+        data_value_to_list.append(new_data)
+
+    d2 = ds.MindDataset(dataset_file=CV_FILE_NAME2,
+                        num_parallel_workers=num_readers,
+                        shuffle=False)
+    assert d2.get_dataset_size() == 5
+    num_iter = 0
+    for item in d2.create_dict_iterator():
+        assert len(item) == 5
+        for field in item:
+            if isinstance(item[field], np.ndarray):
+                assert (item[field] ==
+                        data_value_to_list[num_iter][field]).all()
+            else:
+                assert item[field] == data_value_to_list[num_iter][field]
+        num_iter += 1
+    assert num_iter == 5
+
+
+def test_case_01(add_and_remove_cv_file):  # only raw data
+    data = [{"file_name": "001.jpg", "label": 43},
+            {"file_name": "002.jpg", "label": 91},
+            {"file_name": "003.jpg", "label": 61},
+            {"file_name": "004.jpg", "label": 29},
+            {"file_name": "005.jpg", "label": 78},
+            {"file_name": "006.jpg", "label": 37}]
+    schema = {"file_name": {"type": "string"},
+              "label": {"type": "int32"}
+              }
+
+    writer = FileWriter(CV_FILE_NAME1, FILES_NUM)
+    writer.add_schema(schema, "schema")
+    writer.write_raw_data(data)
+    writer.commit()
+
+    d1 = ds.MindDataset(CV_FILE_NAME1, None, num_readers, shuffle=False)
+    d1.save(CV_FILE_NAME2, FILES_NUM)
+
+    data_value_to_list = []
+    for item in data:
+        new_data = {}
+        new_data['file_name'] = np.asarray(item["file_name"], dtype='S')
+        new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32)
+        data_value_to_list.append(new_data)
+
+    d2 = ds.MindDataset(dataset_file=CV_FILE_NAME2,
+                        num_parallel_workers=num_readers,
+                        shuffle=False)
+    assert d2.get_dataset_size() == 6
+    num_iter = 0
+    for item in d2.create_dict_iterator():
+        logger.info(item)
+        assert len(item) == 2
+        for field in item:
+            if isinstance(item[field], np.ndarray):
+                assert (item[field] ==
+                        data_value_to_list[num_iter][field]).all()
+            else:
+                assert item[field] == data_value_to_list[num_iter][field]
+        num_iter += 1
+    assert num_iter == 6
+
+
+def test_case_02(add_and_remove_cv_file):  # muti-bytes
+    data = [{"file_name": "001.jpg", "label": 43,
+             "float32_array": np.array([1.2, 2.78, 3.1234, 4.9871, 5.12341], dtype=np.float32),
+             "float64_array": np.array([48.1234556789, 49.3251241431, 50.13514312414, 51.8971298471,
+                                        123414314.2141243, 87.1212122], dtype=np.float64),
+             "float32": 3456.12345,
+             "float64": 1987654321.123456785,
+             "source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int32),
+             "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64),
+             "image1": bytes("image1 bytes abc", encoding='UTF-8'),
+             "image2": bytes("image1 bytes def", encoding='UTF-8'),
+             "image3": bytes("image1 bytes ghi", encoding='UTF-8'),
+             "image4": bytes("image1 bytes jkl", encoding='UTF-8'),
+             "image5": bytes("image1 bytes mno", encoding='UTF-8')},
+            {"file_name": "002.jpg", "label": 91,
+             "float32_array": np.array([1.2, 2.78, 4.1234, 4.9871, 5.12341], dtype=np.float32),
+             "float64_array": np.array([48.1234556789, 49.3251241431, 60.13514312414, 51.8971298471,
+                                        123414314.2141243, 87.1212122], dtype=np.float64),
+             "float32": 3456.12445,
+             "float64": 1987654321.123456786,
+             "source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int32),
+             "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64),
+             "image1": bytes("image2 bytes abc", encoding='UTF-8'),
+             "image2": bytes("image2 bytes def", encoding='UTF-8'),
+             "image3": bytes("image2 bytes ghi", encoding='UTF-8'),
+             "image4": bytes("image2 bytes jkl", encoding='UTF-8'),
+             "image5": bytes("image2 bytes mno", encoding='UTF-8')},
+            {"file_name": "003.jpg", "label": 61,
+             "float32_array": np.array([1.2, 2.78, 5.1234, 4.9871, 5.12341], dtype=np.float32),
+             "float64_array": np.array([48.1234556789, 49.3251241431, 70.13514312414, 51.8971298471,
+                                        123414314.2141243, 87.1212122], dtype=np.float64),
+             "float32": 3456.12545,
+             "float64": 1987654321.123456787,
+             "source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int32),
+             "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64),
+             "image1": bytes("image3 bytes abc", encoding='UTF-8'),
+             "image2": bytes("image3 bytes def", encoding='UTF-8'),
+             "image3": bytes("image3 bytes ghi", encoding='UTF-8'),
+             "image4": bytes("image3 bytes jkl", encoding='UTF-8'),
+             "image5": bytes("image3 bytes mno", encoding='UTF-8')},
+            {"file_name": "004.jpg", "label": 29,
+             "float32_array": np.array([1.2, 2.78, 6.1234, 4.9871, 5.12341], dtype=np.float32),
+             "float64_array": np.array([48.1234556789, 49.3251241431, 80.13514312414, 51.8971298471,
+                                        123414314.2141243, 87.1212122], dtype=np.float64),
+             "float32": 3456.12645,
+             "float64": 1987654321.123456788,
+             "source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int32),
+             "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64),
+             "image1": bytes("image4 bytes abc", encoding='UTF-8'),
+             "image2": bytes("image4 bytes def", encoding='UTF-8'),
+             "image3": bytes("image4 bytes ghi", encoding='UTF-8'),
+             "image4": bytes("image4 bytes jkl", encoding='UTF-8'),
+             "image5": bytes("image4 bytes mno", encoding='UTF-8')},
+            {"file_name": "005.jpg", "label": 78,
+             "float32_array": np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32),
+             "float64_array": np.array([48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471,
+                                        123414314.2141243, 87.1212122], dtype=np.float64),
+             "float32": 3456.12745,
+             "float64": 1987654321.123456789,
+             "source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int32),
+             "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64),
+             "image1": bytes("image5 bytes abc", encoding='UTF-8'),
+             "image2": bytes("image5 bytes def", encoding='UTF-8'),
+             "image3": bytes("image5 bytes ghi", encoding='UTF-8'),
+             "image4": bytes("image5 bytes jkl", encoding='UTF-8'),
+             "image5": bytes("image5 bytes mno", encoding='UTF-8')},
+            {"file_name": "006.jpg", "label": 37,
+             "float32_array": np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32),
+             "float64_array": np.array([48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471,
+                                        123414314.2141243, 87.1212122], dtype=np.float64),
+             "float32": 3456.12745,
+             "float64": 1987654321.123456789,
+             "source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int32),
+             "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64),
+             "image1": bytes("image6 bytes abc", encoding='UTF-8'),
+             "image2": bytes("image6 bytes def", encoding='UTF-8'),
+             "image3": bytes("image6 bytes ghi", encoding='UTF-8'),
+             "image4": bytes("image6 bytes jkl", encoding='UTF-8'),
+             "image5": bytes("image6 bytes mno", encoding='UTF-8')}
+            ]
+    schema = {"file_name": {"type": "string"},
+              "float32_array": {"type": "float32", "shape": [-1]},
+              "float64_array": {"type": "float64", "shape": [-1]},
+              "float32": {"type": "float32"},
+              "float64": {"type": "float64"},
+              "source_sos_ids": {"type": "int32", "shape": [-1]},
+              "source_sos_mask": {"type": "int64", "shape": [-1]},
+              "image1": {"type": "bytes"},
+              "image2": {"type": "bytes"},
+              "image3": {"type": "bytes"},
+              "label": {"type": "int32"},
+              "image4": {"type": "bytes"},
+              "image5": {"type": "bytes"}}
+    writer = FileWriter(CV_FILE_NAME1, FILES_NUM)
+    writer.add_schema(schema, "schema")
+    writer.write_raw_data(data)
+    writer.commit()
+
+    d1 = ds.MindDataset(CV_FILE_NAME1, None, num_readers, shuffle=False)
+    d1.save(CV_FILE_NAME2, FILES_NUM)
+    data_value_to_list = []
+
+    for item in data:
+        new_data = {}
+        new_data['file_name'] = np.asarray(item["file_name"], dtype='S')
+        new_data['float32_array'] = item["float32_array"]
+        new_data['float64_array'] = item["float64_array"]
+        new_data['float32'] = item["float32"]
+        new_data['float64'] = item["float64"]
+        new_data['source_sos_ids'] = item["source_sos_ids"]
+        new_data['source_sos_mask'] = item["source_sos_mask"]
+        new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32)
+        new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8)
+        new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8)
+        new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8)
+        new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8)
+        new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8)
+        data_value_to_list.append(new_data)
+
+    d2 = ds.MindDataset(dataset_file=CV_FILE_NAME2,
+                        num_parallel_workers=num_readers,
+                        shuffle=False)
+    assert d2.get_dataset_size() == 6
+    num_iter = 0
+    for item in d2.create_dict_iterator():
+        assert len(item) == 13
+        for field in item:
+            if isinstance(item[field], np.ndarray):
+                if item[field].dtype == np.float32:
+                    assert (item[field] ==
+                            np.array(data_value_to_list[num_iter][field], np.float32)).all()
+                else:
+                    assert (item[field] ==
+                            data_value_to_list[num_iter][field]).all()
+            else:
+                assert item[field] == data_value_to_list[num_iter][field]
+        num_iter += 1
+    assert num_iter == 6
+
+
+def generator_1d():
+    for i in range(10):
+        yield (np.array([i]),)
+
+
+def test_case_03(add_and_remove_cv_file):
+
+    # apply dataset operations
+    d1 = ds.GeneratorDataset(generator_1d, ["data"], shuffle=False)
+
+    d1.save(CV_FILE_NAME2)
+
+    d2 = ds.MindDataset(dataset_file=CV_FILE_NAME2,
+                        num_parallel_workers=num_readers,
+                        shuffle=False)
+
+    i = 0
+    for item in d2.create_dict_iterator():  # each data is a dictionary
+        golden = np.array([i])
+        assert np.array_equal(item["data"], golden)
+        i = i + 1
+
+
+def generator_with_type(t):
+    for i in range(64):
+        yield (np.array([i], dtype=t),)
+
+
+def type_tester(t):
+    logger.info("Test with Type {}".format(t.__name__))
+
+    # apply dataset operations
+    data1 = ds.GeneratorDataset((lambda: generator_with_type(t)), ["data"], shuffle=False)
+
+    data1 = data1.batch(4)
+
+    data1 = data1.repeat(3)
+
+    data1.save(CV_FILE_NAME2)
+
+    d2 = ds.MindDataset(dataset_file=CV_FILE_NAME2,
+                        num_parallel_workers=num_readers,
+                        shuffle=False)
+
+    i = 0
+    num_repeat = 0
+    for item in d2.create_dict_iterator():  # each data is a dictionary
+        golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
+        logger.info(item)
+        assert np.array_equal(item["data"], golden)
+        i = i + 4
+        if i == 64:
+            i = 0
+            num_repeat += 1
+    assert num_repeat == 3
+    if os.path.exists("{}".format(CV_FILE_NAME2)):
+        os.remove("{}".format(CV_FILE_NAME2))
+    if os.path.exists("{}.db".format(CV_FILE_NAME2)):
+        os.remove("{}.db".format(CV_FILE_NAME2))
+
+
+def test_case_04():
+    # uint8 will drop shape as mindrecord store uint8 as bytes
+    types = [np.int8, np.int16, np.int32, np.int64,
+             np.uint16, np.uint32, np.float32, np.float64]
+
+    for t in types:
+        type_tester(t)
+
+
+def test_case_05(add_and_remove_cv_file):
+
+    d1 = ds.GeneratorDataset(generator_1d, ["data"], shuffle=False)
+
+    with pytest.raises(Exception, match="num_files should between 1 and 1000."):
+        d1.save(CV_FILE_NAME2, 0)
+
+
+def test_case_06(add_and_remove_cv_file):
+
+    d1 = ds.GeneratorDataset(generator_1d, ["data"], shuffle=False)
+
+    with pytest.raises(Exception, match="tfrecord dataset format is not supported."):
+        d1.save(CV_FILE_NAME2, 1, "tfrecord")

From 162b3aefcf8d8c734e8956d2f16403a14ca7e52c Mon Sep 17 00:00:00 2001
From: jjfeing <jiangjianfei3@huawei.com>
Date: Thu, 16 Jul 2020 15:43:15 +0800
Subject: [PATCH 29/68] add tbe build info with full name

---
 mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
index 73642b291a4..7e920136572 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
@@ -43,6 +43,7 @@ constexpr auto kJInputs = "inputs";
 constexpr auto kJOutputs = "outputs";
 constexpr auto kJAttrs = "attrs";
 constexpr auto kJKernelName = "kernel_name";
+constexpr auto kJFullName = "full_name";
 constexpr auto kJOpInfo = "op_info";
 constexpr auto kJDtype = "dtype";
 constexpr auto kJtype = "type";
@@ -125,6 +126,7 @@ bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspor
     op_info_json[kJKernelName] = json_name_;
   }
   (*kernel_json)[kJOpInfo] = op_info_json;
+  (*kernel_json)[kJFullName] = anf_node->fullname_with_scope();
   if (creater_type_ == SINGLE_BUILD) {
     TbeUtils::SaveJsonInfo(json_name_, json_info_);
   }

From 4c4c08c726e7ca7aeb338ddf178e47242945beac Mon Sep 17 00:00:00 2001
From: WilliamLian <lianliguang@huawei.com>
Date: Wed, 15 Jul 2020 15:10:49 +0800
Subject: [PATCH 30/68] refactor primitive GetObj function

---
 .../ccsrc/pipeline/jit/static_analysis/prim.cc    |  8 +-------
 .../jit/static_analysis/program_specialize.cc     | 12 +-----------
 .../ccsrc/pipeline/pynative/pynative_execute.cc   |  9 ++++-----
 mindspore/core/ir/primitive.h                     |  1 +
 mindspore/core/ir/primitive_py.cc                 | 15 +++++++++++++++
 mindspore/core/ir/primitive_py.h                  |  4 ++++
 6 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc b/mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc
index 99e613395cc..9f3011d1187 100644
--- a/mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc
+++ b/mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc
@@ -523,14 +523,8 @@ EvalResultPtr PythonPrimEvaluator::EvalPrim(const AnalysisEnginePtr &, const Abs
     return iter->second;
   }
   auto py_args = PreparePyInputs(prim_py_, args);
-
-  auto pyobj = prim_py_->GetPyObj();
-  if (pyobj == nullptr) {
-    MS_LOG(EXCEPTION) << "[" << prim_py_->ToString() << "]: pyobj is empty";
-  }
-  auto infer_fuc = pyobj.attr("__infer__");
   prim_py_->BeginRecordAddAttr();
-  py::dict output = infer_fuc(*py_args);
+  py::dict output = prim_py_->RunInfer(py_args);
   prim_py_->EndRecordAddAttr();
   auto added_attrs = prim_py_->evaluate_added_attrs();
   MS_LOG(DEBUG) << "Output type is " << (std::string)py::str(output);
diff --git a/mindspore/ccsrc/pipeline/jit/static_analysis/program_specialize.cc b/mindspore/ccsrc/pipeline/jit/static_analysis/program_specialize.cc
index ad39190dc3c..fe5871fe5e4 100644
--- a/mindspore/ccsrc/pipeline/jit/static_analysis/program_specialize.cc
+++ b/mindspore/ccsrc/pipeline/jit/static_analysis/program_specialize.cc
@@ -654,17 +654,7 @@ static PrimitivePtr BuildPrimtiveValueWithAttributes(const PrimitivePtr &prim, c
     }
   }
   if (!is_attr_same) {
-    if (prim->isa<PrimitivePy>()) {
-      PrimitivePyPtr prim_py = prim->cast<PrimitivePyPtr>();
-      auto clone_fn = prim_py->GetPyObj().attr("_clone");
-      py::object new_obj = clone_fn();
-      auto cloned_prim = new_obj.cast<PrimitivePyPtr>();
-      for (auto &item : *attrs) {
-        cloned_prim->AddAttr(item.first, item.second);
-      }
-      return cloned_prim;
-    }
-    auto cloned_prim = std::make_shared<Primitive>(*prim);
+    auto cloned_prim = prim->Clone();
     for (auto &item : *attrs) {
       cloned_prim->AddAttr(item.first, item.second);
     }
diff --git a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
index db41b2a0a86..fd5c8f1965b 100644
--- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
@@ -280,8 +280,8 @@ void PynativeInfer(const PrimitivePyPtr &prim, const py::list &py_args, OpExecIn
   AbstractBasePtrList args_spec_list;
   for (size_t i = 0; i < size; i++) {
     ValuePtr input_value = PyAttrValue(py_args[i]);
-    args_spec_list.emplace_back(abstract::FromValueInside(
-      input_value, !py::hasattr(prim->GetPyObj(), "const_value") && input_value->isa<tensor::Tensor>()));
+    args_spec_list.emplace_back(
+      abstract::FromValueInside(input_value, !prim->ObjHasAttr("const_value") && input_value->isa<tensor::Tensor>()));
   }
   AbstractBasePtr infer_res = EvalOnePrim(prim, args_spec_list)->abstract();
   op_exec_info->abstract = infer_res;
@@ -296,8 +296,7 @@ OpExecInfoPtr GenerateOpExecInfo(const py::args &args, py::list *const out_args)
   MS_EXCEPTION_IF_NULL(op_exec_info);
   op_exec_info->op_name = py::cast<std::string>(args[PY_NAME]);
   auto prim = py::cast<PrimitivePyPtr>(args[PY_PRIM]);
-  auto pyobj = prim->GetPyObj();
-  if (pyobj == nullptr) {
+  if (!prim->HasPyObj()) {
     MS_LOG(EXCEPTION) << "pyobj is empty";
   }
 
@@ -708,7 +707,7 @@ py::tuple RunOpInner(const py::args &args) {
       value_ret[0] = output["value"];
       return value_ret;
     }
-    if (py::hasattr(op_exec_info->py_primitive->GetPyObj(), "const_value")) {
+    if (op_exec_info->py_primitive->ObjHasAttr("const_value")) {
       py::tuple value_ret(1);
       value_ret[0] = "";
       return value_ret;
diff --git a/mindspore/core/ir/primitive.h b/mindspore/core/ir/primitive.h
index a1784a85a39..c00af419503 100644
--- a/mindspore/core/ir/primitive.h
+++ b/mindspore/core/ir/primitive.h
@@ -100,6 +100,7 @@ class Primitive : public Named {
     return !(iter == attrs_.cend());
   }
   void set_prim_type(const PrimType t) { prim_type_ = t; }
+  virtual PrimitivePtr Clone() { return std::make_shared<Primitive>(*this); }
   void set_instance_name(const std::string s) { instance_name_ = s; }
   bool HasPyEvaluator() const { return prim_type_ == kPrimTypePyInferShape || prim_type_ == kPrimTypeUserCustom; }
   bool HasPyInferTensor() const { return prim_type_ == kPrimTypePyInferTensor; }
diff --git a/mindspore/core/ir/primitive_py.cc b/mindspore/core/ir/primitive_py.cc
index 2a8f003623a..15a19b703a4 100644
--- a/mindspore/core/ir/primitive_py.cc
+++ b/mindspore/core/ir/primitive_py.cc
@@ -196,6 +196,21 @@ bool PrimitivePy::HasComputeFunction() const {
   return true;
 }
 
+PrimitivePtr PrimitivePy::Clone() {
+  auto clone_fn = python_obj_.attr("_clone");
+  py::object new_obj = clone_fn();
+  auto cloned_prim = new_obj.cast<PrimitivePyPtr>();
+  return cloned_prim;
+}
+
+py::dict PrimitivePy::RunInfer(const py::tuple &args) {
+  if (!HasPyObj()) {
+    MS_LOG(EXCEPTION) << "[" << this->ToString() << "]: pyobj is empty";
+  }
+  auto infer_fuc = python_obj_.attr("__infer__");
+  return infer_fuc(*args);
+}
+
 REGISTER_PYBIND_DEFINE(Primitive_, ([](const py::module *m) {
                          (void)py::enum_<PrimType>(*m, "prim_type", py::arithmetic())
                            .value("unknown", PrimType::kPrimTypeUnknown)
diff --git a/mindspore/core/ir/primitive_py.h b/mindspore/core/ir/primitive_py.h
index 8c576016fab..01af9c530f4 100644
--- a/mindspore/core/ir/primitive_py.h
+++ b/mindspore/core/ir/primitive_py.h
@@ -61,6 +61,10 @@ class PrimitivePy : public Primitive {
   bool HasComputeFunction() const;
   const bool parse_info_ = true;
   const py::object &GetPyObj() const { return python_obj_; }
+  py::dict RunInfer(const py::tuple &args);
+  bool ObjHasAttr(const char *attr_name) { return py::hasattr(python_obj_, attr_name); }
+  bool HasPyObj() { return python_obj_ != nullptr; }
+  PrimitivePtr Clone() override;
   bool is_tuple_input_ = false;
 
  private:

From f64352534b6b2d86dc726d1056a432247e04ca9d Mon Sep 17 00:00:00 2001
From: hanjun996 <hanjun12@huawei.com>
Date: Thu, 16 Jul 2020 16:32:31 +0800
Subject: [PATCH 31/68] update readme

---
 model_zoo/wide_and_deep/README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/model_zoo/wide_and_deep/README.md b/model_zoo/wide_and_deep/README.md
index 000e6a53353..b91c39ab304 100644
--- a/model_zoo/wide_and_deep/README.md
+++ b/model_zoo/wide_and_deep/README.md
@@ -4,6 +4,17 @@ This is an implementation of WideDeep as described in the [Wide & Deep Learning
 
 WideDeep model jointly trained wide linear models and deep neural network, which combined the benefits of memorization and generalization for recommender systems.
 
+## Requirements
+
+- Install [MindSpore](https://www.mindspore.cn/install/en).
+
+- Download the dataset and convert the dataset to mindrecord, command as follows:
+```
+python src/preprocess_data.py
+```
+Arguments:
+   * `--data_path`: Dataset storage path (Default: ./criteo_data/).
+   
 ## Dataset
 The Criteo datasets are used for model training and evaluation.
 

From 27cde14b36460f3acd18b337a7028bf953a8f704 Mon Sep 17 00:00:00 2001
From: looop5 <xuhui78@huawei.com>
Date: Thu, 16 Jul 2020 16:32:48 +0800
Subject: [PATCH 32/68] update submodule akg to newest commit id

---
 akg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/akg b/akg
index df57a6cf945..f60af9df422 160000
--- a/akg
+++ b/akg
@@ -1 +1 @@
-Subproject commit df57a6cf9450e347d1854687d1fe66a420ee3b35
+Subproject commit f60af9df4220bf3db5de2b224418953c0dc1f625

From 177e18f3f46fe3fe5b14c149c23a16dd44a25dd0 Mon Sep 17 00:00:00 2001
From: simson <526422051@qq.com>
Date: Wed, 15 Jul 2020 16:45:48 +0800
Subject: [PATCH 33/68] modify the limit of loss scale

---
 mindspore/nn/optim/optimizer.py                     | 9 +++++----
 tests/ut/python/parallel/test_loss_and_optimizer.py | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py
index cdf1565f349..4364e8c5a26 100755
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -79,9 +79,10 @@ class Optimizer(Cell):
               the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
               in the value of 'order_params' should be in one of group parameters.
 
-        weight_decay (float): A floating point value for the weight decay. It should be equal to or greater than 0.
+        weight_decay (float): A floating point value for the weight decay. It should be not less than 0 and not
+                              greater than 1.
             If the type of `weight_decay` input is int, it will be converted to float. Default: 0.0.
-        loss_scale (float): A floating point value for the loss scale. It should be greater than 0. If the
+        loss_scale (float): A floating point value for the loss scale. It should be not less than 1. If the
             type of `loss_scale` input is int, it will be converted to float. Default: 1.0.
 
     Raises:
@@ -103,12 +104,12 @@ class Optimizer(Cell):
         if isinstance(loss_scale, int):
             loss_scale = float(loss_scale)
         validator.check_value_type("loss_scale", loss_scale, [float], self.cls_name)
-        validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, self.cls_name)
+        validator.check_number_range("loss_scale", loss_scale, 1.0, float("inf"), Rel.INC_LEFT, self.cls_name)
 
         if isinstance(weight_decay, int):
             weight_decay = float(weight_decay)
         validator.check_value_type("weight_decay", weight_decay, [float], self.cls_name)
-        validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name)
+        validator.check_number_range("weight_decay", weight_decay, 0.0, 1.0, Rel.INC_BOTH, self.cls_name)
 
         self.is_group = False
         self.is_group_lr = False
diff --git a/tests/ut/python/parallel/test_loss_and_optimizer.py b/tests/ut/python/parallel/test_loss_and_optimizer.py
index 91be7682abd..615f058dc33 100644
--- a/tests/ut/python/parallel/test_loss_and_optimizer.py
+++ b/tests/ut/python/parallel/test_loss_and_optimizer.py
@@ -98,7 +98,7 @@ def test_momentum_with_loss_scale():
 
     net = Net(strategy1, strategy2, weight)
 
-    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9, loss_scale=0.5)
+    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9, loss_scale=1.0)
 
     net_with_loss = NetWithLoss(net, strategy3)
 
@@ -169,7 +169,7 @@ def test_momentum_with_loss_scale_and_dynamic_lr():
     net = Net(strategy1, strategy2, weight)
 
     lr = Tensor(np.ones([6]), dtype=ms.float32)
-    optimizer = Momentum(net.trainable_params(), learning_rate=lr, momentum=0.9, loss_scale=0.5)
+    optimizer = Momentum(net.trainable_params(), learning_rate=lr, momentum=0.9, loss_scale=1.0)
 
     net_with_loss = NetWithLoss(net, strategy3)
 

From d45abc5f54737b9294a771dc10d4152676c512aa Mon Sep 17 00:00:00 2001
From: d00455729 <dingpeifei1@huawei.com>
Date: Tue, 14 Jul 2020 20:47:38 +0800
Subject: [PATCH 34/68] Asynchronous save checkpoint

---
 mindspore/train/callback/_checkpoint.py | 18 ++++---
 mindspore/train/serialization.py        | 67 +++++++++++++++++--------
 2 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/mindspore/train/callback/_checkpoint.py b/mindspore/train/callback/_checkpoint.py
index e0048ad7130..a9389fd395e 100644
--- a/mindspore/train/callback/_checkpoint.py
+++ b/mindspore/train/callback/_checkpoint.py
@@ -15,7 +15,6 @@
 """Checkpoint related classes and functions."""
 
 import os
-import shutil
 import stat
 import time
 
@@ -86,6 +85,7 @@ class CheckpointConfig:
             Can't be used with keep_checkpoint_max at the same time.
         integrated_save (bool): Whether to intergrated save in automatic model parallel scene. Default: True.
             Integrated save function is only supported in automatic parallel scene, not supported in manual parallel.
+        async_save (bool): Whether asynchronous execute save checkpoint into file. Default: False
 
     Raises:
         ValueError: If the input_param is None or 0.
@@ -100,7 +100,8 @@ class CheckpointConfig:
                  save_checkpoint_seconds=0,
                  keep_checkpoint_max=5,
                  keep_checkpoint_per_n_minutes=0,
-                 integrated_save=True):
+                 integrated_save=True,
+                 async_save=False):
 
         if not save_checkpoint_steps and not save_checkpoint_seconds and \
                 not keep_checkpoint_max and not keep_checkpoint_per_n_minutes:
@@ -129,6 +130,7 @@ class CheckpointConfig:
                 self._keep_checkpoint_max = 1
 
         self._integrated_save = check_bool(integrated_save)
+        self._async_save = check_bool(async_save)
 
     @property
     def save_checkpoint_steps(self):
@@ -155,6 +157,11 @@ class CheckpointConfig:
         """Get the value of _integrated_save."""
         return self._integrated_save
 
+    @property
+    def async_save(self):
+        """Get the value of _async_save."""
+        return self._async_save
+
     def get_checkpoint_policy(self):
         """Get the policy of checkpoint."""
         checkpoint_policy = {'save_checkpoint_steps': self._save_checkpoint_steps,
@@ -282,8 +289,6 @@ class ModelCheckpoint(Callback):
             global _save_dir
             _save_dir = self._directory
             cur_file = os.path.join(self._directory, cur_ckpoint_file)
-            tmp_ckpt_file_name_for_cur_process = str(os.getpid()) + "-" + 'parameters.ckpt'
-            gen_file = os.path.join(_save_dir, tmp_ckpt_file_name_for_cur_process)
             self._last_time_for_keep = time.time()
             self._last_triggered_step = cb_params.cur_step_num
 
@@ -291,10 +296,9 @@ class ModelCheckpoint(Callback):
                 set_cur_net(cb_params.train_network)
                 cb_params.train_network.exec_checkpoint_graph()
 
-            _exec_save_checkpoint(cb_params.train_network, gen_file, self._config.integrated_save)
+            _exec_save_checkpoint(cb_params.train_network, cur_file, self._config.integrated_save,
+                                  self._config.async_save)
 
-            if os.path.exists(gen_file):
-                shutil.move(gen_file, cur_file)
             self._latest_ckpt_file_name = cur_file
 
     @property
diff --git a/mindspore/train/serialization.py b/mindspore/train/serialization.py
index 3812698419c..6a4fc36dbd1 100644
--- a/mindspore/train/serialization.py
+++ b/mindspore/train/serialization.py
@@ -15,6 +15,7 @@
 """Model and parameters serialization."""
 import os
 import stat
+from threading import Thread, Lock
 import numpy as np
 
 import mindspore.nn as nn
@@ -40,6 +41,7 @@ tensor_to_np_type = {"Int8": np.int8, "Uint8": np.uint8, "Int16": np.int16, "Uin
                      "Int32": np.int32, "Uint32": np.uint32, "Int64": np.int64, "Uint64": np.uint64,
                      "Float16": np.float16, "Float32": np.float32, "Float64": np.float64, "Bool": np.bool_}
 
+_ckpt_mutex = Lock()
 
 def _special_process_par(par, new_par):
     """
@@ -101,7 +103,29 @@ def _update_param(param, new_param):
         param.set_parameter_data(type(param.data)(new_param.data))
 
 
-def save_checkpoint(parameter_list, ckpt_file_name):
+def _exec_save(ckpt_file_name, data_list):
+    """Execute save checkpoint into file process."""
+    checkpoint_list = Checkpoint()
+
+    try:
+        with _ckpt_mutex:
+            for name, value in data_list.items():
+                param_value = checkpoint_list.value.add()
+                param_value.tag = name
+                param_tensor = param_value.tensor
+                param_tensor.dims.extend(value[0])
+                param_tensor.tensor_type = value[1]
+                param_tensor.tensor_content = value[2].tostring()
+
+            with open(ckpt_file_name, "wb") as f:
+                f.write(checkpoint_list.SerializeToString())
+                os.chmod(ckpt_file_name, stat.S_IRUSR)
+
+    except BaseException as e:
+        logger.error("Failed to save the checkpoint file %s.", ckpt_file_name)
+        raise RuntimeError(e.__str__())
+
+def save_checkpoint(parameter_list, ckpt_file_name, async_save=False):
     """
     Saves checkpoint info to a specified file.
 
@@ -109,37 +133,37 @@ def save_checkpoint(parameter_list, ckpt_file_name):
         parameter_list (list): Parameters list, each element is a dict
                                like {"name":xx, "type":xx, "shape":xx, "data":xx}.
         ckpt_file_name (str): Checkpoint file name.
+        async_save (bool): Whether asynchronous execute save checkpoint into file. Default: False
 
     Raises:
         RuntimeError: Failed to save the Checkpoint file.
     """
     logger.info("Execute save checkpoint process.")
-    checkpoint_list = Checkpoint()
 
-    try:
+    data_list = {}
+    with _ckpt_mutex:
         for param in parameter_list:
-            param_value = checkpoint_list.value.add()
-            param_value.tag = param["name"]
-            param_tensor = param_value.tensor
+            key = param["name"]
+            data_list[key] = []
             if isinstance(param["data"], Parameter):
                 param["data"].init_data()
-            param_data = param["data"].asnumpy().reshape(-1)
-            param_tensor.tensor_content = param_data.tostring()
-            param_tensor.tensor_type = str(param["data"].dtype)
-
+            dims = []
             if param['data'].shape == ():
-                param_tensor.dims.append(0)
+                dims.append(0)
             else:
                 for dim in param['data'].shape:
-                    param_tensor.dims.append(dim)
+                    dims.append(dim)
+            data_list[key].append(dims)
+            tensor_type = str(param["data"].dtype)
+            data_list[key].append(tensor_type)
+            data = param["data"].asnumpy().reshape(-1)
+            data_list[key].append(data)
 
-        with open(ckpt_file_name, "wb") as f:
-            f.write(checkpoint_list.SerializeToString())
-        os.chmod(ckpt_file_name, stat.S_IRUSR)
-
-    except BaseException as e:
-        logger.error("Failed to save the checkpoint file %s.", ckpt_file_name)
-        raise RuntimeError(e.__str__())
+    if async_save:
+        thr = Thread(target=_exec_save, args=(ckpt_file_name, data_list))
+        thr.start()
+    else:
+        _exec_save(ckpt_file_name, data_list)
     logger.info("Save checkpoint process finish.")
 
 
@@ -305,7 +329,7 @@ def _save_graph(network, file_name):
         os.chmod(file_name, stat.S_IRUSR)
 
 
-def _exec_save_checkpoint(train_network, ckpt_file_name, integrated_save=True):
+def _exec_save_checkpoint(train_network, ckpt_file_name, integrated_save=True, async_save=False):
     """
     Saves checkpoint for 'ms' backend.
 
@@ -313,6 +337,7 @@ def _exec_save_checkpoint(train_network, ckpt_file_name, integrated_save=True):
         train_network (Network): The train network for training.
         ckpt_file_name (str): The name of checkpoint file.
         integrated_save (bool): Whether to integrated save in automatic model parallel scene.
+        async_save (bool): Whether asynchronous execute save checkpoint into file. Default: False.
     """
 
     param_dict = {}
@@ -336,7 +361,7 @@ def _exec_save_checkpoint(train_network, ckpt_file_name, integrated_save=True):
         each_param["data"] = param_data
         param_list.append(each_param)
 
-    save_checkpoint(param_list, ckpt_file_name)
+    save_checkpoint(param_list, ckpt_file_name, async_save)
 
 
 def _get_merged_param_data(net, param_name, param_data):

From 3d4d434facae3013278c045faf48f1868c98c7ca Mon Sep 17 00:00:00 2001
From: zhoufeng <zhoufeng54@huawei.com>
Date: Thu, 16 Jul 2020 16:34:52 +0800
Subject: [PATCH 35/68] fix assign-node-wipe bug

Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
---
 .../backend/session/ascend_control_parser.cc  | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/mindspore/ccsrc/backend/session/ascend_control_parser.cc b/mindspore/ccsrc/backend/session/ascend_control_parser.cc
index 274b355679a..4c6c7ab9cf4 100644
--- a/mindspore/ccsrc/backend/session/ascend_control_parser.cc
+++ b/mindspore/ccsrc/backend/session/ascend_control_parser.cc
@@ -18,9 +18,12 @@
 #include <utility>
 #include <memory>
 #include <algorithm>
+#include <string>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "utils/union_find_set.h"
 #include "runtime/device/ascend/ascend_label_assign.h"
+#include "utils/context/ms_context.h"
+#include "debug/anf_ir_dump.h"
 
 static constexpr size_t kCNodePrim = 0;
 static constexpr size_t kCNodeCallArg = 1;
@@ -248,10 +251,14 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
     }
     MS_LOG(INFO) << "Erase " << assign_node->DebugString(5);
     EraseNodeFromExecOrder(assign_node, NOT_NULL(&exec_order));
-
-    auto source = AnfAlgo::VisitKernelWithReturnType(assign_node->input(kCNodeAssignSource), 0).first;
-    parameter_count.AddReadCount(source, -1);
+    auto source = assign_node->input(kCNodeAssignSource);
+    MS_EXCEPTION_IF_NULL(source);
+    auto visit_source = AnfAlgo::VisitKernelWithReturnType(source, 0).first;
     parameter_count.AddWriteCount(para, -1);
+    parameter_count.AddReadCount(para, -1);
+    if (visit_source->isa<Parameter>()) {
+      parameter_count.AddReadCount(visit_source, read - 1);
+    }
     for (auto &node : all_nodes) {
       for (size_t i = 0; i < node->size(); ++i) {
         if (node->input(i) == para) {
@@ -260,8 +267,6 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
         }
       }
     }
-    parameter_count.AddReadCount(source, 1);
-    parameter_count.AddReadCount(para, -1);
   }
   root_graph->set_execution_order(exec_order);
 }
@@ -318,6 +323,17 @@ void AscendControlParser::ExecutorValidate(NotNull<KernelGraphPtr> root_graph) {
   (void)RecurseGraph(root_graph, NOT_NULL(&memo));
   EraseParameter(root_graph, memo);
   EraseLabel(root_graph);
+
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  auto save_graphs_path = context_ptr->save_graphs_path();
+  if (save_graphs_path.empty()) {
+    save_graphs_path = ".";
+  }
+  if (context_ptr->save_graphs_flag()) {
+    std::string file_path = save_graphs_path + "/after_erase_label_and_parameter.ir";
+    DumpIR(file_path, root_graph.get());
+  }
 }
 
 std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> AscendControlParser::ParseCallNode(

From 140174182d190d2e6c2a12a1416d6495403b0713 Mon Sep 17 00:00:00 2001
From: VectorSL <shiliang10@huawei.com>
Date: Wed, 15 Jul 2020 16:28:45 +0800
Subject: [PATCH 36/68] gpu add fusion: replace momentum cast

---
 .../gpu/cuda_impl/momentum_impl.cu            | 27 +++++---
 .../gpu/cuda_impl/momentum_impl.cuh           |  4 +-
 .../kernel_compiler/gpu/gpu_kernel_factory.h  |  6 ++
 .../gpu/nn/fused_batch_norm_gpu_kernel.cc     | 32 +++++-----
 .../gpu/nn/fused_batch_norm_gpu_kernel.h      | 12 ++--
 .../gpu/nn/fused_batchnorm_grad_gpu_kernel.cc | 10 +--
 .../gpu/nn/fused_batchnorm_grad_gpu_kernel.h  | 10 +--
 .../gpu/nn/momentum_gpu_kernel.cc             | 63 +++++++++++--------
 .../gpu/nn/momentum_gpu_kernel.h              |  6 +-
 .../gpu/replace_momentum_cast_fusion.cc       | 63 +++++++++++++++++++
 .../gpu/replace_momentum_cast_fusion.h        | 46 ++++++++++++++
 .../ccsrc/backend/session/gpu_session.cc      | 10 +++
 12 files changed, 218 insertions(+), 71 deletions(-)
 create mode 100644 mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.cc
 create mode 100644 mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.h

diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cu
index 5a1c9eb6871..03a4ccb6178 100755
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cu
@@ -15,9 +15,9 @@
  */
 
 #include "momentum_impl.cuh"
-template <typename T, typename S>
+template <typename T, typename S, typename G>
 __global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const S *learning_rate,
-                                             const T *gradient, const S *momentum) {
+                                             const G *gradient, const S *momentum) {
   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
     accumulation[i] = momentum[0] * accumulation[i] + gradient[i];
     variable[i] -= learning_rate[0] * accumulation[i];
@@ -34,19 +34,32 @@ __global__ void MomentumUpdateVariableKernel(const size_t size, half *variable,
   }
   return;
 }
-template <typename T, typename S>
-void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient,
+template <>
+__global__ void MomentumUpdateVariableKernel(const size_t size, float *variable, float *accumulation,
+                                             const float *learning_rate, const half *gradient,
+                                             const float *momentum) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
+    accumulation[i] = momentum[0] * accumulation[i] + __half2float(gradient[i]);
+    variable[i] -= learning_rate[0] * accumulation[i];
+  }
+  return;
+}
+template <typename T, typename S, typename G>
+void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
                             const S *momentum, cudaStream_t cuda_stream) {
   MomentumUpdateVariableKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, variable, accumulation,
                                                                                   learning_rate, gradient, momentum);
   return;
 }
-template void MomentumUpdateVariable<float, float>(const size_t size, float *variable, float *accumulation,
+template void MomentumUpdateVariable<float, float, float>(const size_t size, float *variable, float *accumulation,
                                                    const float *learning_rate, const float *gradient,
                                                    const float *momentum, cudaStream_t cuda_stream);
-template void MomentumUpdateVariable<half, half>(const size_t size, half *variable, half *accumulation,
+template void MomentumUpdateVariable<half, half, half>(const size_t size, half *variable, half *accumulation,
                                                  const half *learning_rate, const half *gradient,
                                                  const half *momentum, cudaStream_t cuda_stream);
-template void MomentumUpdateVariable<half, float>(const size_t size, half *variable, half *accumulation,
+template void MomentumUpdateVariable<half, float, half>(const size_t size, half *variable, half *accumulation,
+                                                  const float *learning_rate, const half *gradient,
+                                                  const float *momentum, cudaStream_t cuda_stream);
+template void MomentumUpdateVariable<float, float, half>(const size_t size, float *variable, float *accumulation,
                                                   const float *learning_rate, const half *gradient,
                                                   const float *momentum, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh
index 62708663ad3..e5a22e47913 100755
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh
@@ -18,8 +18,8 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
 
 #include "runtime/device/gpu/cuda_common.h"
-template <typename T, typename S>
-void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient,
+template <typename T, typename S, typename G>
+void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
                             const S *momentum, cudaStream_t cuda_stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
index 8834fa0f1a6..7af82442030 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
@@ -88,6 +88,12 @@ class GpuKernelRegister {
   static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \
   static const GpuKernelRegister g_##OPNAME##_##T##_##S##_gpu_kernel_reg(#OPNAME, ATTR,          \
                                                                          []() { return new OPCLASS<T, S>(); });
+
+// register of mixed accuracy kernels which use template and maintain three typename
+#define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G)                                     \
+  static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S, G>>::value, " must be base of GpuKernel"); \
+  static const GpuKernelRegister g_##OPNAME##_##T##_##S##_##G##_gpu_kernel_reg(                     \
+    #OPNAME, ATTR, []() { return new OPCLASS<T, S, G>(); });
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_GPUKERNELFACTORY_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.cc
index 2ce39b63a02..ddd9c2f8d05 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.cc
@@ -34,15 +34,15 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNorm,
 MS_REG_GPU_KERNEL_ONE(FusedBatchNorm,
                       KernelAttr()
                         .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
                         .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16),
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
                       FusedBatchNormGpuKernel, half)
 MS_REG_GPU_KERNEL_ONE(BatchNorm,
                       KernelAttr()
@@ -60,15 +60,15 @@ MS_REG_GPU_KERNEL_ONE(BatchNorm,
 MS_REG_GPU_KERNEL_ONE(BatchNorm,
                       KernelAttr()
                         .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
                         .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16),
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
                       FusedBatchNormGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h
index 774428dc409..fcf21b02142 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h
@@ -56,17 +56,17 @@ class FusedBatchNormGpuKernel : public GpuKernel {
       return true;
     }
     auto x = GetDeviceAddress<T>(inputs, 0);
-    auto scale = GetDeviceAddress<T>(inputs, 1);
-    auto bias = GetDeviceAddress<T>(inputs, 2);
-    auto runing_mean = GetDeviceAddress<T>(inputs, 3);
-    auto runnig_variance = GetDeviceAddress<T>(inputs, 4);
+    auto scale = GetDeviceAddress<float>(inputs, 1);
+    auto bias = GetDeviceAddress<float>(inputs, 2);
+    auto runing_mean = GetDeviceAddress<float>(inputs, 3);
+    auto runnig_variance = GetDeviceAddress<float>(inputs, 4);
     auto y = GetDeviceAddress<T>(outputs, 0);
 
     const float alpha = 1;
     const float beta = 0;
     if (is_train_) {
-      auto save_mean = GetDeviceAddress<T>(outputs, 3);
-      auto save_variance = GetDeviceAddress<T>(outputs, 4);
+      auto save_mean = GetDeviceAddress<float>(outputs, 3);
+      auto save_variance = GetDeviceAddress<float>(outputs, 4);
       CHECK_CUDNN_RET_WITH_EXCEPT(
         cudnnBatchNormalizationForwardTraining(handle_, mode_, &alpha, &beta, x_desc_, x, y_desc_, y,
                                                scale_bias_mean_var_desc_, scale, bias, exp_avg_factor_, runing_mean,
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.cc
index 546e034f6bf..7cd993d0f0d 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.cc
@@ -33,12 +33,12 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNormGrad,
                       KernelAttr()
                         .AddInputAttr(kNumberTypeFloat16)
                         .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
+                        .AddInputAttr(kNumberTypeFloat32)
                         .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16),
+                        .AddOutputAttr(kNumberTypeFloat32)
+                        .AddOutputAttr(kNumberTypeFloat32),
                       FusedBatchNormGradGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
index a2d0d741b13..e2da67b85ef 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
@@ -55,12 +55,12 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
     }
     auto dy = GetDeviceAddress<T>(inputs, 0);
     auto x = GetDeviceAddress<T>(inputs, 1);
-    auto scale = GetDeviceAddress<T>(inputs, 2);
-    auto save_mean = GetDeviceAddress<T>(inputs, 3);
-    auto save_variance = GetDeviceAddress<T>(inputs, 4);
+    auto scale = GetDeviceAddress<float>(inputs, 2);
+    auto save_mean = GetDeviceAddress<float>(inputs, 3);
+    auto save_variance = GetDeviceAddress<float>(inputs, 4);
     auto dx = GetDeviceAddress<T>(outputs, 0);
-    auto bn_scale = GetDeviceAddress<T>(outputs, 1);
-    auto bn_bias = GetDeviceAddress<T>(outputs, 2);
+    auto bn_scale = GetDeviceAddress<float>(outputs, 1);
+    auto bn_bias = GetDeviceAddress<float>(outputs, 2);
 
     const float alpha_data_diff = 1;
     const float beta_data_diff = 0;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.cc
index 99ae2affe8c..96411e9bbf9 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.cc
@@ -18,32 +18,41 @@
 
 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddOutputAttr(kNumberTypeFloat32),
-                      MomentumGpuKernel, float, float)
-MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddOutputAttr(kNumberTypeFloat16),
-                      MomentumGpuKernel, half, half)
-MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeFloat16)
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddOutputAttr(kNumberTypeFloat16),
-                      MomentumGpuKernel, half, float)
+MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32),
+                        MomentumGpuKernel, float, float, float)
+MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddOutputAttr(kNumberTypeFloat16),
+                        MomentumGpuKernel, half, half, half)
+MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat16),
+                        MomentumGpuKernel, half, float, half)
+MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
+                        KernelAttr()
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddInputAttr(kNumberTypeFloat16)
+                          .AddInputAttr(kNumberTypeFloat32)
+                          .AddOutputAttr(kNumberTypeFloat32),
+                        MomentumGpuKernel, float, float, half)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.h
index 32d3fbb079b..02b897f679e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include "backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh"
 namespace mindspore {
 namespace kernel {
-template <typename T, typename S>
+template <typename T, typename S, typename G>
 class MomentumGpuKernel : public GpuKernel {
  public:
   MomentumGpuKernel()
@@ -38,7 +38,7 @@ class MomentumGpuKernel : public GpuKernel {
     T *variable = GetDeviceAddress<T>(inputs, 0);
     T *accumulation = GetDeviceAddress<T>(inputs, 1);
     S *learning_rate = GetDeviceAddress<S>(inputs, 2);
-    T *gradient = GetDeviceAddress<T>(inputs, 3);
+    G *gradient = GetDeviceAddress<G>(inputs, 3);
     S *momentum = GetDeviceAddress<S>(inputs, 4);
     MomentumUpdateVariable(inputs[0]->size / sizeof(T), variable, accumulation, learning_rate, gradient, momentum,
                            reinterpret_cast<cudaStream_t>(stream_ptr));
@@ -54,7 +54,7 @@ class MomentumGpuKernel : public GpuKernel {
     variable_size_ = sizeof(T);
     accumulation_size_ = sizeof(T);
     learning_rate_size_ = sizeof(S);
-    gradient_size_ = sizeof(T);
+    gradient_size_ = sizeof(G);
     momentum_size_ = sizeof(S);
 
     auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.cc b/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.cc
new file mode 100644
index 00000000000..864bb026af3
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.cc
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "backend/session/anf_runtime_algorithm.h"
+#include "ir/primitive.h"
+#include "utils/utils.h"
+#include "backend/optimizer/common/helper.h"
+
+namespace mindspore {
+namespace opt {
+const BaseRef ReplaceMomentumCastFusion::DefinePattern() const {
+  VectorRef grad_cast = VectorRef({prim::kPrimCast, grad_});
+  VectorRef momentum = VectorRef({prim::kPrimApplyMomentum, var_, acc_, lr_, grad_cast, mom_});
+  return momentum;
+}
+
+const AnfNodePtr ReplaceMomentumCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
+                                                    const EquivPtr &equiv) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  MS_EXCEPTION_IF_NULL(node);
+  MS_EXCEPTION_IF_NULL(equiv);
+
+  auto grad_cast = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 3);
+  auto grad = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(grad_cast), 0);
+  MS_EXCEPTION_IF_NULL(grad_cast);
+  MS_EXCEPTION_IF_NULL(grad);
+
+  auto manager = graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  manager->Replace(utils::cast<CNodePtr>(grad_cast), utils::cast<CNodePtr>(grad));
+  std::vector<TypeId> outputs_type;
+  std::vector<std::vector<size_t>> outputs_shape;
+  auto output_num = AnfAlgo::GetOutputTensorNum(node);
+  for (size_t i = 0; i < output_num; i++) {
+    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(node, i));
+    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(node, i));
+  }
+  outputs_type[3] = AnfAlgo::GetPrevNodeOutputInferDataType(grad_cast, 0);
+
+  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, node.get());
+
+  return node;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.h b/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.h
new file mode 100644
index 00000000000..f67033dcbee
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
+
+#include <memory>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class ReplaceMomentumCastFusion : public PatternProcessPass {
+ public:
+  explicit ReplaceMomentumCastFusion(bool multigraph = true) : PatternProcessPass("replace_momentum_cast", multigraph) {
+    var_ = std::make_shared<Var>();
+    acc_ = std::make_shared<Var>();
+    lr_ = std::make_shared<Var>();
+    grad_ = std::make_shared<Var>();
+    mom_ = std::make_shared<Var>();
+  }
+  ~ReplaceMomentumCastFusion() override = default;
+  const BaseRef DefinePattern() const override;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+
+ private:
+  VarPtr var_;
+  VarPtr acc_;
+  VarPtr lr_;
+  VarPtr grad_;
+  VarPtr mom_;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc
index 14e30c1a443..e289cffa56b 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -25,6 +25,11 @@
 #include "backend/optimizer/pass/getitem_tuple.h"
 #include "backend/optimizer/gpu/adam_weight_decay_fusion.h"
 #include "backend/optimizer/gpu/adam_fusion.h"
+#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
+#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
+#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
+#include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
+#include "backend/optimizer/gpu/replace_addn_fusion.h"
 #include "runtime/device/kernel_runtime_manager.h"
 #include "predict/predict.h"
 #include "common/utils.h"
@@ -59,6 +64,11 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
   auto pm = std::make_shared<opt::PassManager>();
   pm->AddPass(std::make_shared<opt::AdamWeightDecayFusion>());
   pm->AddPass(std::make_shared<opt::AdamFusion>());
+  pm->AddPass(std::make_shared<opt::ReplaceBNCastFusion>());
+  pm->AddPass(std::make_shared<opt::ReplaceBNGradCastFusion>());
+  pm->AddPass(std::make_shared<opt::ReplaceBNGradCast2Fusion>());
+  pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
+  pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
   optimizer->AddPassManager(pm);
   (void)optimizer->Optimize(kernel_graph);
   kernel_graph->SetExecOrderByDefault();

From 6d5c580a61b92e368342dbffc547798b6b2df5b5 Mon Sep 17 00:00:00 2001
From: buxue <yiren19920727@163.com>
Date: Thu, 16 Jul 2020 19:44:24 +0800
Subject: [PATCH 37/68] change shape and dtype of tensor from interface to attr

---
 mindspore/nn/graph_kernels/graph_kernels.py | 2 +-
 mindspore/nn/layer/quant.py                 | 6 +++---
 model_zoo/gat/src/gat.py                    | 6 +++---
 tests/st/pynative/test_ops.py               | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mindspore/nn/graph_kernels/graph_kernels.py b/mindspore/nn/graph_kernels/graph_kernels.py
index 21cc4f87109..21a4c38ac5b 100644
--- a/mindspore/nn/graph_kernels/graph_kernels.py
+++ b/mindspore/nn/graph_kernels/graph_kernels.py
@@ -1020,7 +1020,7 @@ class LayerNorm(Cell):
 
     Examples:
         >>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
-        >>> shape1 = x.shape()[1:]
+        >>> shape1 = x.shape[1:]
         >>> m = G.LayerNorm(shape1,  begin_norm_axis=1, begin_params_axis=1)
         >>> m(x)
     """
diff --git a/mindspore/nn/layer/quant.py b/mindspore/nn/layer/quant.py
index 63cdedbfe94..2f4f2032904 100644
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@@ -746,8 +746,8 @@ class DenseQuant(Cell):
         self.has_bias = check_bool(has_bias)
 
         if isinstance(weight_init, Tensor):
-            if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \
-                    weight_init.shape()[1] != in_channels:
+            if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \
+                    weight_init.shape[1] != in_channels:
                 raise ValueError("weight_init shape error")
 
         self.weight = Parameter(initializer(
@@ -755,7 +755,7 @@ class DenseQuant(Cell):
 
         if self.has_bias:
             if isinstance(bias_init, Tensor):
-                if bias_init.dim() != 1 or bias_init.shape()[0] != out_channels:
+                if bias_init.dim() != 1 or bias_init.shape[0] != out_channels:
                     raise ValueError("bias_init shape error")
 
             self.bias = Parameter(initializer(
diff --git a/model_zoo/gat/src/gat.py b/model_zoo/gat/src/gat.py
index 3cb3cc11066..ff0c964e9b7 100644
--- a/model_zoo/gat/src/gat.py
+++ b/model_zoo/gat/src/gat.py
@@ -77,15 +77,15 @@ class GNNFeatureTransform(nn.Cell):
         self.has_bias = check_bool(has_bias)
 
         if isinstance(weight_init, Tensor):
-            if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \
-               weight_init.shape()[1] != in_channels:
+            if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \
+               weight_init.shape[1] != in_channels:
                 raise ValueError("weight_init shape error")
 
         self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight")
 
         if self.has_bias:
             if isinstance(bias_init, Tensor):
-                if bias_init.dim() != 1 or bias_init.shape()[0] != out_channels:
+                if bias_init.dim() != 1 or bias_init.shape[0] != out_channels:
                     raise ValueError("bias_init shape error")
 
             self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias")
diff --git a/tests/st/pynative/test_ops.py b/tests/st/pynative/test_ops.py
index 3cec24fb10c..c43e626be5e 100644
--- a/tests/st/pynative/test_ops.py
+++ b/tests/st/pynative/test_ops.py
@@ -28,4 +28,4 @@ def test_cast():
     type_dst = ms.float32
     cast = P.Cast()
     result = cast(input_x, type_dst)
-    assert result.dtype() == type_dst
+    assert result.dtype == type_dst

From c1eba79b830a2bbeea39ee1bf675cc4b905979dc Mon Sep 17 00:00:00 2001
From: lirongzhen1 <lirongzhen1@huawei.com>
Date: Mon, 6 Jul 2020 17:16:31 +0800
Subject: [PATCH 38/68] set reshape redistribution strategy attribute to no
 redistribution

---
 .../parallel/graph_util/get_parallel_info.cc  |  9 ++-
 .../frontend/parallel/ops_info/matmul_info.cc | 15 +++++
 .../frontend/parallel/ops_info/matmul_info.h  |  1 +
 .../frontend/parallel/ops_info/ops_utils.h    |  2 +
 .../parallel/ops_info/reshape_info.cc         | 48 ++++++++++++---
 .../frontend/parallel/ops_info/reshape_info.h |  1 +
 .../ccsrc/frontend/parallel/step_parallel.cc  | 16 ++++-
 .../tensor_layout/construct_operator.cc       | 13 +++++
 .../tensor_layout/construct_operator.h        |  1 +
 .../parallel/tensor_layout/tensor_layout.h    | 10 ++++
 mindspore/common/parameter.py                 |  4 +-
 mindspore/parallel/_tensor.py                 | 38 +++++++++++-
 mindspore/train/serialization.py              |  5 +-
 .../parallel/test_get_parameter_layout.py     |  4 +-
 .../test_reshape_skip_redistribution.py       | 58 +++++++++++++++++++
 15 files changed, 205 insertions(+), 20 deletions(-)
 create mode 100644 tests/ut/python/parallel/test_reshape_skip_redistribution.py

diff --git a/mindspore/ccsrc/frontend/parallel/graph_util/get_parallel_info.cc b/mindspore/ccsrc/frontend/parallel/graph_util/get_parallel_info.cc
index 21298697f44..effbdc17c7f 100644
--- a/mindspore/ccsrc/frontend/parallel/graph_util/get_parallel_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/graph_util/get_parallel_info.cc
@@ -44,7 +44,14 @@ py::dict GetParameterLayout(const FuncGraphPtr &graph) {
       auto device_arrangement = tensor_layout->device_arrangement().array();
       auto tensor_map = tensor_layout->tensor_map().array();
       auto slice_shape = tensor_layout->slice_shape().array();
-      std::vector<std::vector<int32_t>> layout = {device_arrangement, tensor_map, slice_shape};
+      int32_t _field_size = tensor_layout->get_field_size();
+      std::vector<int32_t> field_size;
+      if (_field_size != 0) {
+        field_size.push_back(_field_size);
+      } else {
+        field_size = {0};
+      }
+      std::vector<std::vector<int32_t>> layout = {device_arrangement, tensor_map, slice_shape, field_size};
       dict[py::str(name)] = layout;
       MS_LOG(INFO) << "GetParameterLayout name = " << name << ", layout " << tensor_layout->ToString();
     }
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.cc
index 60a3d60b392..b2ff493a13a 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.cc
@@ -105,6 +105,17 @@ Status MatMulBase::GetAttrs() {
     }
   }
 
+  auto field_size_iter = attrs_.find(FIELD_SIZE);
+  if (field_size_iter != attrs_.end()) {
+    MS_EXCEPTION_IF_NULL(field_size_iter->second);
+    if (field_size_iter->second->isa<Int32Imm>()) {
+      field_size_ = field_size_iter->second->cast<Int32ImmPtr>()->value();
+    } else {
+      MS_LOG(ERROR) << name_ << " : The value of field_size is not int.";
+      return FAILED;
+    }
+  }
+
   // infer inputs dimension size
   if ((inputs_shape_.size() != MATMUL_INPUTS_SIZE) || (outputs_shape_.size() != MATMUL_OUTPUTS_SIZE)) {
     MS_LOG(ERROR) << name_ << " : Inputs shape size or outputs shape size is wrong.";
@@ -346,6 +357,10 @@ Status MatMulBase::InferTensorLayout(TensorLayouts *inputs_layout, TensorLayouts
     return FAILED;
   }
 
+  if (field_size_ != 0) {
+    mat_b_layout.set_field_size(field_size_);
+  }
+
   inputs_layout->push_back(mat_a_layout);
   inputs_layout->push_back(mat_b_layout);
   outputs_layout->push_back(output_layout);
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.h b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.h
index d4e144c2b64..16f75abafce 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_info.h
@@ -62,6 +62,7 @@ class MatMulBase : public OperatorInfo {
   bool transpose_a_ = false;
   bool transpose_b_ = false;
   bool forward_reduce_scatter_ = false;
+  int32_t field_size_ = 0;
   size_t mat_a_dimension_ = 0;
   size_t mat_b_dimension_ = 0;
 };
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h b/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
index 79dfb56693b..732d25f06b7 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
@@ -100,6 +100,7 @@ constexpr char CONCAT_DIM[] = "concat_dim";
 constexpr char FORWARD[] = "forward";
 constexpr char BACKWARD[] = "backward";
 constexpr char REDISTRIBUTION[] = "redistribution";
+constexpr char SKIP_REDISTRIBUTION[] = "skip_redistribution";
 constexpr char REPLACE[] = "replace";
 constexpr char CONNSYMBOL[] = "/";
 constexpr char INSTANCE_NAME[] = "instance_name";
@@ -131,6 +132,7 @@ constexpr char FORWARD_OP[] = "forward_op";
 constexpr char REDISTRIBUTION_OP[] = "redistribution_op";
 constexpr char DARA_PARALLEL[] = "data_parallel";
 constexpr char FORWARD_REDUCE_SCATTER[] = "forward_reduce_scatter";
+constexpr char FIELD_SIZE[] = "field_size";
 constexpr char OPTIMIZER_SUB_STRING[] = "optimizer";
 constexpr char DEVICE[] = "Device";
 
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.cc
index fb62c1d02c0..cc37da4b1e9 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.cc
@@ -18,6 +18,7 @@
 
 #include <memory>
 #include <vector>
+#include <utility>
 
 #include "frontend/parallel/device_manager.h"
 #include "frontend/parallel/device_matrix.h"
@@ -145,17 +146,23 @@ Status ReshapeInfo::ComputeReplaceOp() {
   MS_LOG(DEBUG) << name_ << ": input " << input_layout_.ToString();
   MS_LOG(DEBUG) << name_ << ": output " << output_layout_.ToString();
   MS_LOG(DEBUG) << name_ << ": dev_list " << dev_list.size();
-  RedistributionOpListPtr redistribution_oplist_ptr = tensor_redistribution.InferTensorRedistributionOperatorList();
-  if (redistribution_oplist_ptr == nullptr) {
-    if (is_generating_costs_) {
-      MS_LOG(DEBUG) << name_ << "InferTensorRedistribution failed.";
-    } else {
-      MS_LOG(ERROR) << name_ << "InferTensorRedistribution failed.";
+  if (is_skip_) {
+    ConstructOperator constructor;
+    replace_op_ = constructor.SkipRedisReshapeOP(output_layout_.slice_shape().array());
+    replace_op_info_.clear();
+  } else {
+    RedistributionOpListPtr redistribution_oplist_ptr = tensor_redistribution.InferTensorRedistributionOperatorList();
+    if (redistribution_oplist_ptr == nullptr) {
+      if (is_generating_costs_) {
+        MS_LOG(DEBUG) << name_ << "InferTensorRedistribution failed.";
+      } else {
+        MS_LOG(ERROR) << name_ << "InferTensorRedistribution failed.";
+      }
+      return FAILED;
     }
-    return FAILED;
+    replace_op_ = redistribution_oplist_ptr->first;
+    replace_op_info_ = redistribution_oplist_ptr->second;
   }
-  replace_op_ = redistribution_oplist_ptr->first;
-  replace_op_info_ = redistribution_oplist_ptr->second;
   MS_LOG(DEBUG) << name_ << ": replace op size = " << replace_op_.size();
   return SUCCESS;
 }
@@ -255,6 +262,19 @@ Status ReshapeInfo::InferTensorLayout(TensorLayouts *inputs_layout, TensorLayout
 }
 
 Status ReshapeInfo::InferTensorInfo() {
+  // skip reshape infer if skip_redistribution is true
+  if (is_skip_) {
+    TensorLayout layout;
+    Shape shape;
+    Shape slice_shape;
+    layout.set_skip_redistribution(true);
+    TensorInfo tensor_info_in(layout, shape, slice_shape);
+    inputs_tensor_info_.push_back(tensor_info_in);
+    outputs_tensor_info_.push_back(tensor_info_in);
+    MS_LOG(DEBUG) << name() << "skip redistribution reshape InferTensorInfo";
+    return SUCCESS;
+  }
+
   Shapes inputs_slice_shape, outputs_slice_shape;
   Strategys inputs_strategy = strategy_->GetInputDim();
   Strategys outputs_strategy = GetOutputsStrategy();
@@ -316,6 +336,16 @@ Status ReshapeInfo::InferDefaultLayout(const Shape &shape, TensorLayout *const l
 }
 
 Status ReshapeInfo::Init(const StrategyPtr &strategy) {
+  auto reshape_skip_redis_iter = attrs_.find(SKIP_REDISTRIBUTION);
+  if (reshape_skip_redis_iter != attrs_.end()) {
+    MS_EXCEPTION_IF_NULL(reshape_skip_redis_iter->second);
+    if (!reshape_skip_redis_iter->second->isa<BoolImm>()) {
+      MS_LOG(ERROR) << name_ << ": skip_redistribution is not a bool.";
+      return FAILED;
+    }
+    is_skip_ = reshape_skip_redis_iter->second->cast<BoolImmPtr>()->value();
+  }
+
   ResetQueueMember();
   device_number(strategy);
   if (strategy) {
diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.h b/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.h
index 2463b440f81..c9c28602cc7 100644
--- a/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.h
@@ -98,6 +98,7 @@ class ReshapeInfo : public OperatorInfo {
   bool input_layout_set_flag_;
   bool output_layout_set_flag_;
   bool is_generating_costs_;
+  bool is_skip_ = false;
   std::string pre_operator_name_;
   std::string next_operator_name_;
 };
diff --git a/mindspore/ccsrc/frontend/parallel/step_parallel.cc b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
index 6b9cfd9d370..20eaf329cf2 100644
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
@@ -302,16 +302,26 @@ void Redistribution(const std::pair<AnfNodePtr, int> &node_pair, const OperatorI
   MS_LOG(DEBUG) << "Redistribution: middle_node " << middle_node->ToString() << " next_node " << next_node->ToString();
   // extract tensor layout in and out
   if (distribute_operator->outputs_tensor_info().empty()) {
-    MS_LOG(EXCEPTION) << "Failure:pre_node's tensorinfo_in is empty";
+    MS_LOG(WARNING) << "pre_node's tensorinfo_in is empty, operator name is " << distribute_operator->name();
+    return;
   }
 
   if (IntToSize(index - 1) >= next_distribute_operator->inputs_tensor_info().size()) {
-    MS_LOG(EXCEPTION) << "The index is out of range, the index is " << index - 1 << ", the vector size is "
-                      << next_distribute_operator->inputs_tensor_info().size();
+    MS_LOG(WARNING) << "The index is out of range, the index is " << index - 1 << ", the vector size is "
+                    << next_distribute_operator->inputs_tensor_info().size() << "next operator name is "
+                    << next_distribute_operator->name();
+    return;
   }
   TensorInfo tensorinfo_out = next_distribute_operator->inputs_tensor_info()[IntToSize(index - 1)];
   TensorLayout tensorlayout_out = tensorinfo_out.tensor_layout();
   TensorLayout tensorlayout_in = GetTensorInLayout(middle_node, middle_prim, distribute_operator);
+
+  if (tensorlayout_in.skip_redistribution() || tensorlayout_out.skip_redistribution()) {
+    MS_LOG(INFO) << "skip the reshape redistribution, operator name is" << distribute_operator->name()
+                 << "next distribute operator, operator name is" << next_distribute_operator->name();
+    return;
+  }
+
   if (tensor_redistribution.Init(tensorlayout_in, tensorlayout_out, dev_list) == FAILED) {
     MS_LOG(ERROR) << "Redistribution: middle_prim " << middle_prim->name() << " next_prim : " << next_prim_name;
     MS_LOG(ERROR) << "Redistribution: middle_node " << middle_node->ToString() << " next_node "
diff --git a/mindspore/ccsrc/frontend/parallel/tensor_layout/construct_operator.cc b/mindspore/ccsrc/frontend/parallel/tensor_layout/construct_operator.cc
index 9395d3df89a..feb81a36ae7 100644
--- a/mindspore/ccsrc/frontend/parallel/tensor_layout/construct_operator.cc
+++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/construct_operator.cc
@@ -28,6 +28,19 @@ Status ConstructOperator::Init(const RankList &dev_list, const Shape &dev_matrix
   return Status::SUCCESS;
 }
 
+// skip redistribution for reshape operator
+OperatorVector ConstructOperator::SkipRedisReshapeOP(Shape shape) {
+  OperatorAttrs attrs;
+  ValuePtr param_value = MakeValue(shape);
+  Attr param = std::make_pair(SHAPE, param_value);
+  OperatorParams params = {std::make_pair(param, 2)};
+  OperatorArgs args = std::make_pair(attrs, params);
+  Operator op = std::make_pair(RESHAPE, args);
+  OperatorVector opvector;
+  opvector.push_back(op);
+  return opvector;
+}
+
 Status ConstructOperator::ReshapeOP(Shape shape) {
   int32_t prod = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
   int32_t prod_expect = std::accumulate(tensor_shape_.begin(), tensor_shape_.end(), 1, std::multiplies<int>());
diff --git a/mindspore/ccsrc/frontend/parallel/tensor_layout/construct_operator.h b/mindspore/ccsrc/frontend/parallel/tensor_layout/construct_operator.h
index b06d70af364..cef2b3aa420 100644
--- a/mindspore/ccsrc/frontend/parallel/tensor_layout/construct_operator.h
+++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/construct_operator.h
@@ -35,6 +35,7 @@ class ConstructOperator {
   ConstructOperator() : dev_size_(0) {}
   ~ConstructOperator() = default;
   Status Init(const RankList &dev_list, const Shape &dev_matrix_shape);
+  OperatorVector SkipRedisReshapeOP(Shape shape);
   Status ReshapeOP(Shape shape);
   Status StridedSliceOP(Args args);
   Status AllGatherOP(int32_t dev_dim);
diff --git a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_layout.h b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_layout.h
index a9fdc9610c8..fc891d6d9fb 100644
--- a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_layout.h
+++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_layout.h
@@ -41,6 +41,14 @@ class TensorLayout {
   Status InitFromVector(const std::vector<int32_t> &device_arrangement, const std::vector<int32_t> &tensor_map,
                         const std::vector<int32_t> &tensor_shape);
 
+  bool skip_redistribution() const { return skip_redistribution_; }
+
+  void set_skip_redistribution(bool flag) { skip_redistribution_ = flag; }
+
+  int32_t get_field_size() const { return field_size_; }
+
+  void set_field_size(int32_t field_size) { field_size_ = field_size; }
+
   Arrangement device_arrangement() const { return device_arrangement_; }
 
   Map tensor_map() const { return tensor_map_; }
@@ -92,6 +100,8 @@ class TensorLayout {
   Arrangement device_arrangement_;
   Map tensor_map_;
   Arrangement tensor_shape_;
+  bool skip_redistribution_ = false;
+  int32_t field_size_ = 0;
 };
 }  // namespace parallel
 }  // namespace mindspore
diff --git a/mindspore/common/parameter.py b/mindspore/common/parameter.py
index 1605ee4bc55..9405e7b2602 100644
--- a/mindspore/common/parameter.py
+++ b/mindspore/common/parameter.py
@@ -247,8 +247,8 @@ class Parameter:
             if not isinstance(layout, list):
                 raise TypeError("The layout should be list! layout is {}."
                                 .format(layout))
-            if len(layout) != 3:
-                raise ValueError("The length of layout must be 3! layout is {}."
+            if len(layout) < 3:
+                raise ValueError("The length of layout must be larger than 3! layout is {}."
                                  .format(layout))
             slice_index = int(_get_slice_index(layout[0], layout[1]))
             self.default_input = self.init_mode.to_tensor(slice_index, layout[2])
diff --git a/mindspore/parallel/_tensor.py b/mindspore/parallel/_tensor.py
index fca8b889201..598046f66a6 100644
--- a/mindspore/parallel/_tensor.py
+++ b/mindspore/parallel/_tensor.py
@@ -229,8 +229,8 @@ def _load_tensor_by_layout(tensor, layout):
     """
     if not isinstance(layout, list):
         raise TypeError("The layout should be list! layout is {}".format(layout))
-    if len(layout) != 3:
-        raise ValueError("The length of layout must be 3! layout is {}".format(layout))
+    if len(layout) < 3:
+        raise ValueError("The length of layout must be larger than 3! layout is {}".format(layout))
     dev_mat = layout[0]
     tensor_map = layout[1]
     if tensor.size() == 1:
@@ -290,3 +290,37 @@ def _reshape_param_data(param_data, dev_mat, tensor_map):
         tensor_slices_new = tensor_slices_new_inner
 
     return Tensor(tensor_slices_new[0])
+
+def _reshape_param_data_with_weight(param_data, dev_mat, field_size):
+    """
+    Combine param slice by the device matrix, used in model parallel scenario.
+
+    Args:
+        param_data (Tensor): The tensor to be reshaped and rearrangement,
+        generated from all the device from AllGatherParamNet.
+        dev_mat (list): The device matrix of devices.
+    Returns:
+        Tensor, the combined tensor which with the whole data value.
+
+    Examples:
+        >>> param_data = _allgather_param_net(param_data)
+        >>> dev_mat = [2, 2]
+        >>> field_size = [39]
+        >>> tensor = _reshape_param_data_with_weight(param_data, dev_mat, field_size)
+    """
+    device_count = 1
+    for dim in dev_mat:
+        device_count *= dim
+
+    tensor_slices = np.split(param_data.asnumpy(), device_count, axis=0)
+    tensor_slices_col = []
+    for i in range(len(tensor_slices[0][0])):
+        tensor_slices_new = np.array(tensor_slices[0][:, i]).reshape(field_size[0], -1)
+        for j in range(1, device_count):
+            tensor_slices_new = np.concatenate((tensor_slices_new,\
+                                   np.array(tensor_slices[j][:, i]).reshape(field_size[0], -1)), axis=1)
+        tensor_slices_col.append(tensor_slices_new)
+    new_tensor = np.array(tensor_slices_col[0]).reshape(-1, 1)
+    for i in range(1, len(tensor_slices_col)):
+        new_tensor = np.concatenate((new_tensor, np.array(tensor_slices_col[i]).reshape(-1, 1)), axis=1)
+    return Tensor(new_tensor)
diff --git a/mindspore/train/serialization.py b/mindspore/train/serialization.py
index 3812698419c..c3f5d5c1f90 100644
--- a/mindspore/train/serialization.py
+++ b/mindspore/train/serialization.py
@@ -359,14 +359,17 @@ def _get_merged_param_data(net, param_name, param_data):
 
     dev_mat = layout[0]
     tensor_map = layout[1]
+    field_size = layout[3]
 
     from mindspore.parallel._cell_wrapper import get_allgather_cell
-    from mindspore.parallel._tensor import _reshape_param_data
+    from mindspore.parallel._tensor import _reshape_param_data, _reshape_param_data_with_weight
     # while any dim is not equal to -1, means param is splited and needs to be merged
     for dim in tensor_map:
         if dim != -1:
             allgather_net = get_allgather_cell()
             param_data = allgather_net(param_data)
+            if field_size[0]:
+                return _reshape_param_data_with_weight(param_data, dev_mat, field_size)
             return _reshape_param_data(param_data, dev_mat, tensor_map)
 
     return param_data
diff --git a/tests/ut/python/parallel/test_get_parameter_layout.py b/tests/ut/python/parallel/test_get_parameter_layout.py
index a34ee94840a..23649b5f0c6 100644
--- a/tests/ut/python/parallel/test_get_parameter_layout.py
+++ b/tests/ut/python/parallel/test_get_parameter_layout.py
@@ -49,8 +49,8 @@ def test_get_parameter_layout():
     net.set_auto_parallel()
     exe = me._executor
     exe.compile(net, x, phase='train', auto_parallel_mode=True)
-    x_layout = [[2, 4], [1, -1], [16, 32]]  # device_arrangement = [2, 4], tensor_map = [1, -1]
-    weight_layout = [[2, 4], [0, -1], [16, 32]]  # device_arrangement = [2, 4], tensor_map = [0, -1]
+    x_layout = [[2, 4], [1, -1], [16, 32], [0]]  # device_arrangement = [2, 4], tensor_map = [1, -1]
+    weight_layout = [[2, 4], [0, -1], [16, 32], [0]]  # device_arrangement = [2, 4], tensor_map = [0, -1]
     expect_dict = {'x': x_layout, 'w1': weight_layout}
     # to be resovled: static local variable count_p is used in step_parallel.cc, it needs to be reset between each ut
     assert net.parameter_layout_dict == expect_dict
diff --git a/tests/ut/python/parallel/test_reshape_skip_redistribution.py b/tests/ut/python/parallel/test_reshape_skip_redistribution.py
new file mode 100644
index 00000000000..cbaf20d1132
--- /dev/null
+++ b/tests/ut/python/parallel/test_reshape_skip_redistribution.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import mindspore as ms
+from mindspore import context, Tensor, Parameter
+from mindspore.common.api import _executor
+from mindspore.nn import Cell, TrainOneStepCell, Momentum
+from mindspore.ops import operations as P
+
+
+class Net(Cell):
+    def __init__(self, matmul_weight, strategy1=None):
+        super().__init__()
+        self.gatherv2 = P.GatherV2().set_strategy(strategy1)
+        self.reshape = P.Reshape().add_prim_attr("skip_redistribution", True)
+        self.matmul = P.MatMul(transpose_b=False)
+        self.index = Tensor(np.ones([64, 64]), dtype=ms.int32)
+        self.matmul_weight = Parameter(matmul_weight, "w1")
+        self.axis = 0
+
+    def construct(self, x, b):
+        out = self.gatherv2(x, self.index, self.axis)
+        out = self.reshape(out, (64, -1))
+        out = self.matmul(out, self.matmul_weight)
+        return out
+
+
+_w1 = Tensor(np.ones([4096, 32]), dtype=ms.float32)
+_x = Tensor(np.ones([64, 64]), dtype=ms.float32)
+_b = Tensor(np.ones([128, 64, 32]), dtype=ms.float32)
+
+def compile_net(net):
+    context.set_context(save_graphs=True)
+    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+    train_net = TrainOneStepCell(net, optimizer)
+    train_net.set_auto_parallel()
+    _executor.compile(train_net, _x, _b)
+    context.reset_auto_parallel_context()
+
+
+def test_reshape_skip_redistribution():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((1, 8), (1, 1))
+    net = Net(_w1, strategy1)
+    compile_net(net)

From 52022c80132b9bf3331fcbcda01ea0241c385d4a Mon Sep 17 00:00:00 2001
From: ZPaC <zhoupeichen@huawei.com>
Date: Mon, 13 Jul 2020 22:06:54 +0800
Subject: [PATCH 39/68] Enable to train in parameter server mode

---
 cmake/external_libs/glog.cmake                |   2 +-
 cmake/options.cmake                           |   4 +
 .../backend/kernel_compiler/CMakeLists.txt    |  19 +--
 .../cpu/embedding_look_up_cpu_kernel.h        |   2 +-
 .../cpu/ps/embedding_look_up_proxy_kernel.cc  |   6 +-
 .../cpu/ps/embedding_look_up_ps_kernel.cc     |   2 +-
 .../kernel_compiler/cpu/ps/push_kernel.cc     |   8 ++
 .../kernel_compiler/cpu/ps/push_kernel.h      |   2 +-
 .../cpu/ps/sparse_apply_adam_ps_kernel.cc     |   2 +-
 .../cpu/ps/sparse_apply_ftrl_ps_kernel.cc     |   2 +-
 .../optimizer/pass/replace_node_by_proxy.cc   |   1 -
 .../ccsrc/backend/session/ascend_session.cc   |  10 ++
 .../ccsrc/backend/session/cpu_session.cc      |  29 ++++
 mindspore/ccsrc/backend/session/cpu_session.h |   1 +
 .../ccsrc/backend/session/gpu_session.cc      |   8 ++
 .../ccsrc/backend/session/session_basic.cc    |  92 ++++++++++++
 .../ccsrc/backend/session/session_basic.h     |   6 +-
 .../ccsrc/frontend/parallel/CMakeLists.txt    |   9 +-
 .../frontend/parallel/ps/optimizer_info.cc    |  15 +-
 .../frontend/parallel/ps/optimizer_info.h     |   5 +-
 .../parallel/ps/optimizer_info_builder.cc     |  25 ++--
 .../parallel/ps/optimizer_info_builder.h      |   2 +-
 .../frontend/parallel/ps/parameter_server.h   |  39 +++--
 mindspore/ccsrc/frontend/parallel/ps/worker.h |  11 +-
 .../ccsrc/frontend/parallel/ps/worker_proxy.h | 134 ++++++++----------
 .../ccsrc/minddata/dataset/CMakeLists.txt     |   6 +
 mindspore/ccsrc/pipeline/jit/action.cc        |  46 +++++-
 mindspore/ccsrc/pipeline/jit/action.h         |   5 +
 mindspore/ccsrc/pipeline/jit/pipeline.cc      |  26 ++++
 29 files changed, 377 insertions(+), 142 deletions(-)

diff --git a/cmake/external_libs/glog.cmake b/cmake/external_libs/glog.cmake
index d7942a4efd7..f372c8e3c2f 100644
--- a/cmake/external_libs/glog.cmake
+++ b/cmake/external_libs/glog.cmake
@@ -1,4 +1,4 @@
-set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS}")
+set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
 mindspore_add_pkg(glog
         VER 0.4.0
diff --git a/cmake/options.cmake b/cmake/options.cmake
index 2470c25a90c..84ac3f611db 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -123,3 +123,7 @@ endif()
 if(ENABLE_DEBUGGER)
     add_compile_definitions(ENABLE_DEBUGGER)
 endif()
+
+if(ENABLE_TESTCASES)
+    add_compile_definitions(ENABLE_TESTCASES)
+endif()
\ No newline at end of file
diff --git a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
index b412d83d116..3201cec92e6 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@@ -26,14 +26,6 @@ if (ENABLE_CPU)
         "cpu/*.cc"
     )
 
-    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc" 
-                                  "cpu/ps/pull_kernel.cc"
-                                  "cpu/ps/embedding_look_up_ps_kernel.cc"
-                                  "cpu/ps/embedding_look_up_proxy_kernel.cc"
-                                  "cpu/ps/apply_momentum_ps_kernel.cc"
-                                  "cpu/ps/sparse_apply_adam_ps_kernel.cc"
-                                  "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
-
     if (NOT ENABLE_MPI)
         list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc")
         list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc")
@@ -41,6 +33,17 @@ if (ENABLE_CPU)
     endif ()
 endif ()
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "Windows" OR ENABLE_GE)
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/apply_momentum_ps_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_proxy_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_ps_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pserver_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pull_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_adam_ps_kernel.cc")
+    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
+endif()
+
 if (ENABLE_GPU)
     file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
         "gpu/*.cu"
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
index 6c61ee346c4..5cced70cdeb 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
@@ -46,7 +46,7 @@ class EmbeddingLookUpCPUKernel : public CPUKernel {
   bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
               const std::vector<AddressPtr> &outputs) override;
 
- private:
+ protected:
   void LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2,
                    float **output_addr);
   void CheckParam(const CNodePtr &kernel_node);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.cc
index 59ab65014be..2d986ff26a4 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.cc
@@ -53,15 +53,15 @@ bool EmbeddingLookUpProxyKernel::Launch(const std::vector<kernel::AddressPtr> &i
   size_t output_size = outputs[0]->size;
 
   size_t size = input_size / sizeof(float);
-  ::ps::SArray<float> lookup_ids(size, 0);
+  ::ps::SArray<int> lookup_ids(size, 0);
   ::ps::SArray<int> lengths{size};
-  ::ps::SArray<float> lookup_result;
+  ::ps::SArray<float> lookup_result(output_size / sizeof(float), 0);
 
   auto ret = memcpy_s(lookup_ids.data(), input_size, indices_addr, input_size);
   if (ret != EOK) {
     MS_LOG(EXCEPTION) << "Lookup id memcpy failed.";
   }
-  parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, lookup_result,
+  parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, &lookup_result,
                                                                  parallel::ps::kEmbeddingLookupCmd);
 
   auto ret2 = memcpy_s(output_addr, output_size, lookup_result.data(), output_size);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.cc
index bcb3ca8ae8a..13a84c34b8d 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.cc
@@ -50,7 +50,7 @@ void EmbeddingLookUpPSKernel::InitKernel(
   split_num_ = pserver_num_;
 
   // input shape should be sharded after computing offset_;
-  Shard(input_shape_, axis_);
+  Shard(&input_shape_, axis_);
 
   size_t output_size =
     std::accumulate(output_shape_.begin(), output_shape_.end(), sizeof(float), std::multiplies<size_t>());
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.cc
index 96c1f15bda7..2322d4ee3a0 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.cc
@@ -34,5 +34,13 @@ MS_REG_CPU_KERNEL_T(Push,
 MS_REG_CPU_KERNEL_T(
   Push, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt64),
   PushKernel, float);
+
+MS_REG_CPU_KERNEL_T(Push,
+                    KernelAttr()
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddInputAttr(kNumberTypeFloat32)
+                      .AddOutputAttr(kNumberTypeUInt64),
+                    PushKernel, float);
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h
index 938792f3bfd..d5876bd4611 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h
@@ -43,7 +43,7 @@ class PushKernel : public CPUKernel {
       sizes.push_back(SizeToInt(input->size) / sizeof(T));
     }
     parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes);
-    memcpy(outputs[0]->addr, &key_, sizeof(size_t));
+    memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t));
     return true;
   }
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc
index c7283954f89..fa91f459472 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc
@@ -75,7 +75,7 @@ void SparseApplyAdamPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar
 
 void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
   const auto &indices_addr = inputs[10];
-  indices_size_ = indices_addr->size;
+  indices_size_ = indices_addr->size / sizeof(int);
   workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
   workspace_size_list_[1] = indices_size_ * sizeof(int);
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc
index 0392bd5a696..93cd38c11b5 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc
@@ -64,7 +64,7 @@ void SparseApplyFtrlPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar
 
 void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
   const auto &indices_addr = inputs[4];
-  indices_size_ = indices_addr->size;
+  indices_size_ = indices_addr->size / sizeof(int);
   workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
   workspace_size_list_[1] = indices_size_ * sizeof(int);
 }
diff --git a/mindspore/ccsrc/backend/optimizer/pass/replace_node_by_proxy.cc b/mindspore/ccsrc/backend/optimizer/pass/replace_node_by_proxy.cc
index cd34464cda8..53faa131b1f 100644
--- a/mindspore/ccsrc/backend/optimizer/pass/replace_node_by_proxy.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/replace_node_by_proxy.cc
@@ -71,7 +71,6 @@ bool ReplaceNodeByProxy::Run(const FuncGraphPtr &func_graph) {
 
       AbstractBasePtrList abstract_list;
       AnfAlgo::CopyNodeAttr(kAttrPsKey, cnode, proxy_node);
-      AnfAlgo::CopyNodeAttr("reduce_scatter_flag", cnode, proxy_node);
       AnfAlgo::CopyNodeAttr("offset", cnode, proxy_node);
       abstract_list.push_back(cnode->abstract());
       auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list);
diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc
index 75bc4e2d058..766647d8a3a 100644
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -353,6 +353,10 @@ GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) {
   RootGraphExecutorValidate(NOT_NULL(root_graph));
   // adjust kernel
   AdjustKernel(root_graph);
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  // Assign parameter keys.
+  AssignParamKey(root_graph);
+#endif
   // assign stream
   AssignStream(NOT_NULL(root_graph));
   // insert profiling point
@@ -511,6 +515,12 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
   }
   // load input data from user input
   LoadInputData(kernel_graph, inputs);
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  // Initialize parameter server
+  if (!ps_init_) {
+    InitPSParamAndOptim(kernel_graph, inputs);
+  }
+#endif
   // convert inputs to model
   predictmodel::StepConvertWeight(inputs);
   {
diff --git a/mindspore/ccsrc/backend/session/cpu_session.cc b/mindspore/ccsrc/backend/session/cpu_session.cc
index ca1c78d2066..1b1b65c434f 100644
--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@@ -25,9 +25,15 @@
 #include "predict/predict.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "runtime/device/cpu/kernel_select_cpu.h"
+#include "backend/optimizer/common/optimizer.h"
+#include "backend/optimizer/common/pass_manager.h"
+#include "backend/optimizer/pass/replace_node_by_proxy.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+#include "frontend/parallel/ps/util.h"
+#endif
 
 namespace mindspore {
 namespace session {
@@ -49,12 +55,29 @@ ParameterPtr CPUSession::CreateNewParameterFromParameter(const AnfNodePtr &anf,
   return new_parameter;
 }
 
+void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
+  auto optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pm = std::make_shared<opt::PassManager>();
+  std::string pass_name = "replace_node_by_proxy";
+  pass_name.append(std::to_string(graph_sum_));
+  pm->AddPass(std::make_shared<opt::ReplaceNodeByProxy>(pass_name));
+  optimizer->AddPassManager(pm);
+  (void)optimizer->Optimize(kernel_graph);
+  kernel_graph->SetExecOrderByDefault();
+}
+
 GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
   auto graph_id = graph_sum_;
   auto graph = ConstructKernelGraph(lst, outputs);
   MS_EXCEPTION_IF_NULL(graph);
   MS_LOG(INFO) << "Set kernel info";
   SetKernelInfo(graph.get());
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  AssignParamKey(graph);
+  if (parallel::ps::Util::IsRoleOfWorker()) {
+    Optimize(graph);
+  }
+#endif
   predictmodel::StepConvertGraph(graph);
   MS_LOG(INFO) << "Build kernel";
   BuildKernel(graph.get());
@@ -66,6 +89,12 @@ GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
 void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
   auto &kernel_graph = graphs_[graph_id];
   MS_EXCEPTION_IF_NULL(kernel_graph);
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  // Initialize parameter server
+  if (!ps_init_) {
+    InitPSParamAndOptim(kernel_graph, inputs);
+  }
+#endif
   MS_LOG(INFO) << "Bind input output address";
   std::vector<tensor::TensorPtr> need_sync_outputs;
   runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs, &need_sync_outputs);
diff --git a/mindspore/ccsrc/backend/session/cpu_session.h b/mindspore/ccsrc/backend/session/cpu_session.h
index b0dbd1cc2bb..8a94c21828a 100644
--- a/mindspore/ccsrc/backend/session/cpu_session.h
+++ b/mindspore/ccsrc/backend/session/cpu_session.h
@@ -37,6 +37,7 @@ class CPUSession : public SessionBasic {
 
  protected:
   ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph) override;
+  void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph);
 
  private:
   void SetKernelInfo(const KernelGraph *kernel_graph);
diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc
index 14e30c1a443..c8644a5916d 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -167,6 +167,10 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
   Optimize(graph);
   // Select kernel build info
   SelectKernel(graph);
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  // Assign parameter keys.
+  AssignParamKey(graph);
+#endif
   // Convert kernel Graph to model
   predictmodel::StepConvertGraph(graph);
   // Start gpu kernel runtime
@@ -204,6 +208,10 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
   auto &kernel_graph = graphs_[graph_id];
   // Load input data from user input
   LoadInputData(kernel_graph, inputs);
+  // Initialize parameter server
+  if (!ps_init_) {
+    InitPSParamAndOptim(kernel_graph, inputs);
+  }
   MS_EXCEPTION_IF_NULL(kernel_graph);
   // Convert inputs to model
   predictmodel::StepConvertWeight(inputs);
diff --git a/mindspore/ccsrc/backend/session/session_basic.cc b/mindspore/ccsrc/backend/session/session_basic.cc
index 9755dfc7d0c..fa55b07fe5c 100644
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@@ -35,6 +35,11 @@
 #include "ir/dtype.h"
 #include "ir/anf.h"
 #include "ir/func_graph_cloner.h"
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+#include "frontend/parallel/ps/worker.h"
+#include "frontend/parallel/ps/common.h"
+#include "frontend/parallel/ps/util.h"
+#endif
 
 namespace mindspore {
 namespace session {
@@ -1097,5 +1102,92 @@ KernelGraphPtr SessionBasic::NewKernelGraph() {
   graphs_[graph_sum_++] = graph;
   return graph;
 }
+
+AnfNodePtr SessionBasic::FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list) {
+  MS_EXCEPTION_IF_NULL(push_node);
+  for (auto &node : node_list) {
+    if (node != nullptr && node->isa<CNode>()) {
+      for (auto input : node->cast<CNodePtr>()->inputs()) {
+        if (push_node == AnfAlgo::VisitKernel(input, 0).first) {
+          if (AnfAlgo::GetCNodeName(node) != kPullOpName) {
+            MS_LOG(EXCEPTION) << "The edge between Push and Pull node is invalid.";
+          }
+          return node;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+void SessionBasic::AssignParamKey(const KernelGraphPtr &kernel_graph) {
+  if (!parallel::ps::Util::IsRoleOfWorker()) {
+    MS_LOG(INFO) << "Not parameter server mode.";
+    return;
+  }
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph->get_return());
+  for (auto &node : node_list) {
+    if (node != nullptr && node->isa<CNode>()) {
+      // Assign key for forward kernel EmbeddingLookup.
+      // The key will be assigned to embedding table ande Push kernel as well.
+      if (AnfAlgo::GetCNodeName(node) == kEmbeddingLookupOpName) {
+        size_t embedding_table_idx = 0;
+        auto embedding_table = AnfAlgo::GetInputNode(node->cast<CNodePtr>(), embedding_table_idx);
+        size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(embedding_table->fullname_with_scope());
+        AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node);
+      } else if (AnfAlgo::GetCNodeName(node) == kPushOpName) {
+        auto pull_node = FindPullNode(node, node_list);
+        if (!pull_node) {
+          MS_LOG(EXCEPTION) << "Assigning parameter key failed: can't find Pull node of the Push node.";
+        }
+
+        // Second input of Pull node is the trainable parameter.
+        size_t parameter_index = 1;
+        auto parameter_node = AnfAlgo::GetInputNode(pull_node->cast<CNodePtr>(), parameter_index);
+        size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(parameter_node->fullname_with_scope());
+        AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node);
+        AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), pull_node);
+
+        std::string optimizer_name = AnfAlgo::GetNodeAttr<std::string>(node, kAttrOptimizerType);
+        parallel::ps::Worker<float>::GetInstance().SetKeyOptimId(key, optimizer_name);
+      }
+    }
+  }
+}
+
+void SessionBasic::InitPSParamAndOptim(const KernelGraphPtr &kernel_graph,
+                                       const std::vector<tensor::TensorPtr> &inputs_const) {
+  if (!parallel::ps::Util::IsRoleOfWorker()) {
+    return;
+  }
+  std::vector<tensor::TensorPtr> inputs(inputs_const);
+  size_t input_ctrl_size = 1;
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  if (kernel_graph->input_ctrl_tensors()) {
+    input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs);
+  }
+  auto input_nodes = kernel_graph->inputs();
+  if ((inputs.size() + input_ctrl_size) - 1 != input_nodes.size()) {
+    MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size()
+                      << ", input_ctrl_size:" << input_ctrl_size;
+  }
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto tensor = inputs[i];
+    MS_EXCEPTION_IF_NULL(tensor);
+    auto input_node = input_nodes[i];
+    MS_EXCEPTION_IF_NULL(input_node);
+    if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
+      auto pk_node = input_node->cast<ParameterPtr>();
+      mindspore::parallel::ps::Worker<float>::GetInstance().InitPSParamAndOptim(
+        pk_node->fullname_with_scope(), tensor->data_c(), LongToSize(tensor->data().nbytes()));
+    }
+  }
+  ps_init_ = true;
+}
+#endif
 }  // namespace session
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/session/session_basic.h b/mindspore/ccsrc/backend/session/session_basic.h
index c662e3978bd..a8ef0a7e1e3 100755
--- a/mindspore/ccsrc/backend/session/session_basic.h
+++ b/mindspore/ccsrc/backend/session/session_basic.h
@@ -51,7 +51,7 @@ using OpRunInfoPtr = std::shared_ptr<OpRunInfo>;
 
 class SessionBasic {
  public:
-  SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0) {
+  SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0), ps_init_(false) {
 #ifdef ENABLE_DEBUGGER
     debugger_ = nullptr;
 #endif
@@ -104,6 +104,8 @@ class SessionBasic {
   virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; }
   virtual void SetActive(GraphId, GraphId) {}
   virtual void GetSummaryNodes(KernelGraph *graph);
+  void AssignParamKey(const KernelGraphPtr &kernel_graph);
+  void InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, const std::vector<tensor::TensorPtr> &inputs_const);
 
 #ifdef ENABLE_DEBUGGER
   // set debugger
@@ -140,6 +142,7 @@ class SessionBasic {
   AnfNodePtr CreateNewParameterFromCNode(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph);
   void AddParameterToGraphInputs(const std::vector<AnfNodePtr> &parameters, KernelGraph *graph);
   void InitInternalOutputParameter(const AnfNodePtr &out_node, const AnfNodePtr &parameter);
+  AnfNodePtr FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list);
 
   std::unordered_map<GraphId, std::shared_ptr<KernelGraph>> graphs_;
   std::unordered_map<GraphInfo, std::shared_ptr<KernelGraph>> run_op_graphs_;
@@ -148,6 +151,7 @@ class SessionBasic {
   CallBackFunc summary_callback_;
   static GraphId graph_sum_;
   uint32_t device_id_;
+  bool ps_init_;
 #ifdef ENABLE_DEBUGGER
   std::shared_ptr<Debugger> debugger_;
 #endif
diff --git a/mindspore/ccsrc/frontend/parallel/CMakeLists.txt b/mindspore/ccsrc/frontend/parallel/CMakeLists.txt
index d2a099cf415..0f667791467 100644
--- a/mindspore/ccsrc/frontend/parallel/CMakeLists.txt
+++ b/mindspore/ccsrc/frontend/parallel/CMakeLists.txt
@@ -1,5 +1,12 @@
 file(GLOB_RECURSE _PARALLEL_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
-list(REMOVE_ITEM _PARALLEL_SRC_FILES  "ps/util.cc" "ps/scheduler.cc" "ps/optimizer_info.cc" "ps/optimizer_info_builder.cc")
+
+if (${CMAKE_SYSTEM_NAME} MATCHES "Windows" OR ENABLE_GE)
+    list(REMOVE_ITEM _PARALLEL_SRC_FILES "ps/optimizer_info_builder.cc")
+    list(REMOVE_ITEM _PARALLEL_SRC_FILES "ps/optimizer_info.cc")
+    list(REMOVE_ITEM _PARALLEL_SRC_FILES "ps/scheduler.cc")
+    list(REMOVE_ITEM _PARALLEL_SRC_FILES "ps/util.cc")
+endif()
+
 if (ENABLE_DUMP_PROTO)
     list(REMOVE_ITEM _PARALLEL_SRC_FILES "parallel/strategy_checkpoint/parallel_strategy_checkpoint.cc")
 endif ()
diff --git a/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.cc b/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.cc
index e16c713e3c7..cbfa5829837 100644
--- a/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.cc
@@ -118,11 +118,13 @@ const AddressPtr &MomentumOptimInfo::gradient() { return inputs_[3]; }
 
 const AddressPtr &MomentumOptimInfo::indices() { return inputs_[3]; }
 
+size_t MomentumOptimInfo::grad_index() { return 1; }
+
 SparseAdamOptimInfo::SparseAdamOptimInfo(const AddressPtr &weight, const AddressPtr &m, const AddressPtr &v,
                                          const AddressPtr &beta1_power, const AddressPtr &beta2_power,
                                          const AddressPtr &learning_rate, const AddressPtr &beta1,
                                          const AddressPtr &beta2, const AddressPtr &epsilon, const AddressPtr &grad,
-                                         const AddressPtr &indices, size_t grads_offset, size_t indices_offset) {
+                                         const AddressPtr &indices) {
   inputs_.push_back(weight);
   inputs_.push_back(m);
   inputs_.push_back(v);
@@ -134,8 +136,8 @@ SparseAdamOptimInfo::SparseAdamOptimInfo(const AddressPtr &weight, const Address
   inputs_.push_back(epsilon);
   inputs_.push_back(grad);
   inputs_.push_back(indices);
-  grads_offset_ = grads_offset;
-  indices_offset_ = indices_offset;
+  grads_offset_ = 0;
+  indices_offset_ = 0;
 }
 
 void SparseAdamOptimInfo::Update(const Values &values, const Lengths &lens) {
@@ -159,15 +161,14 @@ size_t SparseAdamOptimInfo::grad_index() { return 6; }
 size_t SparseAdamOptimInfo::indices_index() { return 7; }
 
 SparseFtrlOptimInfo::SparseFtrlOptimInfo(const AddressPtr &weight, const AddressPtr &accum, const AddressPtr &linear,
-                                         const AddressPtr &grad, const AddressPtr &indices, size_t grads_offset,
-                                         size_t indices_offset) {
+                                         const AddressPtr &grad, const AddressPtr &indices) {
   inputs_.push_back(weight);
   inputs_.push_back(accum);
   inputs_.push_back(linear);
   inputs_.push_back(grad);
   inputs_.push_back(indices);
-  grads_offset_ = grads_offset;
-  indices_offset_ = indices_offset;
+  grads_offset_ = 0;
+  indices_offset_ = 0;
 }
 
 const AddressPtr &SparseFtrlOptimInfo::gradient() { return inputs_[3]; }
diff --git a/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.h b/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.h
index bb9a64acdb3..ada020a95a9 100644
--- a/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ps/optimizer_info.h
@@ -81,6 +81,7 @@ class MomentumOptimInfo : public DenseOptimInfo {
 
   const AddressPtr &gradient();
   const AddressPtr &indices();
+  size_t grad_index() override;
 };
 
 class SparseAdamOptimInfo : public SparseOptimInfo {
@@ -88,7 +89,7 @@ class SparseAdamOptimInfo : public SparseOptimInfo {
   SparseAdamOptimInfo(const AddressPtr &weight, const AddressPtr &m, const AddressPtr &v, const AddressPtr &beta1_power,
                       const AddressPtr &beta2_power, const AddressPtr &learning_rate, const AddressPtr &beta1,
                       const AddressPtr &beta2, const AddressPtr &epsilon, const AddressPtr &grad,
-                      const AddressPtr &indices, size_t grads_offset, size_t indices_offset);
+                      const AddressPtr &indices);
   ~SparseAdamOptimInfo() override = default;
 
   void Update(const Values &values, const Lengths &lens) override;
@@ -102,7 +103,7 @@ class SparseAdamOptimInfo : public SparseOptimInfo {
 class SparseFtrlOptimInfo : public SparseOptimInfo {
  public:
   SparseFtrlOptimInfo(const AddressPtr &weight, const AddressPtr &accum, const AddressPtr &linear,
-                      const AddressPtr &grad, const AddressPtr &indices, size_t grads_offset, size_t indices_offset);
+                      const AddressPtr &grad, const AddressPtr &indices);
   ~SparseFtrlOptimInfo() override = default;
 
   const AddressPtr &gradient();
diff --git a/mindspore/ccsrc/frontend/parallel/ps/optimizer_info_builder.cc b/mindspore/ccsrc/frontend/parallel/ps/optimizer_info_builder.cc
index 159a50793e1..7b6686ea869 100644
--- a/mindspore/ccsrc/frontend/parallel/ps/optimizer_info_builder.cc
+++ b/mindspore/ccsrc/frontend/parallel/ps/optimizer_info_builder.cc
@@ -48,20 +48,25 @@ OptimizerInfo *MomentumOptimInfoBuilder::BuildInputs(const WeightPtr &weight, co
                                                      size_t worker_num) {
   AddressPtr weight_addr = std::make_shared<kernel::Address>();
   weight_addr->addr = weight->data();
-  weight_addr->size = weight->size();
+  weight_addr->size = weight->size() * sizeof(float);
   void *data_ptr = values.data();
+  void *copy_data_ptr = new float[values.size()];
+  auto ret = memcpy_s(copy_data_ptr, values.size() * sizeof(float), data_ptr, values.size() * sizeof(float));
+  if (ret != 0) {
+    MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
+  }
   AddressPtr accumulate = std::make_shared<kernel::Address>();
   accumulate->addr = new float[weight->size()];
-  accumulate->size = weight->size();
+  accumulate->size = weight->size() * sizeof(float);
   AddressPtr learning_rate = std::make_shared<kernel::Address>();
-  learning_rate->addr = data_ptr;
-  learning_rate->size = lens[0];
+  learning_rate->addr = copy_data_ptr;
+  learning_rate->size = lens[0] * sizeof(float);
   AddressPtr gradient = std::make_shared<kernel::Address>();
   gradient->addr = reinterpret_cast<float *>(learning_rate->addr) + lens[0];
-  gradient->size = lens[1];
+  gradient->size = lens[1] * sizeof(float);
   AddressPtr momentum = std::make_shared<kernel::Address>();
   momentum->addr = reinterpret_cast<float *>(gradient->addr) + lens[1];
-  momentum->size = lens[2];
+  momentum->size = lens[2] * sizeof(float);
 
   return new MomentumOptimInfo(weight_addr, accumulate, learning_rate, gradient, momentum);
 }
@@ -131,10 +136,10 @@ OptimizerInfo *SparseAdamOptimInfoBuilder::BuildInputs(const WeightPtr &weight,
   if (ret3 != 0) {
     MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret3 << ")";
   }
-  indices->size = lens[7] * sizeof(float);
+  indices->size = lens[7] * sizeof(int);
 
   return new SparseAdamOptimInfo(weight_addr, m, v, beta1_power, beta2_power, learning_rate, beta1, beta2, epsilon,
-                                 grad, indices, total_grad_size, total_indice_size);
+                                 grad, indices);
 }
 
 OptimizerInfo *SparseFtrlOptimInfoBuilder::BuildInputs(const WeightPtr &weight, const Keys &keys, const Values &values,
@@ -175,9 +180,9 @@ OptimizerInfo *SparseFtrlOptimInfoBuilder::BuildInputs(const WeightPtr &weight,
   if (ret2 != 0) {
     MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret2 << ")";
   }
-  indices->size = lens[1] * sizeof(float);
+  indices->size = lens[1] * sizeof(int);
 
-  return new SparseFtrlOptimInfo(weight_addr, accum, linear, grad, indices, total_grad_size, total_indice_size);
+  return new SparseFtrlOptimInfo(weight_addr, accum, linear, grad, indices);
 }
 }  // namespace ps
 }  // namespace parallel
diff --git a/mindspore/ccsrc/frontend/parallel/ps/optimizer_info_builder.h b/mindspore/ccsrc/frontend/parallel/ps/optimizer_info_builder.h
index c5aae32921b..5a12799775e 100644
--- a/mindspore/ccsrc/frontend/parallel/ps/optimizer_info_builder.h
+++ b/mindspore/ccsrc/frontend/parallel/ps/optimizer_info_builder.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/kernel.h"
-#include "backend/kernel_compiler/ps/pserver_kernel.h"
+#include "backend/kernel_compiler/cpu/ps/pserver_kernel.h"
 #include "frontend/parallel/ps/optimizer_info.h"
 
 namespace mindspore {
diff --git a/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h b/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
index 1afb4c9fa65..56c9e34879e 100755
--- a/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
+++ b/mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
@@ -40,12 +40,12 @@
 #include "runtime/device/cpu/kernel_select_cpu.h"
 #include "utils/context/ms_context.h"
 #include "backend/kernel_compiler/kernel.h"
-#include "backend/kernel_compiler/ps/pserver_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
-#include "backend/kernel_compiler/ps/sparse_apply_adam_ps_kernel.h"
-#include "backend/kernel_compiler/ps/sparse_apply_ftrl_ps_kernel.h"
-#include "backend/kernel_compiler/ps/apply_momentum_ps_kernel.h"
-#include "backend/kernel_compiler/ps/embedding_look_up_ps_kernel.h"
+#include "backend/kernel_compiler/cpu/ps/pserver_kernel.h"
+#include "backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.h"
+#include "backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.h"
+#include "backend/kernel_compiler/cpu/ps/apply_momentum_ps_kernel.h"
+#include "backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.h"
 
 namespace mindspore {
 namespace parallel {
@@ -118,7 +118,7 @@ class ParameterServer {
   std::shared_ptr<session::KernelGraph> kernel_graph_;
   std::shared_ptr<session::SessionBasic> sess_;
 
-  std::unordered_map<std::string, std::shared_ptr<PServerKernel>> optimizers_;
+  std::unordered_map<Key, std::shared_ptr<PServerKernel>> optimizers_;
   std::unordered_map<Key, InputsShapePtr> optim_inputs_shape_;
   std::unordered_map<Key, std::shared_ptr<OptimizerInfo>> optim_infos_;
   std::unordered_map<std::string, std::shared_ptr<OptimizerInfoBuilder>> optim_info_builders_;
@@ -249,10 +249,10 @@ template <typename T>
 void ParameterServer<T>::ServerHandler::HandleEmbeddingLookup(const ::ps::KVMeta &req_meta,
                                                               const ::ps::KVPairs<T> &req_data, ::ps::KVPairs<T> *res) {
   const Key &key = req_data.keys[0];
-  ps_->DoEmbeddingLookup(key, req_data.vals, res);
   for (size_t i = 0; i < req_data.vals.size(); i++) {
-    res->keys->push_back(req_data.vals[i]);
+    res->keys.push_back(req_data.vals[i]);
   }
+  ps_->DoEmbeddingLookup(key, req_data.vals, res);
 }
 
 template <typename T>
@@ -288,7 +288,7 @@ void ParameterServer<T>::InitOptimInfoBuilders() {
 
 template <typename T>
 void ParameterServer<T>::InitWeightKeyToOptims(const Key &key, const int &optim_id) {
-  if (weight_key_to_optims_.count(key) > 0 || Util::optimizer_name(key) == "") {
+  if (weight_key_to_optims_.count(key) > 0 || Util::optimizer_name(optim_id) == "") {
     return;
   }
   weight_key_to_optims_[key] = Util::optimizer_name(optim_id);
@@ -314,22 +314,22 @@ void ParameterServer<T>::InitOptimInputsShape(const Keys &keys, const Values &va
   }
   if (weight_key_to_optims_.count(key) > 0) {
     const std::string &optim_name = weight_key_to_optims_[key];
-    if (optimizers_.count(optim_name) == 0 && optim_inputs_shape_.count(key) > 0) {
+    if (optimizers_.count(key) == 0 && optim_inputs_shape_.count(key) > 0) {
       if (optim_name == kSparseAdam) {
         std::shared_ptr<PServerKernel> optimizer =
           std::make_shared<kernel::ps::SparseApplyAdamPSKernel>(rank_id_, pserver_num_);
         optimizer->InitKernel(optim_inputs_shape_[key]);
-        optimizers_[optim_name] = optimizer;
+        optimizers_[key] = optimizer;
       } else if (optim_name == kApplyMomentum) {
         std::shared_ptr<PServerKernel> optimizer =
           std::make_shared<kernel::ps::ApplyMomentumPSKernel>(rank_id_, pserver_num_);
         optimizer->InitKernel(optim_inputs_shape_[key]);
-        optimizers_[optim_name] = optimizer;
+        optimizers_[key] = optimizer;
       } else if (optim_name == kSparseFtrl) {
         std::shared_ptr<PServerKernel> optimizer =
           std::make_shared<kernel::ps::SparseApplyFtrlPSKernel>(rank_id_, pserver_num_);
         optimizer->InitKernel(optim_inputs_shape_[key]);
-        optimizers_[optim_name] = optimizer;
+        optimizers_[key] = optimizer;
       }
     }
   }
@@ -382,8 +382,7 @@ void ParameterServer<T>::UpdateWeights() {
 
       std::shared_ptr<PServerKernel> optimizer = nullptr;
       if (weight_key_to_optims_.count(key) > 0) {
-        const std::string &optim_name = weight_key_to_optims_[key];
-        optimizer = optimizers_[optim_name];
+        optimizer = optimizers_[key];
       }
       MS_EXCEPTION_IF_NULL(optimizer);
 
@@ -391,8 +390,6 @@ void ParameterServer<T>::UpdateWeights() {
       if (optim_info == nullptr) {
         continue;
       }
-      const WeightPtr &weight = weights_[key];
-      optim_info->UpdateWeight(weight);
       const std::vector<kernel::AddressPtr> &inputs = optim_info->inputs();
       const std::vector<kernel::AddressPtr> &workspaces = optim_info->workspaces();
       const std::vector<kernel::AddressPtr> &outputs = optim_info->outputs();
@@ -416,7 +413,7 @@ void ParameterServer<T>::AccumGrad(const Keys &keys, const Values &values, const
   // Create or update the optimizer info
   if (optim_info == nullptr) {
     const std::shared_ptr<OptimizerInfoBuilder> &builder = optim_info_builders_[weight_key_to_optims_[key]];
-    std::shared_ptr<kernel::ps::PServerKernel> pserver_kernel = optimizers_[weight_key_to_optims_[key]];
+    std::shared_ptr<kernel::ps::PServerKernel> pserver_kernel = optimizers_[key];
     if (pserver_kernel == nullptr) {
       MS_LOG(EXCEPTION) << "no optimizer found for key " << key << " optim name " << weight_key_to_optims_[key];
     }
@@ -427,10 +424,8 @@ void ParameterServer<T>::AccumGrad(const Keys &keys, const Values &values, const
     optim_infos_[key] = optim_info;
   } else {
     optim_info->Update(values, lengths);
+    optim_info->Accumulate(values, lengths);
   }
-  MS_EXCEPTION_IF_NULL(optim_info);
-
-  optim_info->Accumulate(values, lengths);
 
   grads_accum_counter_[key] += 1;
   if (grads_accum_counter_[key] == worker_num_) {
@@ -499,7 +494,7 @@ void ParameterServer<T>::DoEmbeddingLookup(Key key, const LookupIds &lookup_ids,
 
   table_lookup_op->Execute(inputs, workspaces, outputs);
   res->vals = *addr;
-  res->lens.push_back(res.vals.size());
+  res->lens.push_back(res->vals.size());
 }
 
 template <typename T>
diff --git a/mindspore/ccsrc/frontend/parallel/ps/worker.h b/mindspore/ccsrc/frontend/parallel/ps/worker.h
index 9ecbc28fc51..13cfef4d9f8 100644
--- a/mindspore/ccsrc/frontend/parallel/ps/worker.h
+++ b/mindspore/ccsrc/frontend/parallel/ps/worker.h
@@ -48,7 +48,7 @@ class Worker {
   void AddEmbeddingTable(const ::ps::Key &key, const size_t &row_count);
   void InitPSEmbeddingTable(const std::vector<size_t> &keys, std::vector<size_t> shapes, const std::vector<int> &sizes);
   void InitPSParamAndOptim(const std::string &param_name, void *param_data, size_t param_size);
-  void DoPSEmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &lookup_ids,
+  void DoPSEmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<int> &lookup_ids,
                            const ::ps::SArray<int> &lens, ::ps::SArray<T> *lookup_result, int cmd);
 
  private:
@@ -98,7 +98,8 @@ void Worker<T>::Push(const std::vector<size_t> &keys, std::vector<uintptr_t> add
   ::ps::SArray<T> total_buffer(total_size, 0);
   size_t offset = 0;
   for (size_t i = 0; i < sizes.size(); i++) {
-    memcpy(total_buffer.data() + offset / sizeof(T), addrs[i], sizes[i] * sizeof(T));
+    memcpy_s(total_buffer.data() + offset / sizeof(T), sizes[i] * sizeof(T), reinterpret_cast<void *>(addrs[i]),
+             sizes[i] * sizeof(T));
     offset += sizes[i] * sizeof(T);
   }
   kv_worker_->PushData(::ps::SArray<::ps::Key>(keys), total_buffer, ::ps::SArray<int>(sizes));
@@ -108,13 +109,13 @@ template <typename T>
 void Worker<T>::Pull(const size_t key, void *dev_addr, const size_t size) {
   ::ps::SArray<T> variables(size / sizeof(T), 0);
   kv_worker_->Wait(kv_worker_->ZPull({key}, &variables));
-  memcpy(dev_addr, variables.data(), size);
+  memcpy_s(dev_addr, size, variables.data(), size);
 }
 
 template <typename T>
-void Worker<T>::DoPSEmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &lookup_ids,
+void Worker<T>::DoPSEmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<int> &lookup_ids,
                                     const ::ps::SArray<int> &lens, ::ps::SArray<T> *lookup_result, int cmd) {
-  kv_worker_->EmbeddingLookup(keys, lookup_ids, lens, &lookup_result, cmd);
+  kv_worker_->EmbeddingLookup(keys, lookup_ids, lens, lookup_result, cmd);
 }
 
 template <typename T>
diff --git a/mindspore/ccsrc/frontend/parallel/ps/worker_proxy.h b/mindspore/ccsrc/frontend/parallel/ps/worker_proxy.h
index a0f58d39a4a..6d68419383c 100644
--- a/mindspore/ccsrc/frontend/parallel/ps/worker_proxy.h
+++ b/mindspore/ccsrc/frontend/parallel/ps/worker_proxy.h
@@ -22,6 +22,7 @@
 #include <utility>
 #include <memory>
 #include <vector>
+#include <unordered_set>
 #include "ps/ps.h"
 #include "frontend/parallel/ps/util.h"
 
@@ -34,24 +35,23 @@ class WorkerProxy : public ::ps::KVWorker<T> {
   using Worker = ::ps::KVWorker<T>;
   using Callback = std::function<void()>;
   using SlicedKVs = std::vector<std::pair<bool, ::ps::KVPairs<T>>>;
-  using Slicer =
-    std::function<void(const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &ranges, SlicedKVs *sliced)>;
+  using Slicer = std::function<void(int ts, const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &ranges,
+                                    SlicedKVs *sliced)>;
   using ::ps::SimpleApp::obj_;
   explicit WorkerProxy(int app_id, int customer_id, int lookup_customer_id) : Worker(app_id, customer_id) {
-    using _1 = std::placeholders::_1;
-    using _2 = std::placeholders::_2;
-    using _3 = std::placeholders::_3;
+    using std::placeholders::_1;
+    using std::placeholders::_2;
+    using std::placeholders::_3;
+    using std::placeholders::_4;
     lookup_customer_ = std::unique_ptr<::ps::Customer>(
       new ::ps::Customer(app_id, lookup_customer_id, std::bind(&WorkerProxy<T>::ProcessLookupResult, this, _1)));
-    lookup_slicer_ = std::bind(&WorkerProxy<T>::LookupIdSlicer, this, _1, _2, _3);
-    init_embedding_slicer_ = std::bind(&WorkerProxy<T>::EmbeddingTableInitSlicer, this, _1, _2, _3);
-    push_slicer_ = std::bind(&WorkerProxy<T>::PushSlicer, this, _1, _2, _3);
-    broadcast_slicer_ = std::bind(&WorkerProxy<T>::BroadcastSlicer, this, _1, _2, _3);
+    lookup_slicer_ = std::bind(&WorkerProxy<T>::LookupIdSlicer, this, _1, _2, _3, _4);
+    broadcast_slicer_ = std::bind(&WorkerProxy<T>::BroadcastSlicer, this, _1, _2, _3, _4);
   }
   ~WorkerProxy() override = default;
 
   void AddEmbeddingTable(const ::ps::Key &key, const size_t &row_count);
-  void EmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &lookup_ids,
+  void EmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<int> &lookup_ids,
                        const ::ps::SArray<int> &lens, ::ps::SArray<T> *outs, int cmd = 0, const Callback &cb = nullptr,
                        int priority = 0);
   int InitEmbeddingTable(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &vals,
@@ -61,15 +61,11 @@ class WorkerProxy : public ::ps::KVWorker<T> {
 
  private:
   template <typename C>
-  int AddLookupCB(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &lookup_ids, C *vals, int cmd,
+  int AddLookupCB(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<int> &lookup_ids, C *vals, int cmd,
                   const Callback &cb);
-  void LookupIdSlicer(const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &,
+  void LookupIdSlicer(int timestamp, const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &,
                       std::vector<std::pair<bool, ::ps::KVPairs<T>>> *sliced);
-  void EmbeddingTableInitSlicer(const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &,
-                                std::vector<std::pair<bool, ::ps::KVPairs<T>>> *sliced);
-  void PushSlicer(const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &,
-                  std::vector<std::pair<bool, ::ps::KVPairs<T>>> *sliced);
-  void BroadcastSlicer(const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &,
+  void BroadcastSlicer(int timestamp, const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &,
                        std::vector<std::pair<bool, ::ps::KVPairs<T>>> *sliced);
   void ProcessLookupResult(const ::ps::Message &msg);
   void Send(::ps::Customer *customer, int timestamp, bool push, bool pull, int cmd, const ::ps::KVPairs<T> &kvs,
@@ -80,10 +76,9 @@ class WorkerProxy : public ::ps::KVWorker<T> {
   std::unordered_map<int, std::vector<::ps::KVPairs<T>>> lookup_results_;
   std::mutex mutex_;
   Slicer lookup_slicer_;
-  Slicer init_embedding_slicer_;
-  Slicer push_slicer_;
   Slicer broadcast_slicer_;
   std::unordered_map<int, Callback> lookup_callbacks_;
+  std::unordered_map<int, int> expected_result_count_;
 };
 
 template <typename T>
@@ -108,17 +103,21 @@ void WorkerProxy<T>::AddEmbeddingTable(const ::ps::Key &key, const size_t &row_c
 }
 
 template <typename T>
-void WorkerProxy<T>::EmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &lookup_ids,
+void WorkerProxy<T>::EmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<int> &lookup_ids,
                                      const ::ps::SArray<int> &lens, ::ps::SArray<T> *outs, int cmd, const Callback &cb,
                                      int priority) {
   int ts = AddLookupCB(keys, lookup_ids, outs, cmd, cb);
   ::ps::KVPairs<T> kvs;
   kvs.keys = keys;
-  kvs.vals = lookup_ids;
-  kvs.lens = lens;
+  kvs.lens = lookup_ids;
   kvs.priority = priority;
-  Send(lookup_customer_.get(), ts, true, true, cmd, kvs, broadcast_slicer_);
+  expected_result_count_[ts] = 0;
+  Send(lookup_customer_.get(), ts, true, true, cmd, kvs, lookup_slicer_);
+  int server_num = ::ps::NumServers();
+  int expect_rt_count = expected_result_count_[ts];
+  lookup_customer_->AddResponse(ts, server_num - expect_rt_count);
   lookup_customer_->WaitRequest(ts);
+  expected_result_count_.erase(ts);
 }
 
 template <typename T>
@@ -130,7 +129,7 @@ int WorkerProxy<T>::InitEmbeddingTable(const ::ps::SArray<::ps::Key> &keys, cons
   kvs.vals = vals;
   kvs.lens = lens;
   kvs.priority = priority;
-  Send(obj_, ts, true, false, kInitEmbeddingsCmd, kvs, init_embedding_slicer_);
+  Send(obj_, ts, true, false, kInitEmbeddingsCmd, kvs, broadcast_slicer_);
   return ts;
 }
 
@@ -143,13 +142,13 @@ void WorkerProxy<T>::PushData(const ::ps::SArray<::ps::Key> &keys, const ::ps::S
   kvs.vals = vals;
   kvs.lens = lens;
   kvs.priority = priority;
-  Send(obj_, ts, true, false, cmd, kvs, push_slicer_);
+  Send(obj_, ts, true, false, cmd, kvs, broadcast_slicer_);
   obj_->WaitRequest(ts);
 }
 
 template <typename T>
 template <typename C>
-int WorkerProxy<T>::AddLookupCB(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<T> &lookup_ids,
+int WorkerProxy<T>::AddLookupCB(const ::ps::SArray<::ps::Key> &keys, const ::ps::SArray<int> &lookup_ids,
                                 C *lookup_result, int cmd, const Callback &cb) {
   int ts = lookup_customer_->NewRequest(::ps::kServerGroup);
   const auto &callback = [this, ts, keys, lookup_ids, lookup_result, cb]() mutable {
@@ -158,20 +157,30 @@ int WorkerProxy<T>::AddLookupCB(const ::ps::SArray<::ps::Key> &keys, const ::ps:
     mutex_.unlock();
 
     size_t total_len = 0;
-    const auto &s = kvs[0];
-    for (size_t i = 0; i < s.lens.size(); i++) {
-      total_len += s.lens[i];
-    }
-    lookup_result->resize(total_len, 0);
-    T *result_addr = lookup_result->data();
-
+    std::unordered_map<Key, std::shared_ptr<std::pair<T *, int>>> id_addr_map;
     for (const auto &s : kvs) {
-      size_t offset = 0;
-      for (size_t i = 0; i < s.vals.size(); i++) {
-        result_addr[offset++] += s.vals[i];
+      int offset = 0;
+      int len = s.vals.size() / s.keys.size();
+      for (size_t i = 0; i < s.keys.size(); i++) {
+        const Key &key = s.keys[i];
+        T *addr = s.vals.data() + offset;
+        offset += len;
+        total_len += len;
+        id_addr_map[key] = std::make_shared<std::pair<T *, int>>(std::make_pair(addr, len));
       }
     }
 
+    T *result_addr = lookup_result->data();
+    int offset = 0;
+    for (size_t i = 0; i < lookup_ids.size(); i++) {
+      auto &pair = id_addr_map[static_cast<Key>(lookup_ids[i])];
+      auto ret = memcpy_s(result_addr + offset, pair->second, pair->first, pair->second);
+      if (ret != 0) {
+        MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
+      }
+      offset += pair->second;
+    }
+
     mutex_.lock();
     lookup_results_.erase(ts);
     mutex_.unlock();
@@ -182,31 +191,30 @@ int WorkerProxy<T>::AddLookupCB(const ::ps::SArray<::ps::Key> &keys, const ::ps:
 }
 
 template <typename T>
-void WorkerProxy<T>::LookupIdSlicer(const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &,
+void WorkerProxy<T>::LookupIdSlicer(int timestamp, const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &,
                                     std::vector<std::pair<bool, ::ps::KVPairs<T>>> *sliced) {
-  int *data = send.lens.data();
-  size_t size = send.lens.size();
-  std::vector<int> lookup_ids(data, data + size);
-  std::sort(lookup_ids.begin(), lookup_ids.end());
+  int *lookup_ids = send.lens.data();
+  size_t id_size = send.lens.size();
 
   const Key &key = send.keys[0];
   const std::vector<::ps::Range> &ranges = *(embedding_table_ranges_[key]);
   sliced->resize(ranges.size());
 
-  size_t index = 0;
   for (size_t i = 0; i < ranges.size(); i++) {
     const ::ps::Range &range = ranges[i];
     const auto &begin = range.begin();
     const auto &end = range.end();
+    std::unordered_set<int> unique_ids;
     auto &kvs = sliced->at(i).second;
 
-    auto lookup_id = static_cast<uint64_t>(lookup_ids[index]);
-    while (lookup_id >= begin && lookup_id <= end) {
-      kvs.vals.push_back(lookup_id);
-      if (++index >= lookup_ids.size()) {
-        break;
+    for (size_t j = 0; j < id_size; j++) {
+      auto lookup_id = static_cast<uint64_t>(lookup_ids[j]);
+      if (lookup_id >= begin && lookup_id <= end) {
+        unique_ids.insert(lookup_id);
       }
-      lookup_id = static_cast<uint64_t>(lookup_ids[index]);
+    }
+    for (const auto &lookup_id : unique_ids) {
+      kvs.vals.push_back(lookup_id);
     }
     kvs.keys.push_back(key);
     kvs.lens.push_back(kvs.vals.size());
@@ -215,35 +223,13 @@ void WorkerProxy<T>::LookupIdSlicer(const ::ps::KVPairs<T> &send, const std::vec
       sliced->at(i).first = false;
     } else {
       sliced->at(i).first = true;
+      expected_result_count_[timestamp] += 1;
     }
   }
 }
 
 template <typename T>
-void WorkerProxy<T>::EmbeddingTableInitSlicer(const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &,
-                                              std::vector<std::pair<bool, ::ps::KVPairs<T>>> *sliced) {
-  const Key &key = send.keys[0];
-  const std::vector<::ps::Range> &ranges = *(embedding_table_ranges_[key]);
-  sliced->resize(ranges.size());
-  for (size_t i = 0; i < ranges.size(); i++) {
-    sliced->at(i).first = true;
-    sliced->at(i).second = send;
-  }
-}
-
-template <typename T>
-void WorkerProxy<T>::PushSlicer(const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &,
-                                std::vector<std::pair<bool, ::ps::KVPairs<T>>> *sliced) {
-  auto server_num = ::ps::Postoffice::Get()->num_servers();
-  sliced->resize(server_num);
-  for (int i = 0; i < server_num; i++) {
-    sliced->at(i).first = true;
-    sliced->at(i).second = send;
-  }
-}
-
-template <typename T>
-void WorkerProxy<T>::BroadcastSlicer(const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &,
+void WorkerProxy<T>::BroadcastSlicer(int timestamp, const ::ps::KVPairs<T> &send, const std::vector<::ps::Range> &,
                                      std::vector<std::pair<bool, ::ps::KVPairs<T>>> *sliced) {
   auto server_num = ::ps::Postoffice::Get()->num_servers();
   sliced->resize(server_num);
@@ -268,7 +254,7 @@ void WorkerProxy<T>::ProcessLookupResult(const ::ps::Message &msg) {
     lookup_results_[ts].push_back(kvs);
     mutex_.unlock();
   }
-  if (lookup_customer_->NumResponse(ts) == ::ps::Postoffice::Get()->num_servers() - 1) {
+  if (lookup_customer_->NumResponse(ts) == expected_result_count_[ts] - 1) {
     const auto &cb = lookup_callbacks_[ts];
     cb();
     lookup_callbacks_.erase(ts);
@@ -279,7 +265,7 @@ template <typename T>
 void WorkerProxy<T>::Send(::ps::Customer *customer, int timestamp, bool push, bool pull, int cmd,
                           const ::ps::KVPairs<T> &kvs, const Slicer &slicer) {
   SlicedKVs sliced;
-  slicer(kvs, ::ps::Postoffice::Get()->GetServerKeyRanges(), &sliced);
+  slicer(timestamp, kvs, ::ps::Postoffice::Get()->GetServerKeyRanges(), &sliced);
 
   for (size_t i = 0; i < sliced.size(); i++) {
     const auto &s = sliced[i];
diff --git a/mindspore/ccsrc/minddata/dataset/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
index df9729c4ee1..168a4eb7b35 100644
--- a/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/CMakeLists.txt
@@ -146,6 +146,12 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
     target_link_libraries(_c_dataengine PRIVATE _c_mindrecord ${MINDRECORD_LINK_OBJECT} mindspore::sqlite)
 else()
     target_link_libraries(_c_dataengine PRIVATE _c_mindrecord)
+    if (NOT ENABLE_GE)
+        target_link_libraries(_c_dataengine PRIVATE mindspore::pslite mindspore::protobuf ${zeromq_DIRPATH}/zmq_install/lib/libzmq.a)
+        if (${ENABLE_IBVERBS} STREQUAL "ON")
+            target_link_libraries(_c_dataengine PRIVATE ibverbs rdmacm)
+        endif()
+    endif()
 endif()
 
 if (USE_GLOG)
diff --git a/mindspore/ccsrc/pipeline/jit/action.cc b/mindspore/ccsrc/pipeline/jit/action.cc
index 74eb9f3f9b5..409bd28a6da 100644
--- a/mindspore/ccsrc/pipeline/jit/action.cc
+++ b/mindspore/ccsrc/pipeline/jit/action.cc
@@ -40,6 +40,11 @@
 #include "vm/transform.h"
 #include "parse/python_adapter.h"
 #include "frontend/optimizer/py_pass_manager.h"
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+#include "frontend/parallel/ps/parameter_server.h"
+#include "frontend/parallel/ps/scheduler.h"
+#include "frontend/parallel/ps/worker.h"
+#endif
 
 namespace mindspore {
 namespace pipeline {
@@ -374,6 +379,25 @@ bool ExecuteAction(const ResourcePtr &res) {
   return true;
 }
 
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+bool StartPSWorkerAction(const ResourcePtr &res) {
+  parallel::ps::Worker<float>::GetInstance().Run();
+  return true;
+}
+
+bool StartPSServerAction(const ResourcePtr &res) {
+  FuncGraphPtr func_graph = res->func_graph();
+  auto &ps = parallel::ps::ParameterServer<float>::GetInstance();
+  ps.Run(func_graph);
+  return true;
+}
+
+bool StartPSSchedulerAction(const ResourcePtr &res) {
+  parallel::ps::Scheduler::GetInstance().Run();
+  return true;
+}
+#endif
+
 // The parallel primitive related valuenode might be partitioned so that its value changes by device,
 // that will result in a syncronization error due to different executing order.
 // Here we temporarily avoid the problem by skipping valuenode merging used by parallel related primitive,
@@ -481,7 +505,11 @@ std::vector<ActionItem> VmPipeline() {
   actions.emplace_back(std::make_pair("py_opt", OptActionPyStub));
 
   actions.emplace_back(std::make_pair("validate", ValidateAction));
-
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  if (parallel::ps::Util::IsRoleOfWorker()) {
+    actions.emplace_back(std::make_pair("worker", StartPSWorkerAction));
+  }
+#endif
   // compile the ANF graph
   actions.emplace_back(std::make_pair("task_emit", TaskEmitAction));
 
@@ -490,5 +518,21 @@ std::vector<ActionItem> VmPipeline() {
 
   return actions;
 }
+
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+std::vector<ActionItem> PServerPipeline() {
+  auto actions = CommonPipeline();
+  actions.emplace_back(std::make_pair("optimize", VmOptimizeAction));
+  actions.emplace_back(std::make_pair("validate", ValidateAction));
+  actions.emplace_back(std::make_pair("pserver", StartPSServerAction));
+  return actions;
+}
+
+std::vector<ActionItem> PSchedulerPipeline() {
+  std::vector<ActionItem> actions;
+  actions.emplace_back(std::make_pair("scheduler", StartPSSchedulerAction));
+  return actions;
+}
+#endif
 }  // namespace pipeline
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/pipeline/jit/action.h b/mindspore/ccsrc/pipeline/jit/action.h
index 0a1feab1c9f..03ea2450d95 100644
--- a/mindspore/ccsrc/pipeline/jit/action.h
+++ b/mindspore/ccsrc/pipeline/jit/action.h
@@ -38,9 +38,14 @@ bool VmOptimizeAction(const ResourcePtr &res);
 bool PynativeOptimizeAction(const ResourcePtr &res);
 bool TaskEmitAction(const ResourcePtr &res);
 bool ExecuteAction(const ResourcePtr &res);
+bool StartPSWorkerAction(const ResourcePtr &res);
+bool StartPSServerAction(const ResourcePtr &res);
+bool StartPSSchedulerAction(const ResourcePtr &res);
 
 std::vector<ActionItem> GePipeline();
 std::vector<ActionItem> VmPipeline();
+std::vector<ActionItem> PServerPipeline();
+std::vector<ActionItem> PSchedulerPipeline();
 abstract::AnalysisResult AbstractAnalyze(const ResourcePtr &res, const FuncGraphPtr &func_graph,
                                          const abstract::AbstractBasePtrList &args_spec, bool clear = false);
 FuncGraphPtr ProgramSpecialize(const ResourcePtr &res, const FuncGraphPtr &func_graph,
diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.cc b/mindspore/ccsrc/pipeline/jit/pipeline.cc
index 05699793ff8..49bebfb3c42 100644
--- a/mindspore/ccsrc/pipeline/jit/pipeline.cc
+++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc
@@ -41,6 +41,11 @@
 #include "pipeline/pynative/pynative_execute.h"
 #include "frontend/optimizer/py_pass_manager.h"
 
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+#include "frontend/parallel/ps/common.h"
+#include "frontend/parallel/ps/util.h"
+#endif
+
 #if (ENABLE_GE || ENABLE_D)
 #include "pipeline/jit/pipeline_ge.h"
 #include "transform/graph_ir/convert.h"
@@ -420,6 +425,26 @@ bool ExecutorPy::CompileInner(const py::object &obj, const py::tuple &args, cons
   use_vm = ChangeExportGeirUseVmFlag(use_vm, phase_s);
 
   std::string backend = MsContext::GetInstance()->backend_policy();
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  if (mindspore::parallel::ps::Util::IsParamServerMode()) {
+    mindspore::parallel::ps::Util::SetInternalEnvVar();
+  }
+  if (parallel::ps::Util::IsRoleOfPServer()) {
+    resource->results()[kBackend] = compile::CreateBackend();
+    p_actions = PServerPipeline();
+  } else if (parallel::ps::Util::IsRoleOfScheduler()) {
+    p_actions = PSchedulerPipeline();
+  } else if (use_vm && backend != "ge") {
+    // Create backend and session
+    auto backend_ptr = compile::CreateBackend();
+    // Connect session to debugger
+    backend_ptr->SetDebugger();
+    resource->results()[kBackend] = backend_ptr;
+    p_actions = VmPipeline();
+  } else {
+    p_actions = GePipeline();
+  }
+#else
   if (use_vm && backend != "ge") {
     // Create backend and session
     auto backend_ptr = compile::CreateBackend();
@@ -430,6 +455,7 @@ bool ExecutorPy::CompileInner(const py::object &obj, const py::tuple &args, cons
   } else {
     p_actions = GePipeline();
   }
+#endif
 
   std::shared_ptr<Pipeline> pip = std::make_shared<Pipeline>(resource, FilterActions(p_actions, phase_s));
 

From 20213ee4164738a9dcd29c850a1096081f8e2cef Mon Sep 17 00:00:00 2001
From: hexia <hexia15@huawei.com>
Date: Thu, 16 Jul 2020 22:05:32 +0800
Subject: [PATCH 40/68] add serving option

---
 build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/build.sh b/build.sh
index 428743f0ffb..63613a9e6bc 100755
--- a/build.sh
+++ b/build.sh
@@ -184,6 +184,7 @@ checkopts()
         elif [[ "X$OPTARG" == "Xd" || "X$OPTARG" == "Xascend" ]]; then
           ENABLE_D="on"
           ENABLE_CPU="on"
+          ENABLE_SERVING="on"
         elif [[ "X$OPTARG" == "Xcpu" ]]; then
           ENABLE_CPU="on"
         else

From 5871ba8a053974f335961a3bf70e73b5ef28910a Mon Sep 17 00:00:00 2001
From: nhussain <naireen.hussain@huawei.com>
Date: Wed, 15 Jul 2020 11:06:13 -0400
Subject: [PATCH 41/68] more validator fixes

---
 mindspore/dataset/core/validator_helpers.py   |  4 ++-
 mindspore/dataset/engine/validators.py        |  6 ++--
 mindspore/dataset/text/validators.py          |  3 +-
 .../dataset/transforms/vision/validators.py   | 13 ++++----
 tests/ut/python/dataset/test_from_dataset.py  |  2 +-
 .../dataset/test_random_crop_and_resize.py    | 29 +++++++++++++++++-
 .../dataset/test_random_crop_with_bbox.py     | 30 +++++++++++++++++--
 .../python/dataset/test_resize_with_bbox.py   | 15 +++++++++-
 .../ut/python/dataset/test_uniform_augment.py |  4 +--
 9 files changed, 87 insertions(+), 19 deletions(-)

diff --git a/mindspore/dataset/core/validator_helpers.py b/mindspore/dataset/core/validator_helpers.py
index 1ded33a9f98..9e5e8fc687d 100644
--- a/mindspore/dataset/core/validator_helpers.py
+++ b/mindspore/dataset/core/validator_helpers.py
@@ -189,8 +189,10 @@ def type_check_list(args, types, arg_names):
         Exception: when the type is not correct, otherwise nothing.
     """
     type_check(args, (list, tuple,), arg_names)
-    if len(args) != len(arg_names):
+    if len(args) != len(arg_names) and not isinstance(arg_names, str):
         raise ValueError("List of arguments is not the same length as argument_names.")
+    if isinstance(arg_names, str):
+        arg_names = ["{0}[{1}]".format(arg_names, i) for i in range(len(args))]
     for arg, arg_name in zip(args, arg_names):
         type_check(arg, types, arg_name)
 
diff --git a/mindspore/dataset/engine/validators.py b/mindspore/dataset/engine/validators.py
index c61630a0354..23f24632a09 100644
--- a/mindspore/dataset/engine/validators.py
+++ b/mindspore/dataset/engine/validators.py
@@ -686,8 +686,7 @@ def check_concat(method):
         [ds], _ = parse_user_args(method, *args, **kwargs)
         type_check(ds, (list, datasets.Dataset), "datasets")
         if isinstance(ds, list):
-            dataset_names = ["dataset[{0}]".format(i) for i in range(len(ds)) if isinstance(ds, list)]
-            type_check_list(ds, (datasets.Dataset,), dataset_names)
+            type_check_list(ds, (datasets.Dataset,), "dataset")
         return method(self, *args, **kwargs)
 
     return new_method
@@ -751,8 +750,7 @@ def check_add_column(method):
 
         if shape is not None:
             type_check(shape, (list,), "shape")
-            shape_names = ["shape[{0}]".format(i) for i in range(len(shape))]
-            type_check_list(shape, (int,), shape_names)
+            type_check_list(shape, (int,), "shape")
 
         return method(self, *args, **kwargs)
 
diff --git a/mindspore/dataset/text/validators.py b/mindspore/dataset/text/validators.py
index b0327f5609c..38ffe5c3828 100644
--- a/mindspore/dataset/text/validators.py
+++ b/mindspore/dataset/text/validators.py
@@ -297,8 +297,7 @@ def check_from_dataset(method):
         if columns is not None:
             if not isinstance(columns, list):
                 columns = [columns]
-                col_names = ["col_{0}".format(i) for i in range(len(columns))]
-                type_check_list(columns, (str,), col_names)
+                type_check_list(columns, (str,), "col")
 
         if freq_range is not None:
             type_check(freq_range, (tuple,), "freq_range")
diff --git a/mindspore/dataset/transforms/vision/validators.py b/mindspore/dataset/transforms/vision/validators.py
index 0f2bc2ce2e3..2e2d13ddb6f 100644
--- a/mindspore/dataset/transforms/vision/validators.py
+++ b/mindspore/dataset/transforms/vision/validators.py
@@ -78,6 +78,8 @@ def check_fill_value(fill_value):
 def check_padding(padding):
     """Parsing the padding arguments and check if it is legal."""
     type_check(padding, (tuple, list, numbers.Number), "padding")
+    if isinstance(padding, numbers.Number):
+        check_value(padding, (0, INT32_MAX), "padding")
     if isinstance(padding, (tuple, list)):
         if len(padding) not in (2, 4):
             raise ValueError("The size of the padding list or tuple should be 2 or 4.")
@@ -163,10 +165,13 @@ def check_random_resize_crop(method):
         check_crop_size(size)
 
         if scale is not None:
+            type_check(scale, (tuple,), "scale")
+            type_check_list(scale, (float, int), "scale")
             check_range(scale, [0, FLOAT_MAX_INTEGER])
         if ratio is not None:
+            type_check(ratio, (tuple,), "ratio")
+            type_check_list(ratio, (float, int), "ratio")
             check_range(ratio, [0, FLOAT_MAX_INTEGER])
-            check_positive(ratio[0], "ratio[0]")
         if interpolation is not None:
             type_check(interpolation, (Inter,), "interpolation")
         if max_attempts is not None:
@@ -450,8 +455,7 @@ def check_random_affine(method):
 
         if translate is not None:
             if type_check(translate, (list, tuple), "translate"):
-                translate_names = ["translate_{0}".format(i) for i in range(len(translate))]
-                type_check_list(translate, (int, float), translate_names)
+                type_check_list(translate, (int, float), "translate")
             if len(translate) != 2:
                 raise TypeError("translate should be a list or tuple of length 2.")
             for i, t in enumerate(translate):
@@ -508,8 +512,7 @@ def check_uniform_augment_cpp(method):
 
         if num_ops > len(operations):
             raise ValueError("num_ops is greater than operations list size")
-        tensor_ops = ["tensor_op_{0}".format(i) for i in range(len(operations))]
-        type_check_list(operations, (TensorOp,), tensor_ops)
+        type_check_list(operations, (TensorOp,), "tensor_ops")
 
         return method(self, *args, **kwargs)
 
diff --git a/tests/ut/python/dataset/test_from_dataset.py b/tests/ut/python/dataset/test_from_dataset.py
index 983052ea08f..7b6333ba656 100644
--- a/tests/ut/python/dataset/test_from_dataset.py
+++ b/tests/ut/python/dataset/test_from_dataset.py
@@ -134,7 +134,7 @@ def test_from_dataset_exceptions():
     test_config("text", (), 1, "freq_range needs to be a tuple of 2 integers or an int and a None.")
     test_config("text", (2, 3), 1.2345,
                 "Argument top_k with value 1.2345 is not of type (<class 'int'>, <class 'NoneType'>)")
-    test_config(23, (2, 3), 1.2345, "Argument col_0 with value 23 is not of type (<class 'str'>,)")
+    test_config(23, (2, 3), 1.2345, "Argument col[0] with value 23 is not of type (<class 'str'>,)")
     test_config("text", (100, 1), 12, "frequency range [a,b] should be 0 <= a <= b (a,b are inclusive)")
     test_config("text", (2, 3), 0, "top_k must be greater than 0")
     test_config([123], (2, 3), -1, "top_k must be greater than 0")
diff --git a/tests/ut/python/dataset/test_random_crop_and_resize.py b/tests/ut/python/dataset/test_random_crop_and_resize.py
index 486d2cd5ed1..58ae31f4d27 100644
--- a/tests/ut/python/dataset/test_random_crop_and_resize.py
+++ b/tests/ut/python/dataset/test_random_crop_and_resize.py
@@ -332,11 +332,37 @@ def test_random_crop_and_resize_comp(plot=False):
         image_c_cropped.append(c_image)
         image_py_cropped.append(py_image)
         mse = diff_mse(c_image, py_image)
-        assert mse < 0.02 # rounding error
+        assert mse < 0.02  # rounding error
     if plot:
         visualize_list(image_c_cropped, image_py_cropped, visualize_mode=2)
 
 
+def test_random_crop_and_resize_06():
+    """
+    Test RandomCropAndResize with c_transforms: invalid values for scale,
+    expected to raise ValueError
+    """
+    logger.info("test_random_crop_and_resize_05_c")
+
+    # Generate dataset
+    data = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    decode_op = c_vision.Decode()
+    try:
+        random_crop_and_resize_op = c_vision.RandomResizedCrop((256, 512), scale="", ratio=(1, 0.5))
+        data = data.map(input_columns=["image"], operations=decode_op)
+        data.map(input_columns=["image"], operations=random_crop_and_resize_op)
+    except TypeError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert "Argument scale with value \"\" is not of type (<class 'tuple'>,)" in str(e)
+
+    try:
+        random_crop_and_resize_op = c_vision.RandomResizedCrop((256, 512), scale=(1, "2"), ratio=(1, 0.5))
+        data = data.map(input_columns=["image"], operations=decode_op)
+        data.map(input_columns=["image"], operations=random_crop_and_resize_op)
+    except TypeError as e:
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert "Argument scale[1] with value 2 is not of type (<class 'float'>, <class 'int'>)." in str(e)
+
 if __name__ == "__main__":
     test_random_crop_and_resize_op_c(True)
     test_random_crop_and_resize_op_py(True)
@@ -347,4 +373,5 @@ if __name__ == "__main__":
     test_random_crop_and_resize_04_py()
     test_random_crop_and_resize_05_c()
     test_random_crop_and_resize_05_py()
+    test_random_crop_and_resize_06()
     test_random_crop_and_resize_comp(True)
diff --git a/tests/ut/python/dataset/test_random_crop_with_bbox.py b/tests/ut/python/dataset/test_random_crop_with_bbox.py
index b93c638f41d..69fb0d63209 100644
--- a/tests/ut/python/dataset/test_random_crop_with_bbox.py
+++ b/tests/ut/python/dataset/test_random_crop_with_bbox.py
@@ -178,13 +178,15 @@ def test_random_crop_with_bbox_op_edge_c(plot_vis=False):
     dataVoc1 = dataVoc1.map(input_columns=["image", "annotation"],
                             output_columns=["image", "annotation"],
                             columns_order=["image", "annotation"],
-                            operations=[lambda img, bboxes: (img, np.array([[0, 0, img.shape[1], img.shape[0]]]).astype(bboxes.dtype))])
+                            operations=[lambda img, bboxes: (
+                                img, np.array([[0, 0, img.shape[1], img.shape[0]]]).astype(bboxes.dtype))])
 
     # Test Op added to list of Operations here
     dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
                             output_columns=["image", "annotation"],
                             columns_order=["image", "annotation"],
-                            operations=[lambda img, bboxes: (img, np.array([[0, 0, img.shape[1], img.shape[0]]]).astype(bboxes.dtype)), test_op])
+                            operations=[lambda img, bboxes: (
+                                img, np.array([[0, 0, img.shape[1], img.shape[0]]]).astype(bboxes.dtype)), test_op])
 
     unaugSamp, augSamp = [], []
 
@@ -239,6 +241,29 @@ def test_random_crop_with_bbox_op_bad_c():
     check_bad_bbox(data_voc2, test_op, InvalidBBoxType.WrongShape, "4 features")
 
 
+def test_random_crop_with_bbox_op_negative_padding():
+    """
+    Test RandomCropWithBBox Op on invalid constructor parameters, expected to raise ValueError
+    """
+    logger.info("test_random_crop_with_bbox_op_invalid_c")
+
+    dataVoc2 = ds.VOCDataset(DATA_DIR_VOC, task="Detection", mode="train", decode=True, shuffle=False)
+
+    try:
+        test_op = c_vision.RandomCropWithBBox([512, 512], padding=-1)
+
+        dataVoc2 = dataVoc2.map(input_columns=["image", "annotation"],
+                                output_columns=["image", "annotation"],
+                                columns_order=["image", "annotation"],
+                                operations=[test_op])
+
+        for _ in dataVoc2.create_dict_iterator():
+            break
+    except ValueError as err:
+        logger.info("Got an exception in DE: {}".format(str(err)))
+        assert "Input padding is not within the required interval of (0 to 2147483647)." in str(err)
+
+
 if __name__ == "__main__":
     test_random_crop_with_bbox_op_c(plot_vis=True)
     test_random_crop_with_bbox_op_coco_c(plot_vis=True)
@@ -247,3 +272,4 @@ if __name__ == "__main__":
     test_random_crop_with_bbox_op_edge_c(plot_vis=True)
     test_random_crop_with_bbox_op_invalid_c()
     test_random_crop_with_bbox_op_bad_c()
+    test_random_crop_with_bbox_op_negative_padding()
diff --git a/tests/ut/python/dataset/test_resize_with_bbox.py b/tests/ut/python/dataset/test_resize_with_bbox.py
index 3bb731ee970..1dfe0cf987d 100644
--- a/tests/ut/python/dataset/test_resize_with_bbox.py
+++ b/tests/ut/python/dataset/test_resize_with_bbox.py
@@ -16,9 +16,10 @@
 Testing the resize with bounding boxes op in DE
 """
 import numpy as np
+import pytest
+
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as c_vision
-
 from mindspore import log as logger
 from util import visualize_with_bounding_boxes, InvalidBBoxType, check_bad_bbox, \
     save_and_check_md5
@@ -172,6 +173,18 @@ def test_resize_with_bbox_op_bad_c():
     check_bad_bbox(data_voc2, test_op, InvalidBBoxType.WrongShape, "4 features")
 
 
+def test_resize_with_bbox_op_params_outside_of_interpolation_dict():
+    """
+    Test passing in a invalid key for interpolation
+    """
+    logger.info("test_resize_with_bbox_op_params_outside_of_interpolation_dict")
+
+    size = (500, 500)
+    more_para = None
+    with pytest.raises(KeyError, match="None"):
+        c_vision.ResizeWithBBox(size, more_para)
+
+
 if __name__ == "__main__":
     test_resize_with_bbox_op_voc_c(plot_vis=False)
     test_resize_with_bbox_op_coco_c(plot_vis=False)
diff --git a/tests/ut/python/dataset/test_uniform_augment.py b/tests/ut/python/dataset/test_uniform_augment.py
index e5b66696eaf..c0047226830 100644
--- a/tests/ut/python/dataset/test_uniform_augment.py
+++ b/tests/ut/python/dataset/test_uniform_augment.py
@@ -166,10 +166,10 @@ def test_cpp_uniform_augment_exception_pyops(num_ops=2):
                      F.Invert()]
 
     with pytest.raises(TypeError) as e:
-        _ = C.UniformAugment(operations=transforms_ua, num_ops=num_ops)
+        C.UniformAugment(operations=transforms_ua, num_ops=num_ops)
 
     logger.info("Got an exception in DE: {}".format(str(e)))
-    assert "Argument tensor_op_5 with value" \
+    assert "Argument tensor_ops[5] with value" \
            " <mindspore.dataset.transforms.vision.py_transforms.Invert" in str(e.value)
     assert "is not of type (<class 'mindspore._c_dataengine.TensorOp'>,)" in str(e.value)
 

From c41fbd96b707d6f65f05b77ed7bf9563e133e5c4 Mon Sep 17 00:00:00 2001
From: Zirui Wu <zirui.wu@huawei.com>
Date: Thu, 16 Jul 2020 09:47:05 -0400
Subject: [PATCH 42/68] fix minor comments

follow up
---
 mindspore/ccsrc/minddata/dataset/kernels/compose_op.cc    | 3 +--
 .../dataset/kernels/image/random_select_subpolicy_op.cc   | 2 +-
 .../ccsrc/minddata/dataset/kernels/random_apply_op.cc     | 2 +-
 .../ccsrc/minddata/dataset/kernels/random_choice_op.cc    | 2 +-
 .../ccsrc/minddata/dataset/text/kernels/lookup_op.cc      | 8 ++++----
 5 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/mindspore/ccsrc/minddata/dataset/kernels/compose_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/compose_op.cc
index 35128d3e886..152d779f1ec 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/compose_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/compose_op.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,6 @@
 #include <vector>
 
 #include "minddata/dataset/core/tensor.h"
-#include "minddata/dataset/kernels/py_func_op.h"
 #include "minddata/dataset/kernels/tensor_op.h"
 #include "minddata/dataset/util/status.h"
 
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/random_select_subpolicy_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/random_select_subpolicy_op.cc
index 3a789ab3444..d01231f1f84 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/random_select_subpolicy_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/random_select_subpolicy_op.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/random_apply_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/random_apply_op.cc
index 783d5077ccb..40bd5510138 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/random_apply_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/random_apply_op.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/random_choice_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/random_choice_op.cc
index fc81e85741f..e54278fdd33 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/random_choice_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/random_choice_op.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc
index 02b75bc4f9b..d1b4ad24b86 100644
--- a/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/text/kernels/lookup_op.cc
@@ -26,7 +26,7 @@ LookupOp::LookupOp(std::shared_ptr<Vocab> vocab, WordIdType default_id)
 Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
   IO_CHECK(input, output);
   RETURN_UNEXPECTED_IF_NULL(vocab_);
-  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "None String Tensor.");
+  CHECK_FAIL_RETURN_UNEXPECTED(input->type() == DataType::DE_STRING, "None string tensor received.");
   std::vector<WordIdType> word_ids;
   word_ids.reserve(input->Size());
   for (auto itr = input->begin<std::string_view>(); itr != input->end<std::string_view>(); itr++) {
@@ -34,7 +34,7 @@ Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
     word_ids.emplace_back(word_id == Vocab::kNoTokenExists ? default_id_ : word_id);
     CHECK_FAIL_RETURN_UNEXPECTED(
       word_ids.back() != Vocab::kNoTokenExists,
-      "Lookup Error: token" + std::string(*itr) + "doesn't exist in vocab and no unknown token is specified.");
+      "Lookup Error: token: " + std::string(*itr) + " doesn't exist in vocab and no unknown token is specified.");
   }
 
   RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), type_,
@@ -42,8 +42,8 @@ Status LookupOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<T
   return Status::OK();
 }
 Status LookupOp::OutputType(const std::vector<DataType> &inputs, std::vector<DataType> &outputs) {
-  CHECK_FAIL_RETURN_UNEXPECTED(inputs.size() == NumInput() && outputs.size() == NumOutput(), "size doesn't match");
-  CHECK_FAIL_RETURN_UNEXPECTED(inputs[0] == DataType::DE_STRING, "None String tensor type");
+  CHECK_FAIL_RETURN_UNEXPECTED(inputs.size() == NumInput() && outputs.size() == NumOutput(), "size doesn't match.");
+  CHECK_FAIL_RETURN_UNEXPECTED(inputs[0] == DataType::DE_STRING, "None String tensor type.");
   outputs[0] = type_;
   return Status::OK();
 }

From 1d66467d475b8a06c07adf1e48add557341ab33e Mon Sep 17 00:00:00 2001
From: jinyaohui <jinyaohui@huawei.com>
Date: Thu, 16 Jul 2020 10:40:57 +0800
Subject: [PATCH 43/68] opt add ps logic

---
 mindspore/nn/optim/adam.py            | 43 ++++++++++++++++++++-------
 mindspore/nn/optim/ftrl.py            | 32 +++++++++++++++-----
 mindspore/nn/optim/momentum.py        | 21 +++++++++----
 mindspore/nn/optim/optimizer.py       |  2 ++
 mindspore/ops/operations/other_ops.py |  2 ++
 5 files changed, 76 insertions(+), 24 deletions(-)

diff --git a/mindspore/nn/optim/adam.py b/mindspore/nn/optim/adam.py
index eb6e64074f0..39abec5664d 100755
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -71,7 +71,6 @@ def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, grad
         next_v = op_mul(beta2, v_fp32) + op_mul(op_cast(F.tuple_to_array((1.0,)), mstype.float32)
                                                 - beta2, op_square(gradient_fp32))
 
-
         update = next_m / (eps + op_sqrt(next_v))
         if decay_flag:
             update = op_mul(weight_decay_tensor, param_fp32) + update
@@ -110,26 +109,45 @@ def _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, po
 
 
 @_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tuple",
-                    "Tensor", "Tensor", "Tensor")
+                    "Tensor", "Tensor", "Tensor", "Bool")
 def _run_opt_with_sparse(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
-                         moment1, moment2):
+                         moment1, moment2, ps_parameter):
     """Apply sparse adam optimizer to the weight parameter when the gradient is sparse."""
     success = True
-    success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
-                                           eps, gradient[1], gradient[0]))
+    if ps_parameter:
+        op_shape = P.Shape()
+        _ps_pull = P.Pull()
+        _ps_push = P.Push("Adam", [0, 1, 2])
+        shapes = (op_shape(params), op_shape(moment1), op_shape(moment2),
+                  op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
+                  op_shape(beta2), op_shape(eps), op_shape(gradient[1]), op_shape(gradient[0]))
+        success = F.depend(success, _ps_pull(_ps_push((beta1_power, beta2_power, lr, beta1, beta2,
+                                                       eps, gradient[1], gradient[0]), shapes), params))
+    else:
+        success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
+                                               eps, gradient[1], gradient[0]))
     return success
 
 
 @_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor",
-                    "Tensor", "Tensor", "Tensor")
+                    "Tensor", "Tensor", "Tensor", "Bool")
 def _run_opt_with_one_number(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
-                             moment1, moment2):
+                             moment1, moment2, ps_parameter):
     """Apply adam optimizer to the weight parameter using Tensor."""
     success = True
-    success = F.depend(success, opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
-                                    eps, gradient))
+    if ps_parameter:
+        op_shape = P.Shape()
+        _ps_pull = P.Pull()
+        _ps_push = P.Push("Adam", [0, 1, 2])
+        success = F.depend(success, _ps_pull(_ps_push((beta1_power, beta2_power, lr, beta1, beta2, eps, gradient),
+                                                      (op_shape(params), op_shape(moment1), op_shape(moment2))),
+                                             params))
+    else:
+        success = F.depend(success, opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
+                                        eps, gradient))
     return success
 
+
 @_adam_push_pull_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
                               "Tensor", "Tuple", "Tensor", "Tensor", "Tensor")
 def _run_push_pull_opt_with_sparse(push, pull, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
@@ -156,6 +174,7 @@ def _run_push_pull_opt_with_one_number(push, pull, beta1_power, beta2_power, bet
                                           (op_shape(params), op_shape(moment1), op_shape(moment2))), params))
     return success
 
+
 class Adam(Optimizer):
     r"""
     Updates gradients by Adaptive Moment Estimation (Adam) algorithm.
@@ -293,13 +312,14 @@ class Adam(Optimizer):
         if self.is_group_lr:
             success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt, beta1_power, beta2_power,
                                           self.beta1, self.beta2, self.eps),
-                                lr, gradients, params, moment1, moment2)
+                                lr, gradients, params, moment1, moment2, self.ps_parameters)
         else:
             success = self.map_(F.partial(_adam_opt, self.opt, self.sparse_opt, beta1_power, beta2_power,
                                           self.beta1, self.beta2, self.eps, lr),
-                                gradients, params, moment1, moment2)
+                                gradients, params, moment1, moment2, self.ps_parameters)
         return success
 
+
 class PSAdam(Optimizer):
     '''The same usage as Adam optimizer except the parameters are set PS mode.'''
     def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, use_locking=False,
@@ -346,6 +366,7 @@ class PSAdam(Optimizer):
                                 gradients, params, moment1, moment2)
         return success
 
+
 class AdamWeightDecay(Optimizer):
     """
     Implements Adam algorithm weight decay fix.
diff --git a/mindspore/nn/optim/ftrl.py b/mindspore/nn/optim/ftrl.py
index dd2ebddfa78..97e139f2634 100644
--- a/mindspore/nn/optim/ftrl.py
+++ b/mindspore/nn/optim/ftrl.py
@@ -26,22 +26,38 @@ _ftrl_push_pull_opt = C.MultitypeFuncGraph("ftrl_opt")
 
 
 @_ftrl_opt.register("Function", "Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tuple", "Tensor",
-                    "Tensor")
-def _tensor_run_opt_with_sparse(opt, spars_opt, learning_rate, l1, l2, lr_power, linear, gradient, weight, moment):
+                    "Tensor", "Bool")
+def _tensor_run_opt_with_sparse(opt, spars_opt, learning_rate, l1, l2, lr_power, linear, gradient, weight, moment,
+                                ps_parameter):
     """Apply sparse ftrl optimizer to the weight parameter when the gradient is sparse."""
     success = True
-    success = F.depend(success, spars_opt(weight, moment, linear, gradient[1], gradient[0]))
+    if ps_parameter:
+        op_shape = P.Shape()
+        _ps_pull = P.Pull()
+        _ps_push = P.Push("Ftrl", [0, 1, 2])
+        shapes = (op_shape(weight), op_shape(moment), op_shape(linear), op_shape(gradient[1]), op_shape(gradient[0]))
+        success = F.depend(success, _ps_pull(_ps_push((gradient[1], gradient[0]), shapes), weight))
+    else:
+        success = F.depend(success, spars_opt(weight, moment, linear, gradient[1], gradient[0]))
     return success
 
 
 @_ftrl_opt.register("Function", "Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor",
-                    "Tensor")
-def _tensor_run_opt(opt, spars_opt, learning_rate, l1, l2, lr_power, linear, gradient, weight, moment):
+                    "Tensor", "Bool")
+def _tensor_run_opt(opt, spars_opt, learning_rate, l1, l2, lr_power, linear, gradient, weight, moment, ps_parameter):
     """Apply ftrl optimizer to the weight parameter."""
     success = True
-    success = F.depend(success, opt(weight, moment, linear, gradient, learning_rate, l1, l2, lr_power))
+    if ps_parameter:
+        op_shape = P.Shape()
+        _ps_pull = P.Pull()
+        _ps_push = P.Push("Ftrl", [0, 1, 2])
+        success = F.depend(success, _ps_pull(_ps_push((gradient, learning_rate, l1, l2, lr_power),
+                                                      (op_shape(weight), op_shape(moment), op_shape(linear))), weight))
+    else:
+        success = F.depend(success, opt(weight, moment, linear, gradient, learning_rate, l1, l2, lr_power))
     return success
 
+
 @_ftrl_push_pull_opt.register("Function", "Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tuple",
                               "Tensor", "Tensor")
 def _tensor_run_push_pull_opt_with_sparse(push, pull, learning_rate, l1, l2, lr_power, linear, gradient,
@@ -63,6 +79,7 @@ def _tensor_run_push_pull_opt_with_one_number(push, pull, learning_rate, l1, l2,
                                           (op_shape(weight), op_shape(moment), op_shape(linear))), weight))
     return success
 
+
 def _check_param(initial_accum, lr_power, l1, l2, use_locking, weight_decay=0.0, prim_name=None):
     """Check param."""
     validator.check_value_type("initial_accum", initial_accum, [float], prim_name)
@@ -150,9 +167,10 @@ class FTRL(Optimizer):
 
         grads = self.scale_grad(grads)
         success = self.map_(F.partial(_ftrl_opt, self.opt, self.sparse_opt, lr, self.l1, self.l2, self.lr_power),
-                            linear, grads, params, moments)
+                            linear, grads, params, moments, self.ps_parameters)
         return success
 
+
 class PSFTRL(Optimizer):
     def __init__(self, params, initial_accum=0.1, learning_rate=0.001, lr_power=-0.5, l1=0.0, l2=0.0,
                  use_locking=False, loss_scale=1.0, weight_decay=0.0):
diff --git a/mindspore/nn/optim/momentum.py b/mindspore/nn/optim/momentum.py
index 1e8ce855707..a823557defc 100755
--- a/mindspore/nn/optim/momentum.py
+++ b/mindspore/nn/optim/momentum.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ============================================================================
 """momentum"""
-from mindspore.ops import functional as F, composite as C
+from mindspore.ops import functional as F, composite as C, operations as P
 from mindspore.ops import _selected_ops
 from mindspore.common.parameter import Parameter
 from mindspore.common.tensor import Tensor
@@ -25,11 +25,18 @@ from .optimizer import Optimizer
 _momentum_opt = C.MultitypeFuncGraph("momentum_opt")
 
 
-@_momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
-def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, moment):
+@_momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool")
+def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, moment, ps_parameter):
     """Apply momentum optimizer to the weight parameter using Tensor."""
     success = True
-    success = F.depend(success, opt(weight, moment, learning_rate, gradient, momentum))
+    if ps_parameter:
+        op_shape = P.Shape()
+        _ps_pull = P.Pull()
+        _ps_push = P.Push("Momentum", [])
+        shapes = (op_shape(learning_rate), op_shape(gradient), op_shape(momentum))
+        success = F.depend(success, _ps_pull(_ps_push((learning_rate, gradient, momentum), shapes), weight))
+    else:
+        success = F.depend(success, opt(weight, moment, learning_rate, gradient, momentum))
     return success
 
 
@@ -127,7 +134,9 @@ class Momentum(Optimizer):
         gradients = self.scale_grad(gradients)
         lr = self.get_lr()
         if self.is_group_lr:
-            success = self.hyper_map(F.partial(_momentum_opt, self.opt, self.momentum), lr, gradients, params, moments)
+            success = self.hyper_map(F.partial(_momentum_opt, self.opt, self.momentum), lr, gradients, params, moments,
+                                     self.ps_parameters)
         else:
-            success = self.hyper_map(F.partial(_momentum_opt, self.opt, self.momentum, lr), gradients, params, moments)
+            success = self.hyper_map(F.partial(_momentum_opt, self.opt, self.momentum, lr), gradients, params, moments,
+                                     self.ps_parameters)
         return success
diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py
index cdf1565f349..f106e3678c1 100755
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -152,6 +152,8 @@ class Optimizer(Cell):
             self.weight_decay = weight_decay * loss_scale
             decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name
             self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
+        ps_filter = lambda x: x.is_param_ps
+        self.ps_parameters = tuple(ps_filter(x) for x in self.parameters)
         self.reciprocal_scale = 1.0 / loss_scale
         self.exec_weight_decay = any(self.decay_flags)
         self.param_length = len(self.parameters)
diff --git a/mindspore/ops/operations/other_ops.py b/mindspore/ops/operations/other_ops.py
index a58403f8834..6555b03aa9a 100644
--- a/mindspore/ops/operations/other_ops.py
+++ b/mindspore/ops/operations/other_ops.py
@@ -511,6 +511,7 @@ class Push(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self, optim_type='ApplyMomentum', only_shape_indices=None):
         """init Push"""
+        self.add_prim_attr("primitive_target", "CPU")
         self.init_prim_io_names(inputs=['optim_inputs', 'optim_input_shapes'], outputs=['key'])
 
     def infer_shape(self, inputs, shapes):
@@ -534,6 +535,7 @@ class Pull(PrimitiveWithInfer):
     @prim_attr_register
     def __init__(self):
         """init Pull"""
+        self.add_prim_attr("primitive_target", "CPU")
         self.init_prim_io_names(inputs=['key', 'weight'], outputs=['output'])
 
     def infer_shape(self, key_shape, weight_shape):

From 5647889c0dfe7d71d3a4d6c7b6ce9825088c8714 Mon Sep 17 00:00:00 2001
From: islam_amin <islam.amin@huawei.com>
Date: Sun, 12 Jul 2020 12:06:56 -0400
Subject: [PATCH 44/68] Added AutoContrast Op

---
 .../minddata/dataset/api/python_bindings.cc   |   6 +
 .../dataset/kernels/image/CMakeLists.txt      |   1 +
 .../dataset/kernels/image/auto_contrast_op.cc |  34 ++++
 .../dataset/kernels/image/auto_contrast_op.h  |  61 +++++++
 .../dataset/kernels/image/image_utils.cc      | 103 ++++++++++++
 .../dataset/kernels/image/image_utils.h       |   8 +
 .../minddata/dataset/kernels/tensor_op.h      |   1 +
 .../dataset/transforms/vision/c_transforms.py |  20 ++-
 .../dataset/transforms/vision/validators.py   |  21 +++
 .../golden/autcontrast_01_result_py.npz       | Bin 0 -> 607 bytes
 tests/ut/python/dataset/test_autocontrast.py  | 159 +++++++++++++++++-
 11 files changed, 408 insertions(+), 6 deletions(-)
 create mode 100644 mindspore/ccsrc/minddata/dataset/kernels/image/auto_contrast_op.cc
 create mode 100644 mindspore/ccsrc/minddata/dataset/kernels/image/auto_contrast_op.h
 create mode 100644 tests/ut/data/dataset/golden/autcontrast_01_result_py.npz

diff --git a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
index a20c5c80ce9..b5a6dc59e09 100644
--- a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
@@ -48,6 +48,7 @@
 #include "minddata/dataset/kernels/data/slice_op.h"
 #include "minddata/dataset/kernels/data/to_float16_op.h"
 #include "minddata/dataset/kernels/data/type_cast_op.h"
+#include "minddata/dataset/kernels/image/auto_contrast_op.h"
 #include "minddata/dataset/kernels/image/bounding_box_augment_op.h"
 #include "minddata/dataset/kernels/image/center_crop_op.h"
 #include "minddata/dataset/kernels/image/cut_out_op.h"
@@ -362,6 +363,11 @@ void bindTensorOps1(py::module *m) {
   (void)py::class_<TensorOp, std::shared_ptr<TensorOp>>(*m, "TensorOp")
     .def("__deepcopy__", [](py::object &t, py::dict memo) { return t; });
 
+  (void)py::class_<AutoContrastOp, TensorOp, std::shared_ptr<AutoContrastOp>>(
+    *m, "AutoContrastOp", "Tensor operation to apply autocontrast on an image.")
+    .def(py::init<float, std::vector<uint32_t>>(), py::arg("cutoff") = AutoContrastOp::kCutOff,
+         py::arg("ignore") = AutoContrastOp::kIgnore);
+
   (void)py::class_<NormalizeOp, TensorOp, std::shared_ptr<NormalizeOp>>(
     *m, "NormalizeOp", "Tensor operation to normalize an image. Takes mean and std.")
     .def(py::init<float, float, float, float, float, float>(), py::arg("meanR"), py::arg("meanG"), py::arg("meanB"),
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt b/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
index 402989af0de..743fc83c149 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/CMakeLists.txt
@@ -1,6 +1,7 @@
 file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
 add_library(kernels-image OBJECT
+    auto_contrast_op.cc
     center_crop_op.cc
     cut_out_op.cc
     decode_op.cc
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/auto_contrast_op.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/auto_contrast_op.cc
new file mode 100644
index 00000000000..417d16783c8
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/auto_contrast_op.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "minddata/dataset/kernels/image/auto_contrast_op.h"
+#include "minddata/dataset/kernels/image/image_utils.h"
+
+namespace mindspore {
+namespace dataset {
+
+const float AutoContrastOp::kCutOff = 0.0;
+const std::vector<uint32_t> AutoContrastOp::kIgnore = {};
+
+Status AutoContrastOp::Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
+  IO_CHECK(input, output);
+  return AutoContrast(input, output, cutoff_, ignore_);
+}
+}  // namespace dataset
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/auto_contrast_op.h b/mindspore/ccsrc/minddata/dataset/kernels/image/auto_contrast_op.h
new file mode 100644
index 00000000000..94b3b23df65
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/auto_contrast_op.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DATASET_KERNELS_IMAGE_AUTO_CONTRAST_OP_H_
+#define DATASET_KERNELS_IMAGE_AUTO_CONTRAST_OP_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "minddata/dataset/core/tensor.h"
+#include "minddata/dataset/core/cv_tensor.h"
+#include "minddata/dataset/kernels/tensor_op.h"
+#include "minddata/dataset/util/status.h"
+
+namespace mindspore {
+namespace dataset {
+class AutoContrastOp : public TensorOp {
+ public:
+  /// Default cutoff to be used
+  static const float kCutOff;
+  /// Default ignore to be used
+  static const std::vector<uint32_t> kIgnore;
+
+  AutoContrastOp(const float &cutoff, const std::vector<uint32_t> &ignore) : cutoff_(cutoff), ignore_(ignore) {}
+
+  ~AutoContrastOp() override = default;
+
+  /// Provide stream operator for displaying it
+  friend std::ostream &operator<<(std::ostream &out, const AutoContrastOp &so) {
+    so.Print(out);
+    return out;
+  }
+
+  void Print(std::ostream &out) const override { out << Name(); }
+
+  std::string Name() const override { return kAutoContrastOp; }
+
+  Status Compute(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) override;
+
+ private:
+  float cutoff_;
+  std::vector<uint32_t> ignore_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // DATASET_KERNELS_IMAGE_AUTO_CONTRAST_OP_H_
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
index ddbce3e23ac..dac076a5f43 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.cc
@@ -585,6 +585,109 @@ Status AdjustContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tens
   return Status::OK();
 }
 
+Status AutoContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const float &cutoff,
+                    const std::vector<uint32_t> &ignore) {
+  try {
+    std::shared_ptr<CVTensor> input_cv = CVTensor::AsCVTensor(input);
+    if (!input_cv->mat().data) {
+      RETURN_STATUS_UNEXPECTED("Could not convert to CV Tensor");
+    }
+    if (input_cv->Rank() != 3 && input_cv->Rank() != 2) {
+      RETURN_STATUS_UNEXPECTED("Shape not <H,W,C> or <H,W>");
+    }
+    // Reshape to extend dimension if rank is 2 for algorithm to work. then reshape output to be of rank 2 like input
+    if (input_cv->Rank() == 2) {
+      RETURN_IF_NOT_OK(input_cv->ExpandDim(2));
+    }
+    // Get number of channels and image matrix
+    std::size_t num_of_channels = input_cv->shape()[2];
+    if (num_of_channels != 1 && num_of_channels != 3) {
+      RETURN_STATUS_UNEXPECTED("Number of channels is not 1 or 3.");
+    }
+    cv::Mat image = input_cv->mat();
+    // Separate the image to channels
+    std::vector<cv::Mat> planes(num_of_channels);
+    cv::split(image, planes);
+    cv::Mat b_hist, g_hist, r_hist;
+    // Establish the number of bins and set variables for histogram
+    int32_t hist_size = 256;
+    int32_t channels = 0;
+    float range[] = {0, 256};
+    const float *hist_range[] = {range};
+    bool uniform = true, accumulate = false;
+    // Set up lookup table for LUT(Look up table algorithm)
+    std::vector<int32_t> table;
+    std::vector<cv::Mat> image_result;
+    for (std::size_t layer = 0; layer < planes.size(); layer++) {
+      // Reset lookup table
+      table = std::vector<int32_t>{};
+      // Calculate Histogram for channel
+      cv::Mat hist;
+      cv::calcHist(&planes[layer], 1, &channels, cv::Mat(), hist, 1, &hist_size, hist_range, uniform, accumulate);
+      hist.convertTo(hist, CV_32SC1);
+      std::vector<int32_t> hist_vec;
+      hist.col(0).copyTo(hist_vec);
+      // Ignore values in ignore
+      for (const auto &item : ignore) hist_vec[item] = 0;
+      int32_t n = std::accumulate(hist_vec.begin(), hist_vec.end(), 0);
+      // Find pixel values that are in the low cutoff and high cutoff.
+      int32_t cut = static_cast<int32_t>((cutoff / 100.0) * n);
+      if (cut != 0) {
+        for (int32_t lo = 0; lo < 256 && cut > 0; lo++) {
+          if (cut > hist_vec[lo]) {
+            cut -= hist_vec[lo];
+            hist_vec[lo] = 0;
+          } else {
+            hist_vec[lo] -= cut;
+            cut = 0;
+          }
+        }
+        cut = static_cast<int32_t>((cutoff / 100.0) * n);
+        for (int32_t hi = 255; hi >= 0 && cut > 0; hi--) {
+          if (cut > hist_vec[hi]) {
+            cut -= hist_vec[hi];
+            hist_vec[hi] = 0;
+          } else {
+            hist_vec[hi] -= cut;
+            cut = 0;
+          }
+        }
+      }
+      int32_t lo = 0;
+      int32_t hi = 255;
+      for (; lo < 256 && !hist_vec[lo]; lo++) {
+      }
+      for (; hi >= 0 && !hist_vec[hi]; hi--) {
+      }
+      if (hi <= lo) {
+        for (int32_t i = 0; i < 256; i++) {
+          table.push_back(i);
+        }
+      } else {
+        float scale = 255.0 / (hi - lo);
+        float offset = -1 * lo * scale;
+        for (int32_t i = 0; i < 256; i++) {
+          int32_t ix = static_cast<int32_t>(i * scale + offset);
+          ix = std::max(ix, 0);
+          ix = std::min(ix, 255);
+          table.push_back(ix);
+        }
+      }
+      cv::Mat result_layer;
+      cv::LUT(planes[layer], table, result_layer);
+      image_result.push_back(result_layer);
+    }
+    cv::Mat result;
+    cv::merge(image_result, result);
+    std::shared_ptr<CVTensor> output_cv = std::make_shared<CVTensor>(result);
+    if (input_cv->Rank() == 2) output_cv->Squeeze();
+    (*output) = std::static_pointer_cast<Tensor>(output_cv);
+  } catch (const cv::Exception &e) {
+    RETURN_STATUS_UNEXPECTED("Error in auto contrast");
+  }
+  return Status::OK();
+}
+
 Status AdjustSaturation(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const float &alpha) {
   try {
     std::shared_ptr<CVTensor> input_cv = CVTensor::AsCVTensor(input);
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h
index f489c7367b9..c1426338954 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/image/image_utils.h
@@ -175,6 +175,14 @@ Status AdjustBrightness(const std::shared_ptr<Tensor> &input, std::shared_ptr<Te
 // @param output: Adjusted image of same shape and type.
 Status AdjustContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const float &alpha);
 
+// Returns image with contrast maximized.
+// @param input: Tensor of shape <H,W,3>/<H,W,1>/<H,W> in RGB/Grayscale and any OpenCv compatible type, see CVTensor.
+// @param cutoff: Cutoff percentage of how many pixels are to be removed (high pixels change to 255 and low change to 0)
+//                from the high and low ends of the histogram.
+// @param ignore: Pixel values to be ignored in the algorithm.
+Status AutoContrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const float &cutoff,
+                    const std::vector<uint32_t> &ignore);
+
 // Returns image with adjusted saturation.
 // @param input: Tensor of shape <H,W,3> in RGB order and any OpenCv compatible type, see CVTensor.
 // @param alpha: Alpha value to adjust saturation by. Should be a positive number.
diff --git a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
index 27bcfed0077..cae28fe6f31 100644
--- a/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
+++ b/mindspore/ccsrc/minddata/dataset/kernels/tensor_op.h
@@ -87,6 +87,7 @@ namespace mindspore {
 namespace dataset {
 
 // image
+constexpr char kAutoContrastOp[] = "AutoContrastOp";
 constexpr char kBoundingBoxAugmentOp[] = "BoundingBoxAugmentOp";
 constexpr char kDecodeOp[] = "DecodeOp";
 constexpr char kCenterCropOp[] = "CenterCropOp";
diff --git a/mindspore/dataset/transforms/vision/c_transforms.py b/mindspore/dataset/transforms/vision/c_transforms.py
index ca356dd79c3..0715ec8e190 100644
--- a/mindspore/dataset/transforms/vision/c_transforms.py
+++ b/mindspore/dataset/transforms/vision/c_transforms.py
@@ -47,7 +47,7 @@ from .utils import Inter, Border
 from .validators import check_prob, check_crop, check_resize_interpolation, check_random_resize_crop, \
     check_normalize_c, check_random_crop, check_random_color_adjust, check_random_rotation, check_range, \
     check_resize, check_rescale, check_pad, check_cutout, check_uniform_augment_cpp, check_bounding_box_augment_cpp, \
-    check_random_select_subpolicy_op, FLOAT_MAX_INTEGER
+    check_random_select_subpolicy_op, check_auto_contrast, FLOAT_MAX_INTEGER
 
 DE_C_INTER_MODE = {Inter.NEAREST: cde.InterpolationMode.DE_INTER_NEAREST_NEIGHBOUR,
                    Inter.LINEAR: cde.InterpolationMode.DE_INTER_LINEAR,
@@ -71,6 +71,24 @@ def parse_padding(padding):
     return padding
 
 
+class AutoContrast(cde.AutoContrastOp):
+    """
+    Apply auto contrast on input image.
+
+    Args:
+        cutoff (float, optional): Percent of pixels to cut off from the histogram (default=0.0).
+        ignore (int or sequence, optional): Pixel values to ignore (default=None).
+    """
+
+    @check_auto_contrast
+    def __init__(self, cutoff=0.0, ignore=None):
+        if ignore is None:
+            ignore = []
+        if isinstance(ignore, int):
+            ignore = [ignore]
+        super().__init__(cutoff, ignore)
+
+
 class Invert(cde.InvertOp):
     """
     Apply invert on input image in RGB mode.
diff --git a/mindspore/dataset/transforms/vision/validators.py b/mindspore/dataset/transforms/vision/validators.py
index 0f2bc2ce2e3..b4ac03488f6 100644
--- a/mindspore/dataset/transforms/vision/validators.py
+++ b/mindspore/dataset/transforms/vision/validators.py
@@ -530,6 +530,27 @@ def check_bounding_box_augment_cpp(method):
     return new_method
 
 
+def check_auto_contrast(method):
+    """Wrapper method to check the parameters of AutoContrast ops (python and cpp)."""
+
+    @wraps(method)
+    def new_method(self, *args, **kwargs):
+        [cutoff, ignore], _ = parse_user_args(method, *args, **kwargs)
+        type_check(cutoff, (int, float), "cutoff")
+        check_value(cutoff, [0, 100], "cutoff")
+        if ignore is not None:
+            type_check(ignore, (list, tuple, int), "ignore")
+        if isinstance(ignore, int):
+            check_value(ignore, [0, 255], "ignore")
+        if isinstance(ignore, (list, tuple)):
+            for item in ignore:
+                type_check(item, (int,), "item")
+                check_value(item, [0, 255], "ignore")
+        return method(self, *args, **kwargs)
+
+    return new_method
+
+
 def check_uniform_augment_py(method):
     """Wrapper method to check the parameters of python UniformAugment op."""
 
diff --git a/tests/ut/data/dataset/golden/autcontrast_01_result_py.npz b/tests/ut/data/dataset/golden/autcontrast_01_result_py.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6408ebf25080fb5aa983e5a97409c3b897158283
GIT binary patch
literal 607
zcmWIWW@Zs#fB;2?b#dl4*BKcYK$w$3gdwr0DBeIXub`5VK>#cWQV5a+fysWMz5$Vp
z3}p<}>M5zk$wlf`3hFif>N*PQY57GZMTvRw`9&$IAYr$}oZ?iVcyUHzK`M~1VWgvA
zq^YA&t3W>BYG6*zE6pva)Jx7UO4Z9P%_+$Qx;L?sE50Z-IX|zsq^LBxgsYGNqKYdo
z1tMF>=*`et$mGnJRLI<3$P!e@s^QJ(&E(D0R>%fbno?3(kjhoa9>E0kroTlYhc|;a
zV|yWIP$8FwH**BY|22i&etv#l|A7EZc-xm0@+5V}7V>IDuma_C5|dJM3i)CS`7;<Y
zz%~kG0Id@YDii`)$I@CToB=UE&7@ESV!milp%~13n?i9^^BGDCC6YRw1!`ZJY1^;q
zW0;!d8MOGymy$xsl0vDZM6jiqxrynih0?KwGB8VJL6*t|70N>`Rmcznn(=3%@~3l0
zALNP1Rh*31wv#PXEGbk<N=z*&R8G<h@MdHZVaAmvfL?=w25>Tfr;q?|RyL3fBM_Pc
KX(3S9F#rIHd!A(g

literal 0
HcmV?d00001

diff --git a/tests/ut/python/dataset/test_autocontrast.py b/tests/ut/python/dataset/test_autocontrast.py
index d212994e6e7..fd390b54832 100644
--- a/tests/ut/python/dataset/test_autocontrast.py
+++ b/tests/ut/python/dataset/test_autocontrast.py
@@ -16,20 +16,22 @@
 Testing AutoContrast op in DE
 """
 import numpy as np
-
 import mindspore.dataset.engine as de
 import mindspore.dataset.transforms.vision.py_transforms as F
+import mindspore.dataset.transforms.vision.c_transforms as C
 from mindspore import log as logger
-from util import visualize_list, diff_mse
+from util import visualize_list, diff_mse, save_and_check_md5
 
 DATA_DIR = "../data/dataset/testImageNetData/train/"
 
+GENERATE_GOLDEN = False
 
-def test_auto_contrast(plot=False):
+
+def test_auto_contrast_py(plot=False):
     """
     Test AutoContrast
     """
-    logger.info("Test AutoContrast")
+    logger.info("Test AutoContrast Python Op")
 
     # Original Images
     ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
@@ -78,9 +80,156 @@ def test_auto_contrast(plot=False):
         mse[i] = diff_mse(images_auto_contrast[i], images_original[i])
     logger.info("MSE= {}".format(str(np.mean(mse))))
 
+    # Compare with expected md5 from images
+    filename = "autcontrast_01_result_py.npz"
+    save_and_check_md5(ds_auto_contrast, filename, generate_golden=GENERATE_GOLDEN)
+
     if plot:
         visualize_list(images_original, images_auto_contrast)
 
 
+def test_auto_contrast_c(plot=False):
+    """
+    Test AutoContrast C Op
+    """
+    logger.info("Test AutoContrast C Op")
+
+    # AutoContrast Images
+    ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
+    ds = ds.map(input_columns=["image"],
+                operations=[C.Decode(),
+                            C.Resize((224, 224))])
+    python_op = F.AutoContrast()
+    c_op = C.AutoContrast()
+    transforms_op = F.ComposeOp([lambda img: F.ToPIL()(img.astype(np.uint8)),
+                                 python_op,
+                                 np.array])()
+
+    ds_auto_contrast_py = ds.map(input_columns="image",
+                                 operations=transforms_op)
+
+    ds_auto_contrast_py = ds_auto_contrast_py.batch(512)
+
+    for idx, (image, _) in enumerate(ds_auto_contrast_py):
+        if idx == 0:
+            images_auto_contrast_py = image
+        else:
+            images_auto_contrast_py = np.append(images_auto_contrast_py,
+                                                image,
+                                                axis=0)
+
+    ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
+    ds = ds.map(input_columns=["image"],
+                operations=[C.Decode(),
+                            C.Resize((224, 224))])
+
+    ds_auto_contrast_c = ds.map(input_columns="image",
+                                operations=c_op)
+
+    ds_auto_contrast_c = ds_auto_contrast_c.batch(512)
+
+    for idx, (image, _) in enumerate(ds_auto_contrast_c):
+        if idx == 0:
+            images_auto_contrast_c = image
+        else:
+            images_auto_contrast_c = np.append(images_auto_contrast_c,
+                                               image,
+                                               axis=0)
+
+    num_samples = images_auto_contrast_c.shape[0]
+    mse = np.zeros(num_samples)
+    for i in range(num_samples):
+        mse[i] = diff_mse(images_auto_contrast_c[i], images_auto_contrast_py[i])
+    logger.info("MSE= {}".format(str(np.mean(mse))))
+    np.testing.assert_equal(np.mean(mse), 0.0)
+
+    if plot:
+        visualize_list(images_auto_contrast_c, images_auto_contrast_py, visualize_mode=2)
+
+
+def test_auto_contrast_one_channel_c(plot=False):
+    """
+    Test AutoContrast C op with one channel
+    """
+    logger.info("Test AutoContrast C Op With One Channel Images")
+
+    # AutoContrast Images
+    ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
+    ds = ds.map(input_columns=["image"],
+                operations=[C.Decode(),
+                            C.Resize((224, 224))])
+    python_op = F.AutoContrast()
+    c_op = C.AutoContrast()
+    # not using F.ToTensor() since it converts to floats
+    transforms_op = F.ComposeOp([lambda img: (np.array(img)[:, :, 0]).astype(np.uint8),
+                                 F.ToPIL(),
+                                 python_op,
+                                 np.array])()
+
+    ds_auto_contrast_py = ds.map(input_columns="image",
+                                 operations=transforms_op)
+
+    ds_auto_contrast_py = ds_auto_contrast_py.batch(512)
+
+    for idx, (image, _) in enumerate(ds_auto_contrast_py):
+        if idx == 0:
+            images_auto_contrast_py = image
+        else:
+            images_auto_contrast_py = np.append(images_auto_contrast_py,
+                                                image,
+                                                axis=0)
+
+    ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
+    ds = ds.map(input_columns=["image"],
+                operations=[C.Decode(),
+                            C.Resize((224, 224)),
+                            lambda img: np.array(img[:, :, 0])])
+
+    ds_auto_contrast_c = ds.map(input_columns="image",
+                                operations=c_op)
+
+    ds_auto_contrast_c = ds_auto_contrast_c.batch(512)
+
+    for idx, (image, _) in enumerate(ds_auto_contrast_c):
+        if idx == 0:
+            images_auto_contrast_c = image
+        else:
+            images_auto_contrast_c = np.append(images_auto_contrast_c,
+                                               image,
+                                               axis=0)
+
+    num_samples = images_auto_contrast_c.shape[0]
+    mse = np.zeros(num_samples)
+    for i in range(num_samples):
+        mse[i] = diff_mse(images_auto_contrast_c[i], images_auto_contrast_py[i])
+    logger.info("MSE= {}".format(str(np.mean(mse))))
+    np.testing.assert_equal(np.mean(mse), 0.0)
+
+    if plot:
+        visualize_list(images_auto_contrast_c, images_auto_contrast_py, visualize_mode=2)
+
+
+def test_auto_contrast_invalid_input_c():
+    """
+    Test AutoContrast C Op with invalid params
+    """
+    logger.info("Test AutoContrast C Op with invalid params")
+    try:
+        ds = de.ImageFolderDatasetV2(dataset_dir=DATA_DIR, shuffle=False)
+        ds = ds.map(input_columns=["image"],
+                    operations=[C.Decode(),
+                                C.Resize((224, 224)),
+                                lambda img: np.array(img[:, :, 0])])
+        # invalid ignore
+        ds = ds.map(input_columns="image",
+                    operations=C.AutoContrast(ignore=255.5))
+    except TypeError as error:
+        logger.info("Got an exception in DE: {}".format(str(error)))
+        assert "Argument ignore with value 255.5 is not of type" in str(error)
+
+
 if __name__ == "__main__":
-    test_auto_contrast(plot=True)
+    test_auto_contrast_py(plot=True)
+    test_auto_contrast_c(plot=True)
+    test_auto_contrast_one_channel_c(plot=True)
+    test_auto_contrast_invalid_input_c()

From ab37e87d7a9fae0326cfc90f8df6b3bf9e25e0b6 Mon Sep 17 00:00:00 2001
From: tinazhang <tina.mengting.zhang@huawei.com>
Date: Wed, 15 Jul 2020 16:52:56 -0400
Subject: [PATCH 45/68] adding Mnist python ut coverage

---
 .../python/dataset/test_datasets_cifarop.py   |   7 +
 .../ut/python/dataset/test_datasets_mnist.py  | 238 ++++++++++++++++++
 .../python/dataset/test_datasets_sharding.py  |   2 +-
 3 files changed, 246 insertions(+), 1 deletion(-)
 create mode 100644 tests/ut/python/dataset/test_datasets_mnist.py

diff --git a/tests/ut/python/dataset/test_datasets_cifarop.py b/tests/ut/python/dataset/test_datasets_cifarop.py
index d6d3029b53b..2b66f326657 100644
--- a/tests/ut/python/dataset/test_datasets_cifarop.py
+++ b/tests/ut/python/dataset/test_datasets_cifarop.py
@@ -87,6 +87,13 @@ def test_cifar10_basic():
     """
     logger.info("Test Cifar10Dataset Op")
 
+    # case 0: test loading the whole dataset
+    data0 = ds.Cifar10Dataset(DATA_DIR_10)
+    num_iter0 = 0
+    for _ in data0.create_dict_iterator():
+        num_iter0 += 1
+    assert num_iter0 == 10000
+
     # case 1: test num_samples
     data1 = ds.Cifar10Dataset(DATA_DIR_10, num_samples=100)
     num_iter1 = 0
diff --git a/tests/ut/python/dataset/test_datasets_mnist.py b/tests/ut/python/dataset/test_datasets_mnist.py
new file mode 100644
index 00000000000..dfd6f7c6fc6
--- /dev/null
+++ b/tests/ut/python/dataset/test_datasets_mnist.py
@@ -0,0 +1,238 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Test Mnist dataset operators
+"""
+import os
+import pytest
+import numpy as np
+import matplotlib.pyplot as plt
+import mindspore.dataset as ds
+from mindspore import log as logger
+
+DATA_DIR = "../data/dataset/testMnistData"
+
+
+def load_mnist(path):
+    """
+    load Mnist data
+    """
+    labels_path = os.path.join(path, 't10k-labels-idx1-ubyte')
+    images_path = os.path.join(path, 't10k-images-idx3-ubyte')
+    with open(labels_path, 'rb') as lbpath:
+        lbpath.read(8)
+        labels = np.fromfile(lbpath, dtype=np.uint8)
+    with open(images_path, 'rb') as imgpath:
+        imgpath.read(16)
+        images = np.fromfile(imgpath, dtype=np.uint8)
+        images = images.reshape(-1, 28, 28, 1)
+        images[images > 0] = 255  # Perform binarization to maintain consistency with our API
+    return images, labels
+
+
+def visualize_dataset(images, labels):
+    """
+    Helper function to visualize the dataset samples
+    """
+    num_samples = len(images)
+    for i in range(num_samples):
+        plt.subplot(1, num_samples, i + 1)
+        plt.imshow(images[i].squeeze(), cmap=plt.cm.gray)
+        plt.title(labels[i])
+    plt.show()
+
+
+def test_mnist_content_check():
+    """
+    Validate MnistDataset image readings
+    """
+    logger.info("Test MnistDataset Op with content check")
+    data1 = ds.MnistDataset(DATA_DIR, num_samples=100, shuffle=False)
+    images, labels = load_mnist(DATA_DIR)
+    num_iter = 0
+    # in this example, each dictionary has keys "image" and "label"
+    image_list, label_list = [], []
+    for i, data in enumerate(data1.create_dict_iterator()):
+        image_list.append(data["image"])
+        label_list.append("label {}".format(data["label"]))
+        np.testing.assert_array_equal(data["image"], images[i])
+        np.testing.assert_array_equal(data["label"], labels[i])
+        num_iter += 1
+    assert num_iter == 100
+
+
+def test_mnist_basic():
+    """
+    Validate MnistDataset
+    """
+    logger.info("Test MnistDataset Op")
+
+    # case 1: test loading whole dataset
+    data1 = ds.MnistDataset(DATA_DIR)
+    num_iter1 = 0
+    for _ in data1.create_dict_iterator():
+        num_iter1 += 1
+    assert num_iter1 == 10000
+
+    # case 2: test num_samples
+    data2 = ds.MnistDataset(DATA_DIR, num_samples=500)
+    num_iter2 = 0
+    for _ in data2.create_dict_iterator():
+        num_iter2 += 1
+    assert num_iter2 == 500
+
+    # case 3: test repeat
+    data3 = ds.MnistDataset(DATA_DIR, num_samples=200)
+    data3 = data3.repeat(5)
+    num_iter3 = 0
+    for _ in data3.create_dict_iterator():
+        num_iter3 += 1
+    assert num_iter3 == 1000
+
+    # case 4: test batch with drop_remainder=False
+    data4 = ds.MnistDataset(DATA_DIR, num_samples=100)
+    assert data4.get_dataset_size() == 100
+    assert data4.get_batch_size() == 1
+    data4 = data4.batch(batch_size=7)  # drop_remainder is default to be False
+    assert data4.get_dataset_size() == 15
+    assert data4.get_batch_size() == 7
+    num_iter4 = 0
+    for _ in data4.create_dict_iterator():
+        num_iter4 += 1
+    assert num_iter4 == 15
+
+    # case 5: test batch with drop_remainder=True
+    data5 = ds.MnistDataset(DATA_DIR, num_samples=100)
+    assert data5.get_dataset_size() == 100
+    assert data5.get_batch_size() == 1
+    data5 = data5.batch(batch_size=7, drop_remainder=True)  # the rest of incomplete batch will be dropped
+    assert data5.get_dataset_size() == 14
+    assert data5.get_batch_size() == 7
+    num_iter5 = 0
+    for _ in data5.create_dict_iterator():
+        num_iter5 += 1
+    assert num_iter5 == 14
+
+
+def test_mnist_pk_sampler():
+    """
+    Test MnistDataset with PKSampler
+    """
+    logger.info("Test MnistDataset Op with PKSampler")
+    golden = [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4,
+              5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9]
+    sampler = ds.PKSampler(3)
+    data = ds.MnistDataset(DATA_DIR, sampler=sampler)
+    num_iter = 0
+    label_list = []
+    for item in data.create_dict_iterator():
+        label_list.append(item["label"])
+        num_iter += 1
+    np.testing.assert_array_equal(golden, label_list)
+    assert num_iter == 30
+
+
+def test_mnist_sequential_sampler():
+    """
+    Test MnistDataset with SequentialSampler
+    """
+    logger.info("Test MnistDataset Op with SequentialSampler")
+    num_samples = 50
+    sampler = ds.SequentialSampler(num_samples=num_samples)
+    data1 = ds.MnistDataset(DATA_DIR, sampler=sampler)
+    data2 = ds.MnistDataset(DATA_DIR, shuffle=False, num_samples=num_samples)
+    label_list1, label_list2 = [], []
+    num_iter = 0
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        label_list1.append(item1["label"])
+        label_list2.append(item2["label"])
+        num_iter += 1
+    np.testing.assert_array_equal(label_list1, label_list2)
+    assert num_iter == num_samples
+
+
+def test_mnist_exception():
+    """
+    Test error cases for MnistDataset
+    """
+    logger.info("Test error cases for MnistDataset")
+    error_msg_1 = "sampler and shuffle cannot be specified at the same time"
+    with pytest.raises(RuntimeError, match=error_msg_1):
+        ds.MnistDataset(DATA_DIR, shuffle=False, sampler=ds.PKSampler(3))
+
+    error_msg_2 = "sampler and sharding cannot be specified at the same time"
+    with pytest.raises(RuntimeError, match=error_msg_2):
+        ds.MnistDataset(DATA_DIR, sampler=ds.PKSampler(3), num_shards=2, shard_id=0)
+
+    error_msg_3 = "num_shards is specified and currently requires shard_id as well"
+    with pytest.raises(RuntimeError, match=error_msg_3):
+        ds.MnistDataset(DATA_DIR, num_shards=10)
+
+    error_msg_4 = "shard_id is specified but num_shards is not"
+    with pytest.raises(RuntimeError, match=error_msg_4):
+        ds.MnistDataset(DATA_DIR, shard_id=0)
+
+    error_msg_5 = "Input shard_id is not within the required interval"
+    with pytest.raises(ValueError, match=error_msg_5):
+        ds.MnistDataset(DATA_DIR, num_shards=5, shard_id=-1)
+    with pytest.raises(ValueError, match=error_msg_5):
+        ds.MnistDataset(DATA_DIR, num_shards=5, shard_id=5)
+    with pytest.raises(ValueError, match=error_msg_5):
+        ds.MnistDataset(DATA_DIR, num_shards=2, shard_id=5)
+
+    error_msg_6 = "num_parallel_workers exceeds"
+    with pytest.raises(ValueError, match=error_msg_6):
+        ds.MnistDataset(DATA_DIR, shuffle=False, num_parallel_workers=0)
+    with pytest.raises(ValueError, match=error_msg_6):
+        ds.MnistDataset(DATA_DIR, shuffle=False, num_parallel_workers=65)
+    with pytest.raises(ValueError, match=error_msg_6):
+        ds.MnistDataset(DATA_DIR, shuffle=False, num_parallel_workers=-2)
+
+    error_msg_7 = "Argument shard_id"
+    with pytest.raises(TypeError, match=error_msg_7):
+        ds.MnistDataset(DATA_DIR, num_shards=2, shard_id="0")
+
+
+def test_mnist_visualize(plot=False):
+    """
+    Visualize MnistDataset results
+    """
+    logger.info("Test MnistDataset visualization")
+
+    data1 = ds.MnistDataset(DATA_DIR, num_samples=10, shuffle=False)
+    num_iter = 0
+    image_list, label_list = [], []
+    for item in data1.create_dict_iterator():
+        image = item["image"]
+        label = item["label"]
+        image_list.append(image)
+        label_list.append("label {}".format(label))
+        assert isinstance(image, np.ndarray)
+        assert image.shape == (28, 28, 1)
+        assert image.dtype == np.uint8
+        assert label.dtype == np.uint32
+        num_iter += 1
+    assert num_iter == 10
+    if plot:
+        visualize_dataset(image_list, label_list)
+
+
+if __name__ == '__main__':
+    test_mnist_content_check()
+    test_mnist_basic()
+    test_mnist_pk_sampler()
+    test_mnist_sequential_sampler()
+    test_mnist_exception()
+    test_mnist_visualize(plot=True)
diff --git a/tests/ut/python/dataset/test_datasets_sharding.py b/tests/ut/python/dataset/test_datasets_sharding.py
index 94c39fb34c7..ce6a30077fc 100644
--- a/tests/ut/python/dataset/test_datasets_sharding.py
+++ b/tests/ut/python/dataset/test_datasets_sharding.py
@@ -200,7 +200,7 @@ def test_cifar10_shardings(print_res=False):
             logger.info("labels of dataset: {}".format(res))
         return res
 
-    # 60000 rows in total. CIFAR reads everything in memory which would make each test case very slow
+    # 10000 rows in total. CIFAR reads everything in memory which would make each test case very slow
     # therefore, only 2 test cases for now.
     assert sharding_config(10000, 9999, 7, False, 1) == [9]
     assert sharding_config(10000, 0, 4, False, 3) == [0, 0, 0]

From c984b0fc8808a531facc4032a6c84b75c2109707 Mon Sep 17 00:00:00 2001
From: nhussain <naireen.hussain@huawei.com>
Date: Thu, 16 Jul 2020 13:26:24 -0400
Subject: [PATCH 46/68] fix size tuple

---
 .../dataset/transforms/vision/c_transforms.py |   2 -
 .../data/dataset/golden/resize_01_result.npz  | Bin 0 -> 644 bytes
 .../data/dataset/golden/resize_02_result.npz  | Bin 0 -> 644 bytes
 tests/ut/python/dataset/test_resize.py        | 117 ++++++++++++++++++
 4 files changed, 117 insertions(+), 2 deletions(-)
 create mode 100644 tests/ut/data/dataset/golden/resize_01_result.npz
 create mode 100644 tests/ut/data/dataset/golden/resize_02_result.npz
 create mode 100644 tests/ut/python/dataset/test_resize.py

diff --git a/mindspore/dataset/transforms/vision/c_transforms.py b/mindspore/dataset/transforms/vision/c_transforms.py
index 2de575d14d6..b4245cc2149 100644
--- a/mindspore/dataset/transforms/vision/c_transforms.py
+++ b/mindspore/dataset/transforms/vision/c_transforms.py
@@ -319,8 +319,6 @@ class Resize(cde.ResizeOp):
 
     @check_resize_interpolation
     def __init__(self, size, interpolation=Inter.LINEAR):
-        if isinstance(size, int):
-            size = (size, size)
         self.size = size
         self.interpolation = interpolation
         interpoltn = DE_C_INTER_MODE[interpolation]
diff --git a/tests/ut/data/dataset/golden/resize_01_result.npz b/tests/ut/data/dataset/golden/resize_01_result.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b3a52243a4d660ff8dfde41256c43e0c479a7e56
GIT binary patch
literal 644
zcmWIWW@Zs#fB;2?J%>!y{xC8yfG{V62t#5~QM`d(UO^=zg8*0%q!1(t0+anheFGvH
z8Oj){)l*W7lZ(`?6x3_{)pZoq)AEZ-iW2kU^NUhaLBei{ImM|!@#2icf>a=1!%#=T
zNK;3lR)KuL)xeybSDIT;sh6Bzl&Y6onp2VqbZ=rMSA0=wa(-TMNl|HX30ENlL={(F
z3PiS$(VL;Qkja@bsgSw7kR_;)Rl}PR2$|Xn*`P{ON-7IdxeD1Mn1J5&w<zTBX7FZg
zFXRj=<kIkFjsW?;rjXmu&(G^W5P%79`;tPQq|VqvUX2J=pnPU-VtQ&JUu+@2Mg~I$
z*h+y6pnZZtg+d_vSXv8(GaweEnG}jZED#MU6oXk{Qz(vV0YgclL{g`-z&CFO?hUH8
ziOjRtvY88UmlsNw6iOw*ERhCTA`?_73$;WpLky^ALiCC5w|(`?atpjA?Y7%2E|f1R
zRDkPI1nE%<DpZE*Q330za{5*CxL&&E?5mrP{`u})Rj680sFswPT2iQ<q!-}L$Rxsy
gE8zhB4+Rb2v;t2?0p6@^AQ?s=GzZe=pa5n70BWwSW&i*H

literal 0
HcmV?d00001

diff --git a/tests/ut/data/dataset/golden/resize_02_result.npz b/tests/ut/data/dataset/golden/resize_02_result.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b3a52243a4d660ff8dfde41256c43e0c479a7e56
GIT binary patch
literal 644
zcmWIWW@Zs#fB;2?J%>!y{xC8yfG{V62t#5~QM`d(UO^=zg8*0%q!1(t0+anheFGvH
z8Oj){)l*W7lZ(`?6x3_{)pZoq)AEZ-iW2kU^NUhaLBei{ImM|!@#2icf>a=1!%#=T
zNK;3lR)KuL)xeybSDIT;sh6Bzl&Y6onp2VqbZ=rMSA0=wa(-TMNl|HX30ENlL={(F
z3PiS$(VL;Qkja@bsgSw7kR_;)Rl}PR2$|Xn*`P{ON-7IdxeD1Mn1J5&w<zTBX7FZg
zFXRj=<kIkFjsW?;rjXmu&(G^W5P%79`;tPQq|VqvUX2J=pnPU-VtQ&JUu+@2Mg~I$
z*h+y6pnZZtg+d_vSXv8(GaweEnG}jZED#MU6oXk{Qz(vV0YgclL{g`-z&CFO?hUH8
ziOjRtvY88UmlsNw6iOw*ERhCTA`?_73$;WpLky^ALiCC5w|(`?atpjA?Y7%2E|f1R
zRDkPI1nE%<DpZE*Q330za{5*CxL&&E?5mrP{`u})Rj680sFswPT2iQ<q!-}L$Rxsy
gE8zhB4+Rb2v;t2?0p6@^AQ?s=GzZe=pa5n70BWwSW&i*H

literal 0
HcmV?d00001

diff --git a/tests/ut/python/dataset/test_resize.py b/tests/ut/python/dataset/test_resize.py
new file mode 100644
index 00000000000..a187e0c53cd
--- /dev/null
+++ b/tests/ut/python/dataset/test_resize.py
@@ -0,0 +1,117 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Testing Resize op in DE
+"""
+import pytest
+import mindspore.dataset as ds
+import mindspore.dataset.transforms.vision.c_transforms as vision
+from mindspore.dataset.transforms.vision.utils import Inter
+from mindspore import log as logger
+from util import visualize_list, save_and_check_md5, \
+    config_get_set_seed, config_get_set_num_parallel_workers
+
+DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
+SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
+
+GENERATE_GOLDEN = False
+
+
+def test_resize_op(plot=False):
+    def test_resize_op_parameters(test_name, size, plot):
+        """
+        Test resize_op
+        """
+        logger.info("Test resize: {0}".format(test_name))
+        data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+
+        # define map operations
+        decode_op = vision.Decode()
+        resize_op = vision.Resize(size)
+
+        # apply map operations on images
+        data1 = data1.map(input_columns=["image"], operations=decode_op)
+
+        data2 = data1.map(input_columns=["image"], operations=resize_op)
+        image_original = []
+        image_resized = []
+        for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+            image_1 = item1["image"]
+            image_2 = item2["image"]
+            image_original.append(image_1)
+            image_resized.append(image_2)
+        if plot:
+            visualize_list(image_original, image_resized)
+
+    test_resize_op_parameters("Test single int for size", 10, plot=False)
+    test_resize_op_parameters("Test tuple for size", (10, 15), plot=False)
+
+
+def test_resize_md5(plot=False):
+    def test_resize_md5_parameters(test_name, size, filename, seed, plot):
+        """
+        Test Resize with md5 check
+        """
+        logger.info("Test Resize with md5 check: {0}".format(test_name))
+        original_seed = config_get_set_seed(seed)
+        original_num_parallel_workers = config_get_set_num_parallel_workers(1)
+
+        # Generate dataset
+        data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+        decode_op = vision.Decode()
+        resize_op = vision.Resize(size)
+        data1 = data1.map(input_columns=["image"], operations=decode_op)
+        data2 = data1.map(input_columns=["image"], operations=resize_op)
+        image_original = []
+        image_resized = []
+        # Compare with expected md5 from images
+        save_and_check_md5(data1, filename, generate_golden=GENERATE_GOLDEN)
+
+        for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+            image_1 = item1["image"]
+            image_2 = item2["image"]
+            image_original.append(image_1)
+            image_resized.append(image_2)
+        if plot:
+            visualize_list(image_original, image_resized)
+
+        # Restore configuration
+        ds.config.set_seed(original_seed)
+        ds.config.set_num_parallel_workers(original_num_parallel_workers)
+
+    test_resize_md5_parameters("Test single int for size", 5, "resize_01_result.npz", 5, plot)
+    test_resize_md5_parameters("Test tuple for size", (5, 7), "resize_02_result.npz", 7, plot)
+
+
+def test_resize_op_invalid_input():
+    def test_invalid_input(test_name, size, interpolation, error, error_msg):
+        logger.info("Test Resize with bad input: {0}".format(test_name))
+        with pytest.raises(error) as error_info:
+            vision.Resize(size, interpolation)
+        assert error_msg in str(error_info.value)
+
+    test_invalid_input("invalid size parameter type as a single number", 4.5, Inter.LINEAR, TypeError,
+                       "Size should be a single integer or a list/tuple (h, w) of length 2.")
+    test_invalid_input("invalid size parameter shape", (2, 3, 4), Inter.LINEAR, TypeError,
+                       "Size should be a single integer or a list/tuple (h, w) of length 2.")
+    test_invalid_input("invalid size parameter type in a tuple", (2.3, 3), Inter.LINEAR, TypeError,
+                       "incompatible constructor arguments.")
+    test_invalid_input("invalid Interpolation value", (2.3, 3), None, KeyError, "None")
+
+
+if __name__ == "__main__":
+    test_resize_op(plot=True)
+    test_resize_md5(plot=True)
+    test_resize_op_invalid_input()

From aef2c1984ef760445840e1c22a1f61bfe043e5e8 Mon Sep 17 00:00:00 2001
From: VectorSL <shiliang10@huawei.com>
Date: Thu, 16 Jul 2020 20:35:36 +0800
Subject: [PATCH 47/68] cast support more types

---
 .../akg/akg_kernel_attrs_process.cc           |  11 +-
 mindspore/ops/_op_impl/akg/gpu/cast.py        |  33 ++-
 tests/st/ops/gpu/test_cast_op.py              | 272 ++++++++++++++++++
 3 files changed, 305 insertions(+), 11 deletions(-)

diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
index 73fdb5c11b5..e4d0a6c00a3 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/optimizer/common/helper.h"
+#include "backend/kernel_compiler/common_utils.h"
 
 namespace mindspore {
 namespace kernel {
@@ -75,15 +76,7 @@ void SetAkgAttrsForCast(const AnfNodePtr &anf_node) {
 
   std::string dst_type;
   TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, 0);
-  if (output_type == kFloat32->type_id()) {
-    dst_type = "float32";
-  } else if (output_type == kFloat16->type_id()) {
-    dst_type = "float16";
-  } else if (output_type == kInt32->type_id()) {
-    dst_type = "int32";
-  } else {
-    MS_LOG(WARNING) << "Unknown cast_to type: " << TypeIdToType(output_type)->ToString();
-  }
+  dst_type = TypeId2String(output_type);
   AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node);
 }
 
diff --git a/mindspore/ops/_op_impl/akg/gpu/cast.py b/mindspore/ops/_op_impl/akg/gpu/cast.py
index c8aef249cd9..3c9ffa89740 100644
--- a/mindspore/ops/_op_impl/akg/gpu/cast.py
+++ b/mindspore/ops/_op_impl/akg/gpu/cast.py
@@ -21,10 +21,39 @@ cast_op_info = AkgGpuRegOp("Cast") \
     .output(0, "output") \
     .attr("dst_type", "required", "str") \
     .dtype_format(DataType.F16_Default, DataType.F32_Default) \
-    .dtype_format(DataType.F32_Default, DataType.F16_Default) \
-    .dtype_format(DataType.F32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.F64_Default) \
+    .dtype_format(DataType.I32_Default, DataType.F16_Default) \
     .dtype_format(DataType.I32_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I32_Default, DataType.I8_Default) \
+    .dtype_format(DataType.I32_Default, DataType.U8_Default) \
+    .dtype_format(DataType.I32_Default, DataType.BOOL_Default) \
+    .dtype_format(DataType.I8_Default, DataType.F64_Default) \
+    .dtype_format(DataType.I8_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I8_Default, DataType.F16_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I16_Default) \
+    .dtype_format(DataType.I8_Default, DataType.I64_Default) \
     .dtype_format(DataType.BOOL_Default, DataType.F32_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.F16_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.F64_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.I8_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.I16_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.I32_Default) \
+    .dtype_format(DataType.BOOL_Default, DataType.I64_Default) \
+    .dtype_format(DataType.U8_Default, DataType.F32_Default) \
+    .dtype_format(DataType.U8_Default, DataType.F16_Default) \
+    .dtype_format(DataType.U8_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I16_Default, DataType.F64_Default) \
+    .dtype_format(DataType.I16_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.I16_Default, DataType.I32_Default) \
+    .dtype_format(DataType.I16_Default, DataType.I64_Default) \
+    .dtype_format(DataType.I64_Default, DataType.F64_Default) \
+    .dtype_format(DataType.I16_Default, DataType.F32_Default) \
+    .dtype_format(DataType.I16_Default, DataType.F16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.F16_Default) \
     .get_op_info()
 
 
diff --git a/tests/st/ops/gpu/test_cast_op.py b/tests/st/ops/gpu/test_cast_op.py
index 793d92d7bc4..b3b48fcfa0f 100644
--- a/tests/st/ops/gpu/test_cast_op.py
+++ b/tests/st/ops/gpu/test_cast_op.py
@@ -70,3 +70,275 @@ def test_cast1():
     assert type0 == 'float32'
     type1 = output[1].asnumpy().dtype
     assert type1 == 'float32'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast2():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.float16))
+    t0 = mstype.int32
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.float16))
+    t1 = mstype.float64
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'int32'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'float64'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast3():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.float16))
+    t0 = mstype.int32
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.float32))
+    t1 = mstype.int32
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'int32'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'int32'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast4():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int32))
+    t0 = mstype.float16
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int32))
+    t1 = mstype.int8
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'float16'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'int8'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast5():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int32))
+    t0 = mstype.uint8
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int32))
+    t1 = mstype.bool_
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'uint8'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'bool'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast6():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int8))
+    t0 = mstype.float64
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int8))
+    t1 = mstype.float32
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'float64'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'float32'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast7():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int8))
+    t0 = mstype.float32
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int8))
+    t1 = mstype.float16
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'float32'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'float16'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast8():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int8))
+    t0 = mstype.int32
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int8))
+    t1 = mstype.int16
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'int32'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'int16'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast9():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int8))
+    t0 = mstype.int64
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.bool))
+    t1 = mstype.float16
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'int64'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'float16'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast10():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.bool))
+    t0 = mstype.int8
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.bool))
+    t1 = mstype.float64
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'int8'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'float64'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast11():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.bool))
+    t0 = mstype.int16
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.bool))
+    t1 = mstype.int32
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'int16'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'int32'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast12():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.bool))
+    t0 = mstype.int64
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.uint8))
+    t1 = mstype.float32
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'int64'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'float32'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast13():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.uint8))
+    t0 = mstype.int32
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.uint8))
+    t1 = mstype.float16
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'int32'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'float16'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast14():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int16))
+    t0 = mstype.float64
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int16))
+    t1 = mstype.float32
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'float64'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'float32'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast15():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int16))
+    t0 = mstype.float16
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int16))
+    t1 = mstype.int32
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'float16'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'int32'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast16():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int16))
+    t0 = mstype.float16
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int64))
+    t1 = mstype.float64
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'float16'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'float64'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_cast17():
+    x0 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int16))
+    t0 = mstype.float32
+    x1 = Tensor(np.arange(24).reshape((4, 3, 2)).astype(np.int16))
+    t1 = mstype.float16
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
+    net = Net(t0, t1)
+    output = net(x0, x1)
+    type0 = output[0].asnumpy().dtype
+    assert type0 == 'float32'
+    type1 = output[1].asnumpy().dtype
+    assert type1 == 'float16'

From 891b80b9f580e0f942df84571eaf97ec47f0aa7c Mon Sep 17 00:00:00 2001
From: kswang <wangkaisheng2@huawei.com>
Date: Thu, 16 Jul 2020 11:58:54 +0800
Subject: [PATCH 48/68] add bucket reduce sparse gradient

---
 .../backend/kernel_compiler/common_utils.cc   | 511 +++++++++++-------
 .../backend/kernel_compiler/common_utils.h    |  28 +-
 .../cpu/sparse_apply_adam_cpu_kernel.cc       |  18 +-
 .../cpu/sparse_apply_ftrl_cpu_kernel.cc       |  17 +-
 .../cpu/sparse_apply_lazy_adam_cpu_kernel.cc  |  16 +-
 ...parse_apply_proximal_adagrad_cpu_kernel.cc |  16 +-
 .../ccsrc/backend/session/cpu_session.cc      |  45 +-
 tests/ut/cpp/kernel/common_utils_test.cc      |  67 ++-
 8 files changed, 465 insertions(+), 253 deletions(-)

diff --git a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
index f4495cdb9df..ef2f75ee6e1 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
@@ -20,6 +20,7 @@
 #include <iostream>
 #include <utility>
 #include <fstream>
+#include <algorithm>
 #include <thread>
 #include "nlohmann/json.hpp"
 #include "backend/session/anf_runtime_algorithm.h"
@@ -499,235 +500,329 @@ int Sign(float x) {
   return 0;
 }
 
-void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
-                              size_t outer_dim) {
-  MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
-  MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
-  MS_EXCEPTION_IF_NULL(unique_grad);
-  MS_EXCEPTION_IF_NULL(unique_grad->value_);
-  MS_EXCEPTION_IF_NULL(unique_grad->indices_);
+namespace {
+struct BucketSparseGradient {
+  float *value_;
+  int *indices_;
+  int *global_indices_;
+  size_t indices_size_;
+};
+
+struct MultiThreadReduceSparseGradientParam {
+  SparseGradient *input_grad_{nullptr};
+  SparseGradient *workspace_grad_{nullptr};
+  SparseGradient *output_grad_{nullptr};
+  size_t max_index_{0};
+  size_t value_stride_{0};
+  size_t thread_num_{0};
+  bool use_sort_reduce_{false};
+};
+
+void CalculateEachBucketSize(const std::shared_ptr<SparseGradient> &sparse_grad, size_t max_index,
+                             std::vector<size_t> *each_bucket_size) {
+  MS_LOG(DEBUG) << "Start";
+  MS_EXCEPTION_IF_NULL(sparse_grad);
+  MS_EXCEPTION_IF_NULL(sparse_grad->indices_);
+  MS_EXCEPTION_IF_NULL(each_bucket_size);
+  size_t bucket_num = each_bucket_size->size();
+  for (size_t i = 0; i < sparse_grad->indices_size_; ++i) {
+    int index = sparse_grad->indices_[i];
+    if (index >= 0 && IntToSize(index) < max_index) {
+      auto bucket_id = index % bucket_num;
+      each_bucket_size->at(bucket_id)++;
+    }
+  }
+  MS_LOG(DEBUG) << "End";
+}
+
+void SplitAndCalculateSegmentBucketSize(const MultiThreadReduceSparseGradientParam &param,
+                                        std::vector<std::shared_ptr<SparseGradient>> *segments_ptr,
+                                        std::vector<std::shared_ptr<std::vector<size_t>>> *segment_bucket_sizes_ptr) {
+  MS_EXCEPTION_IF_NULL(param.input_grad_);
+  MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr);
+  MS_EXCEPTION_IF_NULL(segments_ptr);
+  auto &segments = *segments_ptr;
+  auto &segment_bucket_sizes = *segment_bucket_sizes_ptr;
+  auto input_grad = param.input_grad_;
+  if (param.thread_num_ < 1) {
+    MS_EXCEPTION(ArgumentError) << "Input param thread num must > 0!";
+  }
+  size_t thread_indices_size = input_grad->indices_size_ / param.thread_num_;
+  size_t left_indices_size = input_grad->indices_size_ % param.thread_num_;
+  std::vector<std::thread> threads;
+  threads.reserve(param.thread_num_);
+  segments.reserve(param.thread_num_);
+
+  size_t current_indices_offset = 0;
+  for (size_t i = 0; i < param.thread_num_; ++i) {
+    segment_bucket_sizes.emplace_back(std::make_shared<std::vector<size_t>>(param.thread_num_, 0));
+    size_t indices_size = thread_indices_size;
+    if (i < left_indices_size) {
+      indices_size += 1;
+    }
+    segments.emplace_back(std::make_shared<SparseGradient>());
+    segments[i]->value_ = input_grad->value_ + current_indices_offset * param.value_stride_;
+    segments[i]->indices_ = input_grad->indices_ + current_indices_offset;
+    segments[i]->indices_size_ = indices_size;
+    threads.emplace_back(
+      std::thread(CalculateEachBucketSize, segments[i], param.max_index_, segment_bucket_sizes[i].get()));
+    current_indices_offset += indices_size;
+  }
+
+  for (size_t i = 0; i < param.thread_num_; ++i) {
+    threads[i].join();
+  }
+}
+
+void CopySegmentIndicesToBucket(const MultiThreadReduceSparseGradientParam &param,
+                                const std::shared_ptr<SparseGradient> &segment, size_t bucket_offset,
+                                const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets) {
+  MS_LOG(DEBUG) << "Start";
+  MS_EXCEPTION_IF_NULL(segment);
+  MS_EXCEPTION_IF_NULL(segment->indices_);
+  std::vector<size_t> bucket_data_num(param.thread_num_, 0);
+  for (size_t i = 0; i < segment->indices_size_; ++i) {
+    int index = segment->indices_[i];
+    if (index >= 0 && IntToSize(index) < param.max_index_) {
+      auto bucket_id = index % param.thread_num_;
+      auto bucket_index = bucket_data_num[bucket_id];
+      buckets[bucket_id]->indices_[bucket_index] = index;
+      buckets[bucket_id]->global_indices_[bucket_index] = bucket_offset + i;
+      bucket_data_num[bucket_id]++;
+    }
+  }
+  MS_LOG(DEBUG) << "End";
+}
+
+void GatherSegmentIndicesToOutputBucket(const MultiThreadReduceSparseGradientParam &param,
+                                        const std::vector<std::shared_ptr<SparseGradient>> &segments,
+                                        const std::vector<std::shared_ptr<std::vector<size_t>>> &segment_bucket_sizes,
+                                        std::vector<std::shared_ptr<BucketSparseGradient>> *buckets_ptr) {
+  MS_EXCEPTION_IF_NULL(param.output_grad_);
+  MS_EXCEPTION_IF_NULL(param.output_grad_->value_);
+  MS_EXCEPTION_IF_NULL(param.output_grad_->indices_);
+  MS_EXCEPTION_IF_NULL(buckets_ptr);
+  auto &buckets = *buckets_ptr;
+  size_t thread_num = param.thread_num_;
+  if (thread_num != segment_bucket_sizes.size()) {
+    MS_EXCEPTION(ArgumentError) << "Input param thread num not equal to segment size!";
+  }
+  std::vector<size_t> bucket_data_size(thread_num, 0);
+  for (size_t i = 0; i < thread_num; ++i) {
+    for (size_t j = 0; j < thread_num; ++j) {
+      bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
+    }
+  }
+  size_t current_indices_offset = 0;
+  for (size_t i = 0; i < thread_num; ++i) {
+    buckets.emplace_back(std::make_shared<BucketSparseGradient>());
+    buckets[i]->value_ = param.output_grad_->value_ + current_indices_offset * param.value_stride_;
+    buckets[i]->indices_ = param.output_grad_->indices_ + current_indices_offset;
+    buckets[i]->global_indices_ = param.workspace_grad_->indices_ + current_indices_offset;
+    buckets[i]->indices_size_ = bucket_data_size[i];
+    current_indices_offset += bucket_data_size[i];
+  }
+  std::vector<size_t> tmp_bucket_data_size(thread_num, 0);
+  std::vector<std::vector<std::shared_ptr<BucketSparseGradient>>> each_thread_buckets;
+  for (size_t i = 0; i < thread_num; ++i) {
+    std::vector<std::shared_ptr<BucketSparseGradient>> thread_buckets;
+    for (size_t j = 0; j < thread_num; ++j) {
+      thread_buckets.emplace_back(std::make_shared<BucketSparseGradient>());
+      thread_buckets[j]->indices_ = buckets[j]->indices_ + tmp_bucket_data_size[j];
+      thread_buckets[j]->global_indices_ = buckets[j]->global_indices_ + tmp_bucket_data_size[j];
+      thread_buckets[j]->value_ = buckets[j]->value_ + tmp_bucket_data_size[j] * param.value_stride_;
+      thread_buckets[j]->indices_size_ = segment_bucket_sizes[i]->at(j);
+      tmp_bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
+    }
+    each_thread_buckets.emplace_back(thread_buckets);
+  }
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+  current_indices_offset = 0;
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back(
+      std::thread(CopySegmentIndicesToBucket, param, segments[i], current_indices_offset, each_thread_buckets[i]));
+    current_indices_offset += segments[i]->indices_size_;
+  }
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads[i].join();
+  }
+}
+
+void SortAndReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam &param,
+                                       const std::shared_ptr<BucketSparseGradient> &bucket,
+                                       const std::shared_ptr<SparseGradient> &reduced_bucket) {
+  MS_LOG(DEBUG) << "Start";
+  MS_EXCEPTION_IF_NULL(bucket);
+  MS_EXCEPTION_IF_NULL(bucket->value_);
+  MS_EXCEPTION_IF_NULL(bucket->indices_);
+  MS_EXCEPTION_IF_NULL(reduced_bucket);
+  MS_EXCEPTION_IF_NULL(reduced_bucket->value_);
+  MS_EXCEPTION_IF_NULL(reduced_bucket->indices_);
+  std::vector<std::pair<int, int>> sorted_indices;
+  sorted_indices.reserve(bucket->indices_size_);
+  for (size_t i = 0; i < bucket->indices_size_; ++i) {
+    int index = bucket->indices_[i];
+    int global_index = bucket->global_indices_[i];
+    sorted_indices.emplace_back(std::pair<int, int>(index, global_index));
+  }
+  std::sort(sorted_indices.begin(), sorted_indices.end());
+
+  float *global_value = param.input_grad_->value_;
+  size_t unique_indices_size = 0;
+  size_t max_length = reduced_bucket->indices_size_ * param.value_stride_;
+  int last_index{0};
+  size_t value_offset{0};
+  for (size_t i = 0; i < sorted_indices.size(); ++i) {
+    int index = sorted_indices[i].first;
+    int global_index = sorted_indices[i].second;
+    int global_value_offset = global_index * param.value_stride_;
+    if (i == 0 || index != last_index) {
+      if (i != 0) {
+        unique_indices_size++;
+      }
+      reduced_bucket->indices_[unique_indices_size] = index;
+      value_offset = unique_indices_size * param.value_stride_;
+      auto ret_code = memcpy_s(reduced_bucket->value_ + value_offset, (max_length - value_offset) * sizeof(float),
+                               global_value + global_value_offset, param.value_stride_ * sizeof(float));
+      if (ret_code != EOK) {
+        MS_LOG(EXCEPTION) << "Failed to copy data!";
+      }
+    } else {
+      for (size_t j = 0; j < param.value_stride_; ++j) {
+        reduced_bucket->value_[value_offset + j] += global_value[global_value_offset + j];
+      }
+    }
+    last_index = index;
+  }
+  reduced_bucket->indices_size_ = unique_indices_size;
+  MS_LOG(DEBUG) << "End";
+}
+
+void ReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam &param,
+                                const std::shared_ptr<BucketSparseGradient> &bucket,
+                                const std::shared_ptr<SparseGradient> &reduced_bucket) {
+  MS_LOG(DEBUG) << "Start";
+  MS_EXCEPTION_IF_NULL(bucket);
+  MS_EXCEPTION_IF_NULL(bucket->value_);
+  MS_EXCEPTION_IF_NULL(bucket->indices_);
+  MS_EXCEPTION_IF_NULL(reduced_bucket);
+  MS_EXCEPTION_IF_NULL(reduced_bucket->value_);
+  MS_EXCEPTION_IF_NULL(reduced_bucket->indices_);
+
+  float *global_value = param.input_grad_->value_;
   std::unordered_map<int, size_t> index_map;
   size_t unique_indices_size = 0;
-  for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
-    int index = origin_sparse_grad.indices_[i];
-    if (index < 0 || IntToSize(index) >= first_dim) {
-      continue;
-    }
+  size_t max_length = reduced_bucket->indices_size_ * param.value_stride_;
+  for (size_t i = 0; i < bucket->indices_size_; ++i) {
+    int index = bucket->indices_[i];
+    int global_index = bucket->global_indices_[i];
     auto iter = index_map.find(index);
     if (iter == index_map.end()) {
-      index_map[index] = unique_indices_size;
-      unique_grad->indices_[unique_indices_size] = index;
-      size_t start_index = unique_indices_size * outer_dim;
-      size_t end_index = start_index + outer_dim;
-      for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) {
-        unique_grad->value_[j] = origin_sparse_grad.value_[k];
+      reduced_bucket->indices_[unique_indices_size] = index;
+      size_t start_index = unique_indices_size * param.value_stride_;
+      index_map[index] = start_index;
+      auto ret_code = memcpy_s(reduced_bucket->value_ + start_index, (max_length - start_index) * sizeof(float),
+                               global_value + global_index * param.value_stride_, param.value_stride_ * sizeof(float));
+      if (ret_code != EOK) {
+        MS_LOG(EXCEPTION) << "Failed to copy data!";
       }
       unique_indices_size++;
     } else {
-      size_t first_index = iter->second;
-      size_t start_index = first_index * outer_dim;
-      size_t end_index = start_index + outer_dim;
-      for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) {
-        unique_grad->value_[j] += origin_sparse_grad.value_[k];
+      size_t start_index = iter->second;
+      size_t end_index = start_index + param.value_stride_;
+      for (size_t j = start_index, k = global_index * param.value_stride_; j < end_index; ++j, ++k) {
+        reduced_bucket->value_[j] += global_value[k];
       }
     }
   }
-  unique_grad->indices_size_ = unique_indices_size;
-}
-
-struct WorkerParamsForReduceSparseGradient {
-  size_t slice_start_{0};
-  size_t slice_end_{0};
-  size_t max_length_{0};
-  size_t outer_dim_{0};
-  std::vector<std::pair<int, size_t>> *sorted_indices_{nullptr};
-  std::vector<size_t> *slice_positions_{nullptr};
-  float *src_value_{nullptr};
-  SparseGradient *unique_grad_{nullptr};
-};
-
-void WorkerForReduceSparseGradient(WorkerParamsForReduceSparseGradient param) {
-  MS_EXCEPTION_IF_NULL(param.sorted_indices_);
-  MS_EXCEPTION_IF_NULL(param.slice_positions_);
-  MS_EXCEPTION_IF_NULL(param.src_value_);
-  MS_EXCEPTION_IF_NULL(param.unique_grad_);
-  auto outer_dim = param.outer_dim_;
-  auto &sorted_indices = *(param.sorted_indices_);
-  auto &slice_positions = *(param.slice_positions_);
-  auto unique_grad = param.unique_grad_;
-  for (size_t slice_id = param.slice_start_; slice_id < param.slice_end_; ++slice_id) {
-    size_t cur_pos = slice_positions[slice_id];
-    int index = sorted_indices[cur_pos].first;
-    unique_grad->indices_[slice_id] = index;
-    size_t start_index = slice_id * outer_dim;
-    auto ret_code = memcpy_s(unique_grad->value_ + start_index, (param.max_length_ - start_index) * sizeof(float),
-                             param.src_value_ + sorted_indices[cur_pos].second, outer_dim * sizeof(float));
-    if (ret_code != EOK) {
-      MS_LOG(EXCEPTION) << "Failed to copy data!";
-    }
-    cur_pos++;
-    size_t end_pos;
-    if (slice_id + 1 < slice_positions.size()) {
-      end_pos = slice_positions[slice_id + 1];
-    } else {
-      end_pos = sorted_indices.size();
-    }
-    while (cur_pos < end_pos) {
-      for (size_t i = 0; i < outer_dim; ++i) {
-        unique_grad->value_[start_index + i] += param.src_value_[sorted_indices[cur_pos].second + i];
-      }
-      cur_pos++;
-    }
-  }
-}
-
-void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
-                                        size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
-                                        std::vector<size_t> *slice_positions) {
-  MS_LOG(DEBUG) << "Start";
-  size_t thread_num = 24;
-  if (slice_positions->size() < thread_num) {
-    thread_num = slice_positions->size();
-  }
-  size_t stride = (slice_positions->size() + thread_num - 1) / thread_num;
-  thread_num = (slice_positions->size() + stride - 1) / stride;
-  std::vector<std::thread> threads;
-  size_t max_length = sorted_indices->size() * outer_dim;
-  for (size_t i = 0; i < thread_num; ++i) {
-    size_t slice_start = i * stride;
-    size_t slice_end = 0;
-    if (i == thread_num - 1) {
-      slice_end = slice_positions->size();
-    } else {
-      slice_end = slice_start + stride;
-    }
-    WorkerParamsForReduceSparseGradient params{
-      slice_start, slice_end, max_length, outer_dim, sorted_indices, slice_positions, origin_sparse_grad.value_,
-      unique_grad};
-    threads.emplace_back(std::thread(WorkerForReduceSparseGradient, params));
-  }
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads[i].join();
-  }
+  reduced_bucket->indices_size_ = unique_indices_size;
   MS_LOG(DEBUG) << "End";
 }
 
-void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
-                          size_t outer_dim, bool use_multi_threads) {
-  MS_LOG(DEBUG) << "Start";
-  MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
-  MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
-  MS_EXCEPTION_IF_NULL(unique_grad);
-  MS_EXCEPTION_IF_NULL(unique_grad->value_);
-  MS_EXCEPTION_IF_NULL(unique_grad->indices_);
-  std::vector<std::pair<int, size_t>> sorted_indices;
-  sorted_indices.reserve(origin_sparse_grad.indices_size_);
-  for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
-    int index = origin_sparse_grad.indices_[i];
-    if (index >= 0 && IntToSize(index) < first_dim) {
-      sorted_indices.emplace_back(std::pair<int, size_t>(index, i * outer_dim));
-    }
-  }
-  std::sort(
-    sorted_indices.begin(), sorted_indices.end(),
-    [](const std::pair<int, size_t> &left, const std::pair<int, size_t> &right) { return left.first < right.first; });
-  int last_index = 0;
-  std::vector<size_t> slice_positions;
-  slice_positions.reserve(sorted_indices.size());
-  for (size_t i = 0; i < sorted_indices.size(); ++i) {
-    if (i == 0 || last_index != sorted_indices[i].first) {
-      slice_positions.emplace_back(i);
-    }
-    last_index = sorted_indices[i].first;
-  }
-  if (use_multi_threads) {
-    RunMultiThreadReduceSparseGradient(origin_sparse_grad, unique_grad, outer_dim, &sorted_indices, &slice_positions);
-  } else {
-    size_t max_length = sorted_indices.size() * outer_dim;
-    WorkerParamsForReduceSparseGradient params{0,
-                                               slice_positions.size(),
-                                               max_length,
-                                               outer_dim,
-                                               &sorted_indices,
-                                               &slice_positions,
-                                               origin_sparse_grad.value_,
-                                               unique_grad};
-    WorkerForReduceSparseGradient(params);
-  }
-  unique_grad->indices_size_ = slice_positions.size();
-  MS_LOG(DEBUG) << "End";
-}
-
-void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
-                               SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
-                               size_t outer_dim) {
-  MS_LOG(DEBUG) << "Start";
-  if (unique_slice_grads.empty()) {
-    return;
-  }
-  size_t index_data_size = outer_dim * sizeof(float);
-  size_t unique_indices_size = 0;
-  for (size_t i = 0; i < unique_slice_grads.size(); ++i) {
-    auto &slice_grad = unique_slice_grads[i];
-    auto ret_code = memcpy_s(tmp_grad->value_ + unique_indices_size * outer_dim,
-                             (tmp_grad->indices_size_ - unique_indices_size) * index_data_size, slice_grad->value_,
-                             slice_grad->indices_size_ * index_data_size);
-    if (ret_code != EOK) {
-      MS_LOG(EXCEPTION) << "Failed to copy data!";
-    }
-    ret_code =
-      memcpy_s(tmp_grad->indices_ + unique_indices_size, (tmp_grad->indices_size_ - unique_indices_size) * sizeof(int),
-               slice_grad->indices_, slice_grad->indices_size_ * sizeof(int));
-    if (ret_code != EOK) {
-      MS_LOG(EXCEPTION) << "Failed to copy data!";
-    }
-    unique_indices_size += slice_grad->indices_size_;
-  }
-  tmp_grad->indices_size_ = unique_indices_size;
-  ReduceSparseGradient(*tmp_grad, unique_grad, first_dim, outer_dim);
-  MS_LOG(DEBUG) << "End";
-}
-
-void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
-                                  SparseGradient *unique_grad, size_t first_dim, size_t outer_dim) {
-  MS_LOG(DEBUG) << "Start";
-  MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
-  MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
-  MS_EXCEPTION_IF_NULL(unique_grad);
-  MS_EXCEPTION_IF_NULL(unique_grad->value_);
-  MS_EXCEPTION_IF_NULL(unique_grad->indices_);
-  MS_EXCEPTION_IF_NULL(tmp_grad);
-  MS_EXCEPTION_IF_NULL(tmp_grad->value_);
-  MS_EXCEPTION_IF_NULL(tmp_grad->indices_);
-  size_t thread_num = 24;
-  if (origin_sparse_grad.indices_size_ < thread_num) {
-    thread_num = origin_sparse_grad.indices_size_;
-  }
-  size_t thread_indices_size = origin_sparse_grad.indices_size_ / thread_num;
-  size_t left_indices_size = origin_sparse_grad.indices_size_ % thread_num;
+void ReduceBucketSparseGradientToWorkspace(const MultiThreadReduceSparseGradientParam &param,
+                                           const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets,
+                                           std::vector<std::shared_ptr<SparseGradient>> *reduced_buckets_ptr) {
+  MS_EXCEPTION_IF_NULL(param.workspace_grad_);
+  MS_EXCEPTION_IF_NULL(param.workspace_grad_->value_);
+  MS_EXCEPTION_IF_NULL(param.workspace_grad_->indices_);
+  MS_EXCEPTION_IF_NULL(reduced_buckets_ptr);
+  auto &reduced_buckets = *reduced_buckets_ptr;
+  size_t thread_num = buckets.size();
   std::vector<std::thread> threads;
   threads.reserve(thread_num);
-  std::vector<std::shared_ptr<SparseGradient>> unique_slice_grads;
+
+  size_t current_indices_offset = 0;
   for (size_t i = 0; i < thread_num; ++i) {
-    size_t indices_size = thread_indices_size;
-    if (i == thread_num - 1) {
-      indices_size = thread_indices_size + left_indices_size;
+    reduced_buckets.emplace_back(std::make_shared<SparseGradient>());
+    reduced_buckets[i]->value_ = param.workspace_grad_->value_ + current_indices_offset * param.value_stride_;
+    reduced_buckets[i]->indices_ = param.workspace_grad_->indices_ + current_indices_offset;
+    reduced_buckets[i]->indices_size_ = buckets[i]->indices_size_;
+    if (param.use_sort_reduce_) {
+      threads.emplace_back(std::thread(SortAndReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i]));
+    } else {
+      threads.emplace_back(std::thread(ReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i]));
     }
-    size_t value_offset = i * thread_indices_size * outer_dim;
-    size_t indices_offset = i * thread_indices_size;
-    auto slice_grad = SparseGradient(
-      {origin_sparse_grad.value_ + value_offset, origin_sparse_grad.indices_ + indices_offset, indices_size});
-    unique_slice_grads.emplace_back(std::make_shared<SparseGradient>());
-    unique_slice_grads[i]->value_ = unique_grad->value_ + value_offset;
-    unique_slice_grads[i]->indices_ = unique_grad->indices_ + indices_offset;
-    unique_slice_grads[i]->indices_size_ = indices_size;
-    threads.emplace_back(
-      std::thread(ReduceSparseGradient, slice_grad, unique_slice_grads[i].get(), first_dim, outer_dim, false));
+    current_indices_offset += buckets[i]->indices_size_;
   }
   for (size_t i = 0; i < thread_num; ++i) {
     threads[i].join();
   }
-  ReduceMultiSparseGradient(unique_slice_grads, tmp_grad, unique_grad, first_dim, outer_dim);
+}
+
+void MergeReduceSparseGradient(const MultiThreadReduceSparseGradientParam &param,
+                               const std::vector<std::shared_ptr<SparseGradient>> &reduced_buckets) {
+  MS_EXCEPTION_IF_NULL(param.output_grad_);
+  auto output_grad = param.output_grad_;
+  MS_EXCEPTION_IF_NULL(output_grad->value_);
+  MS_EXCEPTION_IF_NULL(output_grad->indices_);
+  size_t stride_data_size = param.value_stride_ * sizeof(float);
+  size_t unique_indices_size = 0;
+  for (size_t i = 0; i < reduced_buckets.size(); ++i) {
+    auto &bucket = reduced_buckets[i];
+    MS_EXCEPTION_IF_NULL(bucket);
+    if (bucket->indices_size_ == 0) {
+      continue;
+    }
+    auto ret_code = memcpy_s(output_grad->value_ + unique_indices_size * param.value_stride_,
+                             (output_grad->indices_size_ - unique_indices_size) * stride_data_size, bucket->value_,
+                             bucket->indices_size_ * stride_data_size);
+    if (ret_code != EOK) {
+      MS_LOG(EXCEPTION) << "Failed to copy data!";
+    }
+    ret_code = memcpy_s(output_grad->indices_ + unique_indices_size,
+                        (output_grad->indices_size_ - unique_indices_size) * sizeof(int), bucket->indices_,
+                        bucket->indices_size_ * sizeof(int));
+    if (ret_code != EOK) {
+      MS_LOG(EXCEPTION) << "Failed to copy data!";
+    }
+    unique_indices_size += bucket->indices_size_;
+  }
+  output_grad->indices_size_ = unique_indices_size;
+}
+}  // namespace
+
+void BucketReduceSparseGradient(const ReduceSparseGradientParam &param) {
+  MS_LOG(DEBUG) << "Start";
+  MS_EXCEPTION_IF_NULL(param.input_grad_);
+  size_t thread_num = 23;
+  if (param.input_grad_->indices_size_ < thread_num) {
+    thread_num = param.input_grad_->indices_size_;
+  }
+  MultiThreadReduceSparseGradientParam multi_thread_param({param.input_grad_, param.workspace_grad_, param.output_grad_,
+                                                           param.max_index_, param.value_stride_, thread_num,
+                                                           param.use_sort_reduce_});
+  std::vector<std::shared_ptr<SparseGradient>> segments;
+  std::vector<std::shared_ptr<std::vector<size_t>>> segment_bucket_sizes;
+  SplitAndCalculateSegmentBucketSize(multi_thread_param, &segments, &segment_bucket_sizes);
+
+  std::vector<std::shared_ptr<BucketSparseGradient>> buckets;
+  GatherSegmentIndicesToOutputBucket(multi_thread_param, segments, segment_bucket_sizes, &buckets);
+
+  std::vector<std::shared_ptr<SparseGradient>> reduced_buckets;
+  ReduceBucketSparseGradientToWorkspace(multi_thread_param, buckets, &reduced_buckets);
+
+  MergeReduceSparseGradient(multi_thread_param, reduced_buckets);
   MS_LOG(DEBUG) << "End";
 }
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
index 8c9ea84b34e..4f48d70b2cc 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
@@ -73,9 +73,18 @@ class KernelMeta {
 };
 
 struct SparseGradient {
-  float *value_;
-  int *indices_;
-  size_t indices_size_;
+  float *value_{nullptr};
+  int *indices_{nullptr};
+  size_t indices_size_{0};
+};
+
+struct ReduceSparseGradientParam {
+  SparseGradient *input_grad_{nullptr};
+  SparseGradient *workspace_grad_{nullptr};
+  SparseGradient *output_grad_{nullptr};
+  size_t max_index_{0};
+  size_t value_stride_{0};
+  bool use_sort_reduce_{false};
 };
 
 struct MultiThreadComputeParams {
@@ -112,10 +121,6 @@ void SaveJsonInfo(const std::string &json_name, const std::string &info);
 std::string GetProcessor(const AnfNodePtr &anf_node);
 bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
 int Sign(float x);
-void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
-                              size_t outer_dim);
-void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
-                          size_t outer_dim, bool use_multi_threads = true);
 std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
 std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
                                                                             const std::vector<AnfNodePtr> &input_list);
@@ -130,14 +135,7 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<An
 bool IsWeightBoundary(const AnfNodePtr &node);
 void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params,
                         size_t total_compute_size);
-void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
-                                        size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
-                                        std::vector<size_t> *slice_positions);
-void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
-                               SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
-                               size_t outer_dim);
-void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
-                                  SparseGradient *unique_grad, size_t first_dim, size_t outer_dim);
+void BucketReduceSparseGradient(const ReduceSparseGradientParam &param);
 std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode);
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc
index 2ff8e77fcd1..9a247611e7f 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc
@@ -81,6 +81,8 @@ void SparseApplyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node)
   MS_EXCEPTION_IF_NULL(kernel_node);
   workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
   workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
+  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
+  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
   workspace_size_list_.emplace_back(var_first_dim_size_ * var_outer_dim_size_ * sizeof(float));
 }
 
@@ -142,11 +144,21 @@ bool SparseApplyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
   auto indices = reinterpret_cast<int *>(inputs[10]->addr);
   auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
   auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
-  auto m_t = reinterpret_cast<float *>(workspace[2]->addr);
+  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
+  auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
+  auto m_t = reinterpret_cast<float *>(workspace[4]->addr);
 
   SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
-  ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
-                       var_outer_dim_size_);
+  SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
+  SparseGradient input_sparse_grad({grad, indices, indices_size_});
+  ReduceSparseGradientParam param;
+  param.input_grad_ = &input_sparse_grad;
+  param.workspace_grad_ = &workspace_sparse_grad;
+  param.output_grad_ = &unique_sparse_grad;
+  param.max_index_ = var_first_dim_size_;
+  param.value_stride_ = var_outer_dim_size_;
+  BucketReduceSparseGradient(param);
+
   size_t total_dim_size = var_first_dim_size_ * var_outer_dim_size_;
   lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_ftrl_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_ftrl_cpu_kernel.cc
index 2662604e196..1a1405f3b39 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_ftrl_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_ftrl_cpu_kernel.cc
@@ -132,12 +132,19 @@ bool SparseApplyFtrlCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
   auto indices = reinterpret_cast<int *>(inputs[4]->addr);
   auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
   auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
-  auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr);
-  auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr);
+  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
+  auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
+
   SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
-  SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_});
-  TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad,
-                               var_first_dim_size_, var_outer_dim_size_);
+  SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
+  SparseGradient input_sparse_grad({grad, indices, indices_size_});
+  ReduceSparseGradientParam param;
+  param.input_grad_ = &input_sparse_grad;
+  param.workspace_grad_ = &workspace_sparse_grad;
+  param.output_grad_ = &unique_sparse_grad;
+  param.max_index_ = var_first_dim_size_;
+  param.value_stride_ = var_outer_dim_size_;
+  BucketReduceSparseGradient(param);
 
   MultiThreadComputeParams input_params;
   input_params.var_ = var;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.cc
index 636d92dcbb1..a19b014829b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.cc
@@ -123,13 +123,19 @@ bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr>
   auto indices = reinterpret_cast<int *>(inputs[10]->addr);
   auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
   auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
-  auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr);
-  auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr);
+  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
+  auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
 
   SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
-  SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_});
-  TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad,
-                               var_first_dim_size_, var_outer_dim_size_);
+  SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
+  SparseGradient input_sparse_grad({grad, indices, indices_size_});
+  ReduceSparseGradientParam param;
+  param.input_grad_ = &input_sparse_grad;
+  param.workspace_grad_ = &workspace_sparse_grad;
+  param.output_grad_ = &unique_sparse_grad;
+  param.max_index_ = var_first_dim_size_;
+  param.value_stride_ = var_outer_dim_size_;
+  BucketReduceSparseGradient(param);
 
   lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
   MultiThreadComputeParams input_params;
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc
index efba35ad8c0..46fa07c3793 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc
@@ -61,6 +61,8 @@ void SparseApplyProximalAdagradCPUKernel::InitInputOutputSize(const CNodePtr &ke
   MS_EXCEPTION_IF_NULL(kernel_node);
   workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
   workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
+  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
+  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
 }
 
 void SparseApplyProximalAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
@@ -119,9 +121,19 @@ bool SparseApplyProximalAdagradCPUKernel::Launch(const std::vector<kernel::Addre
   auto indices = reinterpret_cast<int *>(inputs[6]->addr);
   auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
   auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
+  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
+  auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
+
   SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
-  ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
-                       var_outer_dim_size_);
+  SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
+  SparseGradient input_sparse_grad({grad, indices, indices_size_});
+  ReduceSparseGradientParam param;
+  param.input_grad_ = &input_sparse_grad;
+  param.workspace_grad_ = &workspace_sparse_grad;
+  param.output_grad_ = &unique_sparse_grad;
+  param.max_index_ = var_first_dim_size_;
+  param.value_stride_ = var_outer_dim_size_;
+  BucketReduceSparseGradient(param);
 
   MultiThreadComputeParams input_params;
   input_params.var_ = var;
diff --git a/mindspore/ccsrc/backend/session/cpu_session.cc b/mindspore/ccsrc/backend/session/cpu_session.cc
index ca1c78d2066..b29b1cb1fb2 100644
--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@@ -16,6 +16,7 @@
 
 #include "backend/session/cpu_session.h"
 #include <algorithm>
+#include <sstream>
 #include "ir/tensor.h"
 #include "ir/anf.h"
 #include "backend/kernel_compiler/kernel.h"
@@ -119,6 +120,48 @@ void CPUSession::SetKernelInfo(const KernelGraph *kernel_graph) {
   }
 }
 
+namespace {
+void KernelNotSupportException(const AnfNodePtr &kernel_node) {
+  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
+  std::stringstream operator_info;
+  operator_info << "Operator[" << kernel_name << "] ";
+  auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel_node->kernel_info());
+  if (kernel_info == nullptr) {
+    operator_info << "is not support.";
+    MS_LOG(EXCEPTION) << operator_info.str();
+  }
+  auto kernel_build_Info = kernel_info->select_kernel_build_info();
+  if (kernel_build_Info == nullptr) {
+    operator_info << "is not support.";
+    MS_LOG(EXCEPTION) << operator_info.str();
+  }
+  size_t input_num = kernel_build_Info->GetInputNum();
+  if (input_num > 0) {
+    operator_info << " input(";
+    for (size_t i = 0; i < input_num; ++i) {
+      operator_info << TypeIdLabel(kernel_build_Info->GetInputDeviceType(i));
+      if (i != input_num - 1) {
+        operator_info << ",";
+      }
+    }
+    operator_info << ") ";
+  }
+  size_t output_num = kernel_build_Info->GetOutputNum();
+  if (output_num > 0) {
+    operator_info << "output(";
+    for (size_t i = 0; i < output_num; ++i) {
+      operator_info << TypeIdLabel(kernel_build_Info->GetOutputDeviceType(i));
+      if (i != kernel_build_Info->GetOutputNum() - 1) {
+        operator_info << ",";
+      }
+    }
+    operator_info << ") ";
+  }
+  operator_info << "is not support.";
+  MS_LOG(EXCEPTION) << operator_info.str();
+}
+}  // namespace
+
 void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
   MS_EXCEPTION_IF_NULL(kernel_graph);
   auto &kernel_nodes = kernel_graph->execution_order();
@@ -129,7 +172,7 @@ void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
     std::shared_ptr<kernel::CPUKernel> cpu_kernel =
       kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node);
     if (cpu_kernel == nullptr) {
-      MS_LOG(EXCEPTION) << "Operator[" << kernel_name << "] is not support.";
+      KernelNotSupportException(kernel_node);
     }
     cpu_kernel->Init(kernel_node);
     AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get());
diff --git a/tests/ut/cpp/kernel/common_utils_test.cc b/tests/ut/cpp/kernel/common_utils_test.cc
index 83f7c59e523..4e016cd4953 100644
--- a/tests/ut/cpp/kernel/common_utils_test.cc
+++ b/tests/ut/cpp/kernel/common_utils_test.cc
@@ -25,7 +25,7 @@ class CommonUtilTest : public UT::Common {
   CommonUtilTest() = default;
 };
 
-TEST_F(CommonUtilTest, DeduplicateIndexedSlicesTest1) {
+TEST_F(CommonUtilTest, BucketReduceSparseGradient1) {
   // The indices is a vector and the grad is a tensor with shape (6, 2)
   /* 0
    * 0
@@ -46,20 +46,39 @@ TEST_F(CommonUtilTest, DeduplicateIndexedSlicesTest1) {
   for (int i = 0; i < 6 * 2; i++) {
     grad.push_back(i);
   }
-  std::vector<int> unique_indices(3);
-  std::vector<float> summed_grad(6);
-  SparseGradient unique_grad({summed_grad.data(), unique_indices.data(), 0});
-  ReduceSparseGradient(SparseGradient({grad.data(), indices.data(), 6}), &unique_grad, 6, 2);
+  std::vector<int> unique_indices(6);
+  std::vector<float> summed_grad(12);
+  std::vector<int> tmp_indices(6);
+  std::vector<float> tmp_grad(12);
+
+  SparseGradient unique_grad({summed_grad.data(), unique_indices.data(), 6});
+  SparseGradient workspace_grad({tmp_grad.data(), tmp_indices.data(), 6});
+  SparseGradient input_grad({grad.data(), indices.data(), 6});
+
+  ReduceSparseGradientParam param;
+  param.input_grad_ = &input_grad;
+  param.workspace_grad_ = &workspace_grad;
+  param.output_grad_ = &unique_grad;
+  param.max_index_ = 6;
+  param.value_stride_ = 2;
+  BucketReduceSparseGradient(param);
+
   EXPECT_EQ(unique_grad.indices_size_, 3);
-  EXPECT_EQ(unique_indices, std::vector<int>({0, 1, 3}));
+  std::vector<int> expect_indices({0, 1, 3});
+  for (size_t i = 0; i < unique_grad.indices_size_; ++i) {
+    EXPECT_EQ(unique_grad.indices_[i], expect_indices[i]);
+  }
   /* 10 13
    * 10 12
    * 10 11
    */
-  EXPECT_EQ(summed_grad, std::vector<float>({10, 13, 10, 12, 10, 11}));
+  std::vector<int> expect_value({10, 13, 10, 12, 10, 11});
+  for (size_t i = 0; i < unique_grad.indices_size_ * 2; ++i) {
+    EXPECT_EQ(unique_grad.value_[i], expect_value[i]);
+  }
 }
 
-TEST_F(CommonUtilTest, DeduplicateIndexedSlicesTest2) {
+TEST_F(CommonUtilTest, BucketReduceSparseGradient2) {
   // The indices is a vector and the grad is a tensor with shape (6, 2)
   /* 0
    * 0
@@ -80,16 +99,36 @@ TEST_F(CommonUtilTest, DeduplicateIndexedSlicesTest2) {
   for (int i = 0; i < 6 * 2; i++) {
     grad.push_back(i);
   }
-  std::vector<int> unique_indices(2);
-  std::vector<float> summed_grad(4);
-  SparseGradient unique_grad({summed_grad.data(), unique_indices.data(), 0});
-  ReduceSparseGradient(SparseGradient({grad.data(), indices.data(), 6}), &unique_grad, 6, 2);
+  std::vector<int> unique_indices(6);
+  std::vector<float> summed_grad(12);
+  std::vector<int> tmp_indices(6);
+  std::vector<float> tmp_grad(12);
+  SparseGradient unique_grad({summed_grad.data(), unique_indices.data(), 6});
+  SparseGradient workspace_grad({tmp_grad.data(), tmp_indices.data(), 6});
+  SparseGradient input_grad({grad.data(), indices.data(), 6});
+
+  ReduceSparseGradientParam param;
+  param.input_grad_ = &input_grad;
+  param.workspace_grad_ = &workspace_grad;
+  param.output_grad_ = &unique_grad;
+  param.max_index_ = 6;
+  param.value_stride_ = 2;
+  BucketReduceSparseGradient(param);
+
   EXPECT_EQ(unique_grad.indices_size_, 2);
-  EXPECT_EQ(unique_indices, std::vector<int>({0, 1}));
+
+  std::vector<int> expect_indices({0, 1});
+  for (size_t i = 0; i < unique_grad.indices_size_; ++i) {
+    EXPECT_EQ(unique_grad.indices_[i], expect_indices[i]);
+  }
+
   /* 10 13
    * 10 12
    */
-  EXPECT_EQ(summed_grad, std::vector<float>({10, 13, 10, 12}));
+  std::vector<int> expect_value({10, 13, 10, 12});
+  for (size_t i = 0; i < unique_grad.indices_size_ * 2; ++i) {
+    EXPECT_EQ(unique_grad.value_[i], expect_value[i]);
+  }
 }
 }  // namespace kernel
 }  // namespace mindspore

From 6bca22f85f063ecc51690159598f3ca2293f44ee Mon Sep 17 00:00:00 2001
From: Eric <eric.zhang1@huawei.com>
Date: Thu, 16 Jul 2020 13:16:49 -0400
Subject: [PATCH 49/68] Added fix to include for windows

Removed log header
---
 .../dataset/include/dataset/core/constants.h  |   1 -
 .../dataset/include/dataset/core/data_type.h  |   1 -
 .../include/dataset/core/tensor_shape.h       |   1 -
 .../dataset/include/dataset/util/status.h     |   1 -
 .../ccsrc/minddata/dataset/include/status.h   | 138 +++-
 .../ccsrc/minddata/dataset/include/tensor.h   | 669 +++++++++++++++++-
 .../dataset/include/utils/log_adapter.h       |   1 -
 .../minddata/dataset/include/utils/overload.h |   1 -
 8 files changed, 805 insertions(+), 8 deletions(-)
 delete mode 120000 mindspore/ccsrc/minddata/dataset/include/dataset/core/constants.h
 delete mode 120000 mindspore/ccsrc/minddata/dataset/include/dataset/core/data_type.h
 delete mode 120000 mindspore/ccsrc/minddata/dataset/include/dataset/core/tensor_shape.h
 delete mode 120000 mindspore/ccsrc/minddata/dataset/include/dataset/util/status.h
 mode change 120000 => 100644 mindspore/ccsrc/minddata/dataset/include/status.h
 mode change 120000 => 100644 mindspore/ccsrc/minddata/dataset/include/tensor.h
 delete mode 120000 mindspore/ccsrc/minddata/dataset/include/utils/log_adapter.h
 delete mode 120000 mindspore/ccsrc/minddata/dataset/include/utils/overload.h

diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/core/constants.h b/mindspore/ccsrc/minddata/dataset/include/dataset/core/constants.h
deleted file mode 120000
index 22fe6d07e1e..00000000000
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/core/constants.h
+++ /dev/null
@@ -1 +0,0 @@
-../../../core/constants.h
\ No newline at end of file
diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/core/data_type.h b/mindspore/ccsrc/minddata/dataset/include/dataset/core/data_type.h
deleted file mode 120000
index 37a0e1b686e..00000000000
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/core/data_type.h
+++ /dev/null
@@ -1 +0,0 @@
-../../../core/data_type.h
\ No newline at end of file
diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/core/tensor_shape.h b/mindspore/ccsrc/minddata/dataset/include/dataset/core/tensor_shape.h
deleted file mode 120000
index 1fb7a24d912..00000000000
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/core/tensor_shape.h
+++ /dev/null
@@ -1 +0,0 @@
-../../../core/tensor_shape.h
\ No newline at end of file
diff --git a/mindspore/ccsrc/minddata/dataset/include/dataset/util/status.h b/mindspore/ccsrc/minddata/dataset/include/dataset/util/status.h
deleted file mode 120000
index b06279c05b5..00000000000
--- a/mindspore/ccsrc/minddata/dataset/include/dataset/util/status.h
+++ /dev/null
@@ -1 +0,0 @@
-../../../util/status.h
\ No newline at end of file
diff --git a/mindspore/ccsrc/minddata/dataset/include/status.h b/mindspore/ccsrc/minddata/dataset/include/status.h
deleted file mode 120000
index bba92b63ad9..00000000000
--- a/mindspore/ccsrc/minddata/dataset/include/status.h
+++ /dev/null
@@ -1 +0,0 @@
-../util/status.h
\ No newline at end of file
diff --git a/mindspore/ccsrc/minddata/dataset/include/status.h b/mindspore/ccsrc/minddata/dataset/include/status.h
new file mode 100644
index 00000000000..7a480f42391
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/include/status.h
@@ -0,0 +1,137 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_UTIL_STATUS_H_
+#define DATASET_UTIL_STATUS_H_
+
+#if defined(__GNUC__) || defined(__clang__)
+#define DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+#define DEPRECATED __declspec(deprecated)
+#else
+#pragma message("WARNING: You need to implement DEPRECATED for this compiler")
+#define DEPRECATED
+#endif
+
+#include <iostream>
+#include <string>
+#include <utility>
+
+namespace mindspore {
+namespace dataset {
+#define RETURN_IF_NOT_OK(_s) \
+  do {                       \
+    Status __rc = (_s);      \
+    if (__rc.IsError()) {    \
+      return __rc;           \
+    }                        \
+  } while (false)
+
+#define RETURN_STATUS_UNEXPECTED(_e)                                     \
+  do {                                                                   \
+    return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, _e); \
+  } while (false)
+
+#define CHECK_FAIL_RETURN_UNEXPECTED(_condition, _e)                       \
+  do {                                                                     \
+    if (!(_condition)) {                                                   \
+      return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, _e); \
+    }                                                                      \
+  } while (false)
+
+#define RETURN_UNEXPECTED_IF_NULL(_ptr)                                         \
+  do {                                                                          \
+    if ((_ptr) == nullptr) {                                                    \
+      std::string err_msg = "The pointer[" + std::string(#_ptr) + "] is null."; \
+      RETURN_STATUS_UNEXPECTED(err_msg);                                        \
+    }                                                                           \
+  } while (false)
+
+enum class StatusCode : char {
+  kOK = 0,
+  kOutOfMemory = 1,
+  kShapeMisMatch = 2,
+  kInterrupted = 3,
+  kNoSpace = 4,
+  kPyFuncException = 5,
+  kDuplicateKey = 6,
+  kPythonInterpreterFailure = 7,
+  kTDTPushFailure = 8,
+  kFileNotExist = 9,
+  kProfilingError = 10,
+  kBoundingBoxOutOfBounds = 11,
+  kBoundingBoxInvalidShape = 12,
+  // Make this error code the last one. Add new error code above it.
+  kUnexpectedError = 127
+};
+
+std::string CodeAsString(const StatusCode c);
+
+class Status {
+ public:
+  Status() noexcept;
+
+  explicit Status(StatusCode c) noexcept;
+
+  ~Status() noexcept;
+
+  // Copy constructor
+  Status(const Status &s);
+
+  Status &operator=(const Status &s);
+
+  // Move constructor
+  Status(Status &&) noexcept;
+
+  Status &operator=(Status &&) noexcept;
+
+  Status(const StatusCode code, const std::string &msg);
+
+  Status(const StatusCode code, int line_of_code, const char *file_name, const std::string &extra = "");
+
+  // Return a success status
+  static Status OK() { return Status(StatusCode::kOK); }
+
+  std::string ToString() const;
+
+  StatusCode get_code() const;
+
+  friend std::ostream &operator<<(std::ostream &os, const Status &s);
+
+  explicit operator bool() const { return (get_code() == StatusCode::kOK); }
+
+  bool operator==(const Status &other) const { return (this->get_code() == other.get_code()); }
+
+  bool operator!=(const Status &other) const { return !(*this == other); }
+
+  bool IsOk() const { return (get_code() == StatusCode::kOK); }
+
+  bool IsError() const { return !IsOk(); }
+
+  bool IsOutofMemory() const { return (get_code() == StatusCode::kOutOfMemory); }
+
+  bool IsInterrupted() const { return (get_code() == StatusCode::kInterrupted); }
+
+  bool IsShapeIncorrect() const { return (get_code() == StatusCode::kShapeMisMatch); }
+
+  bool IsNoSpace() const { return (get_code() == StatusCode::kNoSpace); }
+
+ private:
+  StatusCode code_;
+  std::string err_msg_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_UTIL_STATUS_H_
diff --git a/mindspore/ccsrc/minddata/dataset/include/tensor.h b/mindspore/ccsrc/minddata/dataset/include/tensor.h
deleted file mode 120000
index 34b5e020a9f..00000000000
--- a/mindspore/ccsrc/minddata/dataset/include/tensor.h
+++ /dev/null
@@ -1 +0,0 @@
-../core/tensor.h
\ No newline at end of file
diff --git a/mindspore/ccsrc/minddata/dataset/include/tensor.h b/mindspore/ccsrc/minddata/dataset/include/tensor.h
new file mode 100644
index 00000000000..8707cbd7c06
--- /dev/null
+++ b/mindspore/ccsrc/minddata/dataset/include/tensor.h
@@ -0,0 +1,668 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DATASET_CORE_TENSOR_H_
+#define DATASET_CORE_TENSOR_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+#include "./securec.h"
+#include "utils/log_adapter.h"
+#if defined(_WIN32) || defined(_WIN64)
+#undef HAVE_STDDEF_H
+#undef HAVE_STDLIB_H
+#endif
+
+#ifdef ENABLE_PYTHON
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#endif
+
+#include "minddata/dataset/core/constants.h"
+#include "minddata/dataset/core/data_type.h"
+#include "minddata/dataset/core/tensor_shape.h"
+#include "minddata/dataset/util/status.h"
+#include "proto/example.pb.h"
+
+#ifdef ENABLE_PYTHON
+namespace py = pybind11;
+#endif
+namespace mindspore {
+namespace dataset {
+class Tensor;
+template <typename T>
+class Allocator;
+
+using CharAllocPtr = std::unique_ptr<Allocator<unsigned char>>;
+using TensorAllocPtr = std::shared_ptr<Allocator<Tensor>>;  // An allocator shared_ptr for Tensors
+
+class Tensor {
+ public:
+  Tensor() = delete;
+
+  // Create a new tensor, does not internally allocate storage. This constructor is protected, use CreateTensor.
+  // @note The shape and type information should be known and valid.
+  // @param shape TensorShape
+  // @param type DataType
+  Tensor(const TensorShape &shape, const DataType &type);
+
+  // Create a new tensor, allocates storage and copies in data. This constructor is protected, use CreateTensor.
+  // @note The buffer should be valid and the shape and type information should be known and valid.
+  // @param shape TensorShape
+  // @param type DataType
+  // @param data unsigned char*, pointer to the data.
+  Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data);
+
+  Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data, const dsize_t &length);
+
+  Tensor(const Tensor &other) = delete;
+
+  Tensor &operator=(const Tensor &other) = delete;
+
+  Tensor(Tensor &&other) noexcept;
+
+  Tensor &operator=(Tensor &&other) noexcept;
+
+  Status AllocateBuffer(const dsize_t &length);
+
+  // type of offest values to store strings information
+  using offset_t = uint32_t;
+  // const of the size of the offset variable
+  static constexpr uint8_t kOffsetSize = sizeof(offset_t);
+  // Tensor base class which holds the data in an unsigned char* buffer.
+
+  // Construct  a scalar string Tensor
+  explicit Tensor(const std::string &str) : Tensor(std::vector<std::string>{str}, TensorShape::CreateScalar()) {}
+
+  // Construct a tensor from  a list of strings. Reshape the tensor with `shape` if given, otherwise assume the shape is
+  // the size of the vector `strings`.
+  // The memory layout of a Tensor of strings consists of the Offset_array followed by the strings.
+  // Thr offset array will store one extra value to find the length of the last string.
+  // OFFSET1, OFFSET2, ..., OFFSETn+1, STRING1, STRING2, ..., STRINGn
+  // The value of each offset is the start index of the corresponding string
+  // Offsets is of type offest_t
+  // strings will ne null-terminated
+  // example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING)
+  // |----------------------------------------------------------------|
+  // |             OFFSET ARRAY           |            STRINGS        |
+  // | bytes 0-3 | bytes 3-6 | bytes 7-10 | bytes 11-14 | bytes 15-17 |
+  // |     11    |    15     |     18     |     abc\0   |      de\0   |
+  // |----------------------------------------------------------------|
+  explicit Tensor(const std::vector<std::string> &strings,
+                  const TensorShape &shape = TensorShape::CreateUnknownRankShape());
+
+  // Same as Tensor(vector<string>) but the input is protobuf bytelist
+  explicit Tensor(const dataengine::BytesList &bytes_list,
+                  const TensorShape &shape = TensorShape::CreateUnknownRankShape());
+
+  // A static factory method to create the given flavour of derived Tensor
+  // Returns the base class reference for the Tensor.
+  // @param ptr output argument to hold the created Tensor of given tensor_impl
+  // @param tensor_impl - which implementation of Tensor
+  // @param shape - shape of the tensor
+  // @param type - datatype of the tensor
+  // @param data - data to be copied to Tensor new allocation
+  // @return Status Code
+  static Status CreateTensor(std::shared_ptr<Tensor> *, TensorImpl tensor_impl, const TensorShape &shape, DataType type,
+                             const unsigned char *data = nullptr);
+
+  // Create a copy of the input tensor
+  // @param out [out] output tensor to be generated
+  // @param in [in] orginal tensor to be copied
+  // @return Status
+  static Status CreateTensor(std::shared_ptr<Tensor> *out, const std::shared_ptr<Tensor> &in) {
+    const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
+    *out = std::allocate_shared<Tensor>(*alloc, in->shape(), in->type(), in->GetBuffer(), in->SizeInBytes());
+    return Status::OK();
+  }
+
+#ifdef ENABLE_PYTHON
+  // A static factory method to create a Tensor from a given py::array.
+  // @param ptr output argument to hold the created Tensor
+  // @param arr py::array
+  // @return Status Code
+  static Status CreateTensor(std::shared_ptr<Tensor> *ptr, py::array arr);
+
+  // Helper function to create a tensor from Numpy of strings
+  static Status CreateTensorFromNumpyString(std::shared_ptr<Tensor> *ptr, py::array arr);
+#endif
+
+  // A static factory method to create a Tensor from a given list of strings.
+  // @param ptr output argument to hold the created Tensor
+  // @param strings elements of the tensor
+  // @param shape shape of the tensor
+  // @return Status Code
+  static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<std::string> &strings,
+                             const TensorShape &shape = TensorShape::CreateUnknownRankShape());
+
+  // create tensor from protobuf bytelist with strings
+  static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::BytesList &bytes_list,
+                             const TensorShape &shape);
+
+  // A static factory method to create a Tensor from a given list of numbers.
+  // @param ptr output argument to hold the created Tensor
+  // @param items elements of the tensor
+  // @param shape shape of the tensor
+  // @return Status Code
+  template <typename T>
+  static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<T> &items,
+                             const TensorShape &shape_req = TensorShape::CreateUnknownRankShape()) {
+    DataType type = DataType::FromCType<T>();
+    auto items_ptr = reinterpret_cast<const uchar *>(&items[0]);
+    TensorShape shape = shape_req;
+    if (!shape.known()) {
+      shape = TensorShape({static_cast<dsize_t>(items.size())});
+    }
+    return CreateTensor(ptr, TensorImpl::kFlexible, shape, type, items_ptr);
+  }
+
+  // A static factory method to create a Tensor from a given number.
+  // @param ptr output argument to hold the created Tensor
+  // @param item value
+  // @return Status Code
+  template <typename T>
+  static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const T &item) {
+    return CreateTensor<T>(ptr, {item}, TensorShape::CreateScalar());
+  }
+
+  // Create tensor from protobuf bytelist with uint8 or int8 types
+  static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::BytesList &bytes_list,
+                             const TensorShape &shape, const DataType &type, dsize_t pad_size);
+
+  static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const std::string &path);
+
+  // Copy raw data of a array based on shape and strides to the destination pointer
+  // @param dst Pointer to the destination array where the content is to be copied
+  // @param src Pointer to the source of strided array to be copied
+  // @param shape - shape of the source array
+  // @param strides - strides of the source array
+  // @param type_size - number of bytes needed to store one array element's type
+  // @return Status Code
+  static Status CopyStridedArray(unsigned char *dst, unsigned char *src, std::vector<dsize_t> shape,
+                                 std::vector<dsize_t> strides, uint8_t type_size);
+
+  // Release the memory using the allocator
+  virtual ~Tensor();
+
+  // compare the tensor shape and data
+  bool operator==(const Tensor &rhs) const;
+
+  bool operator!=(const Tensor &rhs) const { return !((*this) == rhs); }
+
+  // Get item located at `index`, caller needs to provide the type.
+  // @tparam T
+  // @param index vector<dsize_t>
+  // @return return the item specified at index
+  template <typename T>
+  Status GetItemAt(T *o, const std::vector<dsize_t> &index) const;
+
+  // Get string located at `index`.
+  // @param index vector<dsize_t>
+  // @return return std::string_view specified at index
+  Status GetItemAt(std::string_view *o, const std::vector<dsize_t> &index) const;
+
+  template <typename T>
+  Status GetUnsignedIntAt(T *o, const std::vector<dsize_t> &index) const;
+
+  template <typename T>
+  Status GetSignedIntAt(T *o, const std::vector<dsize_t> &index) const;
+
+  template <typename T>
+  Status GetFloatAt(T *o, const std::vector<dsize_t> &index) const;
+
+  // set item at location specified by index
+  // @tparam `T`
+  // @param index
+  // @param value of type `T`
+  template <typename T>
+  Status SetItemAt(const std::vector<dsize_t> &index, const T &value) {
+    RETURN_IF_NOT_OK(AllocateBuffer(SizeInBytes()));
+    T *ptr = nullptr;
+    RETURN_IF_NOT_OK(GetItemPtr<T>(&ptr, index));
+    *ptr = value;
+    return Status::OK();
+  }
+
+  // set string item at location specified by index
+  // @param index
+  // @param value of type std::string
+  Status SetItemAt(const std::vector<dsize_t> &index, const std::string &value) {
+    RETURN_UNEXPECTED_IF_NULL(data_);
+    uchar *ptr = nullptr;
+    offset_t length = 0;
+    RETURN_IF_NOT_OK(GetItemPtr(&ptr, index, &length));
+    if (value.length() != length) {
+      RETURN_STATUS_UNEXPECTED("Length of the new string does not match the item.");
+    }
+    memcpy_s(reinterpret_cast<char *>(ptr), length, value.c_str(), length);
+
+    return Status::OK();
+  }
+  // fill tensor with Zeros. Does not support strings.
+  Status Zero() {
+    CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use Zero on tensor of strings..");
+    dsize_t size = SizeInBytes();
+    CHECK_FAIL_RETURN_UNEXPECTED(memset_sp(GetMutableBuffer(), size, 0, size) == 0,
+                                 "Failed to fill tensor with zeroes.");
+    return Status::OK();
+  }
+
+  // Fill all elements in the Tensor with the given value of type `T`.  Does not support strings.
+  // @tparam T
+  // @param value
+  template <typename T>
+  Status Fill(const T &value) {
+    CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use fill on tensor of strings.");
+    RETURN_IF_NOT_OK(AllocateBuffer(SizeInBytes()));
+    int64_t cellSize = type_.SizeInBytes();
+    if ((data_ != nullptr) && type_.IsCompatible<T>()) {
+      for (dsize_t i = 0; i < Size(); i++) {
+        CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s((data_ + i * cellSize), cellSize, &value, cellSize) == 0, "memcpy err");
+      }
+      return Status::OK();
+    } else {
+      std::string err;
+      err += (data_ == nullptr) ? "data_ is nullptr \t" : "";
+      err += type_.IsCompatible<T>() ? "data type not compatible\t" : "";
+      return Status(StatusCode::kUnexpectedError, err);
+    }
+  }
+
+  // Getter function for shape
+  // @return
+  const TensorShape &shape() const { return shape_; }
+
+  /// Check if tensor has data
+  /// \return bool - true if tensor is empty
+  bool HasData() const;
+
+  // Reshape the tensor. The given shape should have the same number of elements in the Tensor
+  // @param shape
+  virtual Status Reshape(const TensorShape &shape);
+
+  // @return number of elements in this tensor
+  dsize_t Size() const { return shape().NumOfElements(); }
+
+  // @return the number of bytes this tensor is needs
+  dsize_t SizeInBytes() const {
+    if (data_end_ == nullptr) return type_.SizeInBytes() * shape_.NumOfElements();
+    return data_end_ - data_;
+  }
+
+  // @return the rank of the tensor
+  dsize_t Rank() const { return shape().Rank(); }
+
+  // Get the starting memory address as a constant for the data of the tensor.  This potentially
+  // drives an allocation if the data area.
+  // @return const unsigned char*
+  const unsigned char *GetBuffer() const;
+
+  // Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if the
+  // tensor's type is a string, otherwise undefined address would be returned.
+  // @return address of the first string of the tensor.
+  uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; }
+
+  // Getter of the type
+  // @return
+  DataType type() const { return type_; }
+
+  // Provide stream operator for displaying it
+  // @param output stream
+  // @param so the Tensor object to be printed
+  // @return output stream
+  friend std::ostream &operator<<(std::ostream &out, const Tensor &so) {
+    so.Print(out);
+    return out;
+  }
+
+  // Invalidate this Tensor by setting the type and shape to unknown and MData to null.
+  // Calling this method will make the Tensor and its data inaccessible, use it with caution.
+  void Invalidate();
+
+  // Copy input tensor into self at the location index.
+  // Index is a vector of axises which can be incomplete:
+  // Ex: shape <2,3>, inserting into index {0} will replace the first row. index {1,2} will replace the last cell.
+  // @param index
+  // @param input
+  // @return Status code
+  Status InsertTensor(const std::vector<dsize_t> &index, const std::shared_ptr<Tensor> &input);
+
+  // Find the address of the given index. Used in InsertTensor.
+  // Example:
+  //      Tensor t= [[1,2],[3,4]] , StartAddrOfIndex({0}) -> &1
+  // @param index  incomplete index
+  // @param output: startAddrofIndex
+  // @param output: remaining
+  // @return Status code
+  Status StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_index, TensorShape *remaining);
+
+  // Expand the shape of the Tensor with one extra dimension.
+  // For example, if the shape is <512,512,3>:
+  //     *- ExpandDim(0) gives: <1,512,512,3>
+  //     *- ExpandDim(1) gives: <512,1,512,3>
+  //     *- ExpandDim(3) gives: <512,512,3,1>
+  // @param axis location of the dim
+  virtual Status ExpandDim(const dsize_t &axis);
+
+  virtual void Squeeze();
+
+  // Calculates the strides of the Tensor
+  // Ex: Tensor of shape <4,2,2> and type DE_UINT8 (1 byte)
+  // The strides will be {6,2,1}.
+  // Ex: Tensor of shape <4,2,2> and type DE_UINT32 (4 byte)
+  // The strides will be {24,8,4}.
+  // @return vector of integers
+  std::vector<dsize_t> Strides();
+
+  std::string ToString() {
+    std::stringstream ss;
+    this->Print(ss);
+    return ss.str();
+  }
+
+  // Handle negative indices.
+  static inline dsize_t HandleNeg(dsize_t index, dsize_t length) { return (index < 0) ? (index + length) : index; }
+
+  // Slice tensor bases on the given indicies. Copy the sliced data into out tensor. Only rank1 tensors are supported.
+  // Based on the type of tensor, SliceNumeric or SliceString will be called
+  // @param out Tensor
+  // @param indices vector of indices
+  // @return Status error code
+  Status Slice(std::shared_ptr<Tensor> *out, const std::vector<dsize_t> &indices);
+
+  // Slice numeric tensors.
+  Status SliceNumeric(std::shared_ptr<Tensor> *out, const std::vector<dsize_t> &indices);
+
+  // Slice string tensors
+  Status SliceString(std::shared_ptr<Tensor> *out, const std::vector<dsize_t> &indices);
+
+#ifdef ENABLE_PYTHON
+  // Constructs numpy array from input tensor
+  // @param data this data is the location of python data
+  // @return Status code
+  Status GetDataAsNumpy(py::array *data);
+
+  Status GetDataAsNumpyStrings(py::array *data);
+
+  static Status GetBufferInfo(Tensor *t, py::buffer_info *out);
+#endif
+
+  // Concatenate based on given tensor, can fill in current tensor with a smaller one, unlike InsertTensor
+  Status Concatenate(const std::vector<dsize_t> &index, const std::shared_ptr<Tensor> &input);
+
+  // TensorIterator is a linear iterator that can be used to iterate over the elements of the Tensor
+  // The order  elements  is as the memory layout (i.e., row-major) [[1,2,3],[4,5,6] --> 1,2,3,4,5,6
+  // @tparam T type of values in the Tensor Iterator
+  template <typename T, bool = true>
+  class TensorIterator {
+   public:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = T;
+    using difference_type = ptrdiff_t;
+    using pointer = T *;
+    using reference = T &;
+
+    explicit TensorIterator(uchar *ptr = nullptr) { ptr_ = reinterpret_cast<T *>(ptr); }
+
+    TensorIterator(const TensorIterator<T> &raw_iterator) { ptr_ = raw_iterator.ptr_; }
+
+    ~TensorIterator() = default;
+
+    TensorIterator<T> &operator=(const TensorIterator<T> &rhs) {
+      ptr_ = rhs.ptr_;
+      return *this;
+    }
+
+    TensorIterator<T> &operator=(T *rhs) {
+      ptr_ = rhs;
+      return *this;
+    }
+
+    bool operator==(const TensorIterator<T> &rhs) { return ptr_ == rhs.ptr_; }
+
+    bool operator!=(const TensorIterator<T> &rhs) { return !(*this == rhs); }
+
+    operator bool() const { return ptr_ != nullptr; }
+
+    T &operator*() { return *ptr_; }
+
+    const T &operator*() const { return *ptr_; }
+
+    T *operator->() { return ptr_; }
+
+    TensorIterator<T> &operator+=(const ptrdiff_t &inc) {
+      ptr_ += inc;
+      return *this;
+    }
+
+    TensorIterator<T> &operator-=(const ptrdiff_t &inc) {
+      ptr_ -= inc;
+      return *this;
+    }
+
+    TensorIterator<T> &operator++() {
+      ++ptr_;
+      return *this;
+    }
+
+    TensorIterator<T> &operator--() {
+      --ptr_;
+      return *this;
+    }
+
+    TensorIterator<T> operator++(int) {
+      auto temp(*this);
+      ++ptr_;
+      return temp;
+    }
+
+    TensorIterator<T> operator--(int) {
+      auto temp(*this);
+      --ptr_;
+      return temp;
+    }
+
+    TensorIterator<T> operator+(const ptrdiff_t &inc) {
+      auto oldPtr = ptr_;
+      ptr_ += inc;
+      auto temp(*this);
+      ptr_ = oldPtr;
+      return temp;
+    }
+
+    TensorIterator<T> operator-(const ptrdiff_t &inc) {
+      auto oldPtr = ptr_;
+      ptr_ -= inc;
+      auto temp(*this);
+      ptr_ = oldPtr;
+      return temp;
+    }
+
+   protected:
+    T *ptr_;
+  };
+
+  // Specialization of TensorIterator for strings. It returns std::string_view for every item.
+  // @tparam DUMMY, used to mbe able to specialize the inner class
+  template <bool DUMMY>
+  class TensorIterator<std::string_view, DUMMY> {
+   public:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = std::string_view;
+    using difference_type = ptrdiff_t;
+    using pointer = std::string_view *;
+    using reference = std::string_view &;
+
+    explicit TensorIterator(uchar *data = nullptr, dsize_t index = 0) {
+      data_ = reinterpret_cast<const char *>(data);
+      index_ = index;
+    }
+
+    TensorIterator(const TensorIterator<std::string_view, DUMMY> &raw_iterator) {
+      data_ = raw_iterator.data_;
+      index_ = raw_iterator.index_;
+    }
+
+    ~TensorIterator() = default;
+
+    bool operator==(const TensorIterator<std::string_view> &rhs) { return data_ == rhs.data_ && index_ == rhs.index_; }
+
+    bool operator!=(const TensorIterator<std::string_view> &rhs) { return !(*this == rhs); }
+
+    operator bool() const { return data_ != nullptr; }
+
+    std::string_view operator*() const {
+      auto offset_ = reinterpret_cast<const offset_t *>(data_);
+      offset_t start = offset_[index_];
+      return std::string_view{data_ + start};
+    }
+
+    TensorIterator<std::string_view> &operator+=(const dsize_t &inc) {
+      index_ += inc;
+      return *this;
+    }
+
+    TensorIterator<std::string_view> &operator-=(const dsize_t &inc) {
+      index_ -= inc;
+      return *this;
+    }
+
+    TensorIterator<std::string_view> &operator++() {
+      ++index_;
+      return *this;
+    }
+
+    TensorIterator<std::string_view> &operator--() {
+      --index_;
+      return *this;
+    }
+
+    TensorIterator<std::string_view> operator++(int) {
+      auto temp(*this);
+      ++index_;
+      return temp;
+    }
+
+    TensorIterator<std::string_view> operator--(int) {
+      auto temp(*this);
+      --index_;
+      return temp;
+    }
+
+    TensorIterator<std::string_view> operator+(const dsize_t &inc) {
+      auto oldPtr = index_;
+      index_ += inc;
+      auto temp(*this);
+      index_ = oldPtr;
+      return temp;
+    }
+
+    TensorIterator<std::string_view> operator-(const dsize_t &inc) {
+      auto oldPtr = index_;
+      index_ -= inc;
+      auto temp(*this);
+      index_ = oldPtr;
+      return temp;
+    }
+
+   protected:
+    dsize_t index_;
+    const char *data_;
+  };
+
+  // Return a TensorIterator that points to the start of the Tensor.
+  // It's the user responsibility to use the correct type that matches the Tensor type
+  // @param T The type of values in the Tensor
+  // @return TensorIterator
+  template <typename T>
+  TensorIterator<T> begin() {
+    AllocateBuffer(SizeInBytes());
+    return TensorIterator<T>(data_);
+  }
+
+  // Return a linear iterator that points to the place after the last element of the Tensor.
+  // @tparam T The type of values in the Tensor
+  // @return TensorIterator
+  template <typename T>
+  TensorIterator<T> end() {
+    return TensorIterator<T>(data_end_);
+  }
+
+  // Copies the last dimension at `index` from Tensor `src` to this Tensor.
+  // @param src Tensor
+  // @param index vector to the start of the dimension. The last dim should be 0
+  // @return Status
+  Status CopyLastDimAt(const std::shared_ptr<Tensor> &src, const std::vector<dsize_t> &index);
+
+ protected:
+  // Get the starting memory address for the data of the tensor.  This potentially
+  // drives an allocation if the data is null.
+  // @return unsigned char*
+  unsigned char *GetMutableBuffer();
+
+  // A function that prints Tensor recursively, first called by print
+  // @param out
+  // @param cur_dim
+  // @param cur_index
+  void PrintRecursive(std::ostream &out, int32_t cur_dim, const std::vector<dsize_t> &cur_index) const;
+
+  // A function that prints info about the tensor
+  // @param out output stream
+  void Print(std::ostream &out) const;
+
+  // A function that print the value as specified by its index
+  // @param index vector representing the index
+  // @param out
+  void PrintItemAt(const std::vector<dsize_t> &index, std::ostream &out) const;
+
+  // Get pointer to item located at `index`, caller needs to provide the type.
+  // @tparam T
+  // @param index vector<dsize_t>
+  // @return return a pointer to the item specified at index of type `T`
+  template <typename T>
+  Status GetItemPtr(T **, const std::vector<dsize_t> &index) const;
+
+  // Get pointer to string located at `index` and the length of string
+  // @param index vector<dsize_t>
+  // @return return a pointer to the string specified at index and the length of the string
+  Status GetItemPtr(uchar **, const std::vector<dsize_t> &index, offset_t *length = nullptr) const;
+
+  // Given a flat index of an item string, return the start and length of the item
+  // @param index flat index of the item
+  // @return start address of the ths string
+  // @return length of the string
+  Status GetStringAt(dsize_t index, uchar **string_start, offset_t *length) const;
+
+  // all access to shape_ should be via shape
+  TensorShape shape_;
+  // data type of tensor
+  DataType type_;
+  // pointer to the start of the physical data
+  unsigned char *data_;
+  // An allocator for data_
+  CharAllocPtr data_allocator_;
+  // pointer to the end of the physical data
+  unsigned char *data_end_ = nullptr;
+};
+template <>
+inline Tensor::TensorIterator<std::string_view> Tensor::end<std::string_view>() {
+  return TensorIterator<std::string_view>(data_, shape_.NumOfElements());
+}
+}  // namespace dataset
+}  // namespace mindspore
+#endif  // DATASET_CORE_TENSOR_H_
diff --git a/mindspore/ccsrc/minddata/dataset/include/utils/log_adapter.h b/mindspore/ccsrc/minddata/dataset/include/utils/log_adapter.h
deleted file mode 120000
index f2c939bc0be..00000000000
--- a/mindspore/ccsrc/minddata/dataset/include/utils/log_adapter.h
+++ /dev/null
@@ -1 +0,0 @@
-../../../../utils/log_adapter.h
\ No newline at end of file
diff --git a/mindspore/ccsrc/minddata/dataset/include/utils/overload.h b/mindspore/ccsrc/minddata/dataset/include/utils/overload.h
deleted file mode 120000
index 7dc313d512a..00000000000
--- a/mindspore/ccsrc/minddata/dataset/include/utils/overload.h
+++ /dev/null
@@ -1 +0,0 @@
-../../../../utils/overload.h
\ No newline at end of file

From abebb2004b3c276b19ac2421f1794ec790b07486 Mon Sep 17 00:00:00 2001
From: yao_yf <yaoyifan1@huawei.com>
Date: Fri, 17 Jul 2020 10:46:52 +0800
Subject: [PATCH 50/68] remove 4 reshape ut

---
 tests/ut/python/parallel/test_reshape.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/ut/python/parallel/test_reshape.py b/tests/ut/python/parallel/test_reshape.py
index 5fe48da1339..ce0af6d4979 100644
--- a/tests/ut/python/parallel/test_reshape.py
+++ b/tests/ut/python/parallel/test_reshape.py
@@ -344,11 +344,11 @@ def test_reshape_net2_2():
     reshape_net2(ReshapeNet2(((1, 8), (8, 2))))
 
 
-def test_reshape_net3_1():
+def _test_reshape_net3_1():
     reshape_net2(ReshapeNet3(((1, 8), (8, 1))))
 
 
-def test_reshape_net3_2():
+def _test_reshape_net3_2():
     reshape_net2(ReshapeNet3(((1, 8), (8, 2))))
 
 
@@ -464,11 +464,11 @@ def test_reshape_common2_3():
     reshape_common2(ParallelMode.SEMI_AUTO_PARALLEL, ReshapeNet2(((1, 8), (8, 2))))
 
 
-def test_reshape_common2_4():
+def _test_reshape_common2_4():
     reshape_common2(ParallelMode.SEMI_AUTO_PARALLEL, ReshapeNet3(((1, 8), (8, 1))))
 
 
-def test_reshape_common2_5():
+def _test_reshape_common2_5():
     reshape_common2(ParallelMode.SEMI_AUTO_PARALLEL, ReshapeNet3(((1, 8), (8, 2))))
 
 

From acbccea6445eb9489d38d1103efa449bed8e16ca Mon Sep 17 00:00:00 2001
From: Wei Luning <weiluning@huawei.com>
Date: Wed, 15 Jul 2020 13:04:34 +0800
Subject: [PATCH 51/68] remove redundant phi

---
 mindspore/ccsrc/frontend/optimizer/py_pass.cc |  3 +-
 .../pipeline/jit/parse/function_block.cc      | 29 +++++-----
 .../ccsrc/pipeline/jit/parse/function_block.h |  2 +-
 mindspore/ccsrc/pipeline/jit/parse/parse.cc   | 56 +++++++++++++------
 mindspore/ccsrc/utils/graph_utils.cc          | 21 +++++++
 mindspore/ccsrc/utils/graph_utils.h           |  1 +
 mindspore/core/ir/anf.cc                      | 11 ++++
 mindspore/core/ir/anf.h                       |  2 +-
 mindspore/core/ir/func_graph.cc               |  9 +++
 mindspore/core/ir/func_graph.h                |  3 +-
 mindspore/core/ir/func_graph_cloner.cc        | 16 ++++--
 mindspore/core/ir/manager.cc                  |  7 +--
 tests/ut/cpp/common/py_func_graph_fetcher.h   |  1 +
 .../pynative_mode/test_multigraph_sink.py     | 23 ++++++++
 14 files changed, 136 insertions(+), 48 deletions(-)

diff --git a/mindspore/ccsrc/frontend/optimizer/py_pass.cc b/mindspore/ccsrc/frontend/optimizer/py_pass.cc
index c1bf40fcbb3..34c46c6b66e 100644
--- a/mindspore/ccsrc/frontend/optimizer/py_pass.cc
+++ b/mindspore/ccsrc/frontend/optimizer/py_pass.cc
@@ -52,9 +52,10 @@ std::string GetNodeRepr(AnfNodePtr node) {
 
 void ResolveFuncGraph_(const FuncGraphPtr &fg) {
   auto manager = Manage(fg, false);
+  auto use_sig = parse::python_adapter::UseSignatureInResolve();
   parse::python_adapter::set_use_signature_in_resolve(false);
   parse::ResolveAll(manager);
-  parse::python_adapter::set_use_signature_in_resolve(true);
+  parse::python_adapter::set_use_signature_in_resolve(use_sig);
 }
 
 bool Match(const AnfNodePtr &pattern, const AnfNodePtr &node, const NodeEquivPtr &equiv_ptr) {
diff --git a/mindspore/ccsrc/pipeline/jit/parse/function_block.cc b/mindspore/ccsrc/pipeline/jit/parse/function_block.cc
index b52dddda66f..14e9f739d52 100644
--- a/mindspore/ccsrc/pipeline/jit/parse/function_block.cc
+++ b/mindspore/ccsrc/pipeline/jit/parse/function_block.cc
@@ -145,6 +145,12 @@ AnfNodePtr FunctionBlock::MakeResolve(const NameSpacePtr &name_space, const Symb
 void FunctionBlock::SetPhiArgument(const ParameterPtr &phi) {
   std::string var = phi_nodes_[phi];
   MS_LOG(DEBUG) << "graph " << func_graph_->ToString() << " set phi " << phi->ToString() << " for var " << var;
+  auto removable = CollectRemovablePhi(phi);
+  // If the phi node is not necessary, not need to add to jumps_ of the prev blocks.
+  if (removable) {
+    MS_LOG(DEBUG) << "remove the phi when call graph " << func_graph_->ToString() << " var " << var;
+    return;
+  }
   for (auto &pred : prev_blocks_) {
     MS_EXCEPTION_IF_NULL(pred);
     MS_LOG(DEBUG) << "graph " << func_graph_->ToString() << " pred_blocks_ " << pred->func_graph_->ToString();
@@ -152,16 +158,6 @@ void FunctionBlock::SetPhiArgument(const ParameterPtr &phi) {
     CNodePtr jump = pred->jumps_[this];
     jump->add_input(arg_node);
   }
-  // If the phi node in the body part of a for/while loop is being removed,
-  // then the closure convert phase will generate a cycle in graph if the
-  // loop is kept after specialization. This should be investigate further.
-  // Just now user has to set a flag on a function to indicate the for loop
-  // will definitely can be unroll as the sequence in for statement is fixed
-  // size in compile time.
-  if (parser_.func_graph()->has_flag(GRAPH_FLAG_LOOP_CAN_UNROLL) ||
-      parser_.func_graph()->has_flag(GRAPH_FLAG_HAS_EFFECT)) {
-    CollectRemovablePhi(phi);
-  }
 }
 
 AnfNodePtr FunctionBlock::SearchReplaceNode(const std::string &var, const ParameterPtr &phi) {
@@ -207,13 +203,13 @@ AnfNodePtr FunctionBlock::SearchReplaceNode(const std::string &var, const Parame
 // 2. it's costly to iterate the graph to replace the phi for each phi.
 // Args :
 // phi  : This parameter node is functioning as a phi node.
-void FunctionBlock::CollectRemovablePhi(const ParameterPtr &phi) {
+bool FunctionBlock::CollectRemovablePhi(const ParameterPtr &phi) {
   MS_EXCEPTION_IF_NULL(phi);
   std::string var = phi_nodes_[phi];
-  MS_LOG(DEBUG) << "check phi " << phi->ToString() << " for " << var << " in graph " << func_graph_->ToString();
+  MS_LOG(DEBUG) << "check phi " << phi->DebugString() << " for " << var;
   if (prev_blocks_.size() == 0) {
-    MS_LOG(DEBUG) << "no phi " << phi->ToString() << " for var " << var << " in graph " << func_graph_->ToString();
-    return;
+    MS_LOG(DEBUG) << "no phi " << phi->DebugString() << " for var " << var;
+    return false;
   }
   AnfNodePtr arg_node = SearchReplaceNode(var, phi);
   if (arg_node != nullptr) {
@@ -235,13 +231,16 @@ void FunctionBlock::CollectRemovablePhi(const ParameterPtr &phi) {
           const auto &param = phi_iter.second->cast<ParameterPtr>();
           if (param == phi) {
             MS_LOG(DEBUG) << "graph " << prev->func_graph_->ToString() << " var " << phi_iter.first->DebugString()
-                          << " can be replaced from " << param->DebugString() << " with " << arg_node->DebugString();
+                          << " can be replaced from " << param->DebugString() << " with " << arg_node->DebugString()
+                          << " in graph " << arg_node->func_graph()->ToString();
             prev->removable_phis_[phi_iter.first] = arg_node;
           }
         }
       }
     }
+    return true;
   }
+  return false;
 }
 
 // A block should be marked matured if its predecessor blocks have been processed
diff --git a/mindspore/ccsrc/pipeline/jit/parse/function_block.h b/mindspore/ccsrc/pipeline/jit/parse/function_block.h
index cbf75a3dd84..2331eeca47e 100644
--- a/mindspore/ccsrc/pipeline/jit/parse/function_block.h
+++ b/mindspore/ccsrc/pipeline/jit/parse/function_block.h
@@ -52,7 +52,7 @@ class FunctionBlock : public std::enable_shared_from_this<FunctionBlock> {
   AnfNodePtr ReadVariable(const std::string &var_name);
   void AddPrevBlock(const FunctionBlockPtr &block);
   void SetPhiArgument(const ParameterPtr &phi);
-  void CollectRemovablePhi(const ParameterPtr &phi);
+  bool CollectRemovablePhi(const ParameterPtr &phi);
   // A block is matured if all its predecessors is generated
   void Mature();
   CNodePtr ForceToBoolNode(const AnfNodePtr &cond);
diff --git a/mindspore/ccsrc/pipeline/jit/parse/parse.cc b/mindspore/ccsrc/pipeline/jit/parse/parse.cc
index edc9a66594b..b2e95c50703 100644
--- a/mindspore/ccsrc/pipeline/jit/parse/parse.cc
+++ b/mindspore/ccsrc/pipeline/jit/parse/parse.cc
@@ -1436,6 +1436,15 @@ FunctionBlockPtr Parser::ParsePass(const FunctionBlockPtr &block, const py::obje
   return block;
 }
 
+AnfNodePtr FindPhis(const std::unordered_map<ParameterPtr, AnfNodePtr> &removable_phis, const AnfNodePtr &node) {
+  const auto &inp = node->cast<ParameterPtr>();
+  const auto &iter = removable_phis.find(inp);
+  if (iter == removable_phis.end()) {
+    return node;
+  }
+  return FindPhis(removable_phis, iter->second);
+}
+
 void Parser::RemoveUnnecessaryPhis() {
   // merge all removable phis to one map;
   std::unordered_map<ParameterPtr, AnfNodePtr> removable_phis;
@@ -1443,28 +1452,39 @@ void Parser::RemoveUnnecessaryPhis() {
     MS_EXCEPTION_IF_NULL(block);
     removable_phis.insert(block->removable_phis().begin(), block->removable_phis().end());
   }
-
   if (removable_phis.size() == 0) {
     return;
   }
-  for (auto &node : DeepUsedGraphSearch(func_graph_->get_return())) {
-    if (node->isa<CNode>()) {
-      const auto &cnode = node->cast<CNodePtr>();
-      auto &inputs = cnode->inputs();
-      for (std::size_t i = 0; i < inputs.size(); i++) {
-        if (inputs[i]->isa<Parameter>()) {
-          const auto &inp = inputs[i]->cast<ParameterPtr>();
-          const auto &iter = removable_phis.find(inp);
-          if (iter == removable_phis.end()) {
-            continue;
-          }
-          auto &argNode = iter->second;
-          MS_LOG(DEBUG) << "graph " << cnode->func_graph()->ToString() << " replace phi " << inp->ToString() << " in "
-                        << cnode->DebugString() << " with " << argNode->DebugString();
-          cnode->set_input(i, argNode);
-        }
-      }
+
+  auto fg_name = func_graph_->ToString();
+  auto mng = Manage(func_graph_, false);
+  // replace the nodes
+  for (auto iter : removable_phis) {
+    auto new_node = FindPhis(removable_phis, iter.first);
+    MS_LOG(DEBUG) << "phi " << iter.first->DebugString() << " to " << new_node->DebugString();
+    mng->Replace(iter.first, new_node);
+  }
+  // remove the parameter
+  for (FunctionBlockPtr &block : func_block_list_) {
+    MS_EXCEPTION_IF_NULL(block);
+    auto &local_removable_phis = block->removable_phis();
+    if (local_removable_phis.size() == 0) {
+      continue;
     }
+    auto func_graph = block->func_graph();
+    auto &parameters = func_graph->parameters();
+    std::vector<AnfNodePtr> new_parameters(parameters.size());
+    auto it = std::copy_if(
+      parameters.begin(), parameters.end(), new_parameters.begin(), [&local_removable_phis](AnfNodePtr param) {
+        return local_removable_phis.find(param->cast<ParameterPtr>()) == local_removable_phis.end();
+      });
+
+    // shrink container to new size
+    new_parameters.resize(std::distance(new_parameters.begin(), it));
+    func_graph->set_parameters(new_parameters);
+  }
+  for (auto fg : mng->func_graphs()) {
+    fg->ClearAllManagerInfo();
   }
 }
 
diff --git a/mindspore/ccsrc/utils/graph_utils.cc b/mindspore/ccsrc/utils/graph_utils.cc
index 03ac14573df..6689719fccd 100644
--- a/mindspore/ccsrc/utils/graph_utils.cc
+++ b/mindspore/ccsrc/utils/graph_utils.cc
@@ -111,6 +111,27 @@ std::vector<CNodePtr> BroadFirstSearchGraphCNodes(CNodePtr ret) {
   return sorted_nodes;
 }
 
+std::vector<FuncGraphPtr> BroadFirstSearchGraphUsed(FuncGraphPtr root) {
+  std::deque<FuncGraphPtr> todo;
+  todo.push_back(root);
+  std::vector<FuncGraphPtr> sorted;
+  auto seen = NewSeenGeneration();
+  while (!todo.empty()) {
+    FuncGraphPtr top = todo.front();
+    todo.pop_front();
+    sorted.push_back(top);
+    auto used = top->func_graphs_used();
+    for (auto &item : used) {
+      if (item.first->seen_ == seen) {
+        continue;
+      }
+      todo.push_back(item.first);
+      item.first->seen_ = seen;
+    }
+  }
+  return sorted;
+}
+
 std::vector<AnfNodePtr> SuccDeeper(const AnfNodePtr &node) {
   std::vector<AnfNodePtr> vecs;
   if (node == nullptr) {
diff --git a/mindspore/ccsrc/utils/graph_utils.h b/mindspore/ccsrc/utils/graph_utils.h
index 2a9240ac849..8eb75f67993 100644
--- a/mindspore/ccsrc/utils/graph_utils.h
+++ b/mindspore/ccsrc/utils/graph_utils.h
@@ -70,6 +70,7 @@ std::vector<AnfNodePtr> TopoSort(const AnfNodePtr &root, const SuccFunc &succ =
                                  const IncludeFunc &include = AlwaysInclude);
 
 std::vector<CNodePtr> BroadFirstSearchGraphCNodes(CNodePtr ret);
+std::vector<FuncGraphPtr> BroadFirstSearchGraphUsed(FuncGraphPtr root);
 class FuncGraphIndex {
  public:
   explicit FuncGraphIndex(const FuncGraphPtr &fg, const SearchFunc &search = DeepScopedGraphSearch,
diff --git a/mindspore/core/ir/anf.cc b/mindspore/core/ir/anf.cc
index 0d96ddf263c..275bd3b206a 100644
--- a/mindspore/core/ir/anf.cc
+++ b/mindspore/core/ir/anf.cc
@@ -77,6 +77,17 @@ std::string CNode::DebugString(int recursive_level) const {
   return buffer.str();
 }
 
+std::string Parameter::DebugString(int recursive_level) const {
+  std::ostringstream buffer;
+  if (recursive_level > 0) {
+    if (func_graph() != nullptr) {
+      buffer << func_graph()->ToString() << ":";
+    }
+  }
+  buffer << ToString();
+  return buffer.str();
+}
+
 std::string ValueNode::ToString() const {
   MS_EXCEPTION_IF_NULL(value_);
   if (value_->isa<FuncGraph>()) {
diff --git a/mindspore/core/ir/anf.h b/mindspore/core/ir/anf.h
index c1a28d57f18..961dcde8a78 100644
--- a/mindspore/core/ir/anf.h
+++ b/mindspore/core/ir/anf.h
@@ -249,7 +249,7 @@ class Parameter : public ANode {
   MS_DECLARE_PARENT(Parameter, ANode);
 
   void accept(AnfVisitor *v) override;
-
+  std::string DebugString(int recursive_level = 1) const override;
   std::string name() const { return name_; }
   void set_name(const std::string &name) { name_ = name; }
   std::string fullname_with_scope() override { return name(); };
diff --git a/mindspore/core/ir/func_graph.cc b/mindspore/core/ir/func_graph.cc
index fabdd3e7d32..570ed61f96a 100644
--- a/mindspore/core/ir/func_graph.cc
+++ b/mindspore/core/ir/func_graph.cc
@@ -417,6 +417,15 @@ std::shared_ptr<std::list<FuncGraphPtr>> FuncGraph::recursive_graphs() {
   return mng->recursive_graphs(shared_from_base<FuncGraph>());
 }
 
+void FuncGraph::ClearAllManagerInfo() {
+  ClearNodes();
+  ClearValueNodes();
+  ClearFuncGraphCNodesIndex();
+  ClearFreeVariables();
+  ClearFuncGraphsUsed();
+  ClearJFuncGraphs();
+}
+
 AnfNodePtr FuncGraph::GetDefaultValueByName(const std::string &name) {
   auto itr = this->parameter_default_value_.find(name);
   if (itr == parameter_default_value_.end()) {
diff --git a/mindspore/core/ir/func_graph.h b/mindspore/core/ir/func_graph.h
index 712c75b4315..fd7f5d9d48d 100644
--- a/mindspore/core/ir/func_graph.h
+++ b/mindspore/core/ir/func_graph.h
@@ -229,7 +229,8 @@ class FuncGraph : public FuncGraphBase {
     }
     this->debug_info_ = info;
   }
-
+  // clear all info from manager
+  void ClearAllManagerInfo();
   // get all nodes belonging to this func graph
   const AnfNodeSet &nodes();
   void CopyNodes(const FuncGraphPtr &source);
diff --git a/mindspore/core/ir/func_graph_cloner.cc b/mindspore/core/ir/func_graph_cloner.cc
index 0857770cad5..432a924b1e5 100644
--- a/mindspore/core/ir/func_graph_cloner.cc
+++ b/mindspore/core/ir/func_graph_cloner.cc
@@ -25,6 +25,7 @@
 #include "utils/log_adapter.h"
 #include "utils/profile.h"
 #include "utils/context/ms_context.h"
+#include "utils/graph_utils.h"
 
 // namespace to support intermediate representation definition
 namespace mindspore {
@@ -400,11 +401,16 @@ void Cloner::LiftParameters(const FuncGraphPtr &func_graph_user, const FuncGraph
 }
 
 void Cloner::Lift() {
-  for (auto &func_graph_params : repl_func_graph_params_) {
-    auto &func_graph = func_graph_params.first;
-    auto &params = func_graph_params.second;
-    for (auto &cnode : func_graph->func_graph_cnodes_index()) {
-      LiftParameters(cnode.first->first->func_graph(), func_graph, params);
+  // lift inner graph first
+  auto sorted = BroadFirstSearchGraphUsed(*(manager_->roots().begin()));
+  for (auto r_iter = sorted.rbegin(); r_iter != sorted.rend(); ++r_iter) {
+    auto func_graph = *r_iter;
+    auto iter  = repl_func_graph_params_.find(func_graph);
+    if (iter != repl_func_graph_params_.end()) {
+      auto &params = iter->second;
+      for (auto &cnode : func_graph->func_graph_cnodes_index()) {
+        LiftParameters(cnode.first->first->func_graph(), func_graph, params);
+      }
     }
   }
 }
diff --git a/mindspore/core/ir/manager.cc b/mindspore/core/ir/manager.cc
index 00c39679cd5..5c996bcdab5 100644
--- a/mindspore/core/ir/manager.cc
+++ b/mindspore/core/ir/manager.cc
@@ -520,12 +520,7 @@ void FuncGraphManager::MoveAllNodes(FuncGraphPtr source, FuncGraphPtr target) {
   target->CopyFuncGraphsUsed(source);
   target->CopyJFuncGraphs(source);
   signals_->InvalidateComputer();
-  source->ClearNodes();
-  source->ClearValueNodes();
-  source->ClearFuncGraphCNodesIndex();
-  source->ClearFreeVariables();
-  source->ClearFuncGraphsUsed();
-  source->ClearJFuncGraphs();
+  source->ClearAllManagerInfo();
 }
 
 FuncGraphTransaction FuncGraphManager::Transact() {
diff --git a/tests/ut/cpp/common/py_func_graph_fetcher.h b/tests/ut/cpp/common/py_func_graph_fetcher.h
index d864842760e..ae9467cef1f 100644
--- a/tests/ut/cpp/common/py_func_graph_fetcher.h
+++ b/tests/ut/cpp/common/py_func_graph_fetcher.h
@@ -72,6 +72,7 @@ class PyFuncGraphFetcher {
       mindspore::FuncGraphPtr func_graph = mindspore::parse::ParsePythonCode(fn);
       if (doResolve_) {
         std::shared_ptr<mindspore::FuncGraphManager> manager = mindspore::Manage(func_graph, false);
+        mindspore::parse::python_adapter::set_use_signature_in_resolve(false);
         mindspore::parse::ResolveAll(manager);
       }
       return func_graph;
diff --git a/tests/ut/python/pynative_mode/test_multigraph_sink.py b/tests/ut/python/pynative_mode/test_multigraph_sink.py
index e8ebe03797d..c4ef44ef5a2 100644
--- a/tests/ut/python/pynative_mode/test_multigraph_sink.py
+++ b/tests/ut/python/pynative_mode/test_multigraph_sink.py
@@ -131,3 +131,26 @@ def test_while_in_while():
     output = while_in_while(c1, c2, c3)
     expect = Tensor([1274], mstype.int32)
     assert output == expect
+
+
+@ms_function
+def while_by_while_in_while(x, y, z):
+    out = c4
+    while x < c2:
+        y = c4 + c4
+        while y < c2:
+            y = y + 1
+        out = out + y
+        z = c4 + c4
+        while z < c2:
+            z = z + 1
+        out = out + z
+        x = x + 1
+    out = out + x
+    return out
+
+
+def test_while_by_while_in_while():
+    output = while_by_while_in_while(c1, c2, c3)
+    expect = Tensor([350], mstype.int32)
+    assert output == expect

From f3f9fc958ab043e98797e9e6b1a4d304e0896862 Mon Sep 17 00:00:00 2001
From: CaoJian <caojian05>
Date: Thu, 16 Jul 2020 22:54:53 +0800
Subject: [PATCH 52/68] add GPU operator: abs and floor

---
 .../gpu/cuda_impl/unary_op_impl.cu            | 43 +++++++++++++++++++
 .../gpu/cuda_impl/unary_op_impl.cuh           |  4 ++
 .../gpu/math/unary_op_gpu_kernel.cc           |  8 ++++
 .../gpu/math/unary_op_gpu_kernel.h            | 14 +++++-
 4 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cu
index 09b347e3d51..629c4c29dc7 100755
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cu
@@ -103,6 +103,35 @@ __global__ void ZeroslikeKernel(T *output, size_t count) {
   return;
 }
 template <typename T>
+__global__ void AbsKernel(T *input, T *output, size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = abs(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void AbsKernel(half *input, half *output, size_t count) {
+  half zero = 0.0;
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = input[i] < zero ? -input[i] : input[i];
+  }
+  return;
+}
+template <typename T>
+__global__ void FloorKernel(T *input, T *output, size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = floor(input[i]);
+  }
+  return;
+}
+template <>
+__global__ void FloorKernel(half *input, half *output, size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = hfloor(input[i]);
+  }
+  return;
+}
+template <typename T>
 void Exponential(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
   ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
   return;
@@ -147,6 +176,16 @@ void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream) {
   ZeroslikeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(output, count);
   return;
 }
+template <typename T>
+void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
+  AbsKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
+template <typename T>
+void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
+  FloorKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
+  return;
+}
 
 template void Exponential<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Logarithm<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
@@ -156,6 +195,8 @@ template void Square<float>(float *input, float *output, size_t count, cudaStrea
 template void Sqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Rsqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Zeroslike<float>(float *output, size_t count, cudaStream_t cuda_stream);
+template void Abs<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
+template void Floor<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Exponential<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Logarithm<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Negative<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
@@ -164,3 +205,5 @@ template void Square<half>(half *input, half *output, size_t count, cudaStream_t
 template void Sqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Rsqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Zeroslike<half>(half *output, size_t count, cudaStream_t cuda_stream);
+template void Abs<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
+template void Floor<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cuh
index cf8b30866e7..4020f93df2e 100755
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cuh
@@ -34,5 +34,9 @@ template <typename T>
 void Rsqrt(T *input, T *output, size_t count, cudaStream_t cuda_stream);
 template <typename T>
 void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream);
+template <typename T>
+void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream);
+template <typename T>
+void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream);
 
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.cc
index ae8e7bbd0b1..d646ef417ca 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.cc
@@ -46,5 +46,13 @@ MS_REG_GPU_KERNEL_ONE(Sqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOut
                       UnaryOpGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(Rsqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                       UnaryOpGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                      UnaryOpGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      UnaryOpGpuKernel, half)
+MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                      UnaryOpGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+                      UnaryOpGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
index 26993bc3bd8..a02b94130cb 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
@@ -36,6 +36,8 @@ enum UnaryOptype {
   UNARY_OP_SQUARE,
   UNARY_OP_SQRT,
   UNARY_OP_RSQRT,
+  UNARY_OP_ABS,
+  UNARY_OP_FLOOR,
   UNARY_OP_INVALID_TYPE = 255
 };
 static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY_OP_EXP},
@@ -45,7 +47,9 @@ static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY
                                                                    {"ZerosLike", UNARY_OP_ZEROSLIKE},
                                                                    {"Square", UNARY_OP_SQUARE},
                                                                    {"Sqrt", UNARY_OP_SQRT},
-                                                                   {"Rsqrt", UNARY_OP_RSQRT}};
+                                                                   {"Rsqrt", UNARY_OP_RSQRT},
+                                                                   {"Abs", UNARY_OP_ABS},
+                                                                   {"Floor", UNARY_OP_FLOOR}};
 template <typename T>
 class UnaryOpGpuKernel : public GpuKernel {
  public:
@@ -100,6 +104,14 @@ class UnaryOpGpuKernel : public GpuKernel {
         Zeroslike(output_addr, output_size_ / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
         return true;
       }
+      case UNARY_OP_ABS: {
+        Abs(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
+        break;
+      }
+      case UNARY_OP_FLOOR: {
+        Floor(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
+        break;
+      }
       default: {
         MS_LOG(EXCEPTION) << "Unary operation " << unary_op_type_ << " is not supported.";
       }

From 28d1d3708508e180f3b04b798139ba6ffcfa7d85 Mon Sep 17 00:00:00 2001
From: Wei Luning <weiluning@huawei.com>
Date: Thu, 16 Jul 2020 19:37:57 +0800
Subject: [PATCH 53/68] End at validate when export.

---
 mindspore/ccsrc/pipeline/jit/pipeline.cc      | 85 ++++++++-----------
 mindspore/ccsrc/pipeline/jit/pipeline.h       |  1 -
 mindspore/ccsrc/transform/graph_ir/convert.cc |  4 +-
 mindspore/nn/layer/quant.py                   |  2 +-
 mindspore/ops/operations/_inner_ops.py        |  8 +-
 mindspore/train/quant/quant.py                | 23 ++---
 mindspore/train/quant/quant_utils.py          |  2 +-
 mindspore/train/serialization.py              |  7 +-
 tests/ut/python/ops/test_ops.py               | 32 +++----
 tests/ut/python/train/quant/test_quant.py     | 14 ++-
 10 files changed, 90 insertions(+), 88 deletions(-)

diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.cc b/mindspore/ccsrc/pipeline/jit/pipeline.cc
index 49bebfb3c42..21d20c893fd 100644
--- a/mindspore/ccsrc/pipeline/jit/pipeline.cc
+++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc
@@ -383,16 +383,6 @@ void ExecutorPy::SaveCompiledGraph(const std::string &phase_s) {
   MS_LOG(INFO) << "End save compiled func graph!";
 }
 
-bool ExecutorPy::ChangeExportGeirUseVmFlag(bool use_vm, const std::string &phase_s) const {
-  std::string phase_prefix = GetPhasePrefix(phase_s);
-
-  if (use_vm && phase_prefix == "export") {
-    MS_LOG(INFO) << "Use ge backend to export geir";
-    use_vm = false;
-  }
-  return use_vm;
-}
-
 void ExecutorPy::GetGeBackendPolicy() const {
   auto ms_context = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(ms_context);
@@ -402,6 +392,40 @@ void ExecutorPy::GetGeBackendPolicy() const {
   }
 }
 
+bool IsPhaseExportGeir(const std::string &phase_s) {
+  auto phase_to_export = "export.geir";
+  return phase_s.rfind(phase_to_export, 0) != std::string::npos;
+}
+
+std::vector<ActionItem> GetPipline(const ResourcePtr &resource, const std::string &phase_s, bool use_vm) {
+  bool is_geir = IsPhaseExportGeir(phase_s);
+
+  std::string backend = MsContext::GetInstance()->backend_policy();
+
+#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
+  if (mindspore::parallel::ps::Util::IsParamServerMode()) {
+    mindspore::parallel::ps::Util::SetInternalEnvVar();
+  }
+  if (parallel::ps::Util::IsRoleOfPServer()) {
+    resource->results()[kBackend] = compile::CreateBackend();
+    return PServerPipeline();
+  }
+  if (parallel::ps::Util::IsRoleOfScheduler()) {
+    return PSchedulerPipeline();
+  }
+#endif
+
+  if (use_vm && backend != "ge" && !is_geir) {
+    // Create backend and session
+    auto backend_ptr = compile::CreateBackend();
+    // Connect session to debugger
+    backend_ptr->SetDebugger();
+    resource->results()[kBackend] = backend_ptr;
+    return VmPipeline();
+  }
+  return GePipeline();
+}
+
 bool ExecutorPy::CompileInner(const py::object &obj, const py::tuple &args, const py::object &phase, bool use_vm) {
   MS_LOG(DEBUG) << "Start ExecutorPy compile!";
   if ((!py::isinstance<py::str>(phase))) {
@@ -420,43 +444,8 @@ bool ExecutorPy::CompileInner(const py::object &obj, const py::tuple &args, cons
   std::string phase_s = py::cast<std::string>(phase);
   MS_LOG(INFO) << "ExecutorPy compile phase:" << phase_s << "!";
   ResourcePtr resource = std::make_shared<Resource>(obj);
-  std::vector<ActionItem> p_actions;
-
-  use_vm = ChangeExportGeirUseVmFlag(use_vm, phase_s);
-
-  std::string backend = MsContext::GetInstance()->backend_policy();
-#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
-  if (mindspore::parallel::ps::Util::IsParamServerMode()) {
-    mindspore::parallel::ps::Util::SetInternalEnvVar();
-  }
-  if (parallel::ps::Util::IsRoleOfPServer()) {
-    resource->results()[kBackend] = compile::CreateBackend();
-    p_actions = PServerPipeline();
-  } else if (parallel::ps::Util::IsRoleOfScheduler()) {
-    p_actions = PSchedulerPipeline();
-  } else if (use_vm && backend != "ge") {
-    // Create backend and session
-    auto backend_ptr = compile::CreateBackend();
-    // Connect session to debugger
-    backend_ptr->SetDebugger();
-    resource->results()[kBackend] = backend_ptr;
-    p_actions = VmPipeline();
-  } else {
-    p_actions = GePipeline();
-  }
-#else
-  if (use_vm && backend != "ge") {
-    // Create backend and session
-    auto backend_ptr = compile::CreateBackend();
-    // Connect session to debugger
-    backend_ptr->SetDebugger();
-    resource->results()[kBackend] = backend_ptr;
-    p_actions = VmPipeline();
-  } else {
-    p_actions = GePipeline();
-  }
-#endif
 
+  auto p_actions = GetPipline(resource, phase_s, use_vm);
   std::shared_ptr<Pipeline> pip = std::make_shared<Pipeline>(resource, FilterActions(p_actions, phase_s));
 
   // get the parameters items and add the value to args_spec
@@ -490,8 +479,8 @@ bool ExecutorPy::CompileInner(const py::object &obj, const py::tuple &args, cons
 }
 
 std::vector<ActionItem> ExecutorPy::FilterActions(const std::vector<ActionItem> &actions, const std::string &phase) {
-  // phase does not contain 'export_onnx'
-  if (GetPhasePrefix(phase).find("export_onnx") == std::string::npos) {
+  // filter action after validate when 'export'.
+  if (GetPhasePrefix(phase).rfind("export", 0) == std::string::npos) {
     return actions;
   }
   MS_LOG(INFO) << "Phase is '" << phase << "', filter out actions after stage 'validate'";
diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.h b/mindspore/ccsrc/pipeline/jit/pipeline.h
index 705853d0860..d018d736231 100644
--- a/mindspore/ccsrc/pipeline/jit/pipeline.h
+++ b/mindspore/ccsrc/pipeline/jit/pipeline.h
@@ -101,7 +101,6 @@ class ExecutorPy : public std::enable_shared_from_this<ExecutorPy> {
  private:
   ExecutorPy();
   void ConvertObjectToTensors(const py::dict &dict, std::map<std::string, tensor::TensorPtr> *tensors);
-  bool ChangeExportGeirUseVmFlag(bool use_vm, const std::string &phase_s) const;
   void GetGeBackendPolicy() const;
   // filter some pipeline actions according to phase, e.g. when exporting onnx, it is no need to execute actions after
   // 'validate' stage
diff --git a/mindspore/ccsrc/transform/graph_ir/convert.cc b/mindspore/ccsrc/transform/graph_ir/convert.cc
index 56028bbdd90..132fabe561b 100644
--- a/mindspore/ccsrc/transform/graph_ir/convert.cc
+++ b/mindspore/ccsrc/transform/graph_ir/convert.cc
@@ -205,8 +205,8 @@ const char kNameL2Loss[] = "L2Loss";
 const char kNameCTCLoss[] = "CTCLoss";
 const char kNameRange[] = "Range";
 const char kNameSquareSumAll[] = "SquareSumAll";
-const char kNameAscendQuant[] = "AscendQuant";
-const char kNameAscendDequant[] = "AscendDequant";
+const char kNameAscendQuant[] = "Quant";
+const char kNameAscendDequant[] = "Dequant";
 const char kNameCase[] = "Case";
 
 // -----------------OpAdapter initialization--------------
diff --git a/mindspore/nn/layer/quant.py b/mindspore/nn/layer/quant.py
index 2f4f2032904..dc30d33ac18 100644
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@@ -1107,7 +1107,7 @@ class QuantBlock(Cell):
     r"""
     A quant block of Conv/Dense, activation layer for Ascend deploy.
 
-    Calculate Conv or Dense in Int8, with AscendQuant and AscendDeQuant.
+    Calculate Conv or Dense in Int8, with Quant and DeQuant.
 
     Notes:
         This block is only for deploy, and not trainable.
diff --git a/mindspore/ops/operations/_inner_ops.py b/mindspore/ops/operations/_inner_ops.py
index 3c5e34e25e5..014998b4bee 100644
--- a/mindspore/ops/operations/_inner_ops.py
+++ b/mindspore/ops/operations/_inner_ops.py
@@ -160,7 +160,7 @@ class Range(PrimitiveWithInfer):
         return x_dtype
 
 
-class AscendQuant(PrimitiveWithInfer):
+class Quant(PrimitiveWithInfer):
     r"""
     Returns the quantized value of input_x.
 
@@ -192,7 +192,7 @@ class AscendQuant(PrimitiveWithInfer):
 
     Examples:
         >>> input_x = Tensor([100.0, 150.0], mstype.float32)
-        >>> quant = P.AscendQuant(80.0, 0.0, False, "Round")
+        >>> quant = P.Quant(80.0, 0.0, False, "Round")
         >>> y = quant(input_x)
     """
 
@@ -213,7 +213,7 @@ class AscendQuant(PrimitiveWithInfer):
         return mstype.int8
 
 
-class AscendDequant(PrimitiveWithInfer):
+class Dequant(PrimitiveWithInfer):
     r"""
     Returns the dequantized value of input_x.
     This operation will do ReLU to the dequantized value if `relu_flag` is True.
@@ -245,7 +245,7 @@ class AscendDequant(PrimitiveWithInfer):
 
     Examples:
         >>> input_x = Tensor([100.0, 150.0], mstype.float32)
-        >>> dequant = P.AscendDequant(False, False)
+        >>> dequant = P.Dequant(False, False)
         >>> y = dequant(input_x)
     """
     @prim_attr_register
diff --git a/mindspore/train/quant/quant.py b/mindspore/train/quant/quant.py
index b553373f105..4048525029e 100644
--- a/mindspore/train/quant/quant.py
+++ b/mindspore/train/quant/quant.py
@@ -329,14 +329,14 @@ class ExportToQuantInferNetwork:
             return None
 
         # Build the `Quant` `Dequant` op.
-        # AscendQuant only support perlayer version. Need check here.
-        quant_op = inner.AscendQuant(float(scale_a_in), float(zp_a_in))
+        # Quant only support perlayer version. Need check here.
+        quant_op = inner.Quant(float(scale_a_in), float(zp_a_in))
         sqrt_mode = False
         scale_deq = scale_a_out * scale_w
         if (scale_deq < 2 ** -14).all():
             scale_deq = np.sqrt(scale_deq)
             sqrt_mode = True
-        dequant_op = inner.AscendDequant(sqrt_mode)
+        dequant_op = inner.Dequant(sqrt_mode)
 
         # get op
         op_core = cell_core.matmul if isinstance(cell_core, quant.DenseQuant) else cell_core.conv
@@ -411,11 +411,15 @@ def export(network, *inputs, file_name, mean=127.5, std_dev=127.5, file_format='
         file_name (str): File name of model to export.
         mean (int): Input data mean. Default: 127.5.
         std_dev (int, float): Input data variance. Default: 127.5.
-        file_format (str): MindSpore currently supports 'GEIR' format for exported quantization aware model.
-            - GEIR: Graph Engine Intermediate Representation. An Intermediate representation format of Ascend model.
+        file_format (str): MindSpore currently supports 'GEIR', 'ONNX' and 'BINARY' format for exported
+            quantization aware model. Default: 'GEIR'.
+
+            - GEIR: Graph Engine Intermidiate Representation. An intermidiate representation format of
+              Ascend model.
+            - BINARY: Binary format for model. An intermidiate representation format for models.
     """
     supported_device = ["Ascend"]
-    supported_formats = ['GEIR']
+    supported_formats = ['GEIR', 'BINARY']
 
     mean = validator.check_type("mean", mean, (int, float))
     std_dev = validator.check_type("std_dev", std_dev, (int, float))
@@ -428,10 +432,9 @@ def export(network, *inputs, file_name, mean=127.5, std_dev=127.5, file_format='
 
     network.set_train(False)
 
-    if file_format == 'GEIR':
-        exporter = ExportToQuantInferNetwork(network, mean, std_dev, *inputs)
-        deploy_net = exporter.run()
-        serialization.export(deploy_net, *inputs, file_name=file_name, file_format=file_format)
+    exporter = ExportToQuantInferNetwork(network, mean, std_dev, *inputs)
+    deploy_net = exporter.run()
+    serialization.export(deploy_net, *inputs, file_name=file_name, file_format=file_format)
 
 
 def convert_quant_network(network,
diff --git a/mindspore/train/quant/quant_utils.py b/mindspore/train/quant/quant_utils.py
index 69505970fd8..5d524391bef 100644
--- a/mindspore/train/quant/quant_utils.py
+++ b/mindspore/train/quant/quant_utils.py
@@ -104,7 +104,7 @@ def weight2int(data, scale, zero_point):
         raise ValueError("`scale` and `zero_point` should have the same shape.")
     if scale.shape[0] < 0:
         raise ValueError("`scale` and `zero_point` shape should greater than zero.")
-    if len(scale.shape) > 1:
+    if len(scale.shape) >= 1 and scale.shape[0] > 1:
         # for perchannel
         if scale.shape[0] == data.shape[0]:
             # `Conv2d` or `Dense` op weight
diff --git a/mindspore/train/serialization.py b/mindspore/train/serialization.py
index bd1cdab43d3..9d77a920b66 100644
--- a/mindspore/train/serialization.py
+++ b/mindspore/train/serialization.py
@@ -451,19 +451,20 @@ def export(net, *inputs, file_name, file_format='GEIR'):
     # export model
     net.init_parameters_data()
     if file_format == 'GEIR':
-        _executor.compile(net, *inputs, phase='export')
+        phase_name = 'export.geir'
+        _executor.compile(net, *inputs, phase=phase_name)
         _executor.export(net, file_name, file_format)
     elif file_format == 'ONNX':  # file_format is 'ONNX'
         # NOTICE: the pahse name `export_onnx` is used for judging whether is exporting onnx in the compile pipeline,
         #         do not change it to other values.
-        phase_name = 'export_onnx'
+        phase_name = 'export.onnx'
         graph_id, _ = _executor.compile(net, *inputs, phase=phase_name, do_convert=False)
         onnx_stream = _executor._get_func_graph_proto(graph_id)
         with open(file_name, 'wb') as f:
             os.chmod(file_name, stat.S_IWUSR | stat.S_IRUSR)
             f.write(onnx_stream)
     elif file_format == 'BINARY':  # file_format is 'BINARY'
-        phase_name = 'export_binary'
+        phase_name = 'export.binary'
         graph_id, _ = _executor.compile(net, *inputs, phase=phase_name, do_convert=False)
         onnx_stream = _executor._get_func_graph_proto(graph_id, 'binary_ir')
         with open(file_name, 'wb') as f:
diff --git a/tests/ut/python/ops/test_ops.py b/tests/ut/python/ops/test_ops.py
index f25196eef75..ef5b0953362 100755
--- a/tests/ut/python/ops/test_ops.py
+++ b/tests/ut/python/ops/test_ops.py
@@ -2180,36 +2180,36 @@ test_case_other_ops = [
 ]
 
 test_case_quant_ops = [
-    ('AscendQuant_1', {
-        'block': inner.AscendQuant(0.5, 0.0, False, "Round"),
+    ('Quant_1', {
+        'block': inner.Quant(0.5, 0.0, False, "Round"),
         'desc_inputs': [Tensor(np.random.rand(1, 2, 4, 4), mstype.float32)],
         'skip': ['backward']}),
-    ('AscendQuant_2', {
-        'block': inner.AscendQuant(80.0, 10.0, True, "Round"),
+    ('Quant_2', {
+        'block': inner.Quant(80.0, 10.0, True, "Round"),
         'desc_inputs': [Tensor([100.0, 200.0], mstype.float32)],
         'skip': ['backward']}),
-    ('AscendQuant_3', {
-        'block': inner.AscendQuant(80.0, 0.0, False, "Floor"),
+    ('Quant_3', {
+        'block': inner.Quant(80.0, 0.0, False, "Floor"),
         'desc_inputs': [Tensor([100.0, 200.0], mstype.float32)],
         'skip': ['backward']}),
-    ('AscendQuant_4', {
-        'block': inner.AscendQuant(80.0, 0.0, False, "Ceil"),
+    ('Quant_4', {
+        'block': inner.Quant(80.0, 0.0, False, "Ceil"),
         'desc_inputs': [Tensor([100.0, 200.0], mstype.float32)],
         'skip': ['backward']}),
-    ('AscendQuant_5', {
-        'block': inner.AscendQuant(80.0, 0.0, False, "Trunc"),
+    ('Quant_5', {
+        'block': inner.Quant(80.0, 0.0, False, "Trunc"),
         'desc_inputs': [Tensor([100.0, 200.0], mstype.float32)],
         'skip': ['backward']}),
-    ('AscendQuant_6', {
-        'block': inner.AscendQuant(-80.0, 10.0, False, "Round"),
+    ('Quant_6', {
+        'block': inner.Quant(-80.0, 10.0, False, "Round"),
         'desc_inputs': [Tensor([100.0, 200.0], mstype.float32)],
         'skip': ['backward']}),
-    ('AscendQuant_7', {
-        'block': inner.AscendQuant(80.0, -10.0, False, "Round"),
+    ('Quant_7', {
+        'block': inner.Quant(80.0, -10.0, False, "Round"),
         'desc_inputs': [Tensor([100.0, 200.0], mstype.float32)],
         'skip': ['backward']}),
-    ('AscendQuant_8', {
-        'block': inner.AscendQuant(80.0, 10.0, False, "Round"),
+    ('Quant_8', {
+        'block': inner.Quant(80.0, 10.0, False, "Round"),
         'desc_inputs': [Tensor([100.0, 200.0], mstype.float16)],
         'skip': ['backward']}),
 ]
diff --git a/tests/ut/python/train/quant/test_quant.py b/tests/ut/python/train/quant/test_quant.py
index 39e887170ca..4816af89360 100644
--- a/tests/ut/python/train/quant/test_quant.py
+++ b/tests/ut/python/train/quant/test_quant.py
@@ -75,10 +75,20 @@ def test_qat_lenet():
 
 
 @pytest.mark.skip(reason="no `te.lang.cce` in ut env")
-def test_qat_mobile():
+def test_qat_mobile_per_channel_tf():
     network = mobilenetV2(num_classes=1000)
     img = Tensor(np.ones((1, 3, 224, 224)).astype(np.float32))
-    network = qat.convert_quant_network(network, bn_fold=True, per_channel=[True, False], symmetric=[True, False])
+    network = qat.convert_quant_network(network, bn_fold=True, per_channel=[False, True], symmetric=[True, False])
+    # should load the checkpoint. mock here
+    for param in network.get_parameters():
+        param.init_data()
+    qat.export(network, img, file_name="quant.pb")
+
+@pytest.mark.skip(reason="no `te.lang.cce` in ut env")
+def test_qat_mobile_per_channel_ff():
+    network = mobilenetV2(num_classes=1000)
+    img = Tensor(np.ones((1, 3, 224, 224)).astype(np.float32))
+    network = qat.convert_quant_network(network, bn_fold=True, per_channel=[False, False], symmetric=[True, False])
     # should load the checkpoint. mock here
     for param in network.get_parameters():
         param.init_data()

From cb453a80821390c545389fabd651b113450bc32d Mon Sep 17 00:00:00 2001
From: liuxiao93 <liuxiao93@huawei.com>
Date: Fri, 17 Jul 2020 11:48:18 +0800
Subject: [PATCH 54/68] change ApplyMomentumD->ApplyMoment for GE.

---
 mindspore/ccsrc/transform/graph_ir/convert.cc    |  2 +-
 mindspore/ccsrc/transform/graph_ir/op_declare.cc | 10 +++++-----
 mindspore/ccsrc/transform/graph_ir/op_declare.h  |  4 ++--
 mindspore/ops/operations/nn_ops.py               |  5 +++--
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/mindspore/ccsrc/transform/graph_ir/convert.cc b/mindspore/ccsrc/transform/graph_ir/convert.cc
index 56028bbdd90..21984f80518 100644
--- a/mindspore/ccsrc/transform/graph_ir/convert.cc
+++ b/mindspore/ccsrc/transform/graph_ir/convert.cc
@@ -216,7 +216,7 @@ std::unordered_map<std::string, OpAdapterDescPtr> &DfGraphConvertor::get_adpt_ma
     {string(kNameIOU), ADPT_DESC(Iou)},
     {string(kNameGreaterEqual), ADPT_DESC(GreaterEqual)},
     {string(kNameSlice), ADPT_DESC(SliceD)},
-    {string(kNameApplyMomentum), ADPT_DESC(ApplyMomentumD)},
+    {string(kNameApplyMomentum), ADPT_DESC(ApplyMomentum)},
     {string(kNameMaxPool), ADPT_DESC(MaxPool)},
     {string(kNameAvgPool), ADPT_DESC(AvgPool)},
     {string(kNameMaxPoolWithArgmax), ADPT_DESC(MaxPoolWithArgmax)},
diff --git a/mindspore/ccsrc/transform/graph_ir/op_declare.cc b/mindspore/ccsrc/transform/graph_ir/op_declare.cc
index 939e5feba18..372051926c4 100644
--- a/mindspore/ccsrc/transform/graph_ir/op_declare.cc
+++ b/mindspore/ccsrc/transform/graph_ir/op_declare.cc
@@ -143,12 +143,12 @@ INPUT_MAP(Constant) = EMPTY_INPUT_MAP;
 ATTR_MAP(Constant) = {{"value", ATTR_DESC(value, AnyTraits<AnyValue>())}};
 OUTPUT_MAP(Constant) = {{0, OUTPUT_DESC(y)}};
 
-// ApplyMomentumD
-INPUT_MAP(ApplyMomentumD) = {
+// ApplyMomentum
+INPUT_MAP(ApplyMomentum) = {
   {1, INPUT_DESC(var)}, {2, INPUT_DESC(accum)}, {3, INPUT_DESC(lr)}, {4, INPUT_DESC(grad)}, {5, INPUT_DESC(momentum)}};
-ATTR_MAP(ApplyMomentumD) = {{"use_nesterov", ATTR_DESC(use_nesterov, AnyTraits<bool>())},
-                            {"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
-OUTPUT_MAP(ApplyMomentumD) = {{0, OUTPUT_DESC(var)}, {1, OUTPUT_DESC(accum)}};
+ATTR_MAP(ApplyMomentum) = {{"use_nesterov", ATTR_DESC(use_nesterov, AnyTraits<bool>())},
+                           {"use_locking", ATTR_DESC(use_locking, AnyTraits<bool>())}};
+OUTPUT_MAP(ApplyMomentum) = {{0, OUTPUT_DESC(var)}};
 
 // ScalarSummary
 INPUT_MAP(Summary) = {{2, INPUT_DESC(x)}};
diff --git a/mindspore/ccsrc/transform/graph_ir/op_declare.h b/mindspore/ccsrc/transform/graph_ir/op_declare.h
index 2774ac1ff83..93462c4071a 100755
--- a/mindspore/ccsrc/transform/graph_ir/op_declare.h
+++ b/mindspore/ccsrc/transform/graph_ir/op_declare.h
@@ -334,8 +334,8 @@ DECLARE_OP_ADAPTER(Assign)
 DECLARE_OP_USE_OUTPUT(Assign)
 DECLARE_OP_ADAPTER(Constant)
 DECLARE_OP_USE_OUTPUT(Constant)
-DECLARE_OP_ADAPTER(ApplyMomentumD)
-DECLARE_OP_USE_OUTPUT(ApplyMomentumD)
+DECLARE_OP_ADAPTER(ApplyMomentum)
+DECLARE_OP_USE_OUTPUT(ApplyMomentum)
 // ** Summary Operations **
 DECLARE_OP_ADAPTER(Summary)
 
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index d2b47357d93..2504a6bcaa9 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -1591,9 +1591,10 @@ class ApplyMomentum(PrimitiveWithInfer):
         self.init_prim_io_names(inputs=['variable', 'accumulation', 'learning_rate', 'gradient', 'momentum'],
                                 outputs=['output'])
         self.is_tbe = context.get_context("device_target") == "Ascend"
+        self.is_ge = context.get_context("enable_ge")
 
     def infer_shape(self, v_shape, a_shape, l_shape, g_shape, m_shape):
-        if self.is_tbe:
+        if not self.is_ge and self.is_tbe:
             return v_shape, v_shape
         return v_shape
 
@@ -1605,7 +1606,7 @@ class ApplyMomentum(PrimitiveWithInfer):
         validator.check_scalar_or_tensor_type_same({"l_dtype": l_dtype}, valid_types, self.name)
         validator.check_scalar_or_tensor_type_same({"g_dtype": g_dtype}, valid_types, self.name)
         validator.check_scalar_or_tensor_type_same({"m_dtype": m_dtype}, valid_types, self.name)
-        if self.is_tbe:
+        if not self.is_ge and self.is_tbe:
             return g_dtype, g_dtype
         return g_dtype
 

From 951ac06782bb41f8f051613ce5305d0cda9252a1 Mon Sep 17 00:00:00 2001
From: kingfo <wangqiuliang@huawei.com>
Date: Fri, 17 Jul 2020 15:02:42 +0800
Subject: [PATCH 55/68] replace unsafe function

---
 .../ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc b/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc
index 5c7af1eb597..7b7c62790dd 100644
--- a/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/arithmetic_simplify.cc
@@ -479,7 +479,7 @@ AnfNodePtr ConstantDuplicateMul::MulConstantTensors(const AnfNodePtr &vnode_1, c
   auto new_tensor_ptr = std::make_shared<tensor::Tensor>(tensor_3_type_ptr->type_id(), tensor_out_shape);
   size_t mem_size = GetTypeByte(tensor_3_type_ptr) * IntToSize(new_tensor_ptr->ElementsNum());
   char *data = reinterpret_cast<char *>(new_tensor_ptr->data_c());
-  memcpy(data, data_out, mem_size);
+  memcpy_s(data, mem_size, data_out, mem_size);
 
   auto new_vnode = NewValueNode(new_tensor_ptr);
   new_vnode->set_abstract(new_tensor_ptr->ToAbstract());

From 2ae6dfe95aa02de06c36a6ef0179e3bcd92d814e Mon Sep 17 00:00:00 2001
From: yanzhenxiang2020 <yanzhenxiang@huawei.com>
Date: Tue, 26 May 2020 09:57:48 +0800
Subject: [PATCH 56/68] add RNNTLoss and RandomCategorical op for aicpu

---
 mindspore/ops/_grad/grad_nn_ops.py            | 10 ++++
 mindspore/ops/_op_impl/aicpu/__init__.py      |  2 +
 .../ops/_op_impl/aicpu/random_categorical.py  | 48 +++++++++++++++
 mindspore/ops/_op_impl/aicpu/rnnt_loss.py     | 37 ++++++++++++
 mindspore/ops/operations/__init__.py          |  5 +-
 mindspore/ops/operations/nn_ops.py            | 56 ++++++++++++++++++
 mindspore/ops/operations/random_ops.py        | 58 +++++++++++++++++++
 .../test_aicpu_ops/test_random_categorical.py | 38 ++++++++++++
 .../ascend/test_aicpu_ops/test_rnnt_loss.py   | 41 +++++++++++++
 9 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 mindspore/ops/_op_impl/aicpu/random_categorical.py
 create mode 100644 mindspore/ops/_op_impl/aicpu/rnnt_loss.py
 create mode 100644 tests/st/ops/ascend/test_aicpu_ops/test_random_categorical.py
 create mode 100644 tests/st/ops/ascend/test_aicpu_ops/test_rnnt_loss.py

diff --git a/mindspore/ops/_grad/grad_nn_ops.py b/mindspore/ops/_grad/grad_nn_ops.py
index 42d81882339..63e63770e41 100755
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@@ -567,6 +567,16 @@ def get_bprop_l2_loss(self):
     return bprop
 
 
+@bprop_getters.register(P.RNNTLoss)
+def get_bprop_rnnt_loss(self):
+    """Grad definition for `RNNTLoss` operation."""
+
+    def bprop(acts, labels, act_lens, label_lens, out, dout):
+        grad = out[1]
+        return grad, zeros_like(labels), zeros_like(act_lens), zeros_like(label_lens)
+    return bprop
+
+
 @bprop_getters.register(P.PReLU)
 def get_bprop_prelu(self):
     """Grad definition for `PReLU` operation."""
diff --git a/mindspore/ops/_op_impl/aicpu/__init__.py b/mindspore/ops/_op_impl/aicpu/__init__.py
index 7b86e47e36b..8914ee8719f 100644
--- a/mindspore/ops/_op_impl/aicpu/__init__.py
+++ b/mindspore/ops/_op_impl/aicpu/__init__.py
@@ -30,3 +30,5 @@ from .ctcloss import _ctcloss_aicpu
 from .reverse_sequence import _reverse_sequence_aicpu
 from .crop_and_resize import _crop_and_resize_aicpu
 from .end_of_sequence import _end_of_sequence_aicpu
+from .rnnt_loss import _rnnt_loss_aicpu
+from .random_categorical import _random_categorical_aicpu
diff --git a/mindspore/ops/_op_impl/aicpu/random_categorical.py b/mindspore/ops/_op_impl/aicpu/random_categorical.py
new file mode 100644
index 00000000000..a0c6f64c975
--- /dev/null
+++ b/mindspore/ops/_op_impl/aicpu/random_categorical.py
@@ -0,0 +1,48 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""RandomCategorical op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+
+random_categorical_op_info = AiCPURegOp("RandomCategorical") \
+    .fusion_type("OPAQUE") \
+    .input(0, "logits", "required") \
+    .input(1, "num_sample", "required") \
+    .input(2, "seed", "required") \
+    .output(0, "output", "required") \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.I32_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I32_Default, DataType.I32_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I32_Default, DataType.I32_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I32_Default, DataType.I32_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I64_Default, DataType.I64_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I64_Default, DataType.I64_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default, DataType.I64_Default, DataType.I16_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I64_Default, DataType.I64_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I64_Default, DataType.I64_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default, DataType.I64_Default, DataType.I32_Default) \
+    .dtype_format(DataType.F16_Default, DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F32_Default, DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
+    .dtype_format(DataType.F64_Default, DataType.I64_Default, DataType.I64_Default, DataType.I64_Default) \
+    .get_op_info()
+
+@op_info_register(random_categorical_op_info)
+def _random_categorical_aicpu():
+    """RandomCategorical AiCPU register"""
+    return
diff --git a/mindspore/ops/_op_impl/aicpu/rnnt_loss.py b/mindspore/ops/_op_impl/aicpu/rnnt_loss.py
new file mode 100644
index 00000000000..d35d1020485
--- /dev/null
+++ b/mindspore/ops/_op_impl/aicpu/rnnt_loss.py
@@ -0,0 +1,37 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""RNNTLoss op"""
+from mindspore.ops.op_info_register import op_info_register, AiCPURegOp, DataType
+
+rnnt_loss_op_info = AiCPURegOp("RNNTLoss") \
+    .fusion_type("OPAQUE") \
+    .input(0, "acts", "required") \
+    .input(1, "labels", "required") \
+    .input(2, "input_lengths", "required") \
+    .input(3, "label_lengths", "required") \
+    .output(0, "costs", "required") \
+    .output(1, "grads", "required") \
+    .attr("blank_label", "int") \
+    .dtype_format(DataType.F32_NCHW, DataType.I32_NCHW, DataType.I32_NCHW, DataType.I32_NCHW, DataType.F32_NCHW,
+                  DataType.F32_NCHW) \
+    .dtype_format(DataType.F32_Default, DataType.I32_Default, DataType.I32_Default, DataType.I32_Default,
+                  DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+@op_info_register(rnnt_loss_op_info)
+def _rnnt_loss_aicpu():
+    """RNNTLoss AiCPU register"""
+    return
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index a992c0edd52..1e881289235 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -55,7 +55,7 @@ from .math_ops import (Abs, ACos, Asin, Asinh, AddN, AccumulateNV2, AssignAdd, A
                        Sin, Sqrt, Rsqrt, BesselI0e, BesselI1e, TruncateDiv, TruncateMod,
                        Square, Sub, TensorAdd, Sign, Round, SquareSumAll, Atan, Atanh, Cosh, Sinh, Eps, Tan)
 
-from .random_ops import (RandomChoiceWithMask, Normal)
+from .random_ops import (RandomChoiceWithMask, Normal, RandomCategorical)
 from .nn_ops import (LSTM, SGD, Adam, SparseApplyAdam, SparseApplyLazyAdam, ApplyMomentum, BatchNorm,
                      BiasAdd, Conv2D,
                      DepthwiseConv2dNative,
@@ -70,6 +70,7 @@ from .nn_ops import (LSTM, SGD, Adam, SparseApplyAdam, SparseApplyLazyAdam, Appl
                      ResizeBilinear, Sigmoid,
                      SigmoidCrossEntropyWithLogits,
                      SmoothL1Loss, Softmax, Softsign, Softplus, LRN,
+                     RNNTLoss,
                      SoftmaxCrossEntropyWithLogits, ROIAlign,
                      SparseSoftmaxCrossEntropyWithLogits, Tanh,
                      TopK, BinaryCrossEntropy, SparseApplyAdagrad, LARSUpdate, ApplyFtrl, SparseApplyFtrl,
@@ -171,6 +172,7 @@ __all__ = [
     'Tanh',
     'RandomChoiceWithMask',
     'Normal',
+    'RandomCategorical',
     'ResizeBilinear',
     'ScalarSummary',
     'ImageSummary',
@@ -202,6 +204,7 @@ __all__ = [
     'SmoothL1Loss',
     'L2Loss',
     'CTCLoss',
+    'RNNTLoss',
     'ReduceAll',
     'ScalarToArray',
     'ScalarToTensor',
diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py
index d2b47357d93..b96557a16a9 100644
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -1735,6 +1735,62 @@ class DataFormatDimMap(PrimitiveWithInfer):
         return x_type
 
 
+class RNNTLoss(PrimitiveWithInfer):
+    """
+    Computes the RNNTLoss and its gradient with respect to the softmax outputs.
+
+    Args:
+        blank_label (int): blank label. Default: 0.
+
+    Inputs:
+        - **acts** (Tensor[float32]) - Tensor of shape :math:`(B, T, U, V)`.
+        - **labels** (Tensor[int32]) - Tensor of shape :math:`(B, U-1)`.
+        - **input_lengths** (Tensor[int32]) - Tensor of shape :math:`(B,)`.
+        - **label_lebgths** (Tensor[int32]) - Tensor of shape :math:`(B,)`.
+
+    Outputs:
+        - **costs** (Tensor[int32]) - Tensor of shape :math:`(B,)`.
+        - **grads** (Tensor[int32]) - Has the same shape as `acts`.
+
+    Examples:
+        >>> B, T, U, V = 1, 2, 3, 5
+        >>> acts = np.random.random((B, T, U, V)).astype(np.float32)
+        >>> labels = np.array([[1, 2]]).astype(np.int32)
+        >>> input_length = np.array([T] * B).astype(np.int32)
+        >>> label_length = np.array([len(l) for l in labels]).astype(np.int32)
+        >>> rnnt_loss = P.RNNTLoss(blank_label=blank)
+        >>> costs, grads = rnnt_loss(Tensor(acts), Tensor(labels), Tensor(input_length), Tensor(label_length))
+    """
+    @prim_attr_register
+    def __init__(self, blank_label=0):
+        validator.check_value_type('blank_label', blank_label, [int], self.name)
+        self.init_prim_io_names(inputs=['acts', 'labels', 'input_length', 'label_length'],
+                                outputs=['costs', 'grads'])
+
+    def infer_shape(self, acts_shape, labels_shape, input_length_shape, label_length_shape):
+        validator.check_integer('acts_rank', len(acts_shape), 4, Rel.EQ, self.name)
+        validator.check_integer('labels_rank', len(labels_shape), 2, Rel.EQ, self.name)
+        validator.check_integer('input_length_rank', len(input_length_shape), 1, Rel.EQ, self.name)
+        validator.check_integer('label_length_rank', len(label_length_shape), 1, Rel.EQ, self.name)
+        validator.check('labels shape[0]', labels_shape[0], 'acts shape[0]', acts_shape[0], Rel.EQ, self.name)
+        validator.check('labels shape[1]', labels_shape[1], 'acts shape[2]-1', acts_shape[2]-1, Rel.EQ, self.name)
+        validator.check('input_length size', input_length_shape[0], 'acts shape[0]', acts_shape[0], Rel.EQ, self.name)
+        validator.check('label_length size', label_length_shape[0], 'acts shape[0]', acts_shape[0], Rel.EQ, self.name)
+        costs_shape = (acts_shape[0],)
+        return (costs_shape, acts_shape)
+
+    def infer_dtype(self, acts_type, labels_type, input_length_type, label_length_type):
+        validator.check_subclass("acts_type", acts_type, mstype.tensor, self.name)
+        validator.check_subclass("labels_type", labels_type, mstype.tensor, self.name)
+        validator.check_subclass("input_length_type", input_length_type, mstype.tensor, self.name)
+        validator.check_subclass("label_length_type", label_length_type, mstype.tensor, self.name)
+        validator.check_tensor_type_same({"acts_type": acts_type}, [mstype.float32], self.name)
+        validator.check_tensor_type_same({"labels_type": labels_type}, [mstype.int32], self.name)
+        validator.check_tensor_type_same({"input_length_type": input_length_type}, [mstype.int32], self.name)
+        validator.check_tensor_type_same({"label_length_type": label_length_type}, [mstype.int32], self.name)
+        return (acts_type, acts_type)
+
+
 class SGD(PrimitiveWithInfer):
     """
     Computes stochastic gradient descent (optionally with momentum).
diff --git a/mindspore/ops/operations/random_ops.py b/mindspore/ops/operations/random_ops.py
index 7a457d09981..9ebe127228a 100644
--- a/mindspore/ops/operations/random_ops.py
+++ b/mindspore/ops/operations/random_ops.py
@@ -108,3 +108,61 @@ class Normal(PrimitiveWithInfer):
                "dtype": mstype.float32,
                "value": None}
         return out
+
+
+class RandomCategorical(PrimitiveWithInfer):
+    """
+    Generates random samples from a given categorical distribution tensor.
+
+    Args:
+        dtype (mindspore.dtype): The type of output. Its value should be one of [mindspore.int16,
+            mindspore.int32, mindspore.int64]. Default: mindspore.int64.
+
+    Inputs:
+        - **logits** (Tensor) - The input tensor. 2-D Tensor with shape [batch_size, num_classes].
+        - **num_sample** (int) - Number of sample to be drawn. Only constant values is allowed.
+        - **seed** (int) - Random seed. Default: 0. Only constant values is allowed.
+
+    Outputs:
+        - **output** (Tensor) - The output Tensor with shape [batch_size, num_samples].
+
+    Examples:
+        >>> class Net(nn.Cell):
+        >>>   def __init__(self, num_sample):
+        >>>     super(Net, self).__init__()
+        >>>     self.random_categorical = P.RandomCategorical(mindspore.int64)
+        >>>     self.num_sample = num_sample
+        >>>   def construct(self, logits, seed=0):
+        >>>     return self.random_categorical(logits, self.num_sample, seed)
+        >>>
+        >>> x = np.random.random((10, 5)).astype(np.float32)
+        >>> net = Net(8)
+        >>> output = net(Tensor(x))
+    """
+    @prim_attr_register
+    def __init__(self, dtype=mstype.int64):
+        """Init RandomCategorical"""
+        self.dtype = dtype
+
+        valid_values = (mstype.int32, mstype.int16, mstype.int64)
+        validator.check_type_name("dtype", dtype, valid_values, self.name)
+        self.init_prim_io_names(inputs=['logits', 'num_samples', 'seed'],
+                                outputs=['output'])
+
+    def __infer__(self, logits, num_samples, seed):
+        logits_dtype = logits['dtype']
+        valid_types = (mstype.float32, mstype.float16, mstype.float64)
+        validator.check_tensor_type_same({'logits': logits_dtype}, valid_types, self.name)
+        num_samples_v = num_samples['value']
+        seed_v = seed['value']
+        validator.check_value_type('num_samples', num_samples_v, (int,), self.name)
+        validator.check_value_type('seed', seed_v, (int,), self.name)
+        validator.check_integer("num_samples", num_samples_v, 0, Rel.GT, self.name)
+        x_shape = list(logits['shape'])
+        if len(x_shape) != 2:
+            raise ValueError("RandomCategorical shape should be 2-dimension.")
+        ndim = len(x_shape) - 1
+        x_shape[ndim] = num_samples_v
+        return {'shape': (x_shape),
+                'dtype': (self.dtype),
+                'value': None}
diff --git a/tests/st/ops/ascend/test_aicpu_ops/test_random_categorical.py b/tests/st/ops/ascend/test_aicpu_ops/test_random_categorical.py
new file mode 100644
index 00000000000..a581636ad4a
--- /dev/null
+++ b/tests/st/ops/ascend/test_aicpu_ops/test_random_categorical.py
@@ -0,0 +1,38 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import mindspore
+import mindspore.nn as nn
+import mindspore.context as context
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+class Net(nn.Cell):
+    def __init__(self, num_sample):
+        super(Net, self).__init__()
+        self.random_categorical = P.RandomCategorical(mindspore.int64)
+        self.num_sample = num_sample
+
+    def construct(self, logits, seed=0):
+        return self.random_categorical(logits, self.num_sample, seed)
+
+def test_net():
+    x = np.random.random((10, 5)).astype(np.float32)
+    net = Net(8)
+    output = net(Tensor(x))
+    print(x)
+    print(output.asnumpy())
+    #print(output.dtype())
diff --git a/tests/st/ops/ascend/test_aicpu_ops/test_rnnt_loss.py b/tests/st/ops/ascend/test_aicpu_ops/test_rnnt_loss.py
new file mode 100644
index 00000000000..705aa0e5949
--- /dev/null
+++ b/tests/st/ops/ascend/test_aicpu_ops/test_rnnt_loss.py
@@ -0,0 +1,41 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import mindspore.nn as nn
+import mindspore.context as context
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.rnnt_loss = P.RNNTLoss(blank_label=0)
+
+    def construct(self, acts, labels, act_lens, label_lens):
+        return self.rnnt_loss(acts, labels, act_lens, label_lens)
+
+
+def test_net():
+    B, T, U, V = 1, 2, 3, 5
+    acts = np.random.random((B, T, U, V)).astype(np.float32)
+    labels = np.array([[np.random.randint(1, V-1) for _ in range(U-1)]]).astype(np.int32)
+    input_length = np.array([T] * B).astype(np.int32)
+    label_length = np.array([len(l) for l in labels]).astype(np.int32)
+    rnnt_loss = Net()
+    costs, grads = rnnt_loss(Tensor(acts), Tensor(labels), Tensor(input_length), Tensor(label_length))
+    print(Tensor(acts), Tensor(labels), Tensor(input_length), Tensor(label_length))
+    print(costs.asnumpy())
+    print(grads.asnumpy())

From 59e519e8ed3a196091b01d77bdc9f58545cf7980 Mon Sep 17 00:00:00 2001
From: jinyaohui <jinyaohui@huawei.com>
Date: Fri, 17 Jul 2020 11:18:53 +0800
Subject: [PATCH 57/68] allreduce add ps filter

---
 mindspore/nn/wrap/grad_reducer.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mindspore/nn/wrap/grad_reducer.py b/mindspore/nn/wrap/grad_reducer.py
index 3d754977d45..77a55f69bf5 100644
--- a/mindspore/nn/wrap/grad_reducer.py
+++ b/mindspore/nn/wrap/grad_reducer.py
@@ -50,8 +50,8 @@ def _init_allreduce_operators(length):
     return opt_list
 
 
-@reduce_opt.register("Number", "Bool", "Function", "Bool", "Tensor", "Function")
-def _tensors_allreduce(degree, mean, allgather, allreduce_filter, grad, allreduce):
+@reduce_opt.register("Number", "Bool", "Function", "Bool", "Tensor", "Function", "Bool")
+def _tensors_allreduce(degree, mean, allgather, allreduce_filter, grad, allreduce, ps_parameter):
     """
     Apply allreduce on gradient.
 
@@ -66,7 +66,7 @@ def _tensors_allreduce(degree, mean, allgather, allreduce_filter, grad, allreduc
     Returns:
         Tensor, the gradient tensor after operation.
     """
-    if allreduce_filter:
+    if not ps_parameter and allreduce_filter:
         grad = allreduce(grad)
         if mean:
             degree = F.scalar_cast(degree, F.dtype(grad))
@@ -257,6 +257,8 @@ class DistributedGradReducer(Cell):
         self.allreduce_filter = tuple(x.layerwise_parallel is False for x in parameters)
         self.opt_list = _init_allreduce_operators(len(parameters))
         self.allgather = AllGather(GlobalComm.WORLD_COMM_GROUP)
+        ps_filter = lambda x: x.is_param_ps
+        self.ps_parameters = tuple(ps_filter(x) for x in parameters)
 
     def construct(self, grads):
         """
@@ -273,7 +275,7 @@ class DistributedGradReducer(Cell):
         datatypes = self.map_(F.partial(_get_datatype), grads)
         grads = self.map_(F.partial(_cast_datatype, mstype.float32), grads)
         new_grad = self.map_(F.partial(reduce_opt, self.degree, self.mean, self.allgather),
-                             self.allreduce_filter, grads, self.opt_list)
+                             self.allreduce_filter, grads, self.opt_list, self.ps_parameters)
 
         new_grad = self.map_(F.partial(_cast_datatype), datatypes, new_grad)
         return new_grad

From 86889c59cb6a7dd768caa9ff7ef6b7fa97630071 Mon Sep 17 00:00:00 2001
From: "wangnan39@huawei.com" <wangnan39@huawei.com>
Date: Wed, 15 Jul 2020 18:02:20 +0800
Subject: [PATCH 58/68] optimizer adapt IndexedSlices

---
 mindspore/nn/optim/adam.py                    |  18 +-
 mindspore/nn/optim/ftrl.py                    |  18 +-
 mindspore/nn/optim/lazyadam.py                |   6 +-
 mindspore/nn/optim/optimizer.py               |  14 +-
 mindspore/nn/optim/proximal_ada_grad.py       |   5 +-
 mindspore/nn/wrap/grad_reducer.py             |  31 ++--
 mindspore/ops/_grad/grad_array_ops.py         |   5 +-
 mindspore/ops/_grad/grad_comm_ops.py          |  25 +--
 .../nn/optim/test_adam_with_tuple_grad.py     | 174 ------------------
 .../parallel/test_sparse_feature_bprop.py     |  10 +-
 10 files changed, 73 insertions(+), 233 deletions(-)
 delete mode 100644 tests/ut/python/nn/optim/test_adam_with_tuple_grad.py

diff --git a/mindspore/nn/optim/adam.py b/mindspore/nn/optim/adam.py
index 39abec5664d..1dbfb940ee8 100755
--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -108,24 +108,26 @@ def _check_learning_rate_value(learning_rate, end_learning_rate, decay_steps, po
     validator.check_integer('decay_steps', decay_steps, 0, Rel.GT, prim_name)
 
 
-@_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tuple",
+@_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "IndexedSlices",
                     "Tensor", "Tensor", "Tensor", "Bool")
 def _run_opt_with_sparse(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
                          moment1, moment2, ps_parameter):
     """Apply sparse adam optimizer to the weight parameter when the gradient is sparse."""
     success = True
+    indices = gradient.indices()
+    values = gradient.values()
     if ps_parameter:
         op_shape = P.Shape()
         _ps_pull = P.Pull()
         _ps_push = P.Push("Adam", [0, 1, 2])
         shapes = (op_shape(params), op_shape(moment1), op_shape(moment2),
                   op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
-                  op_shape(beta2), op_shape(eps), op_shape(gradient[1]), op_shape(gradient[0]))
+                  op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices))
         success = F.depend(success, _ps_pull(_ps_push((beta1_power, beta2_power, lr, beta1, beta2,
-                                                       eps, gradient[1], gradient[0]), shapes), params))
+                                                       eps, values, indices), shapes), params))
     else:
         success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
-                                               eps, gradient[1], gradient[0]))
+                                               eps, values, indices))
     return success
 
 
@@ -149,17 +151,19 @@ def _run_opt_with_one_number(opt, sparse_opt, beta1_power, beta2_power, beta1, b
 
 
 @_adam_push_pull_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
-                              "Tensor", "Tuple", "Tensor", "Tensor", "Tensor")
+                              "Tensor", "IndexedSlices", "Tensor", "Tensor", "Tensor")
 def _run_push_pull_opt_with_sparse(push, pull, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
                                    moment1, moment2):
     """Apply sparse adam optimizer by push and pull to the weight parameter when the gradient is sparse."""
     success = True
     op_shape = P.Shape()
+    values = gradient.values()
+    indices = gradient.indices()
     shapes = (op_shape(params), op_shape(moment1), op_shape(moment2),
               op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
-              op_shape(beta2), op_shape(eps), op_shape(gradient[1]), op_shape(gradient[0]))
+              op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices))
     success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2,
-                                           eps, gradient[1], gradient[0]), shapes), params))
+                                           eps, values, indices), shapes), params))
     return success
 
 
diff --git a/mindspore/nn/optim/ftrl.py b/mindspore/nn/optim/ftrl.py
index 97e139f2634..fd755d703aa 100644
--- a/mindspore/nn/optim/ftrl.py
+++ b/mindspore/nn/optim/ftrl.py
@@ -25,20 +25,22 @@ _ftrl_opt = C.MultitypeFuncGraph("ftrl_opt")
 _ftrl_push_pull_opt = C.MultitypeFuncGraph("ftrl_opt")
 
 
-@_ftrl_opt.register("Function", "Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tuple", "Tensor",
+@_ftrl_opt.register("Function", "Function", "Tensor", "Number", "Number", "Number", "Tensor", "IndexedSlices", "Tensor",
                     "Tensor", "Bool")
 def _tensor_run_opt_with_sparse(opt, spars_opt, learning_rate, l1, l2, lr_power, linear, gradient, weight, moment,
                                 ps_parameter):
     """Apply sparse ftrl optimizer to the weight parameter when the gradient is sparse."""
     success = True
+    indices = gradient.indices()
+    values = gradient.values()
     if ps_parameter:
         op_shape = P.Shape()
         _ps_pull = P.Pull()
         _ps_push = P.Push("Ftrl", [0, 1, 2])
-        shapes = (op_shape(weight), op_shape(moment), op_shape(linear), op_shape(gradient[1]), op_shape(gradient[0]))
-        success = F.depend(success, _ps_pull(_ps_push((gradient[1], gradient[0]), shapes), weight))
+        shapes = (op_shape(weight), op_shape(moment), op_shape(linear), op_shape(values), op_shape(indices))
+        success = F.depend(success, _ps_pull(_ps_push((values, indices), shapes), weight))
     else:
-        success = F.depend(success, spars_opt(weight, moment, linear, gradient[1], gradient[0]))
+        success = F.depend(success, spars_opt(weight, moment, linear, values, indices))
     return success
 
 
@@ -58,14 +60,16 @@ def _tensor_run_opt(opt, spars_opt, learning_rate, l1, l2, lr_power, linear, gra
     return success
 
 
-@_ftrl_push_pull_opt.register("Function", "Function", "Tensor", "Number", "Number", "Number", "Tensor", "Tuple",
+@_ftrl_push_pull_opt.register("Function", "Function", "Tensor", "Number", "Number", "Number", "Tensor", "IndexedSlices",
                               "Tensor", "Tensor")
 def _tensor_run_push_pull_opt_with_sparse(push, pull, learning_rate, l1, l2, lr_power, linear, gradient,
                                           weight, moment):
     success = True
     op_shape = P.Shape()
-    shapes = (op_shape(weight), op_shape(moment), op_shape(linear), op_shape(gradient[1]), op_shape(gradient[0]))
-    success = F.depend(success, pull(push((gradient[1], gradient[0]), shapes), weight))
+    values = gradient.values()
+    indices = gradient.indices()
+    shapes = (op_shape(weight), op_shape(moment), op_shape(linear), op_shape(values), op_shape(indices))
+    success = F.depend(success, pull(push((values, indices), shapes), weight))
     return success
 
 
diff --git a/mindspore/nn/optim/lazyadam.py b/mindspore/nn/optim/lazyadam.py
index 79053984374..756200c41b8 100644
--- a/mindspore/nn/optim/lazyadam.py
+++ b/mindspore/nn/optim/lazyadam.py
@@ -27,14 +27,14 @@ from .optimizer import Optimizer
 _lazy_adam_opt = C.MultitypeFuncGraph("lazy_adam_opt")
 
 
-@_lazy_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tuple",
-                         "Tensor", "Tensor", "Tensor")
+@_lazy_adam_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor",
+                         "IndexedSlices", "Tensor", "Tensor", "Tensor")
 def _run_opt_with_sparse(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params,
                          moment1, moment2):
     """Apply sparse lazy adam optimizer to the weight parameter when the gradient is sparse."""
     success = True
     success = F.depend(success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2,
-                                           eps, gradient[1], gradient[0]))
+                                           eps, gradient.values(), gradient.indices()))
     return success
 
 
diff --git a/mindspore/nn/optim/optimizer.py b/mindspore/nn/optim/optimizer.py
index 7023efc8ab2..868b2a4d998 100755
--- a/mindspore/nn/optim/optimizer.py
+++ b/mindspore/nn/optim/optimizer.py
@@ -22,7 +22,7 @@ from mindspore.ops import functional as F, composite as C, operations as P
 from mindspore.nn.cell import Cell
 from mindspore.common.parameter import Parameter, ParameterTuple
 from mindspore.common.initializer import initializer
-from mindspore.common.tensor import Tensor
+from mindspore.common.tensor import Tensor, IndexedSlices
 import mindspore.common.dtype as mstype
 from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
@@ -490,12 +490,14 @@ op_gather = P.GatherV2()
 _apply_decay = C.MultitypeFuncGraph("apply_decay")
 
 
-@_apply_decay.register("Number", "Bool", "Tensor", "Tuple")
+@_apply_decay.register("Number", "Bool", "Tensor", "IndexedSlices")
 def _tensor_apply_decay_with_sparse(weight_decay, if_apply, weight, gradient):
     """Get grad with weight_decay."""
     if if_apply:
-        weight = op_gather(weight, gradient[0], 0)
-        return gradient[0], op_add((weight * weight_decay, gradient[1])), gradient[2]
+        indices = gradient.indices()
+        values = op_add((op_gather(weight, indices, 0) * weight_decay, gradient.values()))
+        shape = gradient.dense_shape()
+        return IndexedSlices(indices, values, shape)
     return gradient
 
 
@@ -518,9 +520,9 @@ def tensor_grad_scale(scale, grad):
     return grad * scale
 
 
-@_grad_scale.register("Number", "Tuple")
+@_grad_scale.register("Number", "IndexedSlices")
 def tensor_grad_scale_with_sparse(scale, grad):
     """Get grad with scale."""
     if scale == 1.0:
         return grad
-    return grad[0], grad[1] * scale, grad[2]
+    return IndexedSlices(grad.indices(), grad.values() * scale, grad.dense_shape())
diff --git a/mindspore/nn/optim/proximal_ada_grad.py b/mindspore/nn/optim/proximal_ada_grad.py
index 25cf4380344..2b965fc5b53 100644
--- a/mindspore/nn/optim/proximal_ada_grad.py
+++ b/mindspore/nn/optim/proximal_ada_grad.py
@@ -23,11 +23,12 @@ from .optimizer import Optimizer
 
 _proximal_ada_grad_opt = C.MultitypeFuncGraph("proximal_ada_grad_opt")
 
-@_proximal_ada_grad_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tuple", "Tensor", "Tensor")
+@_proximal_ada_grad_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "IndexedSlices", "Tensor",
+                                 "Tensor")
 def _tensor_run_opt_with_sparse(opt, sparse_opt, learning_rate, l1, l2, gradient, weight, accum):
     """Apply sparse proximal_ada_grad optimizer to the weight parameter."""
     success = True
-    success = F.depend(success, sparse_opt(weight, accum, learning_rate, l1, l2, gradient[1], gradient[0]))
+    success = F.depend(success, sparse_opt(weight, accum, learning_rate, l1, l2, gradient.values(), gradient.indices()))
     return success
 
 
diff --git a/mindspore/nn/wrap/grad_reducer.py b/mindspore/nn/wrap/grad_reducer.py
index 3d754977d45..1766db686d3 100644
--- a/mindspore/nn/wrap/grad_reducer.py
+++ b/mindspore/nn/wrap/grad_reducer.py
@@ -16,6 +16,7 @@
 from mindspore import context
 from mindspore.nn.cell import Cell
 from mindspore.communication.management import GlobalComm, get_group_size
+from mindspore.common.tensor import IndexedSlices
 from mindspore.ops import functional as F, composite as C, operations as P
 from mindspore.ops.operations.comm_ops import AllReduce, AllGather
 from mindspore.parallel._auto_parallel_context import auto_parallel_context
@@ -77,7 +78,7 @@ def _tensors_allreduce(degree, mean, allgather, allreduce_filter, grad, allreduc
     return grad
 
 
-@reduce_opt.register("Number", "Bool", "Function", "Bool", "Tuple", "Function")
+@reduce_opt.register("Number", "Bool", "Function", "Bool", "IndexedSlices", "Function")
 def _tensors_allreduce_with_sparse(degree, mean, allgather, allreduce_filter, grad, allreduce):
     """
     Apply allgather on gradient instead of allreduce for sparse feature.
@@ -88,21 +89,21 @@ def _tensors_allreduce_with_sparse(degree, mean, allgather, allreduce_filter, gr
         mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients.
         allgather (Primitive): The communication operator for sparse gradients.
         allreduce_filter (bool): When it is true, allgather would apply.
-        grad (tuple): The indices, gradient tensor and tensor_shape before operation.
+        grad (IndexedSlices): The gradient before operation.
         allreduce (Primitive): The communication operator for gradients.
 
     Returns:
-        Tuple, include indices, the gradient tensor and tensor_shape after operation.
+        IndexedSlices, the gradient after operation.
     """
     if allreduce_filter:
-        indices = allgather(grad[0])
-        dout = allgather(grad[1])
+        indices = allgather(grad.indices())
+        dout = allgather(grad.values())
         if mean:
-            degree = F.scalar_cast(degree, F.dtype(grad[1]))
+            degree = F.scalar_cast(degree, F.dtype(grad.values()))
             cast_op = P.Cast()
             mul_op = P.Mul()
             dout = mul_op(dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout)))
-        grad = (indices, dout, grad[2])
+        grad = IndexedSlices(indices, dout, grad.dense_shape())
     return grad
 
 
@@ -123,18 +124,18 @@ def _tensors_get_datatype(grad):
     return F.dtype(grad)
 
 
-@_get_datatype.register("Tuple")
+@_get_datatype.register("IndexedSlices")
 def _tensors_get_datatype_with_sparse(grad):
     """
     Acquire gradient datatype.
 
     Args:
-        grad (Tuple): The gradient tensor before operation.
+        grad (IndexedSlices): The gradient before operation.
 
     Returns:
         mstype, the datatype of gradient.
     """
-    return F.dtype(grad[1])
+    return F.dtype(grad.values())
 
 
 _cast_datatype = C.MultitypeFuncGraph("_cast_datatype")
@@ -155,20 +156,20 @@ def _tensors_cast_datatype(datatype, grad):
     return F.cast(grad, datatype)
 
 
-@_cast_datatype.register("TypeType", "Tuple")
+@_cast_datatype.register("TypeType", "IndexedSlices")
 def _tensors_cast_datatype_with_sparse(datatype, grad):
     """
     Cast gradient to datatype.
 
     Args:
         datatype (mstype): the destination datatype of gradient.
-        grad (Tuple): The gradient tensor before operation.
+        grad (IndexedSlices): The gradient before operation.
 
     Returns:
-        Tuple, the gradient tuple after operation.
+        IndexedSlices, the gradient after operation.
     """
-    dout = F.cast(grad[1], datatype)
-    return (grad[0], dout, grad[2])
+    dout = F.cast(grad.values(), datatype)
+    return IndexedSlices(grad.indices(), dout, grad.dense_shape())
 
 
 class DistributedGradReducer(Cell):
diff --git a/mindspore/ops/_grad/grad_array_ops.py b/mindspore/ops/_grad/grad_array_ops.py
index 005fdbc8952..1560425ac20 100644
--- a/mindspore/ops/_grad/grad_array_ops.py
+++ b/mindspore/ops/_grad/grad_array_ops.py
@@ -25,6 +25,7 @@ from .grad_base import bprop_getters
 from ..primitive import constexpr
 from ... import context
 from ...common import dtype as mstype
+from ...common.tensor import IndexedSlices
 
 reduce_sum = P.ReduceSum()
 unsorted_segment_sum = P.UnsortedSegmentSum()
@@ -206,7 +207,7 @@ def get_bprop_embedding_lookup(self):
         actual_dout_shape_changed = new_indices_shape_changed + x_shp_tail
         # Reshape the 'actual_dout' on device
         actual_dout = reshape_op(dout, actual_dout_shape_changed)
-        return (new_indices, actual_dout, x_shp), zeros_like(indices), zeros_like(offset)
+        return IndexedSlices(new_indices, actual_dout, x_shp), zeros_like(indices), zeros_like(offset)
     return bprop_sparse
 
 
@@ -335,7 +336,7 @@ def get_bprop_sparse_gather_v2(self):
             values_shape = indices_size + x_tail_shp
             values = reshape(dout, values_shape)
             indices = reshape(indices, indices_size)
-            return (indices, values, x_shp), zeros_like(indices), zeros_like(axis)
+            return IndexedSlices(indices, values, x_shp), zeros_like(indices), zeros_like(axis)
         if F.rank(dout) == 0:
             dout = P.ExpandDims()(dout, -1)
         if F.rank(indices) == 0:
diff --git a/mindspore/ops/_grad/grad_comm_ops.py b/mindspore/ops/_grad/grad_comm_ops.py
index 34df18beba4..e4029737847 100644
--- a/mindspore/ops/_grad/grad_comm_ops.py
+++ b/mindspore/ops/_grad/grad_comm_ops.py
@@ -17,6 +17,7 @@
 import mindspore.common.dtype as mstype
 from mindspore.ops import functional as F
 from .. import operations as P
+from ...common.tensor import IndexedSlices
 from ..composite.multitype_ops.zeros_like_impl import zeros_like
 from ..operations.comm_ops import (AllGather, _HostAllGather, AllReduce, _AlltoAll, Broadcast,
                                    _GetTensorSlice, _MirrorOperator, ReduceOp,
@@ -46,9 +47,9 @@ def get_bprop_all_reduce(self):
             if F.issubclass_(F.typeof(dout), mstype.tensor):
                 dx = all_reduce_grad(dout)
             else:
-                indices = all_gather(dout[0])
-                grad = all_gather(dout[1])
-                dx = (indices, grad, dout[2])
+                indices = all_gather(dout.indices())
+                grad = all_gather(dout.values())
+                dx = IndexedSlices(indices, grad, dout.dense_shape())
             return (dx,)
     else:
 
@@ -59,12 +60,12 @@ def get_bprop_all_reduce(self):
                 z = cast(z, dtype(dx))
                 dx = mul(dx, z)
             else:
-                indices = all_gather(dout[0])
-                grad = all_gather(dout[1])
+                indices = all_gather(dout.indices())
+                grad = all_gather(dout.values())
                 z = equal(x, out)
                 z = cast(z, dtype(grad))
                 grad = mul(grad, z)
-                dx = (indices, grad, dout[2])
+                dx = IndexedSlices(indices, grad, dout.dense_shape())
             return (dx,)
     return bprop
 
@@ -194,19 +195,19 @@ def get_bprop_mirror_operator(self):
                 num = F.scalar_cast(dev_num, F.dtype(dx))
                 dx = mul(dx, cast(F.scalar_to_array(float_one/num), F.dtype(dx)))
             else:
-                indices = all_gather(dout[0])
-                grad = all_gather(dout[1])
+                indices = all_gather(dout.indices())
+                grad = all_gather(dout.values())
                 float_one = F.scalar_cast(1.0, F.dtype(grad))
                 num = F.scalar_cast(dev_num, F.dtype(grad))
                 grad = mul(grad, cast(F.scalar_to_array(float_one/num), F.dtype(grad)))
-                dx = (indices, grad, dout[2])
+                dx = (indices, grad, dout.dense_shape())
         else:
             if F.issubclass_(F.typeof(dout), mstype.tensor):
                 dx = all_reduce(dout)
             else:
-                indices = all_gather(dout[0])
-                grad = all_gather(dout[1])
-                dx = (indices, grad, dout[2])
+                indices = all_gather(dout.indices())
+                grad = all_gather(dout.values())
+                dx = (indices, grad, dout.dense_shape())
 
         return (dx,)
     return bprop
diff --git a/tests/ut/python/nn/optim/test_adam_with_tuple_grad.py b/tests/ut/python/nn/optim/test_adam_with_tuple_grad.py
deleted file mode 100644
index 23aad24c475..00000000000
--- a/tests/ut/python/nn/optim/test_adam_with_tuple_grad.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-""" test adam """
-import numpy as np
-
-import mindspore.nn as nn
-from mindspore import Tensor, Parameter, context
-from mindspore.common.api import _executor
-from mindspore.common import dtype as mstype
-from mindspore.nn import TrainOneStepCell, WithLossCell
-from mindspore.nn.optim import Optimizer
-from mindspore.ops import operations as P
-from mindspore.ops import composite as C
-from mindspore.ops import functional as F
-from mindspore._checkparam import Validator as validator
-from mindspore._checkparam import Rel
-
-context.set_context(enable_sparse=True)
-
-adam_opt_for_map = C.MultitypeFuncGraph("adam_opt_for_map")
-@adam_opt_for_map.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
-                           "Tensor", "Tensor", "Tensor", "Bool")
-def _update_run_op_for_map(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag):
-    op_mul = P.Mul()
-    op_square = P.Square()
-    op_sqrt = P.Sqrt()
-    op_cast = P.Cast()
-    op_reshape = P.Reshape()
-    op_shape = P.Shape()
-
-    param_fp32 = op_cast(param, mstype.float32)
-    m_fp32 = op_cast(m, mstype.float32)
-    v_fp32 = op_cast(v, mstype.float32)
-    gradient_fp32 = op_cast(gradient, mstype.float32)
-
-    next_m = op_mul(beta1, m_fp32) + op_mul(op_cast(F.tuple_to_array((1.0,)), mstype.float32) - beta1, gradient_fp32)
-
-    next_v = op_mul(beta2, v_fp32) + op_mul(op_cast(F.tuple_to_array((1.0,)), mstype.float32)
-                                            - beta2, op_square(gradient_fp32))
-
-    update = next_m / (op_sqrt(next_v) + eps)
-    if decay_flag:
-        update = update + op_mul(weight_decay_tensor, param_fp32)
-
-    update_with_lr = op_mul(lr, update)
-    next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32))
-
-    next_v = F.depend(next_v, F.assign(param, next_param))
-    next_v = F.depend(next_v, F.assign(m, next_m))
-    next_v = F.depend(next_v, F.assign(v, next_v))
-    return next_v
-
-
-@adam_opt_for_map.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor",
-                           "Tensor", "Tensor", "Tuple", "Bool")
-def _update_run_op_sparse_for_map(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag):
-    return gradient[2][2]
-
-def _check_param_value(beta1, beta2, eps, weight_decay, prim_name):
-    """Check the type of inputs."""
-    validator.check_value_type("beta1", beta1, [float], prim_name)
-    validator.check_value_type("beta2", beta2, [float], prim_name)
-    validator.check_value_type("eps", eps, [float], prim_name)
-    validator.check_value_type("weight_dacay", weight_decay, [float], prim_name)
-    validator.check_number_range("beta1", beta1, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
-    validator.check_number_range("beta2", beta2, 0.0, 1.0, Rel.INC_NEITHER, prim_name)
-    validator.check_number_range("eps", eps, 0.0, float("inf"), Rel.INC_NEITHER, prim_name)
-    validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, prim_name)
-
-
-class AdamWeightDecaySparse(Optimizer):
-    """
-    Implements Adam algorithm weight decay fix.
-
-    Args:
-        params (list[Parameter]): A list of parameter, which will be updated. The element in `params`
-                                  should be class mindspore.Parameter.
-        learning_rate (Union[float, Tensor, Iterable]): A value for the learning rate. When the learning_rate is
-                                                        Iterable or a Tensor and the dims of the Tensor is 1,
-                                                        use dynamic learning rate, then the i-th step will
-                                                        take the i-th value as the learning rate.
-                                                        When the learning_rate is float or learning_rate is a Tensor
-                                                        but the dims of the Tensor is 0, use fixed learning rate.
-                                                        Other cases are not supported. Default: 1e-3.
-        beta1 (float): The exponential decay rate for the 1st moment estimates. Default: 0.9.
-            Should be in range (0.0, 1.0).
-        beta2 (float): The exponential decay rate for the 2nd moment estimates. Default: 0.999.
-            Should be in range (0.0, 1.0).
-        eps (float): Term added to the denominator to improve numerical stability. Default: 1e-6.
-            Should be greater than 0.
-        weight_decay (float): Weight decay (L2 penalty). Default: 0.0.
-        decay_filter (Function): A function to determine whether to apply weight decay on parameters. Default:
-                                 lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name.
-
-    Inputs:
-        - **gradients** (tuple[Tensor]) - The gradients of `params`, the shape is the same as `params`,
-          and might be in sparse format.
-
-    Outputs:
-        tuple[Parameter], the updated velocity value, the shape is the same as `params`.
-
-    Examples:
-        >>> net = Net()
-        >>> loss = nn.SoftmaxCrossEntropyWithLogits()
-        >>> optim = nn.AdamWeightDecay(params=net.trainable_params())
-        >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None)
-   """
-    def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-6, weight_decay=0.0,
-                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
-        super(AdamWeightDecaySparse, self).__init__(learning_rate, params)
-        if self.is_group:
-            raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.")
-        _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
-        self.beta1 = Tensor(np.array([beta1]).astype(np.float32))
-        self.beta2 = Tensor(np.array([beta2]).astype(np.float32))
-        self.eps = Tensor(np.array([eps]).astype(np.float32))
-        self.weight_decay_tensor = Tensor(np.array([weight_decay]).astype(np.float32))
-
-        self.params = self.parameters
-        self.moments1 = self.params.clone(prefix="adam_m", init='zeros')
-        self.moments2 = self.params.clone(prefix="adam_v", init='zeros')
-        self.decay_flag = tuple(decay_filter(x) for x in self.params)
-
-        self.map = C.Map()
-
-    def construct(self, gradients):
-        lr = self.get_lr()
-        updated_velocity = self.map(F.partial(adam_opt_for_map, self.beta1, self.beta2, self.eps, lr,
-                                              self.weight_decay_tensor),
-                                    self.params, self.moments1, self.moments2, gradients, self.decay_flag)
-
-        return updated_velocity
-
-
-def test_AdamWeightDecaySparse():
-    """ test_AdamWeightDecaySparse """
-    context.set_context(mode=context.GRAPH_MODE)
-    class Loss(nn.Cell):
-        def __init__(self):
-            super(Loss, self).__init__()
-        def construct(self, base, target):
-            return base
-    class NetWithSparseGatherV2(nn.Cell):
-        def __init__(self):
-            super(NetWithSparseGatherV2, self).__init__()
-            self.w1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="w1")
-            self.w2 = Parameter(Tensor(np.ones([2, 1, 2]).astype(np.float32)), name="w2")
-            self.gatherv2 = P.SparseGatherV2()
-            self.axis = 0
-        def construct(self, indices):
-            return self.gatherv2(self.w1, indices, self.axis) * self.w2
-
-    inputs = Tensor(np.array([0, 1]).astype(np.int32))
-    label = Tensor(np.zeros([2, 1, 2]).astype(np.float32))
-    net = NetWithSparseGatherV2()
-    net.set_train()
-    loss = Loss()
-    optimizer = AdamWeightDecaySparse(net.trainable_params())
-
-    net_with_loss = WithLossCell(net, loss)
-    train_network = TrainOneStepCell(net_with_loss, optimizer)
-    _executor.compile(train_network, inputs, label)
diff --git a/tests/ut/python/parallel/test_sparse_feature_bprop.py b/tests/ut/python/parallel/test_sparse_feature_bprop.py
index cd58261dbd0..515be06e450 100644
--- a/tests/ut/python/parallel/test_sparse_feature_bprop.py
+++ b/tests/ut/python/parallel/test_sparse_feature_bprop.py
@@ -19,8 +19,8 @@ import mindspore as ms
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.common import dtype as mstype
-from mindspore.common.tensor import Tensor
-from mindspore.ops import composite as C
+from mindspore.common.tensor import Tensor, IndexedSlices
+from mindspore.ops import composite as C, operations as P
 from mindspore.ops.operations.comm_ops import AllReduce, _MirrorOperator
 from mindspore.ops._grad.grad_base import bprop_getters
 from mindspore._checkparam import Validator as validator
@@ -65,7 +65,7 @@ def get_bprop_gather_v2(self):
     """Generate bprop for GatherV2"""
 
     def bprop(x, indices, axis, out, dout):
-        return (indices, dout, x), axis, out
+        return IndexedSlices(indices, dout, x), axis, out
 
     return bprop
 
@@ -78,7 +78,7 @@ def test_bprop_with_sparse_feature_allreduce():
             if shape is None:
                 shape = [8, 8]
             self.all_reduce = AllReduce()
-            self.gatherv2 = VirtualGatherV2()
+            self.gatherv2 = P.GatherV2()
             self.index = Tensor(np.ones(shape), dtype=ms.int32)
             self.axis = axis
 
@@ -102,7 +102,7 @@ def test_bprop_with_sparse_feature_mirror():
             if shape is None:
                 shape = [8, 8]
             self.mirror = _MirrorOperator(group=HCCL_WORLD_COMM_GROUP)
-            self.gatherv2 = VirtualGatherV2()
+            self.gatherv2 = P.GatherV2()
             self.index = Tensor(np.ones(shape), dtype=ms.int32)
             self.axis = axis
 

From 0d2495c5ce703c3db083ebf6fe3f67c699b103b4 Mon Sep 17 00:00:00 2001
From: Wei Luning <weiluning@huawei.com>
Date: Fri, 17 Jul 2020 16:42:03 +0800
Subject: [PATCH 59/68] add opt for list

---
 mindspore/ccsrc/frontend/optimizer/irpass.cc        |  2 +-
 .../optimizer/irpass/item_tuple_eliminate.h         | 13 ++++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/mindspore/ccsrc/frontend/optimizer/irpass.cc b/mindspore/ccsrc/frontend/optimizer/irpass.cc
index efc3795a4cc..4d74e38c842 100644
--- a/mindspore/ccsrc/frontend/optimizer/irpass.cc
+++ b/mindspore/ccsrc/frontend/optimizer/irpass.cc
@@ -64,7 +64,7 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
 
   // ops eliminate
   item_tuple_eliminate_ = MakeSubstitution(std::make_shared<ItemTupleEliminater>(), "item_tuple_eliminate",
-                                           {prim::kPrimTupleGetItem, prim::kPrimTupleSetItem});
+                                           {prim::kPrimTupleGetItem, prim::kPrimTupleSetItem, prim::kPrimListGetItem});
   tile_eliminate_ = MakeSubstitution(std::make_shared<TileMultiplyByOne>(), "tile_eliminate", prim::kPrimTile);
   cast_eliminate_ = MakeSubstitution(std::make_shared<CastEliminater>(), "cast_eliminate", prim::kPrimCast);
   reshape_eliminate_ = MakeSubstitution(std::make_shared<ReshapeEliminater>(), "reshape_eliminate", prim::kPrimReshape);
diff --git a/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_eliminate.h b/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_eliminate.h
index acd6844ee74..6ae41eaa2a9 100644
--- a/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_eliminate.h
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/item_tuple_eliminate.h
@@ -38,6 +38,7 @@ class GetitemEliminater : public AnfVisitor {
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
     Reset();
     AnfVisitor::Match(prim::kPrimTupleGetItem, {IsCNode, IsVNode})(node);
+    AnfVisitor::Match(prim::kPrimListGetItem, {IsCNode, IsVNode})(node);
 
     if (is_match_) {
       return tuple_->input(id_);
@@ -46,14 +47,18 @@ class GetitemEliminater : public AnfVisitor {
   }
 
   void Visit(const CNodePtr &cnode) override {
-    if (IsPrimitiveCNode(cnode, prim::kPrimMakeTuple)) {
+    if (IsPrimitiveCNode(cnode, prim::kPrimMakeTuple) || IsPrimitiveCNode(cnode, prim::kPrimMakeList)) {
       tuple_ = cnode;
     }
   }
 
   void Visit(const ValueNodePtr &vnode) override {
     if (tuple_ != nullptr && IsValueNode<Int32Imm>(vnode)) {
-      id_ = IntToSize(GetValue<int>(vnode->value()) + 1);
+      int idx = GetValue<int>(vnode->value());
+      if (idx < 0) {
+        idx = idx + tuple_->size() - 1;
+      }
+      id_ = IntToSize(idx + 1);
       if (tuple_->size() > id_) {
         is_match_ = true;
       }
@@ -80,6 +85,7 @@ class GetitemConstEliminater : public AnfVisitor {
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
     Reset();
     AnfVisitor::Match(prim::kPrimTupleGetItem, {IsVNode, IsVNode})(node);
+    AnfVisitor::Match(prim::kPrimListGetItem, {IsVNode, IsVNode})(node);
 
     if (is_match_) {
       return NewValueNode((*tuple_)[id_]);
@@ -138,7 +144,7 @@ class SetitemEliminater : public AnfVisitor {
   }
 
   void Visit(const CNodePtr &cnode) override {
-    if (IsPrimitiveCNode(cnode, prim::kPrimMakeTuple)) {
+    if (IsPrimitiveCNode(cnode, prim::kPrimMakeTuple) || IsPrimitiveCNode(cnode, prim::kPrimMakeList)) {
       auto &inputs = cnode->inputs();
       (void)std::copy(inputs.begin(), inputs.end(), std::back_inserter(args_));
     }
@@ -234,6 +240,7 @@ class GetitemDependReorder : public AnfVisitor {
   AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
     Reset();
     AnfVisitor::Match(prim::kPrimTupleGetItem, {IsCNode, IsValueNode<Int32Imm>})(node);
+    AnfVisitor::Match(prim::kPrimListGetItem, {IsCNode, IsValueNode<Int32Imm>})(node);
     if (x_ == nullptr) {
       return nullptr;
     }

From 188d74f15e64b6499ffcc672e5c73347c89cd1d2 Mon Sep 17 00:00:00 2001
From: yujianfeng <yujianfeng5@huawei.com>
Date: Wed, 15 Jul 2020 09:38:53 +0800
Subject: [PATCH 60/68] Remove transdata and cast for internal outputs

---
 .../ascend/ascend_backend_optimization.cc     |   3 +
 .../backend/optimizer/ascend/ascend_helper.cc |  14 +-
 .../ascend/format_type/insert_cast.cc         |  12 ++
 .../ascend/format_type/insert_trans_op.cc     |  16 +-
 .../format_type/remove_internal_output.cc     |  83 +++++++++
 .../format_type/remove_internal_output.h      |  51 +++++
 .../ccsrc/backend/session/kernel_graph.cc     |  44 +++--
 .../ccsrc/backend/session/kernel_graph.h      |   6 +-
 .../ccsrc/backend/session/session_basic.cc    |   7 +-
 .../st/host_device/test_host_device_lenet.py  |  89 +++++++++
 tests/st/ops/cpu/test_sparse_apply_adam_op.py |   4 +
 tests/st/ops/cpu/test_sparse_apply_ftrl_op.py |   4 +
 .../test_sparse_apply_proximal_adagrad_op.py  |   4 +
 .../remove_internal_output_test.cc            | 174 ++++++++++++++++++
 .../remove_internal_output_test.py            |  83 +++++++++
 15 files changed, 564 insertions(+), 30 deletions(-)
 create mode 100644 mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.cc
 create mode 100644 mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.h
 create mode 100644 tests/st/host_device/test_host_device_lenet.py
 create mode 100644 tests/ut/cpp/pre_activate/ascend/format_type/remove_internal_output_test.cc
 create mode 100644 tests/ut/cpp/python_input/gtest_input/pre_activate/remove_internal_output_test.py

diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
index 40e7a29c921..2636def192a 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
@@ -96,6 +96,7 @@
 #include "backend/optimizer/ascend/format_type/modify_ops_attrs.h"
 #include "backend/optimizer/ascend/format_type/remove_no_use_reshape_op.h"
 #include "backend/optimizer/ascend/ir_fusion/add_input_to_output.h"
+#include "backend/optimizer/ascend/format_type/remove_internal_output.h"
 #include "utils/context/ms_context.h"
 #include "utils/config_manager.h"
 #include "debug/anf_ir_dump.h"
@@ -199,6 +200,7 @@ void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph)
   data_layout_pm->AddPass(std::make_shared<OptimizeDependence>());
   data_layout_pm->AddPass(std::make_shared<TransDataSplit>());
   data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
+  data_layout_pm->AddPass(std::make_shared<RemoveInternalOutputTransOp>());
   optimizer->AddPassManager(data_layout_pm);
   (void)optimizer->Optimize(kernel_graph);
   kernel_graph->SetExecOrderByDefault();
@@ -220,6 +222,7 @@ void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_grap
   mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>());
   mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
   mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>());
+  mixed_precision_pm->AddPass(std::make_shared<RemoveInternalOutputCast>());
   optimizer->AddPassManager(mixed_precision_pm);
   (void)optimizer->Optimize(kernel_graph);
   kernel_graph->SetExecOrderByDefault();
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc b/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
index fd4c0e59522..9e1f6234b97 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
@@ -142,6 +142,7 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
   MS_EXCEPTION_IF_NULL(node);
   std::vector<AnfNodePtr> make_tuple_inputs;
   make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
+  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
   for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(node); ++output_idx) {
     std::string output_format = AnfAlgo::GetOutputFormat(node, output_idx);
     if (output_format == kOpFormat_NC1KHKWHWC0) {
@@ -151,7 +152,11 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
     auto tuple_getitem = CreatTupleGetItemNode(func_graph, node, output_idx);
     std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx);
     if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) {
-      make_tuple_inputs.emplace_back(AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false));
+      auto trans_op = AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false);
+      if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
+        kernel_graph->ReplaceInternalOutput(node, trans_op, output_idx, 0);
+      }
+      make_tuple_inputs.emplace_back(trans_op);
     } else {
       // No need insert trans op.
       make_tuple_inputs.push_back(tuple_getitem);
@@ -249,9 +254,14 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP
   if (outputs_num == 0) {
     return node;
   }
+  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
   // Single output
   if (outputs_num == 1 && (!AnfAlgo::IsTupleOutput(node))) {
-    return InsertTransOpForSingleOutput(func_graph, node, kernel_select);
+    auto new_node = InsertTransOpForSingleOutput(func_graph, node, kernel_select);
+    if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
+      kernel_graph->ReplaceInternalOutput(node, new_node);
+    }
+    return new_node;
   }
   // Multiple output
   return InsertTransOpForMultipleOutput(func_graph, node, kernel_select);
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
index c3f79006452..bc68511bb2b 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
@@ -40,6 +40,7 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
   std::vector<AnfNodePtr> make_tuple_inputs;
   AbstractBasePtrList abstract_list;
   make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
+  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
   for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(cnode); ++output_idx) {
     AnfNodePtr replace_node = nullptr;
     const auto origin_shape = AnfAlgo::GetOutputInferShape(cnode, output_idx);
@@ -64,6 +65,9 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
         MS_EXCEPTION_IF_NULL(replace_node);
         replace_node->set_scope(cnode->scope());
         AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
+        if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
+          kernel_graph->ReplaceInternalOutput(cnode, replace_node, output_idx, 0);
+        }
       } else {
         replace_node = getitem;
       }
@@ -87,6 +91,7 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
     return cnode;
   }
   MS_EXCEPTION_IF_NULL(cnode->Type());
+  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
   // Single output
   if (!cnode->Type()->isa<Tuple>()) {
     if (!need_insert_cast[0]) {
@@ -109,6 +114,9 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
       MS_EXCEPTION_IF_NULL(replace_node);
       replace_node->set_scope(cnode->scope());
       AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
+      if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
+        kernel_graph->ReplaceInternalOutput(cnode, replace_node);
+      }
     }
     return replace_node;
   }
@@ -188,6 +196,10 @@ const AnfNodePtr InsertCast::Process(const FuncGraphPtr &func_graph, const AnfNo
   CNodePtr cnode = node->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
   auto new_node = InsertCastForInput(func_graph, cnode);
+  auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
+  if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
+    kernel_graph->ReplaceInternalOutput(node, new_node);
+  }
   // process output
   return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true));
 }
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_trans_op.cc b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_trans_op.cc
index a22a1faa5fd..8f0d5dd48e4 100644
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_trans_op.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_trans_op.cc
@@ -46,14 +46,13 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
   if (node == nullptr || !AnfAlgo::IsRealKernel(node)) {
     return nullptr;
   }
-  AnfNodePtr front_node;
+  AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
+  MS_LOG(DEBUG) << "process op: " << node->DebugString();
+  AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
   auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
   if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
-    front_node = kernel_graph->GetFrontNodeByInternalOutput(node);
+    kernel_graph->ReplaceInternalOutput(node, new_node);
   }
-  AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
-  MS_LOG(DEBUG) << "====process op: " << node->DebugString();
-  AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
   auto ms_context = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(ms_context);
   if (ms_context->execution_mode() == kPynativeMode && !ms_context->enable_pynative_hook()) {
@@ -61,12 +60,7 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
       return new_node;
     }
   }
-  auto final_node = InsertTransOpForOutput(func_graph, new_node, kernel_select_);
-  if (kernel_graph != nullptr && front_node != nullptr) {
-    auto old_node = kernel_graph->GetInternalOutputByFrontNode(front_node);
-    kernel_graph->ReplaceInternalOutput(old_node, final_node);
-  }
-  return final_node;
+  return InsertTransOpForOutput(func_graph, new_node, kernel_select_);
 }
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.cc b/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.cc
new file mode 100644
index 00000000000..e9238fe0066
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.cc
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "backend/optimizer/ascend/format_type/remove_internal_output.h"
+#include <memory>
+#include "backend/session/anf_runtime_algorithm.h"
+
+namespace mindspore {
+namespace opt {
+namespace {
+bool UsedForOutputOnly(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  auto manager = func_graph->manager();
+  MS_EXCEPTION_IF_NULL(manager);
+  auto &node_users = manager->node_users();
+  auto iter = node_users.find(node);
+  if (iter == node_users.end()) {
+    return false;
+  }
+  const auto &node_set = iter->second;
+  for (const auto &node_index : node_set) {
+    if (!AnfAlgo::CheckPrimitiveType(node_index.first, prim::kPrimMakeTuple)) {
+      return false;
+    }
+  }
+  return true;
+}
+}  // namespace
+const BaseRef RemoveInternalOutputTransOp::DefinePattern() const {
+  VarPtr X = std::make_shared<Var>();
+  auto prim = std::make_shared<Primitive>(kTransDataOpName);
+  return VectorRef({prim, X});
+}
+
+const BaseRef RemoveInternalOutputCast::DefinePattern() const {
+  VarPtr X = std::make_shared<Var>();
+  return VectorRef({prim::kPrimCast, X});
+}
+
+const AnfNodePtr RemoveInternalOutput::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
+                                               const EquivPtr &) const {
+  MS_EXCEPTION_IF_NULL(func_graph);
+  MS_EXCEPTION_IF_NULL(node);
+  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
+  if (kernel_graph == nullptr) {
+    return nullptr;
+  }
+  if (!kernel_graph->IsInternalOutput(node)) {
+    return nullptr;
+  }
+  if (!UsedForOutputOnly(func_graph, node)) {
+    return nullptr;
+  }
+  auto cnode = node->cast<CNodePtr>();
+  MS_EXCEPTION_IF_NULL(cnode);
+  CheckCNodeInputSize(cnode, kTransOpInputNum);
+  auto input_node = cnode->input(1);
+  if (!AnfAlgo::CheckPrimitiveType(input_node, prim::kPrimTupleGetItem)) {
+    kernel_graph->ReplaceInternalOutput(node, input_node);
+  } else {
+    auto tuple_getitem = input_node->cast<CNodePtr>();
+    MS_EXCEPTION_IF_NULL(tuple_getitem);
+    int idx = AnfAlgo::GetTupleGetItemOutIndex(tuple_getitem);
+    AnfNodePtr real_input_node = AnfAlgo::GetTupleGetItemRealInput(tuple_getitem);
+    kernel_graph->ReplaceInternalOutput(node, real_input_node, 0, idx);
+  }
+  return input_node;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.h b/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.h
new file mode 100644
index 00000000000..6fa9b7421c3
--- /dev/null
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.h
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
+#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
+
+#include <string>
+#include "backend/optimizer/common/optimizer.h"
+
+namespace mindspore {
+namespace opt {
+class RemoveInternalOutput : public PatternProcessPass {
+ public:
+  explicit RemoveInternalOutput(const std::string &name, bool multigraph = true)
+      : PatternProcessPass(name, multigraph) {}
+  ~RemoveInternalOutput() override = default;
+  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
+};
+
+class RemoveInternalOutputTransOp : public RemoveInternalOutput {
+ public:
+  explicit RemoveInternalOutputTransOp(bool multigraph = true)
+      : RemoveInternalOutput("remove_internal_output_trans_op", multigraph) {}
+  ~RemoveInternalOutputTransOp() override = default;
+  const BaseRef DefinePattern() const override;
+};
+
+class RemoveInternalOutputCast : public RemoveInternalOutput {
+ public:
+  explicit RemoveInternalOutputCast(bool multigraph = true)
+      : RemoveInternalOutput("remove_internal_output_cast", multigraph) {}
+  ~RemoveInternalOutputCast() override = default;
+  const BaseRef DefinePattern() const override;
+};
+}  // namespace opt
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
diff --git a/mindspore/ccsrc/backend/session/kernel_graph.cc b/mindspore/ccsrc/backend/session/kernel_graph.cc
index b1cae89a405..3e462ca6184 100644
--- a/mindspore/ccsrc/backend/session/kernel_graph.cc
+++ b/mindspore/ccsrc/backend/session/kernel_graph.cc
@@ -929,10 +929,15 @@ void KernelGraph::AddInternalOutput(const AnfNodePtr &front_node, const AnfNodeP
   }
   MS_LOG(INFO) << "Add internal node " << node->DebugString() << " with front node " << front_node->DebugString();
   front_to_internal_outputs_map_[front_node] = node;
-  internal_outputs_to_front_map_[node] = front_node;
+  int output_idx = 0;
+  if (AnfAlgo::CheckPrimitiveType(front_node, prim::kPrimTupleGetItem)) {
+    output_idx = AnfAlgo::GetTupleGetItemOutIndex(front_node->cast<CNodePtr>());
+  }
+  internal_outputs_to_front_map_[node][output_idx] = front_node;
 }
 
-void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node) {
+void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx,
+                                        int dst_output_idx) {
   if (new_node == nullptr || node == nullptr) {
     MS_LOG(INFO) << "New node or node is nullptr";
     return;
@@ -947,9 +952,30 @@ void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr
     return;
   }
   MS_LOG(INFO) << "Replace internal node " << node->DebugString() << " To " << new_node->DebugString();
-  internal_outputs_to_front_map_[new_node] = iter->second;
-  front_to_internal_outputs_map_[iter->second] = new_node;
-  internal_outputs_to_front_map_.erase(iter);
+  auto &front_nodes = iter->second;
+  // Move all front nodes to new node mapping
+  if (src_output_idx == -1) {
+    internal_outputs_to_front_map_[new_node] = front_nodes;
+    for (const auto &front_node_iter : front_nodes) {
+      front_to_internal_outputs_map_[front_node_iter.second] = new_node;
+    }
+    internal_outputs_to_front_map_.erase(iter);
+    return;
+  }
+  // Move specified front node to new node mapping
+  int index = SizeToInt(src_output_idx);
+  auto front_node_iter = front_nodes.find(index);
+  if (front_node_iter == front_nodes.end()) {
+    MS_LOG(INFO) << "The output " << src_output_idx << " of node " << node->DebugString() << " is not an internal node";
+    return;
+  }
+  auto front_node = front_node_iter->second;
+  internal_outputs_to_front_map_[new_node][dst_output_idx] = front_node;
+  front_to_internal_outputs_map_[front_node] = new_node;
+  front_nodes.erase(index);
+  if (front_nodes.empty()) {
+    internal_outputs_to_front_map_.erase(iter);
+  }
 }
 
 AnfNodePtr KernelGraph::GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const {
@@ -967,14 +993,6 @@ bool KernelGraph::IsInternalOutput(const AnfNodePtr &node) const {
   return false;
 }
 
-AnfNodePtr KernelGraph::GetFrontNodeByInternalOutput(const AnfNodePtr &node) const {
-  auto iter = internal_outputs_to_front_map_.find(node);
-  if (iter != internal_outputs_to_front_map_.end()) {
-    return iter->second;
-  }
-  return nullptr;
-}
-
 void KernelGraph::AddFinalOutputKernel(const AnfNodePtr &node) {
   if (node == nullptr) {
     return;
diff --git a/mindspore/ccsrc/backend/session/kernel_graph.h b/mindspore/ccsrc/backend/session/kernel_graph.h
index 48df351120b..3ba5f333da4 100644
--- a/mindspore/ccsrc/backend/session/kernel_graph.h
+++ b/mindspore/ccsrc/backend/session/kernel_graph.h
@@ -148,10 +148,10 @@ class KernelGraph : public FuncGraph {
   const std::map<std::string, std::pair<AnfNodePtr, int>> &summary_nodes() const { return summary_nodes_; }
   void set_summary_nodes(const std::map<std::string, std::pair<AnfNodePtr, int>> &nodes) { summary_nodes_ = nodes; }
   void AddInternalOutput(const AnfNodePtr &front_node, const AnfNodePtr &node);
-  void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node);
+  void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx = -1,
+                             int dst_output_idx = -1);
   AnfNodePtr GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const;
   bool IsInternalOutput(const AnfNodePtr &node) const;
-  AnfNodePtr GetFrontNodeByInternalOutput(const AnfNodePtr &node) const;
   void AddFinalOutputKernel(const AnfNodePtr &node);
   bool IsFinalOutputKernel(const AnfNodePtr &node) const;
   uint32_t current_epoch() const { return current_epoch_; }
@@ -223,7 +223,7 @@ class KernelGraph : public FuncGraph {
   CNodePtr end_goto_;
   bool null_output_;
   std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_;
-  std::unordered_map<AnfNodePtr, AnfNodePtr> internal_outputs_to_front_map_;
+  std::unordered_map<AnfNodePtr, std::unordered_map<int, AnfNodePtr>> internal_outputs_to_front_map_;
   std::set<AnfNodePtr> final_output_kernels_;
   uint32_t current_epoch_;
 };
diff --git a/mindspore/ccsrc/backend/session/session_basic.cc b/mindspore/ccsrc/backend/session/session_basic.cc
index fa55b07fe5c..80777482dd1 100644
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@@ -300,7 +300,11 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
     MS_LOG(INFO) << "No corresponding internal output for output node";
     return;
   }
-  auto real_kernel = AnfAlgo::VisitKernel(ref_node, 0);
+  size_t output_idx = 0;
+  if (AnfAlgo::CheckPrimitiveType(out_node, prim::kPrimTupleGetItem)) {
+    output_idx = AnfAlgo::GetTupleGetItemOutIndex(out_node->cast<CNodePtr>());
+  }
+  auto real_kernel = AnfAlgo::VisitKernel(ref_node, output_idx);
   auto ref_real_node = real_kernel.first;
   auto ref_real_node_index = real_kernel.second;
   if (ref_real_node->isa<CNode>() && node_graph->IsInternalOutput(ref_real_node) &&
@@ -325,6 +329,7 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
     builder.SetOutputsFormat({format});
     d_kernel_info->set_select_kernel_build_info(builder.Build());
     AnfAlgo::SetOutputAddr(address, 0, parameter.get());
+    AnfAlgo::SetOutputInferTypeAndShape({type}, {AnfAlgo::GetOutputInferShape(parameter, 0)}, parameter.get());
   }
 }
 
diff --git a/tests/st/host_device/test_host_device_lenet.py b/tests/st/host_device/test_host_device_lenet.py
new file mode 100644
index 00000000000..d1c49dc1e48
--- /dev/null
+++ b/tests/st/host_device/test_host_device_lenet.py
@@ -0,0 +1,89 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.nn import TrainOneStepCell, WithLossCell
+from mindspore.nn.optim import Momentum
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
+
+
+class LeNet(nn.Cell):
+    def __init__(self):
+        super(LeNet, self).__init__()
+        self.relu = P.ReLU()
+        self.batch_size = 32
+
+        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0, has_bias=False, pad_mode='valid')
+        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0, has_bias=False, pad_mode='valid')
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.reshape = P.Reshape()
+        self.fc1 = nn.Dense(400, 120)
+        self.fc1.matmul.add_prim_attr("primitive_target", "CPU")
+        self.fc1.bias_add.add_prim_attr("primitive_target", "CPU")
+        self.fc2 = nn.Dense(120, 84)
+        self.fc2.matmul.add_prim_attr("primitive_target", "CPU")
+        self.fc2.bias_add.add_prim_attr("primitive_target", "CPU")
+        self.fc3 = nn.Dense(84, 10)
+        self.fc3.matmul.add_prim_attr("primitive_target", "CPU")
+        self.fc3.bias_add.add_prim_attr("primitive_target", "CPU")
+
+    def construct(self, input_x):
+        output = self.conv1(input_x)
+        output = self.relu(output)
+        output = self.pool(output)
+        output = self.conv2(output)
+        output = self.relu(output)
+        output = self.pool(output)
+        output = self.reshape(output, (self.batch_size, -1))
+        output = self.fc1(output)
+        output = self.relu(output)
+        output = self.fc2(output)
+        output = self.relu(output)
+        output = self.fc3(output)
+        return output
+
+
+def train(net, data, label):
+    learning_rate = 0.01
+    momentum = 0.9
+
+    optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(net_with_criterion, optimizer)  # optimizer
+    train_network.set_train()
+    res = train_network(data, label)
+    print("+++++++++Loss+++++++++++++")
+    print(res)
+    print("+++++++++++++++++++++++++++")
+    diff = res.asnumpy()[0] - 2.3025851
+    assert np.all(diff < 1.e-7)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_lenet():
+    data = Tensor(np.ones([32, 1, 32, 32]).astype(np.float32) * 0.01)
+    label = Tensor(np.ones([32]).astype(np.int32))
+    net = LeNet()
+    train(net, data, label)
diff --git a/tests/st/ops/cpu/test_sparse_apply_adam_op.py b/tests/st/ops/cpu/test_sparse_apply_adam_op.py
index 06b4a70b390..6dd866e96cb 100644
--- a/tests/st/ops/cpu/test_sparse_apply_adam_op.py
+++ b/tests/st/ops/cpu/test_sparse_apply_adam_op.py
@@ -14,6 +14,7 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
@@ -43,6 +44,9 @@ class Net(nn.Cell):
         return out
 
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
 def test_net():
     gradient = Tensor(np.ones([3, 3, 3]).astype(np.float32))
     indices = Tensor([0, 1, 2], mstype.int32)
diff --git a/tests/st/ops/cpu/test_sparse_apply_ftrl_op.py b/tests/st/ops/cpu/test_sparse_apply_ftrl_op.py
index babaefbd864..dca5cf7a776 100644
--- a/tests/st/ops/cpu/test_sparse_apply_ftrl_op.py
+++ b/tests/st/ops/cpu/test_sparse_apply_ftrl_op.py
@@ -14,6 +14,7 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
@@ -35,6 +36,9 @@ class Net(nn.Cell):
         return out
 
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
 def test_net():
     gradient = Tensor(np.ones([3, 3, 3]).astype(np.float32))
     indices = Tensor([0, 1, 2], mstype.int32)
diff --git a/tests/st/ops/cpu/test_sparse_apply_proximal_adagrad_op.py b/tests/st/ops/cpu/test_sparse_apply_proximal_adagrad_op.py
index c2a129a86cb..5d52e718964 100644
--- a/tests/st/ops/cpu/test_sparse_apply_proximal_adagrad_op.py
+++ b/tests/st/ops/cpu/test_sparse_apply_proximal_adagrad_op.py
@@ -14,6 +14,7 @@
 # ============================================================================
 
 import numpy as np
+import pytest
 import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
@@ -37,6 +38,9 @@ class Net(nn.Cell):
         return out
 
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
 def test_net():
     gradient = Tensor(np.ones([3, 3, 3]).astype(np.float32))
     indices = Tensor([0, 1, 2], mstype.int32)
diff --git a/tests/ut/cpp/pre_activate/ascend/format_type/remove_internal_output_test.cc b/tests/ut/cpp/pre_activate/ascend/format_type/remove_internal_output_test.cc
new file mode 100644
index 00000000000..72b7c6e3614
--- /dev/null
+++ b/tests/ut/cpp/pre_activate/ascend/format_type/remove_internal_output_test.cc
@@ -0,0 +1,174 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common/backend_common_test.h"
+#include "debug/anf_ir_dump.h"
+#include "common/py_func_graph_fetcher.h"
+#include "backend/optimizer/ascend/format_type/remove_internal_output.h"
+
+#define private public
+#define protected public
+#include "backend/optimizer/ascend/format_type/insert_trans_op.h"
+#undef private
+#undef protected
+
+namespace mindspore {
+namespace opt {
+using KernelBuildInfoBuilder = kernel::KernelBuildInfo::KernelBuildInfoBuilder;
+
+class TestHWRemoveInternalOutput : public BackendCommon {
+ public:
+  TestHWRemoveInternalOutput() : getPyFun_("gtest_input.pre_activate.remove_internal_output_test", true) {}
+  ~TestHWRemoveInternalOutput() override = default;
+
+  AnfNodePtr GetMakeTuple(const KernelGraphPtr &kg) {
+    auto ret = kg->get_return();
+    MS_EXCEPTION_IF_NULL(ret);
+    auto make_tuple = ret->input(1);
+    return make_tuple;
+  }
+
+  KernelGraphPtr GetSingleOutputGraph(const std::string &func_name, const std::string &sub_func_name) {
+    FuncGraphPtr g = getPyFun_.CallAndParseRet(func_name, sub_func_name);
+    std::vector<int> shp{2, 32, 224, 224};
+    auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp);
+    AbstractBasePtrList args_spec_list{x_abstract, x_abstract};
+    auto kg = GetKernelGraph(g, args_spec_list);
+    auto make_tuple = GetMakeTuple(kg);
+    auto add = make_tuple->cast<CNodePtr>()->input(1);
+    MS_EXCEPTION_IF_NULL(add);
+    kg->AddInternalOutput(add, add);
+    KernelBuildInfoBuilder builder;
+    builder.SetInputsFormat({kOpFormat_DEFAULT, kOpFormat_DEFAULT});
+    builder.SetInputsDeviceType({kFloat32->type_id(), kFloat32->type_id()});
+    builder.SetOutputsFormat({kOpFormat_NC1HWC0});
+    builder.SetOutputsDeviceType({kFloat16->type_id()});
+    add->set_kernel_info(std::make_shared<device::KernelInfo>());
+    AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), add.get());
+    return kg;
+  }
+
+  KernelGraphPtr GetMutilpleOutputGraph(const std::string &func_name, const std::string &sub_func_name) {
+    FuncGraphPtr g = getPyFun_.CallAndParseRet(func_name, sub_func_name);
+    std::vector<int> shp{2, 32, 224, 224};
+    auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp);
+    AbstractBasePtrList args_spec_list{x_abstract};
+    auto kg = GetKernelGraph(g, args_spec_list);
+    auto output_make_tuple = GetMakeTuple(kg);
+    auto make_tuple = output_make_tuple->cast<CNodePtr>()->input(1);
+    MS_EXCEPTION_IF_NULL(make_tuple);
+    auto tuple_getitem1 = make_tuple->cast<CNodePtr>()->input(1);
+    MS_EXCEPTION_IF_NULL(tuple_getitem1);
+    auto tuple_getitem2 = make_tuple->cast<CNodePtr>()->input(2);
+    MS_EXCEPTION_IF_NULL(tuple_getitem2);
+    auto max_pool = tuple_getitem1->cast<CNodePtr>()->input(1);
+    MS_EXCEPTION_IF_NULL(max_pool);
+    kg->AddInternalOutput(tuple_getitem1, max_pool);
+    kg->AddInternalOutput(tuple_getitem2, max_pool);
+    KernelBuildInfoBuilder builder;
+    builder.SetInputsFormat({kOpFormat_DEFAULT});
+    builder.SetInputsDeviceType({kFloat32->type_id()});
+    builder.SetOutputsFormat({kOpFormat_NC1HWC0, kOpFormat_NC1HWC0});
+    builder.SetOutputsDeviceType({kFloat16->type_id(), kFloat16->type_id()});
+    max_pool->set_kernel_info(std::make_shared<device::KernelInfo>());
+    AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), max_pool.get());
+    return kg;
+  }
+  UT::PyFuncGraphFetcher getPyFun_;
+};
+
+class MockRemoveInternalOutputTransOpKernelSelect : public KernelSelect {
+ public:
+  MockRemoveInternalOutputTransOpKernelSelect() = default;
+  ~MockRemoveInternalOutputTransOpKernelSelect() override = default;
+  void SelectKernel(const CNodePtr &cnode) override {
+    KernelBuildInfoBuilder builder;
+    builder.SetInputsFormat({kOpFormat_NC1HWC0});
+    builder.SetInputsDeviceType({kFloat16->type_id()});
+    builder.SetOutputsFormat({kOpFormat_DEFAULT});
+    builder.SetOutputsDeviceType({kFloat32->type_id()});
+    AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), cnode.get());
+  }
+};
+
+TEST_F(TestHWRemoveInternalOutput, test_remove_internal_output_trans_op_for_single_output) {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  ms_context->set_execution_mode(kGraphMode);
+  auto kg = GetSingleOutputGraph("test_remove_internal_output_trans_op_for_single_output", "before");
+  // insert trans op for output
+  auto graph_optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pass_manager = std::make_shared<opt::PassManager>();
+  auto insert_trans_op_pass = std::make_shared<opt::InsertTransOp>();
+  insert_trans_op_pass->kernel_select_ = std::make_shared<MockRemoveInternalOutputTransOpKernelSelect>();
+  pass_manager->AddPass(insert_trans_op_pass);
+  graph_optimizer->AddPassManager(pass_manager);
+  auto new_g = graph_optimizer->Optimize(kg);
+  FuncGraphPtr g_after =
+    getPyFun_.CallAndParseRet("test_remove_internal_output_trans_op_for_single_output", "after_insert_trans_op");
+  EXPECT_TRUE(CheckEqualGraph(g_after, new_g));
+
+  auto make_tuple = GetMakeTuple(kg);
+  auto trans_data = make_tuple->cast<CNodePtr>()->input(1);
+  EXPECT_TRUE(kg->IsInternalOutput(trans_data));
+
+  // remove trans op for internal output
+  auto graph_optimizer1 = std::make_shared<opt::GraphOptimizer>();
+  auto pass_manager1 = std::make_shared<opt::PassManager>();
+  auto remove_internal_output_trans_op_pass = std::make_shared<opt::RemoveInternalOutputTransOp>();
+  pass_manager1->AddPass(remove_internal_output_trans_op_pass);
+  graph_optimizer1->AddPassManager(pass_manager1);
+  auto new_g1 = graph_optimizer1->Optimize(new_g);
+  FuncGraphPtr g_after1 = getPyFun_.CallAndParseRet("test_remove_internal_output_trans_op_for_single_output",
+                                                    "after_remove_internal_output_trans_op");
+  EXPECT_TRUE(CheckEqualGraph(g_after1, new_g1));
+}
+
+TEST_F(TestHWRemoveInternalOutput, test_remove_internal_output_trans_op_for_multiple_output) {
+  auto kg = GetMutilpleOutputGraph("test_remove_internal_output_trans_op_for_multiple_output", "before");
+  // insert trans op for output
+  auto graph_optimizer = std::make_shared<opt::GraphOptimizer>();
+  auto pass_manager = std::make_shared<opt::PassManager>();
+  auto insert_trans_op_pass = std::make_shared<opt::InsertTransOp>();
+  insert_trans_op_pass->kernel_select_ = std::make_shared<MockRemoveInternalOutputTransOpKernelSelect>();
+  pass_manager->AddPass(insert_trans_op_pass);
+  graph_optimizer->AddPassManager(pass_manager);
+  auto new_g = graph_optimizer->Optimize(kg);
+  FuncGraphPtr g_after =
+    getPyFun_.CallAndParseRet("test_remove_internal_output_trans_op_for_multiple_output", "after_insert_trans_op");
+  EXPECT_TRUE(CheckEqualGraph(g_after, new_g));
+
+  auto output_make_tuple = GetMakeTuple(kg);
+  auto make_tuple = output_make_tuple->cast<CNodePtr>()->input(1);
+  auto tuple_getitem = make_tuple->cast<CNodePtr>()->input(1);
+  auto make_tuple1 = tuple_getitem->cast<CNodePtr>()->input(1);
+  auto trans_data1 = make_tuple1->cast<CNodePtr>()->input(1);
+  auto trans_data2 = make_tuple1->cast<CNodePtr>()->input(2);
+  EXPECT_TRUE(kg->IsInternalOutput(trans_data1));
+  EXPECT_TRUE(kg->IsInternalOutput(trans_data2));
+
+  // remove trans op for internal output
+  auto graph_optimizer1 = std::make_shared<opt::GraphOptimizer>();
+  auto pass_manager1 = std::make_shared<opt::PassManager>();
+  auto remove_internal_output_trans_op_pass = std::make_shared<opt::RemoveInternalOutputTransOp>();
+  pass_manager1->AddPass(remove_internal_output_trans_op_pass);
+  graph_optimizer1->AddPassManager(pass_manager1);
+  auto new_g1 = graph_optimizer1->Optimize(new_g);
+  FuncGraphPtr g_after1 = getPyFun_.CallAndParseRet("test_remove_internal_output_trans_op_for_multiple_output",
+                                                    "after_remove_internal_output_trans_op");
+  EXPECT_TRUE(CheckEqualGraph(g_after1, new_g1));
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/tests/ut/cpp/python_input/gtest_input/pre_activate/remove_internal_output_test.py b/tests/ut/cpp/python_input/gtest_input/pre_activate/remove_internal_output_test.py
new file mode 100644
index 00000000000..0c02864816c
--- /dev/null
+++ b/tests/ut/cpp/python_input/gtest_input/pre_activate/remove_internal_output_test.py
@@ -0,0 +1,83 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+from mindspore.ops import Primitive
+from mindspore.ops import operations as P
+
+tuple_getitem = Primitive('tuple_getitem')
+add = P.TensorAdd()
+max_pool = P.MaxPoolWithArgmax(padding="same", ksize=3, strides=2)
+make_tuple = Primitive('make_tuple')
+trans_data = Primitive("TransData")
+
+
+class FnDict:
+    def __init__(self):
+        self.fnDict = {}
+
+    def __call__(self, fn):
+        self.fnDict[fn.__name__] = fn
+
+    def __getitem__(self, name):
+        return self.fnDict[name]
+
+
+def test_remove_internal_output_trans_op_for_single_output(tag):
+    fns = FnDict()
+
+    @fns
+    def before(x, y):
+        res = add(x, y)
+        return res
+
+    @fns
+    def after_insert_trans_op(x, y):
+        output = add(x, y)
+        res = trans_data(output)
+        return make_tuple(res)
+
+    @fns
+    def after_remove_internal_output_trans_op(x, y):
+        res = add(x, y)
+        return make_tuple(res)
+
+    return fns[tag]
+
+
+def test_remove_internal_output_trans_op_for_multiple_output(tag):
+    fns = FnDict()
+
+    @fns
+    def before(x):
+        max_pool_res = max_pool(x)
+        res = make_tuple(tuple_getitem(max_pool_res, 0), tuple_getitem(max_pool_res, 1))
+        return res
+
+    @fns
+    def after_insert_trans_op(x):
+        output = max_pool(x)
+        trans_data0 = trans_data(tuple_getitem(output, 0))
+        trans_data1 = trans_data(tuple_getitem(output, 1))
+        new_make_tuple = make_tuple(trans_data0, trans_data1)
+        res = make_tuple(tuple_getitem(new_make_tuple, 0), tuple_getitem(new_make_tuple, 1))
+        return make_tuple(res)
+
+    @fns
+    def after_remove_internal_output_trans_op(x):
+        output = max_pool(x)
+        new_make_tuple = make_tuple(tuple_getitem(output, 0), tuple_getitem(output, 1))
+        res = make_tuple(tuple_getitem(new_make_tuple, 0), tuple_getitem(new_make_tuple, 1))
+        return make_tuple(res)
+
+    return fns[tag]

From e2ea1fa0dfbc78ee00b6da9954a43230772ed695 Mon Sep 17 00:00:00 2001
From: liyong <liyong126@huawei.com>
Date: Thu, 16 Jul 2020 19:07:29 +0800
Subject: [PATCH 61/68] activate num_samples in distributed samplers

---
 .../minddata/dataset/api/python_bindings.cc   |  2 +-
 .../include/shard_distributed_sample.h        |  5 +-
 .../mindrecord/include/shard_sample.h         |  2 +-
 .../meta/shard_distributed_sample.cc          |  9 +--
 .../minddata/mindrecord/meta/shard_sample.cc  | 10 ++-
 mindspore/dataset/engine/samplers.py          |  4 +-
 tests/ut/python/dataset/test_minddataset.py   | 66 +++++++++++++++++++
 .../dataset/test_minddataset_exception.py     | 21 ++++++
 8 files changed, 108 insertions(+), 11 deletions(-)

diff --git a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
index 08016ee0613..94c4ec40d70 100644
--- a/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python_bindings.cc
@@ -784,7 +784,7 @@ void bindSamplerOps(py::module *m) {
 
   (void)py::class_<mindrecord::ShardDistributedSample, mindrecord::ShardSample,
                    std::shared_ptr<mindrecord::ShardDistributedSample>>(*m, "MindrecordDistributedSampler")
-    .def(py::init<int64_t, int64_t, bool, uint32_t>());
+    .def(py::init<int64_t, int64_t, bool, uint32_t, int64_t>());
 
   (void)py::class_<mindrecord::ShardShuffle, mindrecord::ShardOperator, std::shared_ptr<mindrecord::ShardShuffle>>(
     *m, "MindrecordRandomSampler")
diff --git a/mindspore/ccsrc/minddata/mindrecord/include/shard_distributed_sample.h b/mindspore/ccsrc/minddata/mindrecord/include/shard_distributed_sample.h
index f166ec1e6c6..9244c16f9f5 100644
--- a/mindspore/ccsrc/minddata/mindrecord/include/shard_distributed_sample.h
+++ b/mindspore/ccsrc/minddata/mindrecord/include/shard_distributed_sample.h
@@ -29,9 +29,10 @@ namespace mindspore {
 namespace mindrecord {
 class ShardDistributedSample : public ShardSample {
  public:
-  ShardDistributedSample(int num_shards, int shard_id, int no_of_padded_samples, bool shuffle, uint32_t seed);
+  ShardDistributedSample(int num_shards, int shard_id, int no_of_padded_samples, bool shuffle, uint32_t seed,
+                         int no_of_samples = 0);
 
-  ShardDistributedSample(int num_shards, int shard_id, bool shuffle, uint32_t seed);
+  ShardDistributedSample(int num_shards, int shard_id, bool shuffle, uint32_t seed, int no_of_samples = 0);
 
   void SetNumPaddedSamples(int no_of_padded_samples) { no_of_padded_samples_ = no_of_padded_samples; }
 
diff --git a/mindspore/ccsrc/minddata/mindrecord/include/shard_sample.h b/mindspore/ccsrc/minddata/mindrecord/include/shard_sample.h
index ce813bc4bf4..c3d695e8e8c 100644
--- a/mindspore/ccsrc/minddata/mindrecord/include/shard_sample.h
+++ b/mindspore/ccsrc/minddata/mindrecord/include/shard_sample.h
@@ -32,7 +32,7 @@ class ShardSample : public ShardOperator {
 
   ShardSample(int num, int den);
 
-  ShardSample(int num, int den, int par);
+  ShardSample(int num, int den, int par, int no_of_samples = 0);
 
   ShardSample(const std::vector<int64_t> &indices, uint32_t seed);
 
diff --git a/mindspore/ccsrc/minddata/mindrecord/meta/shard_distributed_sample.cc b/mindspore/ccsrc/minddata/mindrecord/meta/shard_distributed_sample.cc
index 4c7abbb4b48..6bc1c1408d4 100644
--- a/mindspore/ccsrc/minddata/mindrecord/meta/shard_distributed_sample.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/meta/shard_distributed_sample.cc
@@ -23,16 +23,17 @@ using mindspore::MsLogLevel::ERROR;
 namespace mindspore {
 namespace mindrecord {
 ShardDistributedSample::ShardDistributedSample(int num_shards, int shard_id, int no_of_padded_samples, bool shuffle,
-                                               uint32_t seed)
-    : ShardSample(1, num_shards, shard_id),
+                                               uint32_t seed, int no_of_samples)
+    : ShardSample(1, num_shards, shard_id, no_of_samples),
       shuffle_(shuffle),
       no_of_padded_samples_(no_of_padded_samples),
       first_epoch_(true) {
   shuffle_op_ = std::make_shared<ShardShuffle>(seed, kShuffleSample);
 }
 
-ShardDistributedSample::ShardDistributedSample(int num_shards, int shard_id, bool shuffle, uint32_t seed)
-    : ShardDistributedSample(num_shards, shard_id, 0, shuffle, seed) {}
+ShardDistributedSample::ShardDistributedSample(int num_shards, int shard_id, bool shuffle, uint32_t seed,
+                                               int no_of_samples)
+    : ShardDistributedSample(num_shards, shard_id, 0, shuffle, seed, no_of_samples) {}
 
 int64_t ShardDistributedSample::GetNumSamples(int64_t dataset_size, int64_t num_classes) {
   if (no_of_padded_samples_ <= 0) {
diff --git a/mindspore/ccsrc/minddata/mindrecord/meta/shard_sample.cc b/mindspore/ccsrc/minddata/mindrecord/meta/shard_sample.cc
index 808ab55bfbe..b8be83735b7 100644
--- a/mindspore/ccsrc/minddata/mindrecord/meta/shard_sample.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/meta/shard_sample.cc
@@ -38,11 +38,11 @@ ShardSample::ShardSample(int num, int den)
       indices_({}),
       sampler_type_(kCustomTopPercentSampler) {}
 
-ShardSample::ShardSample(int num, int den, int par)
+ShardSample::ShardSample(int num, int den, int par, int no_of_samples)
     : numerator_(num),
       denominator_(den),
       partition_id_(par),
-      no_of_samples_(0),
+      no_of_samples_(no_of_samples),
       indices_({}),
       sampler_type_(kCustomTopPercentSampler) {}
 
@@ -110,8 +110,11 @@ MSRStatus ShardSample::Execute(ShardTask &tasks) {
         new_tasks.InsertTask(tasks.GetTaskByID(index));  // different mod result between c and python
       }
     } else {
+      int count = 0;
       for (int i = partition_id_ * taking; i < (partition_id_ + 1) * taking; i++) {
+        if (no_of_samples_ != 0 && count == no_of_samples_) break;
         new_tasks.InsertTask(tasks.GetTaskByID(i % total_no));  // rounding up. if overflow, go back to start
+        count++;
       }
     }
     std::swap(tasks, new_tasks);
@@ -121,8 +124,11 @@ MSRStatus ShardSample::Execute(ShardTask &tasks) {
       return FAILED;
     }
     total_no = static_cast<int>(tasks.permutation_.size());
+    int count = 0;
     for (size_t i = partition_id_ * taking; i < (partition_id_ + 1) * taking; i++) {
+      if (no_of_samples_ != 0 && count == no_of_samples_) break;
       new_tasks.InsertTask(tasks.GetTaskByID(tasks.permutation_[i % total_no]));
+      count++;
     }
     std::swap(tasks, new_tasks);
   }
diff --git a/mindspore/dataset/engine/samplers.py b/mindspore/dataset/engine/samplers.py
index b74874f9cf3..22c0e44d0d4 100644
--- a/mindspore/dataset/engine/samplers.py
+++ b/mindspore/dataset/engine/samplers.py
@@ -270,7 +270,9 @@ class DistributedSampler(BuiltinSampler):
         return c_sampler
 
     def create_for_minddataset(self):
-        c_sampler = cde.MindrecordDistributedSampler(self.num_shards, self.shard_id, self.shuffle, self.seed)
+        num_samples = self.num_samples if self.num_samples is not None else 0
+        c_sampler = cde.MindrecordDistributedSampler(self.num_shards, self.shard_id, self.shuffle,
+                                                     self.seed, num_samples)
         c_child_sampler = self.create_child_for_minddataset()
         c_sampler.add_child(c_child_sampler)
         return c_sampler
diff --git a/tests/ut/python/dataset/test_minddataset.py b/tests/ut/python/dataset/test_minddataset.py
index 7d613d414f6..8d22bd6c50f 100644
--- a/tests/ut/python/dataset/test_minddataset.py
+++ b/tests/ut/python/dataset/test_minddataset.py
@@ -238,6 +238,72 @@ def test_cv_minddataset_partition_tutorial(add_and_remove_cv_file):
     assert partitions(5) == 2
     assert partitions(9) == 2
 
+def test_cv_minddataset_partition_num_samples_0(add_and_remove_cv_file):
+    """tutorial for cv minddataset."""
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+
+    def partitions(num_shards):
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id, num_samples=1)
+            num_iter = 0
+            for item in data_set.create_dict_iterator():
+                logger.info("-------------- partition : {} ------------------------".format(partition_id))
+                logger.info("-------------- item[file_name]: {}-----------------------".format(item["file_name"]))
+                logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
+                num_iter += 1
+        return num_iter
+
+    assert partitions(4) == 1
+    assert partitions(5) == 1
+    assert partitions(9) == 1
+
+def test_cv_minddataset_partition_num_samples_1(add_and_remove_cv_file):
+    """tutorial for cv minddataset."""
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+
+    def partitions(num_shards):
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id, num_samples=2)
+            num_iter = 0
+            for item in data_set.create_dict_iterator():
+                logger.info("-------------- partition : {} ------------------------".format(partition_id))
+                logger.info("-------------- item[file_name]: {}-----------------------".format(item["file_name"]))
+                logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
+                num_iter += 1
+        return num_iter
+
+    assert partitions(4) == 2
+    assert partitions(5) == 2
+    assert partitions(9) == 2
+
+def test_cv_minddataset_partition_num_samples_2(add_and_remove_cv_file):
+    """tutorial for cv minddataset."""
+    columns_list = ["data", "file_name", "label"]
+    num_readers = 4
+
+    def partitions(num_shards):
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id, num_samples=3)
+            num_iter = 0
+            for item in data_set.create_dict_iterator():
+                logger.info("-------------- partition : {} ------------------------".format(partition_id))
+                logger.info("-------------- item[file_name]: {}-----------------------".format(item["file_name"]))
+                logger.info("-------------- item[label]: {} -----------------------".format(item["label"]))
+                num_iter += 1
+        return num_iter
+
+    assert partitions(4) == 3
+    assert partitions(5) == 2
+    assert partitions(9) == 2
+
 
 def test_cv_minddataset_partition_tutorial_check_shuffle_result(add_and_remove_cv_file):
     """tutorial for cv minddataset."""
diff --git a/tests/ut/python/dataset/test_minddataset_exception.py b/tests/ut/python/dataset/test_minddataset_exception.py
index 0b4d0dfc8fe..0bfb7a03427 100644
--- a/tests/ut/python/dataset/test_minddataset_exception.py
+++ b/tests/ut/python/dataset/test_minddataset_exception.py
@@ -228,3 +228,24 @@ def test_minddataset_shard_id_bigger_than_num_shard():
 
     os.remove(CV_FILE_NAME)
     os.remove("{}.db".format(CV_FILE_NAME))
+
+def test_cv_minddataset_partition_num_samples_equals_0():
+    """tutorial for cv minddataset."""
+    create_cv_mindrecord(1)
+    columns_list = ["data", "label"]
+    num_readers = 4
+
+    def partitions(num_shards):
+        for partition_id in range(num_shards):
+            data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers,
+                                      num_shards=num_shards,
+                                      shard_id=partition_id, num_samples=0)
+            num_iter = 0
+            for _ in data_set.create_dict_iterator():
+                num_iter += 1
+    with pytest.raises(Exception) as error_info:
+        partitions(5)
+    assert 'num_samples should be a positive integer value, but got num_samples=0' in str(error_info)
+
+    os.remove(CV_FILE_NAME)
+    os.remove("{}.db".format(CV_FILE_NAME))

From 23cc01f21d23b5a5fc885de9d4bb8e2c9685078f Mon Sep 17 00:00:00 2001
From: zhoufeng <zhoufeng54@huawei.com>
Date: Fri, 17 Jul 2020 16:09:02 +0800
Subject: [PATCH 62/68] dumpir can dump subgraphs of ascend kernel graph

Signed-off-by: zhoufeng <zhoufeng54@huawei.com>
---
 .../backend/session/ascend_control_parser.cc  | 29 +++++++++++++++++--
 .../backend/session/ascend_control_parser.h   |  3 +-
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/mindspore/ccsrc/backend/session/ascend_control_parser.cc b/mindspore/ccsrc/backend/session/ascend_control_parser.cc
index 4c6c7ab9cf4..4306a3d7855 100644
--- a/mindspore/ccsrc/backend/session/ascend_control_parser.cc
+++ b/mindspore/ccsrc/backend/session/ascend_control_parser.cc
@@ -107,7 +107,7 @@ static void ReuseParameter(NotNull<KernelGraphPtr> root_kg,
 
 static CNodePtr GetNextRealKernel(const std::vector<CNodePtr> &list, size_t start) {
   for (size_t i = start; i < list.size() - 1; ++i) {
-    if (!IsPrimitiveCNode(list[i], prim::kPrimPartial) && AnfAlgo::IsRealKernel(list[i])) {
+    if (AnfAlgo::IsRealKernel(list[i])) {
       return list[i];
     }
   }
@@ -171,18 +171,43 @@ static void EraseNodeFromExecOrder(const AnfNodePtr &node, const NotNull<std::ve
   exec_order->erase(exec_iter);
 }
 
+void AscendControlParser::AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph,
+                                                       const NotNull<std::set<KernelGraphPtr> *> memo) {
+  if (memo->find(graph) != memo->end()) {
+    return;
+  }
+  memo->insert(graph.get());
+  const std::vector<std::shared_ptr<KernelGraph>> &child_graph_order = graph->child_graph_order();
+  if (child_graph_order.empty()) {
+    return;
+  }
+
+  std::vector<AnfNodePtr> depend_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimPartial->name()))};
+  for (auto &cg : child_graph_order) {
+    MS_EXCEPTION_IF_NULL(cg);
+    auto fg = cg->cast<FuncGraphPtr>();
+    MS_EXCEPTION_IF_NULL(fg);
+    depend_inputs.emplace_back(NewValueNode(fg));
+    AttachChildGraphToReturnNode(NOT_NULL(cg), memo);
+  }
+  auto child_graphs = graph->NewCNode(depend_inputs);
+  InsertDependToGraph(graph, NOT_NULL(child_graphs));
+}
+
 void AscendControlParser::LinkGraph(NotNull<KernelGraphPtr> kg) {
   std::set<KernelGraphPtr> memo;
   std::vector<std::pair<AnfNodePtr, AnfNodePtr>> link_list;
   // Insert Assign
   ChildGraphDataAssign(kg, NOT_NULL(&link_list), NOT_NULL(&memo));
+  memo.clear();
   // Reuse Parameter
   ReuseParameter(kg, link_list);
   // replace call by label goto / label switch
-  memo.clear();
   (void)ProcessKernelGraph(kg, nullptr, nullptr, NOT_NULL(&memo));
+  memo.clear();
   // assign label resource
   device::ascend::AscendLabelAssign::GetInstance().AssignLabel(kg);
+  AttachChildGraphToReturnNode(kg, NOT_NULL(&memo));
 }
 
 void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
diff --git a/mindspore/ccsrc/backend/session/ascend_control_parser.h b/mindspore/ccsrc/backend/session/ascend_control_parser.h
index ac247351390..4e62629d985 100644
--- a/mindspore/ccsrc/backend/session/ascend_control_parser.h
+++ b/mindspore/ccsrc/backend/session/ascend_control_parser.h
@@ -66,7 +66,8 @@ class AscendControlParser {
   static AnfNodePtr InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to);
   static std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> ParseCallNode(NotNull<CNodePtr> call_node);
   static std::tuple<KernelGraphPtr, std::vector<AnfNodePtr>> ParsePartial(NotNull<AnfNodePtr> node);
-
+  static void AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph,
+                                           const NotNull<std::set<KernelGraphPtr> *> memo);
   // root graph order
   static bool CheckLabelIndex(uint32_t order_index, uint32_t label_index, const CNodePtr &cnode,
                               NotNull<KernelGraphPtr> graph);

From a596dd6e433a2bdb4a99f5c41bb0375971095568 Mon Sep 17 00:00:00 2001
From: limingqi107 <limingqi@huawei.com>
Date: Fri, 17 Jul 2020 18:08:20 +0800
Subject: [PATCH 63/68] gpu fix the graph of 'nop node + depend + node'

---
 .../backend/optimizer/mem_reuse/mem_reuse.h   |  1 +
 .../runtime/device/gpu/gpu_kernel_runtime.cc  | 58 +++++++++++++------
 .../runtime/device/gpu/gpu_kernel_runtime.h   |  4 +-
 3 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.h b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.h
index b286bcbc2c6..011b20c4abf 100644
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.h
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.h
@@ -83,6 +83,7 @@ class MemReuseUtil {
   void set_mem_base(uint8_t *mem_base) { mem_base_ = mem_base; }
   uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
   uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
+  bool is_all_nop_node() const { return is_all_nop_node_; }
 
  private:
   int util_index_;
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index ddf73841b77..185df37e4df 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -160,6 +160,12 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
     }
     mem_swap_manager_ = iter->second;
     MS_EXCEPTION_IF_NULL(mem_swap_manager_);
+    auto mem_reuse_iter = mem_reuse_util_map_.find(graph_id);
+    if (mem_reuse_iter == mem_reuse_util_map_.end()) {
+      MS_LOG(EXCEPTION) << "Find memory reuse map failed.";
+    }
+    mem_reuse_util_ = mem_reuse_iter->second;
+    MS_EXCEPTION_IF_NULL(mem_reuse_util_);
     while (!LaunchKernelDynamic(graph)) {
       MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment.";
       if (!UpdateMemorySwapInfo(graph)) {
@@ -246,18 +252,11 @@ void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *grap
 
 bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
   MS_EXCEPTION_IF_NULL(graph);
-  auto graph_id = graph->graph_id();
-  auto iter = mem_reuse_util_map_.find(graph_id);
-  if (iter == mem_reuse_util_map_.end()) {
-    MS_LOG(EXCEPTION) << "Find memory reuse map failed.";
-  }
-  auto mem_reuse_util_ptr = iter->second;
-  MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
+  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
   // Reset the reference count.
-  mem_reuse_util_ptr->ResetDynamicUsedRefCount();
+  mem_reuse_util_->ResetDynamicUsedRefCount();
   // The inputs and outputs memory of communication kernel need be continuous, so separate processing.
   AllocCommunicationOpDynamicRes(graph);
-
   auto &kernels = graph->execution_order();
   for (const auto &kernel : kernels) {
     auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
@@ -272,7 +271,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph) {
     if (!kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_)) {
       MS_LOG(EXCEPTION) << "Launch kernel failed.";
     }
-    FreeKernelDynamicRes(kernel, kernel_workspaces, graph_id);
+    FreeKernelDynamicRes(kernel, kernel_workspaces);
     UpdateMemorySwapTask(kernel);
   }
   CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed.");
@@ -450,9 +449,16 @@ bool GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod
 bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs) {
   MS_EXCEPTION_IF_NULL(kernel);
   MS_EXCEPTION_IF_NULL(kernel_inputs);
+  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
   for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
-    // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
-    auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
+    DeviceAddressPtr device_address;
+    if (mem_reuse_util_->is_all_nop_node()) {
+      // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
+      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
+    } else {
+      // Graph may be "nop node + depend + node",  the input of node is the depend, so this case need skip nop node.
+      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true);
+    }
     MS_EXCEPTION_IF_NULL(device_address);
     UpdateHostSwapQueue(device_address);
     MS_EXCEPTION_IF_NULL(device_address->ptr_);
@@ -525,13 +531,21 @@ void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph
 
 void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) {
   MS_EXCEPTION_IF_NULL(kernel);
+  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
   bool is_need_alloc_memory = false;
   bool is_need_free_memory = false;
   size_t total_size = 0;
   std::vector<size_t> size_list;
   DeviceAddressPtrList addr_list;
   for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
-    auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
+    DeviceAddressPtr device_address;
+    if (mem_reuse_util_->is_all_nop_node()) {
+      // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
+      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
+    } else {
+      // Graph may be "nop node + depend + node",  the input of node is the depend, so this case need skip nop node.
+      device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true);
+    }
     MS_EXCEPTION_IF_NULL(device_address);
     if (device_address->ptr_ == nullptr) {
       is_need_alloc_memory = true;
@@ -593,11 +607,10 @@ void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, boo
 }
 
 void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
-                                            const AddressPtrList &kernel_workspaces, uint32_t graph_id) {
+                                            const AddressPtrList &kernel_workspaces) {
   MS_EXCEPTION_IF_NULL(kernel);
   MS_EXCEPTION_IF_NULL(mem_manager_);
-  auto mem_reuse_util_ptr = mem_reuse_util_map_[graph_id];
-  MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
+  MS_EXCEPTION_IF_NULL(mem_reuse_util_);
   auto cnode = kernel->cast<CNodePtr>();
   MS_EXCEPTION_IF_NULL(cnode);
   if (AnfAlgo::IsCommunicationOp(kernel)) {
@@ -605,7 +618,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
   }
   // Free the input of kernel by reference count.
   for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
-    auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetKernelInputRef(cnode, i);
+    auto kernel_ref_count_ptr = mem_reuse_util_->GetKernelInputRef(cnode, i);
     if (kernel_ref_count_ptr == nullptr) {
       continue;
     }
@@ -614,14 +627,21 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
       MS_LOG(EXCEPTION) << "Check dynamic reference count failed.";
     }
     if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) {
-      auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
+      DeviceAddressPtr device_address;
+      if (mem_reuse_util_->is_all_nop_node()) {
+        // Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
+        device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false);
+      } else {
+        // Graph may be "nop node + depend + node",  the input of node is the depend, so this case need skip nop node.
+        device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true);
+      }
       mem_manager_->FreeMemFromMemPool(device_address);
       device_address->set_status(DeviceAddressStatus::kInDevice);
     }
   }
   // Free the output of kernel, if output has no reference.
   for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(kernel); ++i) {
-    auto kernel_ref_count_ptr = mem_reuse_util_ptr->GetRef(cnode, i);
+    auto kernel_ref_count_ptr = mem_reuse_util_->GetRef(cnode, i);
     if (kernel_ref_count_ptr == nullptr) {
       continue;
     }
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
index 2b1f8198ce1..e1ba3458661 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
@@ -72,8 +72,7 @@ class GPUKernelRuntime : public KernelRuntime {
   void AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
                                   const DeviceAddressPtrList addr_list, size_t total_size,
                                   std::vector<size_t> size_list);
-  void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, const AddressPtrList &kernel_workspaces,
-                            uint32_t graph_id);
+  void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel, const AddressPtrList &kernel_workspaces);
   bool AddMemorySwapTask(const AnfNodePtr &kernel);
   bool UpdateMemorySwapInfo(const session::KernelGraph *graph);
   bool UpdateMemorySwapTask(const AnfNodePtr &kernel);
@@ -82,6 +81,7 @@ class GPUKernelRuntime : public KernelRuntime {
   void ClearSwapQueue();
   std::unordered_map<uint32_t, MemReuseUtilPtr> mem_reuse_util_map_;
   std::unordered_map<uint32_t, MemSwapManagerPtr> mem_swap_map_;
+  MemReuseUtilPtr mem_reuse_util_{nullptr};
   MemSwapManagerPtr mem_swap_manager_{nullptr};
 };
 MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime);

From 5a10383cc32476973076fb915673f19bbf42cc24 Mon Sep 17 00:00:00 2001
From: panyifeng <panyifeng@huawei.com>
Date: Wed, 15 Jul 2020 11:43:31 +0800
Subject: [PATCH 64/68] add coo_tensor

---
 mindspore/_extends/parse/resources.py         |   3 +-
 mindspore/ccsrc/debug/dump_proto.cc           |   2 +
 .../frontend/operator/composite/composite.cc  |  12 ++
 .../operator/composite/multitype_funcgraph.cc |  36 ------
 mindspore/ccsrc/frontend/operator/ops.cc      |   7 +-
 mindspore/ccsrc/frontend/operator/ops.h       |   7 +-
 .../ccsrc/frontend/operator/prim_others.cc    | 115 ++++++++++++++++--
 .../ccsrc/frontend/optimizer/ad/kprim.cc      |   2 +-
 mindspore/ccsrc/frontend/optimizer/irpass.cc  |   6 +
 mindspore/ccsrc/frontend/optimizer/irpass.h   |   3 +
 .../irpass/sparse_tensor_eliminate.h          |  75 ++++++++++++
 mindspore/ccsrc/pipeline/jit/pass.cc          |   1 +
 mindspore/ccsrc/pipeline/jit/resource.cc      |   6 +
 .../pipeline/jit/static_analysis/prim.cc      |   6 +-
 .../ccsrc/pipeline/jit/static_analysis/prim.h |  10 +-
 mindspore/ccsrc/pipeline/jit/validator.cc     |   3 +-
 mindspore/common/__init__.py                  |   4 +-
 mindspore/common/tensor.py                    |   6 +-
 mindspore/core/abstract/abstract_value.cc     |  59 +++++++++
 mindspore/core/abstract/abstract_value.h      |  33 ++++-
 mindspore/core/abstract/param_validator.h     |   1 +
 mindspore/core/ir/dtype.cc                    |  42 +++++++
 mindspore/core/ir/dtype.h                     |  23 ++++
 mindspore/core/ir/dtype/type.cc               |   2 +
 mindspore/core/ir/dtype/type_id.h             |   1 +
 mindspore/core/ir/dtype_extends.cc            |  20 +++
 mindspore/core/ir/dtype_py.cc                 |   2 +
 mindspore/core/ir/meta_func_graph.cc          |  40 ++++++
 mindspore/core/ir/meta_func_graph.h           |   1 +
 mindspore/core/ir/param_value.h               |   8 --
 mindspore/core/ir/param_value_py.cc           |   8 +-
 mindspore/ops/functional.py                   |   4 +
 tests/ut/cpp/optimizer/lib_test.cc            |  13 ++
 .../gtest_input/optimizer/opt_test.py         |  35 ++++++
 tests/ut/python/ir/test_indexed_slices.py     |  71 ++++++++++-
 tests/ut/python/ir/test_sparse_tensor.py      |  61 ++++++++++
 36 files changed, 652 insertions(+), 76 deletions(-)
 create mode 100644 mindspore/ccsrc/frontend/optimizer/irpass/sparse_tensor_eliminate.h
 create mode 100644 tests/ut/python/ir/test_sparse_tensor.py

diff --git a/mindspore/_extends/parse/resources.py b/mindspore/_extends/parse/resources.py
index e60b70eface..e2b83331f52 100644
--- a/mindspore/_extends/parse/resources.py
+++ b/mindspore/_extends/parse/resources.py
@@ -17,7 +17,7 @@
 """Resources for ast tree parse."""
 import ast
 import math
-from mindspore import IndexedSlices
+from mindspore import IndexedSlices, SparseTensor
 from mindspore.ops.composite import multitype_ops
 from mindspore.ops import functional as F, composite as C
 from . import standard_method as M
@@ -140,4 +140,5 @@ convert_object_map = {
 
     # user defined
     IndexedSlices:  F.make_indexed_slices,
+    SparseTensor:   F.make_sparse_tensor,
 }
diff --git a/mindspore/ccsrc/debug/dump_proto.cc b/mindspore/ccsrc/debug/dump_proto.cc
index 35cdfafe26e..9172d11471a 100644
--- a/mindspore/ccsrc/debug/dump_proto.cc
+++ b/mindspore/ccsrc/debug/dump_proto.cc
@@ -124,6 +124,8 @@ void ProtoExporter::SetNodeOutputType(const TypePtr &type, const BaseShapePtr &s
     // Do Nothing
   } else if (type->isa<UndeterminedType>()) {
     // Do Nothing
+  } else if (type->isa<SparseTensorType>()) {
+    // Do Nothing
   } else if (type->isa<Tuple>()) {
     TuplePtr tuple_type = dyn_cast<Tuple>(type);
     type_proto->set_data_type(irpb::DT_TUPLE);
diff --git a/mindspore/ccsrc/frontend/operator/composite/composite.cc b/mindspore/ccsrc/frontend/operator/composite/composite.cc
index 7d2573e50ab..0586572dd1f 100644
--- a/mindspore/ccsrc/frontend/operator/composite/composite.cc
+++ b/mindspore/ccsrc/frontend/operator/composite/composite.cc
@@ -803,6 +803,18 @@ FuncGraphPtr TupleAdd::GenerateFuncGraph(const AbstractBasePtrList &args_spec_li
   abstract::AbstractTuplePtr a_tuple = dyn_cast<AbstractTuple>(abs_a);
   abstract::AbstractTuplePtr b_tuple = dyn_cast<AbstractTuple>(abs_b);
   if (a_tuple == nullptr || b_tuple == nullptr) {
+    TypePtrList types;
+    (void)std::transform(args_spec_list.begin(), args_spec_list.end(), std::back_inserter(types),
+                         [](const AbstractBasePtr &arg) -> TypePtr {
+                           MS_EXCEPTION_IF_NULL(arg);
+                           return arg->BuildType();
+                         });
+    auto stub = GenerateStubFunc(types);
+    if (stub != nullptr) {
+      MS_LOG(DEBUG) << "GenerateStubFunc for TupleAdd "
+                    << ", function: " << stub->ToString();
+      return stub;
+    }
     MS_LOG(EXCEPTION) << "TupleAdd argument should be tuple,but " << args_spec_list[0]->ToString() << ", "
                       << args_spec_list[1]->ToString();
   }
diff --git a/mindspore/ccsrc/frontend/operator/composite/multitype_funcgraph.cc b/mindspore/ccsrc/frontend/operator/composite/multitype_funcgraph.cc
index ba0d3d9ebb8..16aa6f654bd 100644
--- a/mindspore/ccsrc/frontend/operator/composite/multitype_funcgraph.cc
+++ b/mindspore/ccsrc/frontend/operator/composite/multitype_funcgraph.cc
@@ -119,42 +119,6 @@ const py::function MultitypeFuncGraph::SignMatch(const TypePtrList &types) {
   return py::none();
 }
 
-FuncGraphPtr GenerateStubFunc(const TypePtrList &types) {
-  auto context = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context);
-  bool enable_sparse = context->enable_sparse();
-  if (!enable_sparse) {
-    return nullptr;
-  }
-
-  std::vector<AnfNodePtr> parameters;
-  ParameterPtr undetermined_param = nullptr;
-  auto stub = std::make_shared<FuncGraph>();
-  for (size_t i = 0; i < types.size(); ++i) {
-    auto param = stub->add_parameter();
-    parameters.push_back(param);
-    if (types[i]->type_id() == kObjectTypeUndeterminedType) {
-      undetermined_param = param;
-    }
-  }
-  if (undetermined_param != nullptr) {
-    std::vector<AnfNodePtr> inputs{NewValueNode(prim::kPrimMakeTuple)};
-    for (size_t i = 0; i < types.size(); ++i) {
-      if (types[i]->type_id() == kObjectTypeFunction) {
-        std::vector<AnfNodePtr> call_prim{parameters[i], undetermined_param};
-        inputs.push_back(stub->NewCNode(call_prim));
-      } else {
-        inputs.push_back(parameters[i]);
-      }
-    }
-    auto stub_output = stub->NewCNode(inputs);
-    stub->set_output(stub_output);
-    stub->set_stub(true);
-    return stub;
-  }
-  return nullptr;
-}
-
 FuncGraphPtr MultitypeFuncGraph::GenerateFromTypes(const TypePtrList &types) {
   auto py_fn = SignMatch(types);
   std::ostringstream buffer;
diff --git a/mindspore/ccsrc/frontend/operator/ops.cc b/mindspore/ccsrc/frontend/operator/ops.cc
index 5c7672ee3c6..bf3d55678e1 100755
--- a/mindspore/ccsrc/frontend/operator/ops.cc
+++ b/mindspore/ccsrc/frontend/operator/ops.cc
@@ -283,6 +283,11 @@ const PrimitivePtr kPrimMakeIndexedSlices = std::make_shared<Primitive>("MakeInd
 const PrimitivePtr kPrimIndexedSlicesGetValues = std::make_shared<Primitive>("IndexedSlicesGetValues");
 const PrimitivePtr kPrimIndexedSlicesGetIndices = std::make_shared<Primitive>("IndexedSlicesGetIndices");
 const PrimitivePtr kPrimIndexedSlicesGetDenseShape = std::make_shared<Primitive>("IndexedSlicesGetDenseShape");
-const PrimitivePtr kPrimIsIndexedSlices = std::make_shared<Primitive>("IsIndexedSlices");
+
+// SparseTensor
+const PrimitivePtr kPrimMakeSparseTensor = std::make_shared<Primitive>("MakeSparseTensor");
+const PrimitivePtr kPrimSparseTensorGetValues = std::make_shared<Primitive>("SparseTensorGetValues");
+const PrimitivePtr kPrimSparseTensorGetIndices = std::make_shared<Primitive>("SparseTensorGetIndices");
+const PrimitivePtr kPrimSparseTensorGetDenseShape = std::make_shared<Primitive>("SparseTensorGetDenseShape");
 }  // namespace prim
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/frontend/operator/ops.h b/mindspore/ccsrc/frontend/operator/ops.h
index 0dea045a6ea..d57b681ff26 100755
--- a/mindspore/ccsrc/frontend/operator/ops.h
+++ b/mindspore/ccsrc/frontend/operator/ops.h
@@ -292,7 +292,12 @@ extern const PrimitivePtr kPrimMakeIndexedSlices;
 extern const PrimitivePtr kPrimIndexedSlicesGetValues;
 extern const PrimitivePtr kPrimIndexedSlicesGetIndices;
 extern const PrimitivePtr kPrimIndexedSlicesGetDenseShape;
-extern const PrimitivePtr kPrimIsIndexedSlices;
+
+// SparseTensor
+extern const PrimitivePtr kPrimMakeSparseTensor;
+extern const PrimitivePtr kPrimSparseTensorGetValues;
+extern const PrimitivePtr kPrimSparseTensorGetIndices;
+extern const PrimitivePtr kPrimSparseTensorGetDenseShape;
 
 // attribute 'unroll_flag' of primitive 'switch', when 'unroll_flag' is '0', 'switch' will not unroll
 const char SWITCH_UNROLL_FLAG[] = "unroll_flag";
diff --git a/mindspore/ccsrc/frontend/operator/prim_others.cc b/mindspore/ccsrc/frontend/operator/prim_others.cc
index 530ad6a10c9..25f41860f68 100644
--- a/mindspore/ccsrc/frontend/operator/prim_others.cc
+++ b/mindspore/ccsrc/frontend/operator/prim_others.cc
@@ -349,6 +349,26 @@ AbstractBasePtr InferImplMakeIndexedSlices(const AnalysisEnginePtr &, const Prim
   auto values = CheckArg<AbstractTensor>(op_name, args_spec_list, 1);
   auto dense_shape = CheckArg<AbstractTuple>(op_name, args_spec_list, 2);
 
+  auto indices_dtype = indices->element()->BuildType();
+  if (!indices_dtype->isa<Int>()) {
+    MS_EXCEPTION(TypeError) << "The dtype of indices must be a Int, but got " << indices_dtype->ToString();
+  }
+  auto indices_shp = indices->shape()->shape();
+  if (indices_shp.size() != 1) {
+    MS_EXCEPTION(TypeError) << "Indices must be a 1 dimension tensor, but got a " << indices_shp.size()
+                            << " dimension tensor";
+  }
+  auto values_shp = values->shape()->shape();
+  if (indices_shp[0] != values_shp[0]) {
+    MS_EXCEPTION(TypeError) << "The first dimension of indices must be the same with the first dimension of values "
+                            << values_shp[0] << ", but got " << indices_shp[0];
+  }
+
+  for (auto elem_type : dense_shape->ElementsType()) {
+    if (!elem_type->isa<Int>()) {
+      MS_EXCEPTION(TypeError) << "The element type of dense_shape must be Int, but got " << elem_type->ToString();
+    }
+  }
   auto dense_shape_value = dense_shape->BuildValue()->cast<ValueTuplePtr>();
   MS_EXCEPTION_IF_NULL(dense_shape_value);
   auto shp = dense_shape_value->value();
@@ -358,6 +378,12 @@ AbstractBasePtr InferImplMakeIndexedSlices(const AnalysisEnginePtr &, const Prim
                          auto elem = GetValue<int>(e);
                          return elem;
                        });
+  for (auto dense_shape_elem : dense_shape_vec) {
+    if (dense_shape_elem < 0) {
+      MS_EXCEPTION(TypeError) << "The element of dense_shape must be positive, but got "
+                              << dense_shape_value->ToString();
+    }
+  }
   auto ret = std::make_shared<AbstractIndexedSlices>(values->element()->BuildType(), dense_shape_vec);
   ret->set_indices(indices);
   ret->set_values(values);
@@ -395,16 +421,89 @@ AbstractBasePtr InferImplIndexedSlicesGetDenseShape(const AnalysisEnginePtr &, c
   return indexed_slices->dense_shape();
 }
 
-AbstractBasePtr InferImplIsIndexedSlices(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                         const AbstractBasePtrList &args_spec_list) {
+AbstractBasePtr InferImplMakeSparseTensor(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                          const AbstractBasePtrList &args_spec_list) {
+  // Inputs: two tensors and a tuple.
+  const std::string op_name = primitive->name();
+  CheckArgsSize(op_name, args_spec_list, 3);
+  auto indices = CheckArg<AbstractTensor>(op_name, args_spec_list, 0);
+  auto values = CheckArg<AbstractTensor>(op_name, args_spec_list, 1);
+  auto dense_shape = CheckArg<AbstractTuple>(op_name, args_spec_list, 2);
+
+  auto indices_dtype = indices->element()->BuildType();
+  if (!indices_dtype->isa<Int>()) {
+    MS_EXCEPTION(TypeError) << "The dtype of indices must be a Int, but got " << indices_dtype->ToString();
+  }
+  auto indices_shp = indices->shape()->shape();
+  if (indices_shp.size() != 2) {
+    MS_EXCEPTION(TypeError) << "Indices must be a 2 dimension tensor, but got a " << indices_shp.size()
+                            << " dimension tensor";
+  }
+  auto values_shp = values->shape()->shape();
+  if (values_shp.size() != 1) {
+    MS_EXCEPTION(TypeError) << "Values must be a 1 dimension tensor, but got a " << values_shp.size()
+                            << " dimension tensor";
+  }
+  if (indices_shp[0] != values_shp[0]) {
+    MS_EXCEPTION(TypeError) << "The first dimension of indices must be the same with the first dimension of values "
+                            << values_shp[0] << ", but got " << indices_shp[0];
+  }
+
+  for (auto elem_type : dense_shape->ElementsType()) {
+    if (!elem_type->isa<Int>()) {
+      MS_EXCEPTION(TypeError) << "The element type of dense_shape must be Int, but got " << elem_type->ToString();
+    }
+  }
+  auto dense_shape_value = dense_shape->BuildValue()->cast<ValueTuplePtr>();
+  MS_EXCEPTION_IF_NULL(dense_shape_value);
+  auto shp = dense_shape_value->value();
+  std::vector<int> dense_shape_vec;
+  (void)std::transform(std::begin(shp), std::end(shp), std::back_inserter(dense_shape_vec),
+                       [](const ValuePtr &e) -> int {
+                         auto elem = GetValue<int>(e);
+                         return elem;
+                       });
+  for (auto dense_shape_elem : dense_shape_vec) {
+    if (dense_shape_elem < 0) {
+      MS_EXCEPTION(TypeError) << "The element of dense_shape must be positive, but got "
+                              << dense_shape_value->ToString();
+    }
+  }
+  auto ret = std::make_shared<AbstractSparseTensor>(values->element()->BuildType(), dense_shape_vec);
+  ret->set_indices(indices);
+  ret->set_values(values);
+  ret->set_dense_shape(dense_shape);
+  return ret;
+}
+
+AbstractBasePtr InferImplSparseTensorGetValues(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                               const AbstractBasePtrList &args_spec_list) {
+  // Inputs: two tensors and a tuple.
   const std::string op_name = primitive->name();
   CheckArgsSize(op_name, args_spec_list, 1);
-  bool ret = false;
-  if (args_spec_list[0]->isa<AbstractIndexedSlices>()) {
-    ret = true;
-  }
-  MS_LOG(DEBUG) << "IsIndexedSlices result: " << ret << ", input: " << args_spec_list[0]->ToString();
-  return std::make_shared<AbstractScalar>(ret);
+  auto sparse_tensor = CheckArg<AbstractSparseTensor>(op_name, args_spec_list, 0);
+  MS_EXCEPTION_IF_NULL(sparse_tensor->values());
+  return sparse_tensor->values();
+}
+
+AbstractBasePtr InferImplSparseTensorGetIndices(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                                const AbstractBasePtrList &args_spec_list) {
+  // Inputs: two tensors and a tuple.
+  const std::string op_name = primitive->name();
+  CheckArgsSize(op_name, args_spec_list, 1);
+  auto sparse_tensor = CheckArg<AbstractSparseTensor>(op_name, args_spec_list, 0);
+  MS_EXCEPTION_IF_NULL(sparse_tensor->indices());
+  return sparse_tensor->indices();
+}
+
+AbstractBasePtr InferImplSparseTensorGetDenseShape(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                                   const AbstractBasePtrList &args_spec_list) {
+  // Inputs: two tensors and a tuple.
+  const std::string op_name = primitive->name();
+  CheckArgsSize(op_name, args_spec_list, 1);
+  auto sparse_tensor = CheckArg<AbstractSparseTensor>(op_name, args_spec_list, 0);
+  MS_EXCEPTION_IF_NULL(sparse_tensor->dense_shape());
+  return sparse_tensor->dense_shape();
 }
 }  // namespace abstract
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/frontend/optimizer/ad/kprim.cc b/mindspore/ccsrc/frontend/optimizer/ad/kprim.cc
index 5ca2ca6c43d..aa76d279d53 100644
--- a/mindspore/ccsrc/frontend/optimizer/ad/kprim.cc
+++ b/mindspore/ccsrc/frontend/optimizer/ad/kprim.cc
@@ -264,7 +264,7 @@ FuncGraphPtr KPrim::FakeBprop(const ValueNodePtr &value_node, const pipeline::Re
     return IsPrimitiveCNode(user.first, prim);
   });
   if (cnode == users.end()) {
-    MS_LOG(EXCEPTION) << "Fail to find cnode.";
+    MS_LOG(EXCEPTION) << "Fail to find user for " << prim->ToString();
   }
   auto inputs_num = cnode->first->cast<CNodePtr>()->inputs().size() - 1;
 
diff --git a/mindspore/ccsrc/frontend/optimizer/irpass.cc b/mindspore/ccsrc/frontend/optimizer/irpass.cc
index efc3795a4cc..23321074f7f 100644
--- a/mindspore/ccsrc/frontend/optimizer/irpass.cc
+++ b/mindspore/ccsrc/frontend/optimizer/irpass.cc
@@ -43,6 +43,7 @@
 #include "frontend/optimizer/irpass/transpose_eliminate.h"
 #include "frontend/optimizer/opt.h"
 #include "frontend/optimizer/irpass/indexed_slices_eliminate.h"
+#include "frontend/optimizer/irpass/sparse_tensor_eliminate.h"
 
 namespace mindspore {
 namespace opt {
@@ -159,6 +160,11 @@ OptimizeIRPassLib::OptimizeIRPassLib() {
   indexed_slices_eliminate_ = MakeSubstitution(
     std::make_shared<IndexedSlicesEliminater>(), "indexed_slices_eliminate",
     {prim::kPrimIndexedSlicesGetIndices, prim::kPrimIndexedSlicesGetValues, prim::kPrimIndexedSlicesGetDenseShape});
+
+  // SparseTensor Eliminate
+  sparse_tensor_eliminate_ = MakeSubstitution(
+    std::make_shared<SparseTensorEliminater>(), "sparse_tensor_eliminate",
+    {prim::kPrimSparseTensorGetIndices, prim::kPrimSparseTensorGetValues, prim::kPrimSparseTensorGetDenseShape});
 }
 
 ResolveIRPassLib::ResolveIRPassLib() {
diff --git a/mindspore/ccsrc/frontend/optimizer/irpass.h b/mindspore/ccsrc/frontend/optimizer/irpass.h
index 4af8c0789dc..718302a1e01 100644
--- a/mindspore/ccsrc/frontend/optimizer/irpass.h
+++ b/mindspore/ccsrc/frontend/optimizer/irpass.h
@@ -107,6 +107,9 @@ class OptimizeIRPassLib {
 
   // IndexedSlices Eliminate
   SubstitutionPtr indexed_slices_eliminate_;
+
+  // SparseTensor Eliminate
+  SubstitutionPtr sparse_tensor_eliminate_;
 };
 
 // the collection of irpass for resolve action
diff --git a/mindspore/ccsrc/frontend/optimizer/irpass/sparse_tensor_eliminate.h b/mindspore/ccsrc/frontend/optimizer/irpass/sparse_tensor_eliminate.h
new file mode 100644
index 00000000000..ac8f2449f3c
--- /dev/null
+++ b/mindspore/ccsrc/frontend/optimizer/irpass/sparse_tensor_eliminate.h
@@ -0,0 +1,75 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_OPTIMIZER_IRPASS_SPARSE_TENSOR_ELIMINATE_H_
+#define MINDSPORE_CCSRC_OPTIMIZER_IRPASS_SPARSE_TENSOR_ELIMINATE_H_
+
+#include <vector>
+#include <algorithm>
+
+#include "frontend/optimizer/irpass.h"
+#include "frontend/optimizer/optimizer.h"
+#include "ir/visitor.h"
+#include "frontend/operator/ops.h"
+
+namespace mindspore {
+namespace opt {
+namespace irpass {
+// {prim::kPrimSparseTensorGetIndices, {prim::kPrimMakeSparseTensor, Xs}}
+// {prim::kPrimSparseTensorGetValues, {prim::kPrimMakeSparseTensor, Xs}}
+// {prim::kPrimSparseTensorGetDenseShape, {prim::kPrimMakeSparseTensor, Xs}}
+class SparseTensorEliminater : public AnfVisitor {
+ public:
+  AnfNodePtr operator()(const OptimizerPtr &, const AnfNodePtr &node) override {
+    Reset();
+    AnfVisitor::Match(prim::kPrimSparseTensorGetIndices, {IsCNode})(node);
+
+    if (is_match_) {
+      return tuple_->input(1);
+    }
+    AnfVisitor::Match(prim::kPrimSparseTensorGetValues, {IsCNode})(node);
+
+    if (is_match_) {
+      return tuple_->input(2);
+    }
+    AnfVisitor::Match(prim::kPrimSparseTensorGetDenseShape, {IsCNode})(node);
+
+    if (is_match_) {
+      return tuple_->input(3);
+    }
+    return nullptr;
+  }
+
+  void Visit(const CNodePtr &cnode) override {
+    if (IsPrimitiveCNode(cnode, prim::kPrimMakeSparseTensor)) {
+      tuple_ = cnode;
+      is_match_ = true;
+    }
+  }
+
+  void Reset() {
+    tuple_ = nullptr;
+    is_match_ = false;
+  }
+
+ private:
+  bool is_match_{false};
+  CNodePtr tuple_{nullptr};
+};
+}  // namespace irpass
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_OPTIMIZER_IRPASS_SPARSE_TENSOR_ELIMINATE_H_
diff --git a/mindspore/ccsrc/pipeline/jit/pass.cc b/mindspore/ccsrc/pipeline/jit/pass.cc
index bb9a517556e..f3a03658a2e 100644
--- a/mindspore/ccsrc/pipeline/jit/pass.cc
+++ b/mindspore/ccsrc/pipeline/jit/pass.cc
@@ -157,6 +157,7 @@ OptPassGroupMap GetOptPassesB(const opt::irpass::OptimizeIRPassLib &irpass) {
     irpass.make_ref_eliminate_,
     irpass.get_ref_param_eliminate_,
     irpass.indexed_slices_eliminate_,
+    irpass.sparse_tensor_eliminate_,
   });
   OptPassGroupMap map({
     {"b_1", b_1},
diff --git a/mindspore/ccsrc/pipeline/jit/resource.cc b/mindspore/ccsrc/pipeline/jit/resource.cc
index ece128b77b7..16d4a00346e 100644
--- a/mindspore/ccsrc/pipeline/jit/resource.cc
+++ b/mindspore/ccsrc/pipeline/jit/resource.cc
@@ -179,6 +179,12 @@ MethodMap &GetMethodMap() {
        {"indices", prim::kPrimIndexedSlicesGetIndices},         // F.indexed_slices_get_indices
        {"dense_shape", prim::kPrimIndexedSlicesGetDenseShape},  // F.indexed_slices_get_dense_shape
      }},
+    {kObjectTypeSparseTensorType,
+     {
+       {"values", prim::kPrimSparseTensorGetValues},           // F.sparse_tensor_get_values
+       {"indices", prim::kPrimSparseTensorGetIndices},         // F.sparse_tensor_get_indices
+       {"dense_shape", prim::kPrimSparseTensorGetDenseShape},  // F.sparse_tensor_get_dense_shape
+     }},
     {kObjectTypeJTagged, {}},
     {kObjectTypeSymbolicKeyType, {}},
     {kObjectTypeEnvType, {}}};
diff --git a/mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc b/mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc
index 9f3011d1187..90d4aaa125f 100644
--- a/mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc
+++ b/mindspore/ccsrc/pipeline/jit/static_analysis/prim.cc
@@ -138,7 +138,11 @@ PrimitiveEvalImplMap &GetPrimitiveToEvalImplMap() {
     {prim::kPrimIndexedSlicesGetValues, {InferImplIndexedSlicesGetValues, true}},
     {prim::kPrimIndexedSlicesGetIndices, {InferImplIndexedSlicesGetIndices, true}},
     {prim::kPrimIndexedSlicesGetDenseShape, {InferImplIndexedSlicesGetDenseShape, true}},
-    {prim::kPrimIsIndexedSlices, {InferImplIsIndexedSlices, true}},
+    // SparseTensor
+    {prim::kPrimMakeSparseTensor, {InferImplMakeSparseTensor, true}},
+    {prim::kPrimSparseTensorGetValues, {InferImplSparseTensorGetValues, true}},
+    {prim::kPrimSparseTensorGetIndices, {InferImplSparseTensorGetIndices, true}},
+    {prim::kPrimSparseTensorGetDenseShape, {InferImplSparseTensorGetDenseShape, true}},
   };
   return prim_eval_implement_map;
 }
diff --git a/mindspore/ccsrc/pipeline/jit/static_analysis/prim.h b/mindspore/ccsrc/pipeline/jit/static_analysis/prim.h
index 692fbe66e88..b931bf6b7e8 100644
--- a/mindspore/ccsrc/pipeline/jit/static_analysis/prim.h
+++ b/mindspore/ccsrc/pipeline/jit/static_analysis/prim.h
@@ -358,8 +358,14 @@ AbstractBasePtr InferImplIndexedSlicesGetIndices(const AnalysisEnginePtr &, cons
                                                  const AbstractBasePtrList &args_spec_list);
 AbstractBasePtr InferImplIndexedSlicesGetDenseShape(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                                     const AbstractBasePtrList &args_spec_list);
-AbstractBasePtr InferImplIsIndexedSlices(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
-                                         const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplMakeSparseTensor(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                          const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplSparseTensorGetValues(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                               const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplSparseTensorGetIndices(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                                const AbstractBasePtrList &args_spec_list);
+AbstractBasePtr InferImplSparseTensorGetDenseShape(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
+                                                   const AbstractBasePtrList &args_spec_list);
 }  // namespace abstract
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/pipeline/jit/validator.cc b/mindspore/ccsrc/pipeline/jit/validator.cc
index 04aa6efd05b..53164f8ac0e 100644
--- a/mindspore/ccsrc/pipeline/jit/validator.cc
+++ b/mindspore/ccsrc/pipeline/jit/validator.cc
@@ -36,6 +36,7 @@ using mindspore::abstract::AbstractIndexedSlices;
 using mindspore::abstract::AbstractJTagged;
 using mindspore::abstract::AbstractList;
 using mindspore::abstract::AbstractScalar;
+using mindspore::abstract::AbstractSparseTensor;
 using mindspore::abstract::AbstractTensor;
 using mindspore::abstract::AbstractTuple;
 using mindspore::abstract::AbstractType;
@@ -95,7 +96,7 @@ void ValidateAbstract(const AnfNodePtr &node) {
 
   if (ptrBase->isa<AbstractType>() || ptrBase->isa<AbstractFunction>() || ptrBase->isa<AbstractTuple>() ||
       ptrBase->isa<AbstractList>() || ptrBase->isa<AbstractTensor>() || ptrBase->isa<AbstractIndexedSlices>() ||
-      ptrBase->isa<abstract::AbstractRefKey>()) {
+      ptrBase->isa<AbstractSparseTensor>() || ptrBase->isa<abstract::AbstractRefKey>()) {
     return;
   }
 
diff --git a/mindspore/common/__init__.py b/mindspore/common/__init__.py
index c896805d75a..570e0368c53 100644
--- a/mindspore/common/__init__.py
+++ b/mindspore/common/__init__.py
@@ -17,10 +17,10 @@ from . import dtype
 from .api import ms_function
 from .dtype import *
 from .parameter import Parameter, ParameterTuple
-from .tensor import MetaTensor, Tensor, IndexedSlices
+from .tensor import MetaTensor, Tensor, IndexedSlices, SparseTensor
 
 __all__ = [
-    "MetaTensor", "Tensor", "IndexedSlices",  # tensor
+    "MetaTensor", "Tensor", "IndexedSlices", "SparseTensor",  # tensor
     'ms_function',  # api
     'Parameter', 'ParameterTuple',  # parameter
     "dtype"
diff --git a/mindspore/common/tensor.py b/mindspore/common/tensor.py
index 64a8eb46373..dde82186809 100644
--- a/mindspore/common/tensor.py
+++ b/mindspore/common/tensor.py
@@ -21,7 +21,7 @@ from .._checkparam import check_type, check_typename
 from . import dtype as mstype
 from ._register_for_tensor import tensor_operator_registry
 
-__all__ = ['Tensor', 'MetaTensor', 'IndexedSlices']
+__all__ = ['Tensor', 'MetaTensor', 'IndexedSlices', 'SparseTensor']
 np_types = (np.int8, np.int16, np.int32, np.int64,
             np.uint8, np.uint16, np.uint32, np.uint64, np.float16,
             np.float32, np.float64, np.bool_)
@@ -211,3 +211,7 @@ class Tensor(Tensor_):
 class IndexedSlices:
     def __init__(self, indices, values, dense_shape):
         raise NotImplementedError
+
+class SparseTensor:
+    def __init__(self, indices, values, dense_shape):
+        raise NotImplementedError
diff --git a/mindspore/core/abstract/abstract_value.cc b/mindspore/core/abstract/abstract_value.cc
index 7bef3829a61..fb16cf0161c 100644
--- a/mindspore/core/abstract/abstract_value.cc
+++ b/mindspore/core/abstract/abstract_value.cc
@@ -1093,5 +1093,64 @@ std::string AbstractIndexedSlices::ToString() const {
          << ", dense_shape: " << dense_shape_->ToString();
   return buffer.str();
 }
+
+// SparseTensor
+TypePtr AbstractSparseTensor::BuildType() const {
+  MS_EXCEPTION_IF_NULL(element());
+  TypePtr element_type = element()->BuildType();
+  return std::make_shared<SparseTensorType>(element_type);
+}
+
+AbstractBasePtr AbstractSparseTensor::Clone() const {
+  MS_EXCEPTION_IF_NULL(element());
+  auto clone = std::make_shared<AbstractSparseTensor>(element()->Clone());
+  ShapePtr shp = shape();
+  clone->set_shape(shp->Clone());
+  clone->set_value(GetValueTrack());
+  clone->set_indices(indices_->Clone()->cast<AbstractTensorPtr>());
+  clone->set_values(values_->Clone()->cast<AbstractTensorPtr>());
+  clone->set_dense_shape(dense_shape_->Clone()->cast<AbstractTuplePtr>());
+  return clone;
+}
+
+AbstractBasePtr AbstractSparseTensor::Broaden() const {
+  MS_EXCEPTION_IF_NULL(element());
+  auto broaden = std::make_shared<AbstractSparseTensor>(element()->Broaden());
+  auto shp = shape();
+  broaden->set_shape(shp->Clone());
+  broaden->set_value(kAnyValue);
+  broaden->set_indices(indices_->Clone()->cast<AbstractTensorPtr>());
+  broaden->set_values(values_->Clone()->cast<AbstractTensorPtr>());
+  broaden->set_dense_shape(dense_shape_->Clone()->cast<AbstractTuplePtr>());
+  return broaden;
+}
+
+AbstractBasePtr AbstractSparseTensor::BroadenWithShape() const {
+  MS_EXCEPTION_IF_NULL(element());
+  auto broaden = std::make_shared<AbstractSparseTensor>(element()->Broaden());
+  auto shp = shape()->Clone();
+  shp->Broaden();
+  broaden->set_shape(shp);
+  broaden->set_value(kAnyValue);
+  broaden->set_indices(indices_->Clone()->cast<AbstractTensorPtr>());
+  broaden->set_values(values_->Clone()->cast<AbstractTensorPtr>());
+  broaden->set_dense_shape(dense_shape_->Clone()->cast<AbstractTuplePtr>());
+  return broaden;
+}
+
+std::string AbstractSparseTensor::ToString() const {
+  std::ostringstream buffer;
+  BaseShapePtr shape_track = GetShapeTrack();
+  MS_EXCEPTION_IF_NULL(shape_track);
+  MS_EXCEPTION_IF_NULL(element());
+  auto value_track = GetValueTrack();
+  MS_EXCEPTION_IF_NULL(value_track);
+  buffer << type_name() << "("
+         << "shape: " << shape_track->ToString() << ", element: " << element()->ToString()
+         << ", value_ptr: " << value_track << ", value: " << value_track->ToString() << ")"
+         << ", indices: " << indices_->ToString() << ", values" << values_->ToString()
+         << ", dense_shape: " << dense_shape_->ToString();
+  return buffer.str();
+}
 }  // namespace abstract
 }  // namespace mindspore
diff --git a/mindspore/core/abstract/abstract_value.h b/mindspore/core/abstract/abstract_value.h
index d922f93e70b..5f2ca8f3f32 100644
--- a/mindspore/core/abstract/abstract_value.h
+++ b/mindspore/core/abstract/abstract_value.h
@@ -604,10 +604,39 @@ class AbstractIndexedSlices : public AbstractUndetermined {
   MS_DECLARE_PARENT(AbstractIndexedSlices, AbstractUndetermined)
 
   const AbstractTensorPtr indices() const { return indices_; }
-  const AbstractTensorPtr values() const { return values_; }
-  const AbstractTuplePtr dense_shape() const { return dense_shape_; }
   void set_indices(const AbstractTensorPtr &indices) { indices_ = indices; }
+  const AbstractTensorPtr values() const { return values_; }
   void set_values(const AbstractTensorPtr &values) { values_ = values; }
+  const AbstractTuplePtr dense_shape() const { return dense_shape_; }
+  void set_dense_shape(const AbstractTuplePtr &dense_shape) { dense_shape_ = dense_shape; }
+  TypePtr BuildType() const override;
+  AbstractBasePtr Clone() const override;
+  AbstractBasePtr Broaden() const override;
+  AbstractBasePtr BroadenWithShape() const;
+
+  std::string ToString() const override;
+
+ private:
+  AbstractTensorPtr indices_;
+  AbstractTensorPtr values_;
+  AbstractTuplePtr dense_shape_;
+};
+
+// SparseTensor
+class AbstractSparseTensor : public AbstractUndetermined {
+ public:
+  explicit AbstractSparseTensor(const AbstractBasePtr &element, const BaseShapePtr &shape = std::make_shared<Shape>())
+      : AbstractUndetermined(element, shape) {}
+  AbstractSparseTensor(const TypePtr &element_type, const std::vector<int> &shape)
+      : AbstractUndetermined(element_type, shape) {}
+  ~AbstractSparseTensor() override = default;
+  MS_DECLARE_PARENT(AbstractSparseTensor, AbstractUndetermined)
+
+  const AbstractTensorPtr indices() const { return indices_; }
+  void set_indices(const AbstractTensorPtr &indices) { indices_ = indices; }
+  const AbstractTensorPtr values() const { return values_; }
+  void set_values(const AbstractTensorPtr &values) { values_ = values; }
+  const AbstractTuplePtr dense_shape() const { return dense_shape_; }
   void set_dense_shape(const AbstractTuplePtr &dense_shape) { dense_shape_ = dense_shape; }
   TypePtr BuildType() const override;
   AbstractBasePtr Clone() const override;
diff --git a/mindspore/core/abstract/param_validator.h b/mindspore/core/abstract/param_validator.h
index 434235abda3..e08d4fc8e85 100644
--- a/mindspore/core/abstract/param_validator.h
+++ b/mindspore/core/abstract/param_validator.h
@@ -67,6 +67,7 @@ ABSTRACT_REPORT_NAME_TRAITS(Type)
 ABSTRACT_REPORT_NAME_TRAITS(KeywordArg)
 ABSTRACT_REPORT_NAME_TRAITS(Class)
 ABSTRACT_REPORT_NAME_TRAITS(IndexedSlices)
+ABSTRACT_REPORT_NAME_TRAITS(SparseTensor)
 ABSTRACT_REPORT_NAME_TRAITS(Sequeue)
 
 template <typename T>
diff --git a/mindspore/core/ir/dtype.cc b/mindspore/core/ir/dtype.cc
index 71a78bdcf67..89ab2ac0fa4 100644
--- a/mindspore/core/ir/dtype.cc
+++ b/mindspore/core/ir/dtype.cc
@@ -221,6 +221,48 @@ bool IndexedSlicesType::operator==(const Type &other) const {
   return *element_type_ == *other_elem_type;
 }
 
+TypePtr SparseTensorType::DeepCopy() const {
+  MS_EXCEPTION_IF_NULL(element_type_);
+  if (IsGeneric()) {
+    return std::make_shared<SparseTensorType>();
+  }
+  return std::make_shared<SparseTensorType>(element_type_->DeepCopy());
+}
+
+std::string SparseTensorType::ToReprString() const {
+  if (element_type_ == nullptr) {
+    return "SparseTensor";
+  }
+  return "SparseTensor[" + element_type_->ToReprString() + "]";
+}
+
+std::string SparseTensorType::ToString() const {
+  if (element_type_ == nullptr) {
+    return "SparseTensor";
+  }
+  return "SparseTensor[" + element_type_->ToString() + "]";
+}
+
+std::string SparseTensorType::DumpText() const {
+  if (element_type_ == nullptr) {
+    return "SparseTensor";
+  }
+  return "SparseTensor[" + element_type_->DumpText() + "]";
+}
+
+bool SparseTensorType::operator==(const Type &other) const {
+  if (!IsSameObjectType(*this, other)) {
+    return false;
+  }
+  auto other_elem_type = static_cast<const SparseTensorType &>(other).element_type_;
+  if (element_type_ == nullptr && other_elem_type == nullptr) {
+    return true;
+  } else if (element_type_ == nullptr || other_elem_type == nullptr) {
+    return false;
+  }
+  return *element_type_ == *other_elem_type;
+}
+
 Function::Function() : Object(kObjectTypeFunction) {
   args_ = std::vector<TypePtr>();
   retval_ = nullptr;
diff --git a/mindspore/core/ir/dtype.h b/mindspore/core/ir/dtype.h
index dc277c031c6..0ff152a4f46 100644
--- a/mindspore/core/ir/dtype.h
+++ b/mindspore/core/ir/dtype.h
@@ -177,6 +177,29 @@ class IndexedSlicesType : public Object {
 };
 using IndexedSlicesTypePtr = std::shared_ptr<IndexedSlicesType>;
 
+class SparseTensorType : public Object {
+ public:
+  SparseTensorType() : Object(kObjectTypeSparseTensorType, kObjectTypeUndeterminedType) {}
+  explicit SparseTensorType(const TypePtr &ele)
+      : Object(kObjectTypeSparseTensorType, kObjectTypeUndeterminedType, false), element_type_(ele) {}
+  ~SparseTensorType() override = default;
+  MS_DECLARE_PARENT(SparseTensorType, Object)
+
+  TypeId generic_type_id() const override { return kObjectTypeSparseTensorType; }
+  const TypePtr element() const { return element_type_; }
+  void set_element(const TypePtr &element_type) { element_type_ = element_type; }
+
+  TypePtr DeepCopy() const override;
+  std::string ToString() const override;
+  std::string ToReprString() const override;
+  std::string DumpText() const override;
+  bool operator==(const Type &other) const override;
+
+ private:
+  TypePtr element_type_;
+};
+using SparseTensorTypePtr = std::shared_ptr<SparseTensorType>;
+
 class Function : public Object {
  public:
   Function();
diff --git a/mindspore/core/ir/dtype/type.cc b/mindspore/core/ir/dtype/type.cc
index 754876a366a..39586602e75 100644
--- a/mindspore/core/ir/dtype/type.cc
+++ b/mindspore/core/ir/dtype/type.cc
@@ -117,6 +117,8 @@ const char *ObjectIdLabel(const TypeId &v) {
       return "kObjectTypeTensorType";
     case kObjectTypeIndexedSlicesType:
       return "kObjectTypeIndexedSlicesType";
+    case kObjectTypeSparseTensorType:
+      return "kObjectTypeSparseTensorType";
     case kObjectTypeUndeterminedType:
       return "kObjectTypeUndeterminedType";
     case kObjectTypeDictionary:
diff --git a/mindspore/core/ir/dtype/type_id.h b/mindspore/core/ir/dtype/type_id.h
index 6fb2a354c17..960c2f320d2 100644
--- a/mindspore/core/ir/dtype/type_id.h
+++ b/mindspore/core/ir/dtype/type_id.h
@@ -51,6 +51,7 @@ enum TypeId : int {
   kObjectTypeKeyword,
   kObjectTypeTensorType,
   kObjectTypeIndexedSlicesType,
+  kObjectTypeSparseTensorType,
   kObjectTypeUndeterminedType,
   kObjectTypeClass,
   kObjectTypeDictionary,
diff --git a/mindspore/core/ir/dtype_extends.cc b/mindspore/core/ir/dtype_extends.cc
index 099748217ed..9038646ceb7 100644
--- a/mindspore/core/ir/dtype_extends.cc
+++ b/mindspore/core/ir/dtype_extends.cc
@@ -207,6 +207,23 @@ TypePtr IndexedSlicesStrToType(const std::string &type_name) {
   return std::make_shared<IndexedSlicesType>(element_type);
 }
 
+TypePtr SparseTensorStrToType(const std::string &type_name) {
+  if (type_name == "SparseTensor") {
+    return std::make_shared<SparseTensorType>();
+  }
+  auto start = type_name.find_first_of('[') + 1;
+  auto end = type_name.find_last_of(']');
+  if (start >= type_name.size()) {
+    return nullptr;
+  }
+  auto element_str = type_name.substr(start, end - start);
+  auto element_type = StringToType(element_str);
+  if (element_type == nullptr) {
+    return nullptr;
+  }
+  return std::make_shared<SparseTensorType>(element_type);
+}
+
 TypePtr UndeterminedStrToType(const std::string &type_name) {
   if (type_name == "Undetermined") {
     return std::make_shared<UndeterminedType>();
@@ -349,6 +366,8 @@ TypePtr StringToType(const std::string &type_name) {
     type = UndeterminedStrToType(type_name);
   } else if (type_name.compare(0, strlen("IndexedSlices"), "IndexedSlices") == 0) {
     type = IndexedSlicesStrToType(type_name);
+  } else if (type_name.compare(0, strlen("SparseTensor"), "SparseTensor") == 0) {
+    type = SparseTensorStrToType(type_name);
   } else if (type_name.compare(0, strlen("List"), "List") == 0) {
     type = ListStrToType(type_name);
   } else if (type_name.compare(0, strlen("Tuple"), "Tuple") == 0) {
@@ -428,6 +447,7 @@ const TypePtr kTypeEnv = std::make_shared<EnvType>();
 const TypePtr kTypeType = std::make_shared<TypeType>();
 const TypePtr kTensorType = std::make_shared<TensorType>();
 const TypePtr kIndexedSlicesType = std::make_shared<IndexedSlicesType>();
+const TypePtr kSparseTensorType = std::make_shared<SparseTensorType>();
 const TypePtr kUndeterminedType = std::make_shared<UndeterminedType>();
 const TypePtr kString = std::make_shared<String>();
 const TypePtr kList = std::make_shared<List>();
diff --git a/mindspore/core/ir/dtype_py.cc b/mindspore/core/ir/dtype_py.cc
index b1e2151b6dd..7577a39f7a6 100644
--- a/mindspore/core/ir/dtype_py.cc
+++ b/mindspore/core/ir/dtype_py.cc
@@ -139,6 +139,8 @@ REGISTER_PYBIND_DEFINE(
         }));
     (void)py::class_<IndexedSlicesType, Type, std::shared_ptr<IndexedSlicesType>>(m_sub, "IndexedSlicesType")
       .def(py::init());
+    (void)py::class_<SparseTensorType, Type, std::shared_ptr<SparseTensorType>>(m_sub, "SparseTensorType")
+      .def(py::init());
     (void)py::class_<UndeterminedType, Type, std::shared_ptr<UndeterminedType>>(m_sub, "UndeterminedType")
       .def(py::init());
     (void)py::class_<Function, Type, std::shared_ptr<Function>>(m_sub, "Function")
diff --git a/mindspore/core/ir/meta_func_graph.cc b/mindspore/core/ir/meta_func_graph.cc
index c0cf9d4d2f2..7953931e8f4 100644
--- a/mindspore/core/ir/meta_func_graph.cc
+++ b/mindspore/core/ir/meta_func_graph.cc
@@ -17,9 +17,49 @@
  */
 
 #include "ir/meta_func_graph.h"
+#include "pipeline/jit/static_analysis/static_analysis.h"
+#include "pipeline/jit/static_analysis/abstract_function.h"
+#include "utils/context/ms_context.h"
+#include "frontend/operator/ops.h"
 
 // namespace to support intermediate representation definition
 namespace mindspore {
+FuncGraphPtr MetaFuncGraph::GenerateStubFunc(const TypePtrList &types) {
+  auto context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context);
+  bool enable_sparse = context->enable_sparse();
+  if (!enable_sparse) {
+    return nullptr;
+  }
+
+  std::vector<AnfNodePtr> parameters;
+  ParameterPtr undetermined_param = nullptr;
+  auto stub = std::make_shared<FuncGraph>();
+  for (size_t i = 0; i < types.size(); ++i) {
+    auto param = stub->add_parameter();
+    parameters.push_back(param);
+    if (types[i]->type_id() == kObjectTypeUndeterminedType) {
+      undetermined_param = param;
+    }
+  }
+  if (undetermined_param != nullptr) {
+    std::vector<AnfNodePtr> inputs{NewValueNode(prim::kPrimMakeTuple)};
+    for (size_t i = 0; i < types.size(); ++i) {
+      if (types[i]->type_id() == kObjectTypeFunction) {
+        std::vector<AnfNodePtr> call_prim{parameters[i], undetermined_param};
+        inputs.push_back(stub->NewCNode(call_prim));
+      } else {
+        inputs.push_back(parameters[i]);
+      }
+    }
+    auto stub_output = stub->NewCNode(inputs);
+    stub->set_output(stub_output);
+    stub->set_stub(true);
+    return stub;
+  }
+  return nullptr;
+}
+
 FuncGraphPtr MetaFuncGraph::GenerateFuncGraph(const abstract::AbstractBasePtrList &args_spec_list) {
   TypePtrList types;
   (void)std::transform(args_spec_list.begin(), args_spec_list.end(), std::back_inserter(types),
diff --git a/mindspore/core/ir/meta_func_graph.h b/mindspore/core/ir/meta_func_graph.h
index 933c3f700d8..df1edc312de 100644
--- a/mindspore/core/ir/meta_func_graph.h
+++ b/mindspore/core/ir/meta_func_graph.h
@@ -79,6 +79,7 @@ class MetaFuncGraph : public FuncGraphBase {
   std::shared_ptr<Derived> shared_from_base() {
     return std::static_pointer_cast<Derived>(shared_from_this());
   }
+  FuncGraphPtr GenerateStubFunc(const TypePtrList &types);
   std::string name_;
   std::vector<Signature> signatures_;
   std::unordered_map<TypePtrList, FuncGraphPtr, TypeListHasher, TypeListEqual> cache_;
diff --git a/mindspore/core/ir/param_value.h b/mindspore/core/ir/param_value.h
index 00b79ae91ca..36026ce97fc 100644
--- a/mindspore/core/ir/param_value.h
+++ b/mindspore/core/ir/param_value.h
@@ -40,18 +40,12 @@ class ParamValue {
   const std::string &name() const { return name_; }
   void set_name(const std::string &name) { name_ = name; }
 
-  const std::string &sparse_grad() const { return sparse_grad_; }
-  void set_sparse_grad(const std::string &sparse_grad) { sparse_grad_ = sparse_grad; }
-
   bool requires_grad() const { return requires_grad_; }
   void set_requires_grad(bool requires_grad) { requires_grad_ = requires_grad; }
 
   bool layerwise_parallel() const { return layerwise_parallel_; }
   void set_layerwise_parallel(bool layerwise_parallel) { layerwise_parallel_ = layerwise_parallel; }
 
-  bool has_indexed_slices_grad() const { return has_indexed_slices_grad_; }
-  void set_has_indexed_slices_grad(bool b) { has_indexed_slices_grad_ = b; }
-
   // Whether the parameter clone from other parameter.
   bool cloned() const { return cloned_; }
 
@@ -81,10 +75,8 @@ class ParamValue {
  private:
   tensor::MetaTensorPtr value_;
   std::string name_{"Parameter"};
-  std::string sparse_grad_;
   bool requires_grad_{true};
   bool layerwise_parallel_{false};
-  bool has_indexed_slices_grad_{false};
   bool be_cloned_{false};
   bool cloned_{false};
   std::vector<int32_t> be_cloned_index_;
diff --git a/mindspore/core/ir/param_value_py.cc b/mindspore/core/ir/param_value_py.cc
index fb4b313c228..c976d41cd21 100644
--- a/mindspore/core/ir/param_value_py.cc
+++ b/mindspore/core/ir/param_value_py.cc
@@ -29,14 +29,10 @@ REGISTER_PYBIND_DEFINE(ParamValue, ([](const py::module *m) {
                            .def_property("requires_grad", &ParamValue::requires_grad, &ParamValue::set_requires_grad)
                            .def_property("layerwise_parallel", &ParamValue::layerwise_parallel,
                                          &ParamValue::set_layerwise_parallel)
-                           .def_property("has_indexed_slices_grad", &ParamValue::has_indexed_slices_grad,
-                                         &ParamValue::set_has_indexed_slices_grad)
-                           .def_property("sparse_grad", &ParamValue::sparse_grad, &ParamValue::set_sparse_grad)
                            .def(py::pickle(
                              [](const ParamValue &p) {  // __getstate__
                                return py::make_tuple(py::cast(p.value()), p.name(), p.requires_grad(),
-                                                     p.layerwise_parallel(), p.has_indexed_slices_grad(),
-                                                     p.sparse_grad());
+                                                     p.layerwise_parallel());
                              },
                              [](const py::tuple &t) {  // __setstate__
                                if (t.size() != 6) {
@@ -47,8 +43,6 @@ REGISTER_PYBIND_DEFINE(ParamValue, ([](const py::module *m) {
                                p->set_name(t[1].cast<std::string>());
                                p->set_requires_grad(t[2].cast<bool>());
                                p->set_layerwise_parallel(t[3].cast<bool>());
-                               p->set_has_indexed_slices_grad(t[4].cast<bool>());
-                               p->set_sparse_grad(t[5].cast<std::string>());
                                return p;
                              }));
                        }));
diff --git a/mindspore/ops/functional.py b/mindspore/ops/functional.py
index 2be011cb773..36294fa4cdf 100644
--- a/mindspore/ops/functional.py
+++ b/mindspore/ops/functional.py
@@ -159,6 +159,10 @@ indexed_slices_get_values = Primitive('IndexedSlicesGetValues')
 indexed_slices_get_indices = Primitive('IndexedSlicesGetIndices')
 indexed_slices_get_dense_shape = Primitive('IndexedSlicesGetDenseShape')
 
+make_sparse_tensor = Primitive('MakeSparseTensor')
+sparse_tensor_get_values = Primitive('SparseTensorGetValues')
+sparse_tensor_get_indices = Primitive('SparseTensorGetIndices')
+sparse_tensor_get_dense_shape = Primitive('SparseTensorGetDenseShape')
 
 tensor_operator_registry.register('__add__', tensor_add)
 tensor_operator_registry.register('__sub__', tensor_sub)
diff --git a/tests/ut/cpp/optimizer/lib_test.cc b/tests/ut/cpp/optimizer/lib_test.cc
index 751b301283c..c0d5523edc2 100644
--- a/tests/ut/cpp/optimizer/lib_test.cc
+++ b/tests/ut/cpp/optimizer/lib_test.cc
@@ -616,5 +616,18 @@ TEST_F(TestOptLib, test_indexed_slices) {
   ASSERT_TRUE(CheckOpt(before_get_values, after_get_values, patterns));
   ASSERT_TRUE(CheckOpt(before_get_dense_shape, after_get_dense_shape, patterns));
 }
+
+TEST_F(TestOptLib, test_sparse_tensor) {
+  FuncGraphPtr before_get_indices = getPyFun.CallAndParseRet("test_sparse_tensor", "before_get_indices");
+  FuncGraphPtr after_get_indices = getPyFun.CallAndParseRet("test_sparse_tensor", "after_get_indices");
+  FuncGraphPtr before_get_values = getPyFun.CallAndParseRet("test_sparse_tensor", "before_get_values");
+  FuncGraphPtr after_get_values = getPyFun.CallAndParseRet("test_sparse_tensor", "after_get_values");
+  FuncGraphPtr before_get_dense_shape = getPyFun.CallAndParseRet("test_sparse_tensor", "before_get_dense_shape");
+  FuncGraphPtr after_get_dense_shape = getPyFun.CallAndParseRet("test_sparse_tensor", "after_get_dense_shape");
+  auto patterns = std::vector<SubstitutionPtr>({irpass.sparse_tensor_eliminate_});
+  ASSERT_TRUE(CheckOpt(before_get_indices, after_get_indices, patterns));
+  ASSERT_TRUE(CheckOpt(before_get_values, after_get_values, patterns));
+  ASSERT_TRUE(CheckOpt(before_get_dense_shape, after_get_dense_shape, patterns));
+}
 }  // namespace opt
 }  // namespace mindspore
diff --git a/tests/ut/cpp/python_input/gtest_input/optimizer/opt_test.py b/tests/ut/cpp/python_input/gtest_input/optimizer/opt_test.py
index 16c557adbe2..369dfb3316d 100644
--- a/tests/ut/cpp/python_input/gtest_input/optimizer/opt_test.py
+++ b/tests/ut/cpp/python_input/gtest_input/optimizer/opt_test.py
@@ -1163,3 +1163,38 @@ def test_indexed_slices(tag):
         return z
 
     return fns[tag]
+
+
+def test_sparse_tensor(tag):
+    """ test_add_zero """
+    fns = FnDict()
+    make_sparse_tensor = Primitive('MakeSparseTensor')
+    sparse_tensor_get_values = Primitive('SparseTensorGetValues')
+    sparse_tensor_get_indices = Primitive('SparseTensorGetIndices')
+    sparse_tensor_get_dense_shape = Primitive('SparseTensorGetDenseShape')
+
+    @fns
+    def before_get_indices(x, y, z):
+        return sparse_tensor_get_indices(make_sparse_tensor(x, y, z))
+
+    @fns
+    def after_get_indices(x, y, z):
+        return x
+
+    @fns
+    def before_get_values(x, y, z):
+        return sparse_tensor_get_values(make_sparse_tensor(x, y, z))
+
+    @fns
+    def after_get_values(x, y, z):
+        return y
+
+    @fns
+    def before_get_dense_shape(x, y, z):
+        return sparse_tensor_get_dense_shape(make_sparse_tensor(x, y, z))
+
+    @fns
+    def after_get_dense_shape(x, y, z):
+        return z
+
+    return fns[tag]
diff --git a/tests/ut/python/ir/test_indexed_slices.py b/tests/ut/python/ir/test_indexed_slices.py
index 36dfe464cb4..ff0cfa1da5f 100644
--- a/tests/ut/python/ir/test_indexed_slices.py
+++ b/tests/ut/python/ir/test_indexed_slices.py
@@ -35,6 +35,9 @@ from mindspore._checkparam import Validator as validator
 from mindspore._checkparam import Rel
 from mindspore.nn import Optimizer
 from mindspore.nn import TrainOneStepCell, WithLossCell
+from mindspore.nn.optim import Momentum
+from mindspore.train import Model
+from ....dataset_mock import MindData
 
 context.set_context(mode=context.GRAPH_MODE, enable_sparse=True)
 
@@ -47,6 +50,40 @@ size_op = P.Size()
 invert_permutation = P.InvertPermutation()
 logical_and = P.LogicalAnd()
 
+def get_axis(x):
+    shape = shape_op(x)
+    length = F.tuple_len(shape)
+    perm = F.make_range(0, length)
+    return perm
+
+class MSELoss(nn.Cell):
+    def __init__(self):
+        super(MSELoss, self).__init__()
+        self.reduce_sum = P.ReduceSum()
+        self.square = P.Square()
+        self.reduce_mean = P.ReduceMean()
+
+    def construct(self, data, label):
+        diff = data - label
+        return self.reduce_mean(self.square(diff), get_axis(diff))
+
+
+class MindDataSet(MindData):
+    def __init__(self, dataset_types, dataset_shapes):
+        super(MindDataSet, self).__init__(size=2, batch_size=32,
+                                          np_types=dataset_types,
+                                          output_shapes=dataset_shapes,
+                                          input_indexs=(0, 1))
+    def __next__(self):
+        if self._size < self._iter_num:
+            raise StopIteration
+        self._iter_num += 1
+        lst = []
+        for shape_, type_ in zip(self._output_shapes, self._np_types):
+            lst.append(Tensor(np.ones(shape_).astype(type_)))
+        return tuple(lst)
+
+
 @constexpr
 def _generate_shape_index(out_shape, indices_shape, axis):
     out_rank = len(out_shape)
@@ -189,8 +226,8 @@ def test_indexed_slices_make_indexed_slices():
         def construct(self, indices, values):
             ret = (IndexedSlices(indices, values, self.dense_shape),)
             return ret[0]
-    indices = Tensor([[0, 0], [1, 2]])
-    values = Tensor([1, 2], dtype=ms.float32)
+    indices = Tensor([1, 2])
+    values = Tensor([[0, 0], [1, 2]], dtype=ms.float32)
     MakeIndexedSlices()(indices, values)
 
 
@@ -202,8 +239,8 @@ def test_indexed_slices_attr():
         def construct(self, indices, values):
             x = IndexedSlices(indices, values, self.dense_shape)
             return x.values(), x.indices(), x.dense_shape()
-    indices = Tensor([[0, 0], [1, 2]])
-    values = Tensor([1, 2], dtype=ms.float32)
+    indices = Tensor([0])
+    values = Tensor([[1, 2]], dtype=ms.float32)
     IndexedSlicesGetAttr()(indices, values)
 
 
@@ -279,3 +316,29 @@ def test_indexed_slices_env_get():
     net_with_loss = WithLossCell(net, loss)
     train_network = TrainOneStepCell(net_with_loss, optimizer)
     train_network(inputs, label)
+
+
+def test_indexed_slices_model_train():
+    class Net(nn.Cell):
+        def __init__(self, in_features, out_features):
+            super(Net, self).__init__()
+            self.weight = Parameter(Tensor(np.ones([out_features, in_features]).astype(np.float32)), name="weight")
+            self.add = P.TensorAdd()
+            self.cast = P.Cast()
+            self.flag = True
+
+        def construct(self, inputs, label):
+            x = self.add(inputs, self.weight)
+            if self.flag:
+                x = self.cast(x, mstype.float32)
+            return x
+
+    dataset_types = (np.float32, np.float32)
+    dataset_shapes = ((16, 16), (16, 16))
+    dataset = MindDataSet(dataset_types, dataset_shapes)
+    net = Net(16, 16)
+    net.set_train()
+
+    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+    model = Model(net, optimizer=optimizer)
+    model.train(2, dataset, dataset_sink_mode=False)
diff --git a/tests/ut/python/ir/test_sparse_tensor.py b/tests/ut/python/ir/test_sparse_tensor.py
new file mode 100644
index 00000000000..3f8ca8b184c
--- /dev/null
+++ b/tests/ut/python/ir/test_sparse_tensor.py
@@ -0,0 +1,61 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+@File  : test_sparse_tensor.py
+@Author:
+@Date  : 2020-07-16
+@Desc  : test mindspore sparse_tensor's operation
+"""
+import mindspore as ms
+import mindspore.nn as nn
+from mindspore.ops import composite as C
+from mindspore import Tensor, SparseTensor, context
+
+context.set_context(mode=context.GRAPH_MODE, enable_sparse=True)
+
+def test_sparse_tensor_make_sparse_tensor():
+    class MakeSparseTensor(nn.Cell):
+        def __init__(self):
+            super(MakeSparseTensor, self).__init__()
+            self.dense_shape = (3, 4)
+        def construct(self, indices, values):
+            ret = (SparseTensor(indices, values, self.dense_shape),)
+            return ret[0]
+    indices = Tensor([[0, 1], [1, 2]])
+    values = Tensor([1, 2], dtype=ms.float32)
+    MakeSparseTensor()(indices, values)
+
+
+def test_sparse_tensor_attr():
+    grad_op = C.GradOperation('get_all', get_all=True)
+    class GradWrap(nn.Cell):
+        def __init__(self, network):
+            super(GradWrap, self).__init__()
+            self.network = network
+        def construct(self, input1, input2):
+            gout = grad_op(self.network)(input1, input2)
+            return gout
+
+    class SparseTensorGetAttr(nn.Cell):
+        def __init__(self):
+            super(SparseTensorGetAttr, self).__init__()
+            self.dense_shape = (3, 4)
+        def construct(self, indices, values):
+            x = SparseTensor(indices, values, self.dense_shape)
+            return x.values(), x.indices(), x.dense_shape()
+
+    indices = Tensor([[0, 1], [1, 2]])
+    values = Tensor([1, 2], dtype=ms.float32)
+    SparseTensorGetAttr()(indices, values)

From 11f78fb8ad88418036164019dc3c9dd7ce6f14cd Mon Sep 17 00:00:00 2001
From: kswang <wangkaisheng2@huawei.com>
Date: Fri, 17 Jul 2020 19:35:34 +0800
Subject: [PATCH 65/68] check invalid internal output

---
 mindspore/ccsrc/backend/session/session_basic.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/mindspore/ccsrc/backend/session/session_basic.cc b/mindspore/ccsrc/backend/session/session_basic.cc
index fa55b07fe5c..4c7db3d84ba 100644
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@@ -978,6 +978,16 @@ CNodePtr SessionBasic::ConstructOutput(const AnfNodePtrList &outputs, const std:
       bool internal_output = true;
       std::string kernel_target = GetCNodeTarget(front_real_kernel.first);
       for (auto user : users) {
+        auto cnode = user.first->cast<CNodePtr>();
+        if (cnode == nullptr) {
+          internal_output = false;
+          break;
+        }
+        auto prim = cnode->input(kAnfPrimitiveIndex);
+        if (prim == nullptr || !prim->isa<ValueNode>()) {
+          internal_output = false;
+          break;
+        }
         if (!AnfAlgo::IsRealKernel(user.first) || kernel_target != GetCNodeTarget(user.first)) {
           internal_output = false;
           break;

From b570dec7abd5feac581fecef10fd410aa927518c Mon Sep 17 00:00:00 2001
From: laiyongqiang <laiyongqiang1@huawei.com>
Date: Sat, 18 Jul 2020 09:38:12 +0800
Subject: [PATCH 66/68] add right align border for communication op's single
 output

---
 .../backend/optimizer/mem_reuse/mem_reuse_allocator.cc   | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
index d10d8cd9493..f57a78863a9 100644
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
@@ -194,13 +194,20 @@ void BestFitMemReuse::AssignCommunicationNodeOutputOffset() {
 
   // add left align border for the first output and right align border for the last output to alloc align border memory
   size_t output_index = 0;
-  for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
+  auto output_ref_indexes = current_kernel_->GetOutputRefIndexs();
+  for (auto &tensor_idx : output_ref_indexes) {
     size_t index = GetTensorIndex(tensor_idx);
     auto tensor_desc = tensor_ptr_list_[index];
     MS_EXCEPTION_IF_NULL(tensor_desc);
     if (output_index == 0 || output_index == output_num - 1) {
       tensor_desc->size_ += kDefaultMemAlignSize;
     }
+
+    if ((output_index == 0) && (output_ref_indexes.size() == 1)) {
+      // add right align border for single output
+      tensor_desc->size_ += kDefaultMemAlignSize;
+    }
+
     output_index++;
   }
 

From 4ad4d583ad2c66bc8647e995b404bd5e520d2748 Mon Sep 17 00:00:00 2001
From: caifubi <caifubi1@huawei.com>
Date: Fri, 17 Jul 2020 11:38:20 +0800
Subject: [PATCH 67/68] Fix Security Bug

---
 mindspore/ccsrc/debug/common.cc               |  4 ++
 mindspore/ccsrc/debug/data_dump_parser.cc     | 41 ++++++++++++++++---
 mindspore/ccsrc/debug/data_dump_parser.h      |  7 ++--
 .../device/ascend/ascend_kernel_runtime.cc    |  3 ++
 .../runtime/device/ascend/dump/data_dumper.cc |  3 +-
 5 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/mindspore/ccsrc/debug/common.cc b/mindspore/ccsrc/debug/common.cc
index 6caf7e2c393..931e6d4b858 100644
--- a/mindspore/ccsrc/debug/common.cc
+++ b/mindspore/ccsrc/debug/common.cc
@@ -120,6 +120,10 @@ std::optional<std::string> Common::GetConfigFile(const std::string &env) {
     MS_LOG(ERROR) << dump_config_file << " not exist.";
     return {};
   }
+  auto suffix = dump_config_file.substr(dump_config_file.find_last_of('.') + 1);
+  if (suffix != "json") {
+    MS_LOG(EXCEPTION) << "[DataDump] dump config file suffix only support json! But got:." << suffix;
+  }
   return dump_config_file;
 }
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/debug/data_dump_parser.cc b/mindspore/ccsrc/debug/data_dump_parser.cc
index 55c66e055ba..0f8e1bb598e 100644
--- a/mindspore/ccsrc/debug/data_dump_parser.cc
+++ b/mindspore/ccsrc/debug/data_dump_parser.cc
@@ -29,7 +29,7 @@ void DataDumpParser::ResetParam() {
   net_name_.clear();
   dump_mode_ = 0;
   dump_step_ = 0;
-  kernel_set_.clear();
+  kernel_map_.clear();
 }
 
 bool DataDumpParser::DumpEnabled() const {
@@ -60,9 +60,18 @@ std::optional<std::string> DataDumpParser::GetDumpPath() const {
     return {};
   }
   std::string dump_path_str(dump_path);
+  if (!std::all_of(dump_path_str.begin(), dump_path_str.end(), ::isalpha)) {
+    MS_LOG(EXCEPTION) << "[DataDump] dump path only support alphas, but got:" << dump_path_str;
+  }
   return dump_path_str;
 }
 
+std::string GetIfstreamString(const std::ifstream &ifstream) {
+  std::stringstream buffer;
+  buffer << ifstream.rdbuf();
+  return buffer.str();
+}
+
 void DataDumpParser::ParseDumpConfig() {
   std::lock_guard<std::mutex> guard(lock_);
   MS_LOG(INFO) << "[DataDump] parse start";
@@ -84,7 +93,12 @@ void DataDumpParser::ParseDumpConfig() {
   }
 
   nlohmann::json j;
-  json_file >> j;
+  try {
+    json_file >> j;
+  } catch (nlohmann::json::parse_error &e) {
+    MS_LOG(ERROR) << "[DataDump] json contents:" << GetIfstreamString(json_file);
+    MS_LOG(EXCEPTION) << "[DataDump] parse json failed, error:" << e.what();
+  }
   if (j.find("DumpSettings") == j.end()) {
     MS_LOG(EXCEPTION) << "[DataDump] DumpSettings is not exist.";
   }
@@ -111,8 +125,8 @@ bool DataDumpParser::NeedDump(const std::string &op_full_name) const {
   if (dump_mode_ == 0) {
     return true;
   }
-  auto iter = kernel_set_.find(op_full_name);
-  return iter != kernel_set_.end();
+  auto iter = kernel_map_.find(op_full_name);
+  return iter != kernel_map_.end();
 }
 
 bool DataDumpParser::IsConfigExist(const nlohmann::json &dump_settings) const {
@@ -145,8 +159,25 @@ bool DataDumpParser::ParseDumpSetting(const nlohmann::json &dump_settings) {
     auto kernel_str = kernel.dump();
     kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end());
     MS_LOG(INFO) << "[DataDump] Need dump kernel:" << kernel_str;
-    kernel_set_.insert(kernel_str);
+    kernel_map_.insert({kernel_str, 0});
   }
   return true;
 }
+
+void DataDumpParser::MatchKernel(const std::string &kernel_name) {
+  auto iter = kernel_map_.find(kernel_name);
+  if (iter == kernel_map_.end()) {
+    return;
+  }
+  iter->second = iter->second + 1;
+  MS_LOG(INFO) << "Match dump kernel:" << iter->first << " match times:" << iter->second;
+}
+
+void DataDumpParser::PrintUnusedKernel() {
+  for (const auto &iter : kernel_map_) {
+    if (iter.second == 0) {
+      MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
+    }
+  }
+}
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/debug/data_dump_parser.h b/mindspore/ccsrc/debug/data_dump_parser.h
index 751c61dd1a1..535ef4f6150 100644
--- a/mindspore/ccsrc/debug/data_dump_parser.h
+++ b/mindspore/ccsrc/debug/data_dump_parser.h
@@ -18,7 +18,7 @@
 #define MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_
 
 #include <string>
-#include <set>
+#include <map>
 #include <mutex>
 #include <optional>
 #include "nlohmann/json.hpp"
@@ -39,7 +39,8 @@ class DataDumpParser {
   const std::string &net_name() const { return net_name_; }
   uint32_t dump_mode() const { return dump_mode_; }
   uint32_t dump_step() const { return dump_step_; }
-  const std::set<std::string> &kernel_set() const { return kernel_set_; }
+  void MatchKernel(const std::string &kernel_name);
+  void PrintUnusedKernel();
 
  private:
   DataDumpParser() = default;
@@ -55,7 +56,7 @@ class DataDumpParser {
   std::string net_name_;
   uint32_t dump_mode_{0};
   uint32_t dump_step_{0};
-  std::set<std::string> kernel_set_;
+  std::map<std::string, uint32_t> kernel_map_;
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
index c1e3bff79fe..aafbf757654 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -100,6 +100,9 @@ void AscendKernelRuntime::ClearGraphModelMap() {
     iter.second->UnloadDumpInfo();
   }
   graph_data_dumper_.clear();
+  // tell users which dump kernel name not used
+  DataDumpParser::GetInstance().PrintUnusedKernel();
+
   for (auto &iter : graph_model_map_) {
     MS_LOG(INFO) << "Ge UnloadModel " << iter.first;
     auto ret = ModelRunner::Instance().UnloadModel(iter.first);
diff --git a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
index 61b3b04f739..a4509197cee 100644
--- a/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/dump/data_dumper.cc
@@ -62,6 +62,7 @@ void DataDumper::LoadDumpInfo() {
     }
     MS_LOG(INFO) << "[DataDump] LoadDumpInfo kernel:" << kernel->fullname_with_scope();
     dump_kernel_names_.emplace_back(kernel->fullname_with_scope());
+    DataDumpParser::GetInstance().MatchKernel(kernel->fullname_with_scope());
 
     aicpu::dump::Task task;
     ConstructDumpTask(NOT_NULL(kernel), NOT_NULL(&task));
@@ -84,7 +85,7 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf
     MS_LOG(EXCEPTION) << "Dump path invalid";
   }
   auto device_id = context_ptr->device_id();
-  dump_info->set_dump_path(dump_path.value() + "_" + std::to_string(device_id) + "/");
+  dump_info->set_dump_path("/" + dump_path.value() + "_" + std::to_string(device_id) + "/");
   MS_LOG(INFO) << "[DataDump] dump_path:" << dump_path.value();
 
   dump_info->set_model_name(DataDumpParser::GetInstance().net_name() + "_" + std::to_string(kernel_graph_->graph_id()));

From a42ebf2cbc388a45117605289e3b50a8125de309 Mon Sep 17 00:00:00 2001
From: liubuyu <liubuyu1@huawei.com>
Date: Sat, 18 Jul 2020 10:30:07 +0800
Subject: [PATCH 68/68] cmake bug fix

---
 cmake/package_script.cmake | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cmake/package_script.cmake b/cmake/package_script.cmake
index 94ffc71b494..0ade0af696f 100644
--- a/cmake/package_script.cmake
+++ b/cmake/package_script.cmake
@@ -1,13 +1,16 @@
 # find exec
 find_package(Python3 3.7 COMPONENTS Interpreter Development)
 if (NOT Python3_FOUND)
-    message("No python3 found.")
-    return ()
+    message(FATAL_ERROR "No python3 found.")
 endif ()
 
 set(PYTHON ${Python3_EXECUTABLE})
 set(PYTHON_VERSION ${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR})
 
+if (NOT PYTHON_VERSION MATCHES "3.7")
+    message(FATAL_ERROR "FIND PYTHON VERSION ${PYTHON_VERSION} BUT CAN NOT MATCH PYTHON VERSION 3.7")
+endif ()
+
 find_package(Git)
 if (NOT GIT_FOUND)
     message("No git found.")