From fcd345f2cdc49cf8c9f6f2c736210f3cd16175b8 Mon Sep 17 00:00:00 2001
From: z00478463 <zhangzhongpeng1@huawei.com>
Date: Mon, 25 May 2020 17:36:06 +0800
Subject: [PATCH 1/8] for thor ops

---
 mindspore/ops/_op_impl/_custom_op/__init__.py |   11 +
 .../_op_impl/_custom_op/batch_matmul_impl.py  |  243 ++++
 .../_op_impl/_custom_op/cholesky_trsm_impl.py |  104 ++
 .../_custom_op/fused_abs_max1_impl.py         |  840 +++++++++++++
 .../ops/_op_impl/_custom_op/img2col_impl.py   | 1051 +++++++++++++++++
 .../_custom_op/matmul_cube_dense_left_impl.py |  444 +++++++
 .../matmul_cube_dense_right_impl.py           |  142 +++
 .../matmul_cube_fracz_left_cast_impl.py       |  517 ++++++++
 .../matmul_cube_fracz_right_mul_impl.py       |  244 ++++
 .../_op_impl/_custom_op/matmul_cube_impl.py   |  390 ++++++
 .../_custom_op/matrix_combine_impl.py         |   75 ++
 .../_custom_op/transpose02314_impl.py         |  238 ++++
 .../_op_impl/custom_op/batch_matmul_impl.py   |   76 --
 .../ops/_op_impl/custom_op/cholesky_trsm.py   |   64 -
 .../ops/_op_impl/custom_op/fused_abs_max1.py  |   69 --
 .../ops/_op_impl/custom_op/img2col_impl.py    |   87 --
 .../custom_op/matmul_cube_dense_left.py       |  101 --
 .../matmul_cube_fracz_left_cast_impl.py       |  102 --
 .../matmul_cube_fracz_right_mul_impl.py       |  113 --
 .../_op_impl/custom_op/matmul_cube_impl.py    |  114 --
 .../_op_impl/custom_op/matrix_combine_impl.py |   63 -
 .../_op_impl/custom_op/transpose02314_impl.py |   63 -
 mindspore/ops/operations/__init__.py          |    2 +
 mindspore/ops/operations/thor_ops.py          |   83 +-
 24 files changed, 4368 insertions(+), 868 deletions(-)
 create mode 100644 mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
 create mode 100644 mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
 create mode 100644 mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
 create mode 100644 mindspore/ops/_op_impl/_custom_op/img2col_impl.py
 create mode 100644 mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
 create mode 100644 mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
 create mode 100644 mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
 create mode 100644 mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
 create mode 100644 mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
 create mode 100644 mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
 create mode 100644 mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
 delete mode 100644 mindspore/ops/_op_impl/custom_op/batch_matmul_impl.py
 delete mode 100644 mindspore/ops/_op_impl/custom_op/cholesky_trsm.py
 delete mode 100644 mindspore/ops/_op_impl/custom_op/fused_abs_max1.py
 delete mode 100644 mindspore/ops/_op_impl/custom_op/img2col_impl.py
 delete mode 100644 mindspore/ops/_op_impl/custom_op/matmul_cube_dense_left.py
 delete mode 100644 mindspore/ops/_op_impl/custom_op/matmul_cube_fracz_left_cast_impl.py
 delete mode 100644 mindspore/ops/_op_impl/custom_op/matmul_cube_fracz_right_mul_impl.py
 delete mode 100644 mindspore/ops/_op_impl/custom_op/matmul_cube_impl.py
 delete mode 100644 mindspore/ops/_op_impl/custom_op/matrix_combine_impl.py
 delete mode 100644 mindspore/ops/_op_impl/custom_op/transpose02314_impl.py

diff --git a/mindspore/ops/_op_impl/_custom_op/__init__.py b/mindspore/ops/_op_impl/_custom_op/__init__.py
index 5fe583a60fc..c2f5675d38b 100644
--- a/mindspore/ops/_op_impl/_custom_op/__init__.py
+++ b/mindspore/ops/_op_impl/_custom_op/__init__.py
@@ -14,3 +14,14 @@
 # ============================================================================
 
 """custom ops"""
+from .batch_matmul_impl import CusBatchMatMul
+from .cholesky_trsm_impl import CusCholeskyTrsm
+from .fused_abs_max1_impl import CusFusedAbsMax1
+from .img2col_impl import CusImg2Col
+from .matmul_cube_dense_left_impl import CusMatMulCubeDenseLeft
+from .matmul_cube_dense_right_impl import CusMatMulCubeDenseRight
+from .matmul_cube_fracz_left_cast_impl import CusMatMulCubeFraczLeftCast
+from .matmul_cube_fracz_right_mul_impl import CusMatMulCubeFraczRightMul
+from .matmul_cube_impl import CusMatMulCube
+from .matrix_combine_impl import CusMatrixCombine
+from .transpose_02314_impl import CusTranspose02314
diff --git a/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py b/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
new file mode 100644
index 00000000000..ddbc26a03de
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
@@ -0,0 +1,243 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""batch_matmul_impl"""
+
+from te import tik
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+ 
+cus_batchmatmul_op_info = TBERegOp("CusBatchMatMul") \
+                             .fusion_type("OPAQUE") \
+                             .async_flag(False) \
+                             .binfile_name("batchmatmul.so") \
+                             .compute_cost(10) \
+                             .kernel_name("CusBatchMatMul") \
+                             .partial_flag(True) \
+                             .input(0, "x1", False, "required", "all") \
+                             .input(1, "x2", False, "required", "all") \
+                             .output(0, "y", False, "required", "all") \
+                             .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+                             .get_op_info()
+ 
+def _get_flattern_shape(shape):
+    flattern_shape = 1
+    for dim in shape:
+        flattern_shape *= dim
+    return (flattern_shape,)
+ 
+def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
+    input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
+    t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
+    tik_instance.data_move(input_1_local_UB, input1[input1_index], 0, 1, 16, 0, 0)
+    with tik_instance.for_range(0, 2) as vec_i:
+        tik_instance.vadds(64, t_1_0_local_UB[vec_i * 64], input_1_local_UB[vec_i * 64], 0, 64, 1, 1, 16, 0)
+    with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2:
+        input_2_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="input_2_local_UB",
+                                               scope=tik.scope_ubuf)
+        t_1_local_UB = input_2_local_UB
+        bisec_last_axis_local_UB = input_2_local_UB
+        matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64], name="matmul_hybrid_f_t_local_UB",
+                                                         scope=tik.scope_ubuf)
+        matmul_hybrid_f_t_local_UB_dst_tmp = tik_instance.Tensor(dtype, [64],
+                                                                 name="matmul_hybrid_f_t_local_UB_dst_tmp",
+                                                                 scope=tik.scope_ubuf)
+        tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB, 0, 1, 1, 8)
+        tik_instance.data_move(input_2_local_UB, input2[input2_index + thread_idx2 * 8192], 0, 1, 1024, 0, 0)
+        tik_instance.vmul(64, t_1_local_UB, t_1_0_local_UB, input_2_local_UB, 128, 1, 1, 1, 8, 8, 8)
+        tik_instance.vadd(64, bisec_last_axis_local_UB, t_1_local_UB, t_1_local_UB[64], 64, 1, 1, 1,
+                          16, 16, 16)
+        tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB_dst_tmp, 0, 1, 1, 8)
+        with tik_instance.for_range(0, 64) as cc6:
+            tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6], bisec_last_axis_local_UB[cc6 * 128],
+                               1, 1, 1, 8)
+        tik_instance.vadd(64, matmul_hybrid_f_t_local_UB, matmul_hybrid_f_t_local_UB_dst_tmp,
+                          matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
+        tik_instance.data_move(res[res_index + thread_idx2 * 64],
+                               matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
+ 
+def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
+    input_1_local_UB = tik_instance.Tensor(dtype, [64], name="input_1_local_UB", scope=tik.scope_ubuf)
+    tik_instance.data_move(input_1_local_UB, input1[input1_index], 0, 1, 8, 0, 0)
+    with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2:
+        input_2_local_UB = tik_instance.Tensor(dtype, [32*64], name="input_2_local_UB",
+                                               scope=tik.scope_ubuf)
+        t_1_local_UB = input_2_local_UB
+        matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [32], name="matmul_hybrid_f_t_local_UB",
+                                                         scope=tik.scope_ubuf)
+        tik_instance.data_move(input_2_local_UB, input2[input2_index + thread_idx2 * 2048], 0, 1, 256, 0, 0)
+        tik_instance.vmul(64, t_1_local_UB, input_1_local_UB, input_2_local_UB, 32, 1, 1, 1, 8, 0, 8)
+        with tik_instance.for_range(0, 32) as cc6:
+            tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB[cc6], t_1_local_UB[cc6 * 64],
+                               1, 1, 1, 8)
+        tik_instance.data_move(res[res_index + thread_idx2 * 32],
+                               matmul_hybrid_f_t_local_UB, 0, 1, 4, 0, 0)
+@op_info_register(cus_batchmatmul_op_info)
+def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"):
+    if util.get_product_version() == util.VERSION_MINI:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+    else:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+    x1_shape = input_x1.get("shape")
+    dtype = input_x1.get("dtype").lower()
+    x2_shape = input_x2.get("shape")
+    if dtype != input_x2.get("dtype").lower():
+        raise RuntimeError("dtype of input_x1 and input_x2 must be same, but got %s vs %s" % (
+            dtype, input_x2.get("dtype").lower()))
+    input_shape = (tuple(x1_shape), tuple(x2_shape), dtype, transpose_a, transpose_b)
+    support_shape = [((8, 128, 128), (8, 128, 128), "float32", False, True),
+                    ((36, 128, 128), (36, 128, 128), "float32", False, True),
+                    ((5, 128, 128), (5, 128, 128), "float32", False, True),
+                    ((18, 128, 128), (18, 128, 128), "float32", False, True),
+                    ((16, 128, 128), (16, 128, 128), "float32", False, True),
+                    ((9, 128, 128), (9, 128, 128), "float32", False, True),
+                    ((1, 64, 64), (1, 64, 64), "float32", False, True),
+                    ((1, 128, 128), (1, 128, 128), "float32", False, True),
+                    ((4, 128, 128), (4, 128, 128), "float32", False, True),
+                    ((2, 128, 128), (2, 128, 128), "float32", False, True)]
+    if input_shape not in support_shape:
+        raise RuntimeError("input_shape %s is not supported" % str(input_shape))
+ 
+ 
+    # if not transpose_a and transpose_b:
+    batch, m, k = x1_shape
+    _, n, _ = x2_shape
+ 
+    input1_shape = _get_flattern_shape(x1_shape)
+    input1 = tik_instance.Tensor(dtype, input1_shape, name="input1", scope=tik.scope_gm)
+    input2_shape = _get_flattern_shape(x2_shape)
+    input2 = tik_instance.Tensor(dtype, input2_shape, name="input2", scope=tik.scope_gm)
+ 
+    output_shape = x1_shape
+    res_shape = _get_flattern_shape(output_shape)
+    res = tik_instance.Tensor(dtype, res_shape, name="res", scope=tik.scope_gm)
+ 
+    if input_shape == ((36, 128, 128), (36, 128, 128), "float32", False, True):
+        with tik_instance.for_range(0, 18, block_num=18) as block_idx:
+            with tik_instance.for_range(0, 2) as cc0:
+                with tik_instance.for_range(0, 128, thread_num=2) as cc1:
+                    input1_index = block_idx * 32768 + cc0*16384 + cc1 * 128
+                    input2_index = block_idx * 32768 + cc0*16384
+                    res_index = block_idx*32768 + cc0*16384 + cc1*128
+                    _inner_matmul_new(tik_instance, dtype,
+                                  input1, input1_index,
+                                  input2, input2_index,
+                                  res, res_index)
+    if input_shape == ((5, 128, 128), (5, 128, 128), "float32", False, True):
+        with tik_instance.for_range(0, 30, block_num=30) as block_idx:
+            with tik_instance.for_range(0, 11) as cc1_db:
+                with tik_instance.for_range(0, 2, thread_num=2) as thread_idx:
+                    with tik_instance.if_scope(((((block_idx % 6) * 22) + (cc1_db * 2) + thread_idx) < 128)):
+                        input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
+                        t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
+                        tik_instance.data_move(input_1_local_UB, input1[(block_idx//6)*16384 + (block_idx % 6)*2816 + cc1_db * 256 + thread_idx*128], 0, 1, 16, 0, 0)
+                        with tik_instance.for_range(0, 2) as vec_i:
+                            tik_instance.vadds(64, t_1_0_local_UB[vec_i * 64], input_1_local_UB[vec_i * 64], 0,
+                                               64, 1, 1, 16, 0)
+                        with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2:
+                            input_2_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="input_2_local_UB",
+                                                                   scope=tik.scope_ubuf)
+                            t_1_local_UB = input_2_local_UB
+                            bisec_last_axis_local_UB = input_2_local_UB
+                            matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64], name="matmul_hybrid_f_t_local_UB",
+                                                                             scope=tik.scope_ubuf)
+                            matmul_hybrid_f_t_local_UB_dst_tmp = tik_instance.Tensor(dtype, [64],
+                                                                                     name="matmul_hybrid_f_t_local_UB_dst_tmp",
+                                                                                     scope=tik.scope_ubuf)
+                            tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB, 0, 1, 1, 8)
+                            tik_instance.data_move(input_2_local_UB, input2[(block_idx//6) * 16384 + thread_idx2*8192], 0, 1,
+                                                   1024, 0, 0)
+                            tik_instance.vmul(64, t_1_local_UB, t_1_0_local_UB, input_2_local_UB, 128, 1, 1, 1, 8, 8, 8)
+                            tik_instance.vadd(64, bisec_last_axis_local_UB, t_1_local_UB, t_1_local_UB[64], 64, 1, 1, 1,
+                                              16, 16, 16)
+                            tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB_dst_tmp, 0, 1, 1, 8)
+                            with tik_instance.for_range(0, 64) as cc6:
+                                tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6], bisec_last_axis_local_UB[cc6*128],
+                                                   1, 1, 1, 8)
+                            tik_instance.vadd(64, matmul_hybrid_f_t_local_UB, matmul_hybrid_f_t_local_UB_dst_tmp,
+                                              matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
+                            tik_instance.data_move(res[(block_idx//6)*16384 + (block_idx%6)*2816 + cc1_db*256 +
+                                                       thread_idx*128 + thread_idx2*64],
+                                                   matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
+ 
+    if input_shape == ((18, 128, 128), (18, 128, 128), "float32", False, True):
+        with tik_instance.for_range(0, 18, block_num=18) as block_idx:
+            with tik_instance.for_range(0, 128, thread_num=2) as cc0:
+                input1_index = block_idx * 16384 + cc0 * 128
+                input2_index = block_idx * 16384
+                res_index = block_idx*16384 + cc0*128
+                _inner_matmul_new(tik_instance, dtype,
+                              input1, input1_index,
+                              input2, input2_index,
+                              res, res_index)
+ 
+ 
+    if input_shape == ((9, 128, 128), (9, 128, 128), "float32", False, True):
+        with tik_instance.for_range(0, 27, block_num=27) as block_idx:
+            with tik_instance.for_range(0, 42, thread_num=2) as cc0:
+                input1_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + cc0 * 128
+                input2_index = (block_idx//3) * 16384
+                res_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + cc0*128
+                _inner_matmul_new(tik_instance, dtype,
+                              input1, input1_index,
+                              input2, input2_index,
+                              res, res_index)
+            with tik_instance.if_scope((block_idx % 3) < 2):
+                input1_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + 42*128
+                input2_index = (block_idx // 3) * 16384
+                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42*128
+                _inner_matmul_new(tik_instance, dtype,
+                              input1, input1_index,
+                              input2, input2_index,
+                              res, res_index)
+ 
+    if input_shape == ((1, 64, 64), (1, 64, 64), "float32", False, True):
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            with tik_instance.for_range(0, 2, thread_num=2) as cc0:
+                input1_index = block_idx * 128 + cc0 * 64
+                input2_index = 0
+                res_index = block_idx * 128 + cc0 * 64
+                _inner_matmul_new_1_64_32_64(tik_instance, dtype,
+                              input1, input1_index,
+                              input2, input2_index,
+                              res, res_index)
+ 
+    input_shape_list = [((1, 128, 128), (1, 128, 128), "float32", False, True),
+                        ((2, 128, 128), (2, 128, 128), "float32", False, True),
+                        ((4, 128, 128), (4, 128, 128), "float32", False, True),
+                        ((8, 128, 128), (8, 128, 128), "float32", False, True),
+                        ((16, 128, 128), (16, 128, 128), "float32", False, True)
+    ]
+    if input_shape in input_shape_list:
+        block_num = 32
+        input1_unit_size = 128
+        input2_unint_size = 128*128
+        with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx:
+            block_process_ele_num = (batch * m * k) // block_num
+            loop_time = (batch*m*k)//block_num//input1_unit_size
+            thread_num = 2
+            with tik_instance.for_range(0, loop_time, thread_num=thread_num) as cc0:
+                input1_index = block_idx*block_process_ele_num + cc0*input1_unit_size
+                if batch > 1:
+                    input2_index = block_idx//(block_num//batch) * input2_unint_size
+                else:
+                    input2_index = 0
+                res_index = block_idx*block_process_ele_num + cc0*input1_unit_size
+                _inner_matmul_new(tik_instance, dtype,
+                                  input1, input1_index,
+                                  input2, input2_index,
+                                  res, res_index)
+ 
+    tik_instance.BuildCCE(kernel_name, inputs=[input1, input2], outputs=[res])
+    return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py b/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
new file mode 100644
index 00000000000..99c4b651c25
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
@@ -0,0 +1,104 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""CusCholeskyTrsm"""
+from te import tik
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+cus_cholesky_trsm_op_info = TBERegOp("CusCholeskyTrsm") \
+                             .fusion_type("OPAQUE") \
+                             .async_flag(False) \
+                             .binfile_name("choleskytrsm.so") \
+                             .compute_cost(10) \
+                             .kernel_name("CusCholeskyTrsm") \
+                             .partial_flag(True) \
+                             .input(0, "x1", False, "required", "all") \
+                             .output(0, "y", False, "required", "all") \
+                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+                             .get_op_info()
+
+@op_info_register(cus_cholesky_trsm_op_info)
+def CusCholeskyTrsm(input_x,output, kernel_name):
+    input_x_shape = input_x.get("shape")
+    output_shape = output.get("shape")
+    split_dim = 128
+    matrix_dim = input_x_shape[0]
+    split_dim = min(matrix_dim, split_dim)
+    vector_repeat_times = int(split_dim // 64)
+    blocks = int(matrix_dim // split_dim)
+    if blocks == 0:
+        blocks = 1
+    if util.get_product_version() == util.VERSION_MINI:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+    else:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+
+    input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+    res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+    with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+        input_x_ub = tik_instance.Tensor("float32", (split_dim,split_dim), name="input_x_ub", scope=tik.scope_ubuf)
+        temp_ub = tik_instance.Tensor("float32", (split_dim,split_dim), name="temp_ub", scope=tik.scope_ubuf)
+        assist_1_ub = tik_instance.Tensor("float32", (split_dim,), name="assist_1_ub", scope=tik.scope_ubuf)
+        assist_2_ub = tik_instance.Tensor("float32", (split_dim,), name="assist_2_ub", scope=tik.scope_ubuf)
+        with tik_instance.for_range(0,split_dim) as i:
+            tik_instance.data_move(input_x_ub[i,0], input_x[block_index * split_dim + i, block_index * split_dim], 0, 1, vector_repeat_times * 8, 0, 0)
+        scalar1 = tik_instance.Scalar("float32", init_value = -0.5)
+
+        with tik_instance.for_range(0, split_dim) as i:
+            scalar2= tik_instance.Scalar("float32")
+            tik_instance.vln(64, assist_1_ub[0], input_x_ub[i,0], vector_repeat_times, 1, 1, 8, 8)
+            tik_instance.vmuls(64, assist_2_ub[0], assist_1_ub[0], scalar1, vector_repeat_times, 1, 1, 8, 8)
+            tik_instance.vexp(64, assist_1_ub[0], assist_2_ub[0], vector_repeat_times, 1, 1, 8, 8)
+            scalar2.set_as(assist_1_ub[i])
+            tik_instance.vmuls(64, input_x_ub[i,0], input_x_ub[i,0], scalar2, vector_repeat_times, 1, 1, 8, 8)
+            with tik_instance.for_range(i + 1, split_dim) as j:
+                scalar3= tik_instance.Scalar("float32")
+                scalar3.set_as(input_x_ub[i, j])
+                tik_instance.vmuls(64,temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8)
+            tik_instance.vsub(64,input_x_ub[i+1,0], input_x_ub[i+1,0], temp_ub[i+1,0], (split_dim-1-i) * vector_repeat_times, 1, 1, 1, 8, 8, 8)
+
+        zero = tik_instance.Scalar("float32")
+        zero.set_as(0.0)
+        one = tik_instance.Scalar("float32")
+        one.set_as(1.0)
+        with tik_instance.for_range(0, split_dim) as i:
+            tik_instance.vector_dup(64, temp_ub[i,0], zero, vector_repeat_times, 1, 8)
+            temp_ub.__setitem__(i * split_dim + i, one)
+
+        chol_diag_element_final = tik_instance.Scalar("float32")
+        chol_diag_element_final.set_as(input_x_ub[split_dim * split_dim - 1])
+        trsm_diag_element = tik_instance.Scalar("float32")
+        trsm_diag_element.set_as(1.0 / chol_diag_element_final)
+        temp_ub.__setitem__(split_dim * split_dim - 1, trsm_diag_element)
+
+        with tik_instance.for_range(1, split_dim) as i:
+            index = split_dim - i - 1
+            tik_instance.vector_dup(64, assist_1_ub, zero, vector_repeat_times, 1, 8)
+            with tik_instance.for_range(0, i) as j:
+                chol_diag_element_loop = tik_instance.Scalar("float32")
+                chol_diag_element_loop.set_as(input_x_ub[index, index + 1 + j])
+                tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop, vector_repeat_times,1,1,8,8)
+                tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times,1,1,1,8,8,8)
+            temp_scalar = tik_instance.Scalar("float32")
+            temp_scalar.set_as(input_x_ub[index, index])
+            chol_diag_element = tik_instance.Scalar("float32")
+            chol_diag_element.set_as(1.0 / temp_scalar)
+            tik_instance.vsub(64,temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times,1,1,1,8,8,8)
+            tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element,vector_repeat_times,1,1,8,8)
+
+        tik_instance.data_move(res[block_index,0,0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim,0,0)
+
+    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
+        return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
new file mode 100644
index 00000000000..0a53313d59d
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
@@ -0,0 +1,840 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""CusFusedAbsMax1"""
+from te import tik
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+cus_fused_abs_max1_op_info = TBERegOp("CusFusedAbsMax1") \
+                             .fusion_type("OPAQUE") \
+                             .async_flag(False) \
+                             .binfile_name("fusedabsmax1.so") \
+                             .compute_cost(10) \
+                             .kernel_name("CusFusedAbsMax1") \
+                             .partial_flag(True) \
+                             .attr("origin_shape", "required", "listInt", "all") \
+                             .input(0, "x1", False, "required", "all") \
+                             .output(0, "y", False, "required", "all") \
+                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+                             .get_op_info()
+
+@op_info_register(cus_fused_abs_max1_op_info)
+def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs_max1"):
+    input_x_shape = input_x.get("shape")
+    output_shape = output.get("shape")
+
+    if util.get_product_version() == util.VERSION_MINI:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+    else:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+
+
+    if len(input_x_shape) > 2:
+        if (input_x_shape[0] == 1 and input_x_shape[1] == 128 and input_x_shape[2] == 128)  or (input_x_shape[0] == 4 and input_x_shape[1] == 16) or (input_x_shape[0] == 16 and input_x_shape[1] == 4):             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                repeat_time = each_block_element // 64
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time,1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif (input_x_shape[0] == 2 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 16 and input_x_shape[1] == 8):
+            if origin_shape[0] == 147  and (input_x_shape[0] == 2 and input_x_shape[1] == 128 and input_x_shape[2] == 128) :
+                assert origin_shape[0] == 147
+                assert origin_shape[1] == 147
+                phase_1 = 16384
+                phase_2 = 1216
+                blocks = 32
+                each_block_element = phase_1 // blocks + 64
+                input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+                res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_x_ub, input_x[512 * block_index], 0, 1, 512 // 8, 0, 0)
+                    line_id = block_index % 19
+                    tik_instance.data_move(input_x_ub[512], input_x[16384 + 128 * line_id], 0, 1, 8, 0, 0)
+                    repeat_time = each_block_element // 64
+                    tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                    tik_instance.vmax(19, input_x_ub, input_x_ub, input_x_ub[512], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                    with tik_instance.for_range(0,64) as cc0:
+                        data_temp = tik_instance.Scalar("float32")
+                        data_temp.set_as(input_x_ub[cc0])
+                        tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+            else:
+                input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+                res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+                total_elements = 1
+                for val in input_x_shape:
+                    total_elements *= val
+                blocks = 32
+                each_block_element = total_elements // blocks
+                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                    repeat_time = each_block_element // 64
+                    tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                    with tik_instance.for_range(0,64) as cc0:
+                        data_temp = tik_instance.Scalar("float32")
+                        data_temp.set_as(input_x_ub[cc0])
+                        tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif (input_x_shape[0] == 4 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 8 and input_x_shape[1] == 32) or (input_x_shape[0] == 32 and input_x_shape[1] == 8):            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                repeat_time = each_block_element // 64
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 32 and input_x_shape[1] == 16) or (input_x_shape[0] == 16 and input_x_shape[1] == 32):
+            if (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[0] == 1000:
+                input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+                res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+                blocks = 32
+                each_block_element = 7 * 128 * 128 // 32 + 4 * 128
+                phase_1 = 7 * 128 * 128 // 32
+                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0)
+                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, 0)
+                    move_idx = block_index % 8
+                    tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1, 128 // 8, 0, 0)
+                    repeat_time = each_block_element // 64
+                    tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                    vmask = 1000 - 7 * 128 - 64
+                    with tik_instance.for_range(0, 4) as loop_idx :
+                        tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub[512], input_x_ub[2048], 24, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+
+                    with tik_instance.for_range(0, 4) as loop_idx:
+                        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8, 8, 8)
+                    with tik_instance.for_range(0,64) as cc0:
+                        data_temp = tik_instance.Scalar("float32")
+                        data_temp.set_as(input_x_ub[cc0])
+                        tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+
+            elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[0] == 1001:
+                input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+                res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+                blocks = 32
+                each_block_element = 7 * 128 * 128 // 32 + 4 * 128
+                phase_1 = 7 * 128 * 128 // 32
+                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0)
+                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, 0)
+                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, 0)
+                    move_idx = block_index % 9
+                    tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1, 128 // 8, 0, 0)
+                    repeat_time = each_block_element // 64
+                    tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                    vmask = 1001 - 7 * 128 - 64
+                    with tik_instance.for_range(0, 4) as loop_idx:
+                        tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub[512], input_x_ub[2048], 24, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                    with tik_instance.for_range(0, 4) as loop_idx:
+                        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8, 8, 8)
+                    with tik_instance.for_range(0,64) as cc0:
+                        data_temp = tik_instance.Scalar("float32")
+                        data_temp.set_as(input_x_ub[cc0])
+                        tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+            else:
+                input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+                res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+                total_elements = 1
+                for val in input_x_shape:
+                    total_elements *= val
+                blocks = 32
+                each_block_element = total_elements // blocks
+                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                    repeat_time = each_block_element // 64
+                    tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                    with tik_instance.for_range(0,64) as cc0:
+                        data_temp = tik_instance.Scalar("float32")
+                        data_temp.set_as(input_x_ub[cc0])
+                        tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif (input_x_shape[0] == 16 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 16 and input_x_shape[1] == 64) or (input_x_shape[0] == 64 and input_x_shape[1] == 16):
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                repeat_time = each_block_element // 64
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif input_x_shape[0] == 5 and input_x_shape[1] == 128 and input_x_shape[2] == 128 and origin_shape[0] == 576:
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 69632
+            blocks = 32
+            each_block_element = total_elements // blocks
+            phase_1 = 2048
+            phase_2 = 128
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0)
+                tik_instance.data_move(input_x_ub[phase_1], input_x[65536 + phase_2 * block_index * 2], 0, 1, 8, 0, 0)
+                tik_instance.data_move(input_x_ub[phase_1 + 64], input_x[65536 + 128 + phase_2 * block_index * 2], 0, 1, 8, 0, 0)
+                repeat_time = each_block_element // 64
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub[2048], input_x_ub[2048], input_x_ub[2048+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif (input_x_shape[0] == 9 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 72 and input_x_shape[1] == 8):
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                repeat_time = each_block_element // 64
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096+256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096+128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif input_x_shape[0] == 18 and input_x_shape[1] == 128 and input_x_shape[2] == 128:
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                repeat_time = each_block_element // 64
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192+512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192+256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192+128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif (input_x_shape[0] == 36 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 144 and input_x_shape[1] == 16):
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                repeat_time_1 = 255
+                repeat_time_2 = each_block_element // 64 - 255
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif input_x_shape[0] == 128 and input_x_shape[1] == 63:
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                repeat_time_1 = 255
+                repeat_time_2 = each_block_element // 64 - 255 * 3
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 3 * 64], input_x_ub[repeat_time_1 * 3 * 64], repeat_time_2, 1, 1, 8, 8)
+                loop_size = each_block_element // 16384
+                with tik_instance.for_range(0, loop_size) as loop_idx:
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 8192], 128, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 4096], 64, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 2048], 32, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0, loop_size - 1) as loop_idx:
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * (loop_idx + 1)], 1, 1, 1, 1, 8, 8, 8)
+                tail_element = each_block_element - 16384 * loop_size
+                repeats = tail_element // 64
+                with tik_instance.for_range(0, repeats) as i :
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * loop_size + i * 64], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, input_x_ub[64 + cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[2048 + 64], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[1024 + 64], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[512 + 64], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[256 + 64], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[128 + 64], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[64 + 64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], input_x_ub[64], 0, 1, 8, 0, 0)
+        elif (input_x_shape[0] == 32 and input_x_shape[1] == 128) or (input_x_shape[0] == 128 and input_x_shape[1] == 32):
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                repeat_time_1 = 255
+                repeat_time_2 = each_block_element // 64 - 255 * 2
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                loop_size = each_block_element // 16384
+                with tik_instance.for_range(0, loop_size) as loop_idx:
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 8192], 128, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 4096], 64, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 2048], 32, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0, loop_size - 1) as loop_idx:
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * (loop_idx + 1)], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif input_x_shape[0] == 288 and input_x_shape[1] == 32:
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                assist_ub = tik_instance.Tensor("float32", (64,), name="assist_ub", scope=tik.scope_ubuf)
+                zero = tik_instance.Scalar("float32")
+                zero.set_as(0)
+                tik_instance.vector_dup(64, assist_ub, zero, 1, 1, 8)
+                input_x_ub = tik_instance.Tensor("float32", (32768,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                repeat_time_1 = 255
+                repeat_time_2 = 32768 // 64 - 255 * 2
+ 
+                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 0], 0, 1, 4096, 0, 0)
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
+ 
+                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 32768], 0, 1, 4096, 0, 0)
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 65536], 0, 1, 1024, 0, 0)
+                tik_instance.vabs(64, input_x_ub, input_x_ub, 128, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
+ 
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(assist_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif input_x_shape[0] == 64 and input_x_shape[1] == 128:
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                assist_ub = tik_instance.Tensor("float32", (64,), name="assist_ub", scope=tik.scope_ubuf)
+                zero = tik_instance.Scalar("float32")
+                zero.set_as(0)
+                tik_instance.vector_dup(64, assist_ub, zero, 1, 1, 8)
+                input_x_ub = tik_instance.Tensor("float32", (32768,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                repeat_time_1 = 255
+                repeat_time_2 = 32768 // 64 - 255 * 2
+
+                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 0], 0, 1, 4096, 0, 0)
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
+
+                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 32768], 0, 1, 4096, 0, 0)
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
+
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(assist_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif (input_x_shape[0] == 64 and input_x_shape[1] == 32) or (input_x_shape[0] == 32 and input_x_shape[1] == 64):
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                repeat_time_1 = 255
+                repeat_time_2 = each_block_element // 64 - 255
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif input_x_shape[0] == 36 and input_x_shape[1] == 4:
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+
+                repeat_time = each_block_element // 64
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024 + 64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif input_x_shape[0] == 4 and input_x_shape[1] == 4:
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+
+                repeat_time = each_block_element // 64
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif input_x_shape[0] == 49 and input_x_shape[1] == 4:
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+
+                repeat_time = each_block_element // 64
+                tik_instance.vabs(64, input_x_ub, input_x_ub, 24, 1, 1, 8, 8)
+                tik_instance.vabs(32, input_x_ub[1536], input_x_ub[1536], 1, 1, 1, 8, 8)
+                tik_instance.vmax(32, input_x_ub[1504], input_x_ub[1504], input_x_ub[1536], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024+256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024+128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+        elif input_x_shape[0] == 1 and input_x_shape[1] == 64 and input_x_shape[2] == 64:
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+            res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+            total_elements = 1
+            for val in input_x_shape:
+                total_elements *= val
+            blocks = 32
+            each_block_element = total_elements // blocks
+            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                repeat_time = each_block_element // 64
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
+                tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+                with tik_instance.for_range(0,64) as cc0:
+                    data_temp = tik_instance.Scalar("float32")
+                    data_temp.set_as(input_x_ub[cc0])
+                    tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
+ 
+        else:
+            raise RuntimeError("UnSupportedShape")
+    elif len(input_x_shape) == 2 and (input_x_shape[0] == 32 and input_x_shape[1] == 64):
+        input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+        res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+        input_x_ub = tik_instance.Tensor("float32", (32*64,), name="input_x_ub", scope=tik.scope_ubuf)
+        tik_instance.data_move(input_x_ub, input_x, 0, 1, 256, 0, 0)
+        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
+        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
+        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
+        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
+        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
+        tik_instance.data_move(res[0], input_x_ub, 0, 1, 1, 0, 0)
+    else:
+        raise RuntimeError("UnSupportedShape")
+ 
+    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
+    return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/img2col_impl.py b/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
new file mode 100644
index 00000000000..0d69240dc43
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
@@ -0,0 +1,1051 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""CusImg2ColNC1HWC0"""
+from te import tik
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+ 
+cus_img2col_info = TBERegOp("CusImg2Col") \
+                             .fusion_type("OPAQUE") \
+                             .async_flag(False) \
+                             .binfile_name("img2col.so") \
+                             .compute_cost(10) \
+                             .kernel_name("CusImg2Col") \
+                             .partial_flag(True) \
+                             .attr("ksizes", "required", "listInt", "all") \
+                             .attr("strides", "required", "listInt", "all") \
+                             .attr("dilates", "required", "listInt", "all") \
+                             .attr("mode", "required", "str", "all") \
+                             .input(0, "x1", False, "required", "all") \
+                             .output(0, "y", False, "required", "all") \
+                             .dtype_format(DataType.F16_5HD, DataType.F16_FracNZ) \
+                             .get_op_info()
+ 
+@op_info_register(cus_img2col_info)
+def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img2col"):
+    input_x_shape = input_x.get("shape")
+    input_x_dtype = input_x.get("dtype")
+    N, C1, H, W, C0 = input_x_shape
+    C = C1 * C0
+    padding = 'SAME'
+    _, filter_h, filter_w, _ = ksizes
+    _, stride_h, stride_w, _ = strides
+    _, dilation_filter_h, dilation_filter_w, _ = dilates
+ 
+    input_shape = (tuple(input_x_shape), input_x_dtype, (filter_h, filter_w), (stride_h, stride_w))
+    supported_shape = [((32, 32, 14, 14, 16), 'float16', (3, 3), (2, 2)),
+                       ((32, 1, 224, 224, 16), 'float16', (7, 7), (2, 2)),
+                       ((32, 4, 56, 56, 16), 'float16', (3, 3), (1, 1)),
+                       ((32, 8, 56, 56, 16), 'float16', (3, 3), (2, 2)),
+                       ((32, 8, 28, 28, 16), 'float16', (3, 3), (1, 1)),
+                       ((32, 16, 28, 28, 16), 'float16', (3, 3), (2, 2)),
+                       ((32, 16, 14, 14, 16), 'float16', (3, 3), (1, 1)),
+                       ((32, 32, 7, 7, 16), 'float16', (3, 3), (1, 1)),
+                       ((32, 64, 14, 14, 16), 'float16', (1, 1), (1, 1)),
+                       ((32, 32, 7, 7, 16), 'float16', (1, 1), (1, 1)),
+                       ((32, 4, 56, 56, 16), 'float16', (1, 1), (1, 1)),
+                       ((32, 64, 14, 14, 16), 'float16', (1, 1), (2, 2)),
+                       ((32, 128, 7, 7, 16), 'float16', (1, 1), (1, 1)),
+                       ((32, 32, 28, 28, 16), 'float16', (1, 1), (2, 2)),
+                       ((32, 16, 56, 56, 16), 'float16', (1, 1), (2, 2)),
+                       ((32, 8, 28, 28, 16), 'float16', (1, 1), (1, 1)),
+                       ((32, 32, 28, 28, 16), 'float16', (1, 1), (1, 1)),
+                       ((32, 16, 14, 14, 16), 'float16', (1, 1), (1, 1)),
+                       ((32, 16, 56, 56, 16), 'float16', (1, 1), (1, 1)),]
+ 
+ 
+    if input_shape not in supported_shape:
+        raise RuntimeError("input_shape %s is not supported" % str(input_shape))
+ 
+    output_tmp = [N * int(H // stride_h) * int(W // stride_w), filter_h * filter_w * C]
+    output_shape = [output_tmp[1] // 16, output_tmp[0] // 16, 16, 16]
+    if util.get_product_version() == util.VERSION_MINI:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+    else:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+ 
+    input_x = tik_instance.Tensor("float16", input_x_shape, name="input_x", scope=tik.scope_gm)
+    res = tik_instance.Tensor("float16", output_shape, name="res", scope=tik.scope_gm)
+ 
+    if input_shape == ((32, 1, 224, 224, 16), 'float16', (7, 7), (2, 2)):
+        pad = [3,3,3,3]
+        l1_h = 56
+        l1_w = 224
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+ 
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53760,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+ 
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 12544, 0, 0)
+            with tik_instance.for_range(0,7) as eeb:
+                with tik_instance.for_range(0,7) as cc0:
+                    temp = eeb % 2
+                    rep = ((55 - temp - (-3 + eeb)) // 2 + 1) * 7
+                    fetch_filter_w = cc0
+                    fetch_filter_h = eeb
+                    left_top_w = -3
+                    left_top_h = -3
+ 
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1,
+                        pad,
+                        l1_h,
+                        l1_w,
+                        c1_index,
+                        fetch_filter_w,
+                        fetch_filter_h,
+                        left_top_w,
+                        left_top_h,
+                        stride_w,
+                        stride_h,
+                        filter_w,
+                        filter_h,
+                        dilation_filter_w,
+                        dilation_filter_h,
+                        jump_stride,
+                        repeat_mode,
+                        rep)
+ 
+                    with tik_instance.for_range(0, rep) as cc1:
+                        tik_instance.data_move(res[cc0 + eeb * 7,cc1 + 784 * block_index,0,0], input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
+ 
+            with tik_instance.for_range(1,3) as eeb0:
+                tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,56*eeb0,0,0], 0, 1, 12544, 0, 0)
+                with tik_instance.for_range(0,7) as eeb:
+                    with tik_instance.for_range(0,7) as cc0:
+                        temp = eeb % 2
+                        rep_prefix = ((55 - temp - (-3 + eeb)) // 2 + 1) * 7
+                        rep = 196
+                        fetch_filter_w = cc0
+                        fetch_filter_h = eeb
+                        left_top_w = -3
+ 
+                        left_top_h = 1 + ((55 - temp - (-3 + eeb)) // 2 - 29) * 2
+ 
+                        tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1,
+                            pad,
+                            l1_h,
+                            l1_w,
+                            c1_index,
+                            fetch_filter_w,
+                            fetch_filter_h,
+                            left_top_w,
+                            left_top_h,
+                            stride_w,
+                            stride_h,
+                            filter_w,
+                            filter_h,
+                            dilation_filter_w,
+                            dilation_filter_h,
+                            jump_stride,
+                            repeat_mode,
+                            rep)
+                        with tik_instance.for_range(0, rep) as cc1:
+                            tik_instance.data_move(res[cc0 + eeb * 7,cc1 + rep_prefix + (eeb0 - 1) * rep + 784 * block_index, 0,0], input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
+ 
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,56*3,0,0], 0, 1, 12544, 0, 0)
+ 
+            with tik_instance.for_range(0, 7) as eeb:
+                with tik_instance.for_range(0, 7) as cc0:
+                    temp = eeb % 2
+                    rep_prefix = ((55 - temp - (-3 + eeb)) // 2 + 1) * 7 + 196 * 2
+                    rep = 784 - rep_prefix
+                    fetch_filter_w = cc0
+                    fetch_filter_h = eeb
+                    left_top_w = -3
+                    left_top_h = 1 + ((55 - temp - (-3 + eeb)) // 2 - 29) * 2
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1,
+                        pad,
+                        l1_h,
+                        l1_w,
+                        c1_index,
+                        fetch_filter_w,
+                        fetch_filter_h,
+                        left_top_w,
+                        left_top_h,
+                        stride_w,
+                        stride_h,
+                        filter_w,
+                        filter_h,
+                        dilation_filter_w,
+                        dilation_filter_h,
+                        jump_stride,
+                        repeat_mode,
+                        rep)
+ 
+                    with tik_instance.for_range(0, rep) as cc1:
+                        tik_instance.data_move(res[cc0 + eeb * 7,cc1 + rep_prefix + 784 * block_index,0,0], input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
+ 
+    if input_shape == ((32, 4, 56, 56, 16), 'float16', (3, 3), (1, 1)):
+        pad = [1,1,1,1]
+        l1_h = 56
+        l1_w = 56
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 12544, 0, 0)
+            with tik_instance.for_range(0, 9) as eeb0:
+                rep = 196
+                fetch_filter_w = eeb0 % 3
+                fetch_filter_h = eeb0 // 3
+                left_top_w = -1
+                left_top_h = -1
+                with tik_instance.for_range(0, 4) as eeb1:
+                    c1_index = eeb1
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1,
+                        pad,
+                        l1_h,
+                        l1_w,
+                        c1_index,
+                        fetch_filter_w,
+                        fetch_filter_h,
+                        left_top_w,
+                        left_top_h,
+                        stride_w,
+                        stride_h,
+                        filter_w,
+                        filter_h,
+                        dilation_filter_w,
+                        dilation_filter_h,
+                        jump_stride,
+                        repeat_mode,
+                        rep)
+                    with tik_instance.for_range(0, rep) as i:
+                        tik_instance.data_move(res[eeb1 * 9 + eeb0,i + 196 * block_index,0,0], input_1_1_fractal_L1_local_UB[i * 256], 0, 1, 16, 0, 0)
+
+    if input_shape == ((32, 8, 56, 56, 16), 'float16', (3, 3), (2, 2)):
+        pad = [1,1,1,1]
+        l1_h = 56
+        l1_w = 56
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (401408,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (112896,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 25088, 0, 0)
+            with tik_instance.for_range(0,8) as eeb0:
+                with tik_instance.for_range(0,9) as eeb1:
+                    rep = 49
+                    fetch_filter_w = eeb1 % 3
+                    fetch_filter_h = eeb1 // 3
+                    left_top_w = -1
+                    left_top_h = -1
+                    c1_index = eeb0
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[49*256*eeb1], input_1_1_local_L1,
+                        pad,
+                        l1_h,
+                        l1_w,
+                        c1_index,
+                        fetch_filter_w,
+                        fetch_filter_h,
+                        left_top_w,
+                        left_top_h,
+                        stride_w,
+                        stride_h,
+                        filter_w,
+                        filter_h,
+                        dilation_filter_w,
+                        dilation_filter_h,
+                        jump_stride,
+                        repeat_mode,
+                        rep)
+                with tik_instance.for_range(0,9) as eeb1:
+                    with tik_instance.for_range(0,49) as i:
+                        tik_instance.data_move(res[eeb1 + eeb0 * 9,49 * block_index + i,0,0], input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+
+    if input_shape == ((32, 8, 28, 28, 16), 'float16', (3, 3), (1, 1)):
+        pad = [1,1,1,1]
+        l1_h = 28
+        l1_w = 28
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (112896,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 6272, 0, 0)
+            with tik_instance.for_range(0,8) as eeb0:
+                with tik_instance.for_range(0,9) as eeb1:
+                    rep = 49
+                    fetch_filter_w = eeb1 % 3
+                    fetch_filter_h = eeb1 // 3
+                    left_top_w = -1
+                    left_top_h = -1
+                    c1_index = eeb0
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[49*256*eeb1], input_1_1_local_L1,
+                        pad,
+                        l1_h,
+                        l1_w,
+                        c1_index,
+                        fetch_filter_w,
+                        fetch_filter_h,
+                        left_top_w,
+                        left_top_h,
+                        stride_w,
+                        stride_h,
+                        filter_w,
+                        filter_h,
+                        dilation_filter_w,
+                        dilation_filter_h,
+                        jump_stride,
+                        repeat_mode,
+                        rep)
+                with tik_instance.for_range(0,9) as eeb1:
+                    with tik_instance.for_range(0,49) as i:
+                        tik_instance.data_move(res[eeb1 + eeb0 * 9,49 * block_index + i,0,0], input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+
+    if input_shape == ((32, 16, 28, 28, 16), 'float16', (3, 3), (2, 2)):
+        pad = [1,1,1,1]
+        l1_h = 28
+        l1_w = 28
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            eeb0 = block_index % 2
+            eeb1 = block_index // 2
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0,16) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 12544], input_x[i + 16 * eeb0,eeb1,0,0,0], 0, 1, 784, 0, 0)
+
+            with tik_instance.for_range(0,9) as eeb3:
+                rep = 13
+                fetch_filter_w = eeb3 % 3
+                fetch_filter_h = eeb3 // 3
+                left_top_w = -1
+                left_top_h = -1
+                c1_index = 0
+                with tik_instance.for_range(0,16) as i:
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * i], input_1_1_local_L1[12544 * i],
+                        pad,
+                        l1_h,
+                        l1_w,
+                        c1_index,
+                        fetch_filter_w,
+                        fetch_filter_h,
+                        left_top_w,
+                        left_top_h,
+                        stride_w,
+                        stride_h,
+                        filter_w,
+                        filter_h,
+                        dilation_filter_w,
+                        dilation_filter_h,
+                        jump_stride,
+                        repeat_mode,
+                        rep)
+                with tik_instance.for_range(0,16) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196,0,0)
+
+                with tik_instance.for_range(196 * eeb0, 196 * (eeb0 + 1)) as i:
+                    tik_instance.data_move(res[eeb1 * 9 + eeb3,i,0,0], input_1_2_fractal_L1_local_UB[256 * (i - 196 * eeb0)],0,1,16,0,0)
+
+    if input_shape ==  ((32, 16, 14, 14, 16), 'float16', (3, 3), (1, 1)):
+        pad = [1,1,1,1]
+        l1_h = 14
+        l1_w = 14
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            eeb0 = block_index % 2
+            eeb1 = block_index // 2
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (50176,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0,16) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i + 16 * eeb0,eeb1,0,0,0], 0, 1, 196, 0, 0)
+
+            with tik_instance.for_range(0,9) as eeb3:
+                rep = 13
+                fetch_filter_w = eeb3 % 3
+                fetch_filter_h = eeb3 // 3
+                left_top_w = -1
+                left_top_h = -1
+                c1_index = 0
+                with tik_instance.for_range(0,16) as i:
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * i], input_1_1_local_L1[3136 * i],
+                        pad,
+                        l1_h,
+                        l1_w,
+                        c1_index,
+                        fetch_filter_w,
+                        fetch_filter_h,
+                        left_top_w,
+                        left_top_h,
+                        stride_w,
+                        stride_h,
+                        filter_w,
+                        filter_h,
+                        dilation_filter_w,
+                        dilation_filter_h,
+                        jump_stride,
+                        repeat_mode,
+                        rep)
+                with tik_instance.for_range(0,16) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196,0,0)
+
+                with tik_instance.for_range(196 * eeb0, 196 * (eeb0 + 1)) as i:
+                    tik_instance.data_move(res[eeb1 * 9 + eeb3,i,0,0], input_1_2_fractal_L1_local_UB[256 * (i - 196 * eeb0)],0,1,16,0,0)
+
+    if input_shape == ((32, 32, 14, 14, 16), 'float16', (3, 3), (2, 2)):
+        pad = [1,1,1,1]
+        l1_h = 14
+        l1_w = 14
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0,32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i,block_index,0,0,0], 0, 1, 196, 0, 0)
+            with tik_instance.for_range(0,9) as eeb:
+                rep = 4
+                fetch_filter_w = eeb  % 3
+                fetch_filter_h = eeb // 3
+                left_top_w = -1
+                left_top_h = -1
+                c1_index = 0
+                with tik_instance.for_range(0,32) as i:
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[3136 * i],
+                        pad,
+                        l1_h,
+                        l1_w,
+                        c1_index,
+                        fetch_filter_w,
+                        fetch_filter_h,
+                        left_top_w,
+                        left_top_h,
+                        stride_w,
+                        stride_h,
+                        filter_w,
+                        filter_h,
+                        dilation_filter_w,
+                        dilation_filter_h,
+                        jump_stride,
+                        repeat_mode,
+                        rep)
+
+                with tik_instance.for_range(0,32) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i*1024],0,1,49,0,0)
+
+                with tik_instance.for_range(0,98) as i:
+                    tik_instance.data_move(res[eeb + block_index * 9, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+
+    if input_shape == ((32, 64, 14, 14, 16), 'float16', (1, 1), (2, 2)):
+        pad = [0,0,0,0]
+        l1_h = 14
+        l1_w = 14
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
+
+            with tik_instance.for_range(0,2) as eeb0:
+                with tik_instance.for_range(0,32) as i:
+                    tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i,block_index * 2 + eeb0,0,0,0], 0, 1, 196, 0, 0)
+                with tik_instance.for_range(0,32) as i:
+                    rep = 4
+                    fetch_filter_w = 0
+                    fetch_filter_h = 0
+                    left_top_w = 0
+                    left_top_h = 0
+                    c1_index = 0
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[3136 * i],
+                        pad,
+                        l1_h,
+                        l1_w,
+                        c1_index,
+                        fetch_filter_w,
+                        fetch_filter_h,
+                        left_top_w,
+                        left_top_h,
+                        stride_w,
+                        stride_h,
+                        filter_w,
+                        filter_h,
+                        dilation_filter_w,
+                        dilation_filter_h,
+                        jump_stride,
+                        repeat_mode,
+                        rep)
+                with tik_instance.for_range(0,32) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i*1024],0,1,49,0,0)
+
+                with tik_instance.for_range(0,98) as i:
+                    tik_instance.data_move(res[eeb0 + block_index * 2, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+
+    if input_shape == ((32, 32, 7, 7, 16), 'float16', (3, 3), (1, 1)):
+        pad = [1,1,1,1]
+        l1_h = 7
+        l1_w = 7
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0,32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i,block_index,0,0,0], 0, 1, 49, 0, 0)
+
+            with tik_instance.for_range(0, 9) as eeb:
+                rep = 4
+                fetch_filter_w = eeb % 3
+                fetch_filter_h = eeb // 3
+                left_top_w = -1
+                left_top_h = -1
+                c1_index = 0
+                with tik_instance.for_range(0,32) as i:
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[784 * i],
+                        pad,
+                        l1_h,
+                        l1_w,
+                        c1_index,
+                        fetch_filter_w,
+                        fetch_filter_h,
+                        left_top_w,
+                        left_top_h,
+                        stride_w,
+                        stride_h,
+                        filter_w,
+                        filter_h,
+                        dilation_filter_w,
+                        dilation_filter_h,
+                        jump_stride,
+                        repeat_mode,
+                        rep)
+                with tik_instance.for_range(0,32) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i*1024],0,1,49,0,0)
+
+                with tik_instance.for_range(0,98) as i:
+                    tik_instance.data_move(res[eeb + block_index * 9, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+
+    if input_shape == ((32, 128, 7, 7, 16), 'float16', (1, 1), (1, 1)):
+        pad = [0,0,0,0]
+        l1_h = 7
+        l1_w = 7
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0,4) as eeb0:
+                with tik_instance.for_range(0,32) as i:
+                    tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i,eeb0 + block_index * 4,0,0,0], 0, 1, 49, 0, 0)
+                with tik_instance.for_range(0,32) as i:
+                    rep = 4
+                    fetch_filter_w = 0
+                    fetch_filter_h = 0
+                    left_top_w = 0
+                    left_top_h = 0
+                    c1_index = 0
+                    with tik_instance.for_range(0,32) as i:
+                        tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[784 * i],
+                            pad,
+                            l1_h,
+                            l1_w,
+                            c1_index,
+                            fetch_filter_w,
+                            fetch_filter_h,
+                            left_top_w,
+                            left_top_h,
+                            stride_w,
+                            stride_h,
+                            filter_w,
+                            filter_h,
+                            dilation_filter_w,
+                            dilation_filter_h,
+                            jump_stride,
+                            repeat_mode,
+                            rep)
+
+                with tik_instance.for_range(0,32) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i*1024],0,1,49,0,0)
+
+                with tik_instance.for_range(0,98) as i:
+                    tik_instance.data_move(res[eeb0 + block_index * 4, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+
+    if input_shape == ((32, 64, 14, 14, 16), 'float16', (1, 1), (1, 1)):
+        pad = [0,0,0,0]
+        l1_h = 14
+        l1_w = 14
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+            input_1_2_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_2_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0,32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i,block_index * 2,0,0,0], 0, 1, 196, 0, 0)
+                tik_instance.data_move(input_1_2_local_L1[i * 3136], input_x[i,block_index * 2 + 1,0,0,0], 0, 1, 196, 0, 0)
+            with tik_instance.for_range(0,2) as eeb1:
+                with tik_instance.for_range(eeb1 * 16, (eeb1 + 1) * 16) as i:
+                    rep = 13
+                    fetch_filter_w = 0
+                    fetch_filter_h = 0
+                    left_top_w = 0
+                    left_top_h = 0
+                    c1_index = 0
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - eeb1 * 16)], input_1_1_local_L1[3136 * i],
+                        pad,
+                        l1_h,
+                        l1_w,
+                        c1_index,
+                        fetch_filter_w,
+                        fetch_filter_h,
+                        left_top_w,
+                        left_top_h,
+                        stride_w,
+                        stride_h,
+                        filter_w,
+                        filter_h,
+                        dilation_filter_w,
+                        dilation_filter_h,
+                        jump_stride,
+                        repeat_mode,
+                        rep)
+                with tik_instance.for_range(0,16) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i*3328],0,1,196,0,0)
+                with tik_instance.for_range(eeb1 * 196, (eeb1 + 1) * 196) as i:
+                    tik_instance.data_move(res[block_index * 2, i, 0, 0], input_1_2_fractal_L1_local_UB[256*(i - eeb1 * 196)], 0,1,16,0,0)
+ 
+            with tik_instance.for_range(0,2) as eeb1:
+                with tik_instance.for_range(eeb1 * 16, (eeb1 + 1) * 16) as i:
+                    rep = 13
+                    fetch_filter_w = 0
+                    fetch_filter_h = 0
+                    left_top_w = 0
+                    left_top_h = 0
+                    c1_index = 0
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - eeb1 * 16)], input_1_2_local_L1[3136 * i],
+                        pad,
+                        l1_h,
+                        l1_w,
+                        c1_index,
+                        fetch_filter_w,
+                        fetch_filter_h,
+                        left_top_w,
+                        left_top_h,
+                        stride_w,
+                        stride_h,
+                        filter_w,
+                        filter_h,
+                        dilation_filter_w,
+                        dilation_filter_h,
+                        jump_stride,
+                        repeat_mode,
+                        rep)
+                with tik_instance.for_range(0,16) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i*3328],0,1,196,0,0)
+                with tik_instance.for_range(eeb1 * 196, (eeb1 + 1) * 196) as i:
+                    tik_instance.data_move(res[block_index * 2 + 1, i, 0, 0], input_1_2_fractal_L1_local_UB[256*(i - eeb1 * 196)], 0,1,16,0,0)
+ 
+    if input_shape == ((32, 32, 28, 28, 16), 'float16', (1, 1), (2, 2)):
+        pad = [0,0,0,0]
+        l1_h = 28
+        l1_w = 28
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (401408,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0,32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 12544], input_x[i,block_index,0,0,0], 0, 1, 784, 0, 0)
+            with tik_instance.for_range(0,16) as i:
+                rep = 13
+                fetch_filter_w = 0
+                fetch_filter_h = 0
+                left_top_w = 0
+                left_top_h = 0
+                c1_index = 0
+                tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * i], input_1_1_local_L1[12544 * i],
+                    pad,
+                    l1_h,
+                    l1_w,
+                    c1_index,
+                    fetch_filter_w,
+                    fetch_filter_h,
+                    left_top_w,
+                    left_top_h,
+                    stride_w,
+                    stride_h,
+                    filter_w,
+                    filter_h,
+                    dilation_filter_w,
+                    dilation_filter_h,
+                    jump_stride,
+                    repeat_mode,
+                    rep)
+            with tik_instance.for_range(0,16) as i:
+                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i*3328],0,1,196,0,0)
+            with tik_instance.for_range(0,196) as i:
+                tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+ 
+            with tik_instance.for_range(16,32) as i:
+                rep = 13
+                fetch_filter_w = 0
+                fetch_filter_h = 0
+                left_top_w = 0
+                left_top_h = 0
+                c1_index = 0
+                tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - 16)], input_1_1_local_L1[12544 * i],
+                    pad,
+                    l1_h,
+                    l1_w,
+                    c1_index,
+                    fetch_filter_w,
+                    fetch_filter_h,
+                    left_top_w,
+                    left_top_h,
+                    stride_w,
+                    stride_h,
+                    filter_w,
+                    filter_h,
+                    dilation_filter_w,
+                    dilation_filter_h,
+                    jump_stride,
+                    repeat_mode,
+                    rep)
+            with tik_instance.for_range(0,16) as i:
+                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i*3328],0,1,196,0,0)
+            with tik_instance.for_range(196,392) as i:
+                tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[256*(i-196)], 0,1,16,0,0)
+ 
+    if input_shape == ((32, 32, 7, 7, 16), 'float16', (1, 1), (1, 1)):
+      if padding == 'SAME':
+        padding_left = 0
+        padding_right = 0
+        padding_top = 0
+        padding_bottom = 0
+      pad = [padding_left, padding_right, padding_top, padding_bottom]
+      l1_h = 7
+      l1_w = 7
+      c1_index = 0
+      jump_stride = 1
+      repeat_mode = 1
+      with tik_instance.for_range(0,32, block_num=32) as block_index:
+        input_1_1_local_L1 = tik_instance.Tensor("float16", (25088, ), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+        input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088, ), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
+ 
+        with tik_instance.for_range(0,32) as i:
+          tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i, block_index, 0, 0, 0], 0, 1, 49, 0, 0)
+ 
+        with tik_instance.for_range(0,32) as i:
+            fetch_filter_w = 0
+            fetch_filter_h = 0
+            left_top_h = 0
+            left_top_w = 0
+            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[784 * i],
+              pad,
+              l1_h,
+              l1_w,
+              c1_index,
+              fetch_filter_w,
+              fetch_filter_h,
+              left_top_w,
+              left_top_h,
+              stride_w,
+              stride_h,
+              filter_w,
+              filter_h,
+              dilation_filter_w,
+              dilation_filter_h,
+              jump_stride,
+              repeat_mode,
+              4)
+ 
+        with tik_instance.for_range(0,32) as i:
+          tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
+        with tik_instance.for_range(0,98) as i:
+          tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[i * 256], 0, 1, 16, 0, 0)
+ 
+    if input_shape == ((32, 4, 56, 56, 16), 'float16', (1, 1), (1, 1)):
+      if padding == 'SAME':
+        padding_left = 0
+        padding_right = 0
+        padding_top = 0
+        padding_bottom = 0
+      pad = [padding_left, padding_right, padding_top, padding_bottom]
+      l1_h = 56
+      l1_w = 56
+      c1_index = 0
+      jump_stride = 1
+      repeat_mode = 1
+      with tik_instance.for_range(0,32,block_num=32) as block_index:
+        input_1_1_local_L1 = tik_instance.Tensor("float16", (12544 * 32 // 2, ), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (100352 // 2,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+        tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 12544, 0, 0)
+        with tik_instance.for_range(0, 4) as eeb:
+            fetch_filter_w = 0
+            fetch_filter_h = 0
+            left_top_h = 0
+            left_top_w = 0
+            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1[eeb * 56 * 56 * 16],
+              pad,
+              l1_h,
+              l1_w,
+              c1_index,
+              fetch_filter_w,
+              fetch_filter_h,
+              left_top_w,
+              left_top_h,
+              stride_w,
+              stride_h,
+              filter_w,
+              filter_h,
+              dilation_filter_w,
+              dilation_filter_h,
+              jump_stride,
+              repeat_mode,
+              196)
+            with tik_instance.for_range(0, 196) as rep:
+              tik_instance.data_move(res[eeb, rep + block_index * 196, 0, 0], input_1_1_fractal_L1_local_UB[rep * 256], 0, 1, 16, 0, 0)
+
+    if input_shape == ((32, 8, 28, 28, 16), 'float16', (1, 1), (1, 1)):
+      if padding == 'SAME':
+        padding_left = 0
+        padding_right = 0
+        padding_top = 0
+        padding_bottom = 0
+      pad = [padding_left, padding_right, padding_top, padding_bottom]
+      l1_h = 28
+      l1_w = 28
+      c1_index = 0
+      jump_stride = 1
+      repeat_mode = 1
+      with tik_instance.for_range(0,32,block_num=32) as block_index:
+        input_1_1_local_L1 = tik_instance.Tensor("float16", (6272 * 32 // 2, ), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+        tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 6272, 0, 0)
+        with tik_instance.for_range(0, 1) as eeb0:
+          with tik_instance.for_range(0, 8) as eeb1:
+            fetch_filter_w = 0
+            fetch_filter_h = 0
+            left_top_h = 0
+            left_top_w = 0
+            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256], input_1_1_local_L1[(eeb1 + eeb0 * 8) * 28 * 28 * 16],
+              pad,
+              l1_h,
+              l1_w,
+              c1_index,
+              fetch_filter_w,
+              fetch_filter_h,
+              left_top_w,
+              left_top_h,
+              stride_w,
+              stride_h,
+              filter_w,
+              filter_h,
+              dilation_filter_w,
+              dilation_filter_h,
+              jump_stride,
+              repeat_mode,
+              49)
+          with tik_instance.for_range(0, 8) as eeb1:
+            with tik_instance.for_range(0, 49) as i:
+              tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0], input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+
+    if input_shape == ((32, 32, 28, 28, 16), 'float16', (1, 1), (1, 1)):
+      if padding == 'SAME':
+        padding_left = 0
+        padding_right = 0
+        padding_top = 0
+        padding_bottom = 0
+      pad = [padding_left, padding_right, padding_top, padding_bottom]
+      l1_h = 28
+      l1_w = 28
+      c1_index = 0
+      jump_stride = 1
+      repeat_mode = 1
+      with tik_instance.for_range(0,32,block_num=32) as block_index:
+        input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2, ), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+        tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 25088, 0, 0)
+        with tik_instance.for_range(0, 4) as eeb0:
+          with tik_instance.for_range(0, 8) as eeb1:
+            fetch_filter_w = 0
+            fetch_filter_h = 0
+            left_top_h = 0
+            left_top_w = 0
+            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256], input_1_1_local_L1[(eeb1 + eeb0 * 8) * 28 * 28 * 16],
+              pad,
+              l1_h,
+              l1_w,
+              c1_index,
+              fetch_filter_w,
+              fetch_filter_h,
+              left_top_w,
+              left_top_h,
+              stride_w,
+              stride_h,
+              filter_w,
+              filter_h,
+              dilation_filter_w,
+              dilation_filter_h,
+              jump_stride,
+              repeat_mode,
+              49)
+          with tik_instance.for_range(0, 8) as eeb1:
+            with tik_instance.for_range(0, 49) as i:
+              tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0], input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+
+    if input_shape == ((32, 16, 14, 14, 16), 'float16', (1, 1), (1, 1)):
+      if padding == 'SAME':
+        padding_left = 0
+        padding_right = 0
+        padding_top = 0
+        padding_bottom = 0
+      pad = [padding_left, padding_right, padding_top, padding_bottom]
+      l1_h = 14
+      l1_w = 14
+      c1_index = 0
+      jump_stride = 1
+      repeat_mode = 1
+      with tik_instance.for_range(0,32,block_num=32) as block_index:
+        eeb0 = block_index % 2
+        eeb1 = block_index // 2
+        input_1_1_local_L1 = tik_instance.Tensor("float16", (196 * 32 * 16,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (106496 // 2,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+        input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (196 * 16 * 16,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
+        with tik_instance.for_range(0,32) as i:
+          tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i,eeb1,0,0,0], 0, 1, 196, 0, 0)
+        with tik_instance.for_range(0,16) as i:
+          fetch_filter_w = 0
+          fetch_filter_h = 0
+          left_top_h = 0
+          left_top_w = 0
+          tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[i * 3328], input_1_1_local_L1[i * 3136 + eeb0 * 16 * 3136],
+              pad,
+              l1_h,
+              l1_w,
+              c1_index,
+              fetch_filter_w,
+              fetch_filter_h,
+              left_top_w,
+              left_top_h,
+              stride_w,
+              stride_h,
+              filter_w,
+              filter_h,
+              dilation_filter_w,
+              dilation_filter_h,
+              jump_stride,
+              repeat_mode,
+              13)
+        with tik_instance.for_range(0,16) as i:
+          tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196,0,0)
+        with tik_instance.for_range(0, 196) as i:
+          tik_instance.data_move(res[eeb1, i + 196 * eeb0, 0, 0],input_1_2_fractal_L1_local_UB[256 * i], 0, 1, 16, 0, 0)
+
+    if input_shape == ((32, 16, 56, 56, 16), 'float16', (1, 1), (1, 1)):
+      if padding == 'SAME':
+        padding_left = 0
+        padding_right = 0
+        padding_top = 0
+        padding_bottom = 0
+      pad = [padding_left, padding_right, padding_top, padding_bottom]
+      l1_h = 56
+      l1_w = 56
+      c1_index = 0
+      jump_stride = 1
+      repeat_mode = 1
+      with tik_instance.for_range(0,32,block_num=32) as block_index:
+        input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (196 * 256 * 2,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+        with tik_instance.for_range(0,2) as eeb0:
+          tik_instance.data_move(input_1_1_local_L1, input_x[block_index,eeb0 * 8,0,0,0], 0, 1, 25088, 0, 0)
+          with tik_instance.for_range(0,4) as eeb1:
+            with tik_instance.for_range(0,2) as eeb2:
+              fetch_filter_w = 0
+              fetch_filter_h = 0
+              left_top_h = 0
+              left_top_w = 0
+              tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb2 * 196 * 256], input_1_1_local_L1[(eeb2 + eeb1 * 2) * 56 * 56 * 16],
+                pad,
+                l1_h,
+                l1_w,
+                c1_index,
+                fetch_filter_w,
+                fetch_filter_h,
+                left_top_w,
+                left_top_h,
+                stride_w,
+                stride_h,
+                filter_w,
+                filter_h,
+                dilation_filter_w,
+                dilation_filter_h,
+                jump_stride,
+                repeat_mode,
+                196)
+            with tik_instance.for_range(0,2) as eeb2:
+              with tik_instance.for_range(0,196) as i:
+                tik_instance.data_move(res[eeb0 * 8 + eeb1 * 2 + eeb2, i + block_index * 196, 0, 0],input_1_1_fractal_L1_local_UB[256 * i + eeb2 * 196 * 256], 0, 1, 16, 0, 0)
+
+    if input_shape == ((32, 16, 56, 56, 16), 'float16', (1, 1), (2, 2)):
+      if padding == 'SAME':
+        padding_left = 0
+        padding_right = 0
+        padding_top = 0
+        padding_bottom = 0
+      pad = [padding_left, padding_right, padding_top, padding_bottom]
+      l1_h = 56
+      l1_w = 56
+      c1_index = 0
+      jump_stride = 1
+      repeat_mode = 1
+      with tik_instance.for_range(0,32,block_num=32) as block_index:
+        input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
+        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
+        with tik_instance.for_range(0,2) as eeb0:
+          tik_instance.data_move(input_1_1_local_L1, input_x[block_index,eeb0 * 8,0,0,0], 0, 1, 25088, 0, 0)
+          with tik_instance.for_range(0,8) as eeb1:
+            fetch_filter_w = 0
+            fetch_filter_h = 0
+            left_top_h = 0
+            left_top_w = 0
+            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256], input_1_1_local_L1[eeb1 * 56 * 56 * 16],
+              pad,
+              l1_h,
+              l1_w,
+              c1_index,
+              fetch_filter_w,
+              fetch_filter_h,
+              left_top_w,
+              left_top_h,
+              stride_w,
+              stride_h,
+              filter_w,
+              filter_h,
+              dilation_filter_w,
+              dilation_filter_h,
+              jump_stride,
+              repeat_mode,
+              49)
+          with tik_instance.for_range(0,8) as eeb1:
+            with tik_instance.for_range(0,49) as i:
+              tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0],input_1_1_fractal_L1_local_UB[256 * i + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
+    return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
new file mode 100644
index 00000000000..ad9eb696f88
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
@@ -0,0 +1,444 @@
+# -*- coding:utf-8 -*-
+"""
+copyright 2020 Huawei Technologies Co., Ltd
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License == distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+matmul
+"""
+from __future__ import absolute_import
+import te.lang.cce
+import te.platform.cce_params as cce
+from te.platform.fusion_manager import fusion_manager
+from te import tvm
+from topi import generic
+from topi.cce import util
+
+from impl.matmul_vector import matmul_vector_cce
+
+from te import tik
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+# General limitation of the size for input shape: 2**31
+SHAPE_SIZE_LIMIT = 2147483648
+NoneType = type(None)
+
+matmul_cube_dense_left_op_info = TBERegOp("CusMatMulCubeDenseLeft") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("matmulcubedenseleft.so") \
+    .compute_cost(10) \
+    .kernel_name("CusMatMulCubeDenseLeft") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .input(2, "x3", False, "optional", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F16_FracNZ, DataType.F16_Default, DataType.F16_FracNZ) \
+    .get_op_info()
+
+# pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
+def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
+    """
+    Check the given input if legal
+
+    Parameters:
+    shape_a: list or tuple
+            Shape of the first tensor a with rank > 1
+    shape_b:  list or tuple
+            Shape of the second tensor b with the same type with a,
+            and shape_a, shape_b must be 2 dims
+    shape_bias: list or tuple
+            Shape of bias, only support the input data format with ND
+    src_dtype: str
+            The data type of input, support "float32", "float16"
+    trans_a: bool
+            If True, shape_a == transposed before multiplication
+    trans_b: bool
+           If True, shape_b == transposed before multiplication
+
+    Returns None
+    """
+    shape_len = len(shape_a)
+    src_dtype = src_dtype.lower()
+    k_block_size = cce.BLOCK_REDUCE
+
+    check_list = ("float16")
+
+    if src_dtype not in check_list:
+        raise RuntimeError("matmul_cce only support %s while src_dtype == %s"
+                           % (",".join(check_list), src_dtype))
+    if shape_len != len(shape_b):
+        raise RuntimeError("length of a and b are not equal")
+
+    if shape_len != 2:
+        raise RuntimeError(
+            "length of shape must be 2, more than 2 dimensions should use batch_matmul now!")
+
+    is_gevm = True if shape_a[-2] == 1 or shape_a[-1] == 1 else False
+    is_gemv = True if shape_b[-2] == 1 or shape_b[-1] == 1 else False
+
+    if trans_a:
+        m_shape = shape_a[shape_len - 1]
+        km_shape = shape_a[shape_len - 2]
+    else:
+        m_shape = shape_a[shape_len - 2]
+        km_shape = shape_a[shape_len - 1]
+
+    if trans_b:
+        kn_shape = shape_b[shape_len - 1]
+        n_shape = shape_b[shape_len - 2]
+    else:
+        kn_shape = shape_b[shape_len - 2]
+        n_shape = shape_b[shape_len - 1]
+
+    if m_shape == 1:
+        if n_shape == 1:
+            raise RuntimeError("input shape M and N can't both be 1")
+
+    if km_shape != kn_shape:
+        print(km_shape, kn_shape)
+        raise RuntimeError("reduce axis not same")
+
+    if m_shape % cce.BLOCK_IN != 0 and m_shape != 1:
+        raise RuntimeError(
+            "input shape M should be 1 or multiple of %d" % cce.BLOCK_IN)
+
+    if m_shape != 1:
+        if n_shape == 1:
+            if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+                raise RuntimeError("input shape K1 should be multiple of %d"
+                                   % (cce.BLOCK_IN*cce.BLOCK_IN))
+        elif km_shape%k_block_size != 0:
+            raise RuntimeError(
+                "input shape K1 should be multiple of %d" % cce.BLOCK_IN)
+    else:
+        if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+            raise RuntimeError("input shape K1 should be multiple of %d"
+                               % (cce.BLOCK_IN*cce.BLOCK_IN))
+
+    if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
+        raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
+
+    if len(shape_bias):
+        if len(shape_bias) == 1:
+            if is_gevm or is_gemv:
+                if shape_bias[0] != m_shape*n_shape:
+                    raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
+            else:
+                if shape_bias[0] != n_shape:
+                    raise RuntimeError("broadcast bias shape must be equal to shape n")
+        elif len(shape_bias) == shape_len:
+            if [i for i in shape_bias[-2:]] != [m_shape, n_shape]:
+                raise RuntimeError("non broadcast bias shape must be same as output shape")
+        else:
+            raise RuntimeError("unsupport input shape now for batch bias case")
+
+def _get_bias(shape_bias):
+    bias_length = shape_bias[0]
+    if bias_length % 16 ==0:
+        return shape_bias
+    else:
+        bias_length = (bias_length // 16)*16 + 16
+        shape_bias = []
+        shape_bias.append(bias_length)
+        return shape_bias
+
+def _get_input_shape(shape_x):
+    dim_a = shape_x[0]
+    dim_b = shape_x[1]
+    res = []
+    if dim_a % 16 !=0:
+        dim_a = (dim_a // 16)*16 + 16
+        res.append(dim_a)
+    else:
+        res.append(dim_a)
+
+    if dim_b % 16 !=0:
+        dim_b = (dim_b // 16)*16 + 16
+        res.append(dim_b)
+    else:
+        res.append(dim_b)
+    return res
+
+def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+    shape_a = input_x1.get("shape")
+    shape_b = input_x2.get("shape")
+    print("shape_a: ", shape_a)
+    print("shape_b: ", shape_b)
+    src_dtype = input_x1.get("dtype")
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(shape_a)
+    util.check_shape_rule(shape_b)
+    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
+    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
+    if bias is not None and bool(bias):
+        shape_bias = bias.get("shape")
+    try:
+        trans_a_f = bool(1-trans_a)
+        if src_dtype == "float32" or src_dtype == "int32":
+            if len(shape_a) != 2 and len(shape_b) != 2:
+                return False
+            if trans_b:
+                if shape_b[0] == 1:
+                    return False
+            else:
+                if shape_b[1] == 1:
+                    return False
+            if trans_a:
+                if trans_b:
+                    if shape_a[0] != shape_b[1]:
+                        return False
+                elif shape_a[0] != shape_b[0]:
+                    return False
+            elif trans_b:
+                if shape_a[1] != shape_b[1]:
+                    return False
+            elif shape_a[1] != shape_b[0]:
+                return False
+ 
+            if trans_a_f and trans_b and shape_b[1] == 1:
+                return False
+ 
+        if src_dtype == "float16":
+            if len(shape_a) != 2 and len(shape_b) != 2:
+                return False
+ 
+            if trans_a:
+                m_shape = shape_a[1]
+                k_shape = shape_a[0]
+            else:
+                m_shape = shape_a[0]
+                k_shape = shape_a[1]
+ 
+            if trans_b:
+                n_shape = shape_b[0]
+                k_b_shape = shape_b[1]
+            else:
+                n_shape = shape_b[1]
+                k_b_shape = shape_b[0]
+ 
+            if k_shape != k_b_shape:
+                return False
+ 
+            if m_shape == 1 or n_shape == 1:
+                if k_shape % 256 != 0:
+                    return False
+ 
+    except RuntimeError as e:
+        return False
+ 
+    return True
+ 
+# pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
+# @util.check_input_type(dict, dict, (dict, NoneType), dict, bool, bool, str)
+@op_info_register(matmul_cube_dense_left_op_info)
+def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+    """
+    calculating  matrix multiplication with bias, C = A*B + bias, support input
+    data with fractal format.
+ 
+    Parameters:
+    shape_a: list or tuple
+            Shape of the first tensor a with rank > 1
+    shape_b:  list or tuple
+            Shape of the second tensor b with the same type with a,
+            and shape_a, shape_b must be 2 dims
+    src_dtype: str
+            The data type of input, support "float32", "float16"
+    dst_dtype: str
+            The data type of output, support "float32", "float16"
+    trans_a: bool
+            If True, shape_a == transposed before multiplication
+    trans_b: bool
+            If True, shape_b == transposed before multiplication
+    is_fractal: bool
+            If True, the input data format of a and b must be fractal format
+    shape_bias: list or tuple
+            Shape of bias, only support the input data format with ND
+ 
+    Returns
+    -------
+    None
+    """
+    print("!!!!come into zzt~~~~~~~!!!!")
+    shape_a = input_x1.get("ori_shape")
+    shape_b = input_x2.get("ori_shape")
+    shape_output = output_y.get("ori_shape")
+    print("============")
+    print(input_x1.get("format"), input_x2.get("format"))
+    print(shape_a, shape_b)
+    print("============")
+    if input_x2.get("format") == "FRACTAL_Z":
+        n,c,h,w = shape_b
+        c0 = 16
+        c1 = c // c0
+        if c1 == 0:
+            c1 = 1
+        shape_b = [n, c1 * h * w * c0]
+        shape_a = [n,n]
+
+    if input_x1.get("format") == "FRACTAL_Z":
+        n,c,h,w = shape_a
+        c0 = 16
+        c1 = c // c0
+        if c1 == 0:
+            c1 = 1
+        shape_a = [n, c1 * h * w * c0]
+        shape_b = [c1 * h * w * c0, c1 * h * w * c0]
+ 
+    if input_x2.get("format") == "FRACTAL_NZ":
+        shape_a = [shape_b[0], shape_b[0]]
+        shape_b = shape_b
+ 
+    if input_x1.get("format") == "FRACTAL_NZ":
+        shape_a = shape_a
+        shape_b = [shape_a[1], shape_a[1]]
+ 
+    shape_a = list(shape_a)
+    shape_b = list(shape_b)
+ 
+    shape_a = _get_input_shape(shape_a)
+    shape_b = _get_input_shape(shape_b)
+ 
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(shape_a)
+    util.check_shape_rule(shape_b)
+    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
+    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
+ 
+    shape_a = [shape_a[1], shape_a[0]]
+    trans_a = bool(1-trans_a)
+ 
+    shape_b = [shape_b[1], shape_b[0]]
+    trans_b = bool(1-trans_b)
+ 
+    shape_bias = ()
+    if bias is not None and bool(bias):
+        shape_bias = bias.get("shape")
+        shape_bias = list(shape_bias)
+        shape_bias = _get_bias(shape_bias)
+ 
+    src_dtype = input_x1.get("dtype").lower()
+    dst_dtype = output_y.get("dtype").lower()
+    _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
+ 
+    m_shape = shape_a[len(shape_a) - 2]
+    km_shape = shape_a[len(shape_a) - 1]
+    kn_shape = shape_b[len(shape_a) - 2]
+    n_shape = shape_b[len(shape_a) - 1]
+ 
+    if src_dtype == "float16":
+        block_reduce = cce.BLOCK_REDUCE
+ 
+    block_in = cce.BLOCK_IN
+    block_out = cce.BLOCK_OUT
+ 
+    if trans_a and km_shape == 1:
+        block_in = cce.BLOCK_VECTOR
+ 
+    if not trans_a and m_shape == 1:
+        block_in = cce.BLOCK_VECTOR
+ 
+    if trans_b and kn_shape == 1:
+        block_out = cce.BLOCK_VECTOR
+ 
+   if not trans_b and n_shape == 1:
+        block_out = cce.BLOCK_VECTOR
+ 
+    if trans_a:
+        shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
+    else:
+        shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
+ 
+    if trans_b:
+        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
+    else:
+        shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
+    shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
+    format_a = "FRACTAL_NZ"
+    shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
+    format_b = "FRACTAL_NZ"
+ 
+    print("=======================================")
+    print(shape_a_temp, shape_b_temp)
+    print(format_a, format_b)
+    print("=======================================")
+    tensor_bias = None
+    tensor_a = tvm.placeholder(shape_a_temp, name='tensor_a',
+                               dtype=src_dtype)
+    tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b',
+                               dtype=src_dtype)
+ 
+    if len(shape_bias) > 0:
+        tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias',
+                                      dtype=dst_dtype)
+ 
+    if shape_a_temp[0] == 63 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 63:
+        if util.get_product_version() == util.VERSION_MINI:
+          tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+        else:
+          tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+ 
+        input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
+        input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
+        resMatmul = tik_instance.Tensor("float16", shape_output, name="output", scope=tik.scope_gm)
+        with tik_instance.for_range(0,32,block_num=32) as block_index:
+            resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf, name = "resMatmul_local_UB")
+            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc, name = "resMatmul_local_UB")
+            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca, name = "input_1_local_L1_local_L0A")
+            input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf, name = "input_2_local_L1")
+            input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf, name = "input_1_local_L1")
+            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B")
+            core_m_idx = block_index % 8
+            core_n_idx = block_index // 8
+            with tik_instance.if_scope(core_m_idx != 7):
+                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128, 55 * 16, 0)
+                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 128, 55 * 16, 0)
+                with tik_instance.for_range(0, 8) as cc12:
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256],  0, 8, 8, 0, False)
+                with tik_instance.for_range(0, 2) as cc6:
+                    with tik_instance.for_range(0, 8) as cc121:
+                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc121 * 4096], input_2_local_L1[cc6 * 32768 + cc121 * 256],  0, 16, 8, 0, True)
+                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 128, 128, 256, 0)
+                    tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0, 1)
+                    tik_instance.data_move(resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 256 // 2 , 0, 55 * 16 * 2 // 2)
+            with tik_instance.else_scope():
+                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112, 56 * 16, 0)
+                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 112, 56 * 16, 0)
+                with tik_instance.for_range(0, 7) as cc10:
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256],  0, 7, 7, 0, False)
+                with tik_instance.for_range(0, 2) as cc5:
+                    with tik_instance.for_range(0, 7) as cc101:
+                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc101 * 4096], input_2_local_L1[cc5 * 28672 + cc101 * 256],  0, 16, 7, 0, True)
+                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 112, 112, 256, 0)
+                    tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 112, 0, 0, 1)
+                    tik_instance.data_move(resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 224 // 2 , 0, 56 * 16 * 2 // 2)
+        tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[resMatmul])
+        return tik_instance
+    else:
+        print("come into tbe, shape is error!")
+        result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
+                                    format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
+ 
+        with tvm.target.cce():
+            schedule = generic.auto_schedule(result)
+ 
+        tensor_list = [tensor_a, tensor_b, result]
+        if len(shape_bias) > 0:
+            tensor_list = [tensor_a, tensor_b, tensor_bias, result]
+ 
+        config = {"print_ir": False,
+                  "name": kernel_name,
+                  "tensor_list": tensor_list}
+ 
+        te.lang.cce.cce_build_code(schedule, config)
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
new file mode 100644
index 00000000000..ade39a6ddb7
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+"""
+copyright 2020 Huawei Technologies Co., Ltd
+ 
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ 
+http://www.apache.org/licenses/LICENSE-2.0
+ 
+Unless required by applicable law or agreed to in writing, software
+distributed under the License == distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+ 
+matmul
+"""
+from __future__ import absolute_import
+import te.lang.cce
+import te.platform.cce_params as cce
+from te.platform.fusion_manager import fusion_manager
+from te import tvm
+from topi import generic
+from topi.cce import util
+from impl.matmul_vector import matmul_vector_cce
+from te import tik
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("matmulcubedenseright.so") \
+    .compute_cost(10) \
+    .kernel_name("CusMatMulCubeDenseRight") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .input(2, "x3", False, "required", "all") \
+    .input(3, "x4", False, "optional", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_FracNZ) \
+    .get_op_info()
+ 
+@op_info_register(matmul_cube_dense_right_op_info)
+def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+    shape_a_temp = (128, 63, 16, 16)
+    shape_b_temp = (128, 128, 16, 16)
+    shape_output = output_y.get("shape")
+    matrix_max_shape = (1,)
+    support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape),]
+    shape_a_input = input_x1.get("shape")
+    shape_b_input = input_x2.get("shape")
+    matrix_max_input = input_x3.get("shape")
+    input_shape = (tuple(shape_a_input), tuple(shape_b_input), tuple(matrix_max_input))
+if input_shape not in support_shape:
+        raise RuntimeError("input_shape %s is not supported" % str(input_shape))
+ 
+    if shape_a_temp[0] == 128 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 128:
+        if util.get_product_version() == util.VERSION_MINI:
+            tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+        else:
+            tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+        input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
+        input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
+        input_x3 = tik_instance.Tensor("float32", [1,], name="matrix_max", scope=tik.scope_gm)
+        resMatmul = tik_instance.Tensor("float32", shape_output, name="output", scope=tik.scope_gm)
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            core_m_idx = block_index // 16
+            core_n_idx = block_index % 16
+            matrix_max_scalar = tik_instance.Scalar("float32")
+            matrix_max_local_UB = tik_instance.Tensor("float32", (8,), scope = tik.scope_ubuf, name = "matrix_max_local_UB")
+            tik_instance.data_move(matrix_max_local_UB, input_x3, 0, 1, 1, 0, 0)
+            matrix_max_scalar.set_as(matrix_max_local_UB[0])
+ 
+            resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_ubuf, name = "resMatmul_local_UB")
+            resMatmul_local_UB1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_ubuf, name = "resMatmul_local_UB1")
+
+            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_cc, name = "resMatmul_local_UB_local_L0C")
+            resMatmul_local_UB_local_L0C1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_cc, name = "resMatmul_local_UB_local_L0C1")
+
+            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (256 * 128,), scope=tik.scope_ca, name = "input_1_local_L1_local_L0A")
+            input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf, name = "input_2_local_L1")
+            input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf, name = "input_2_local_L11")
+ 
+            input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16,), scope=tik.scope_cbuf, name = "input_1_local_L1")
+            input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16,), scope=tik.scope_cbuf, name = "input_1_local_L11")
+ 
+            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B")
+            input_2_local_L1_local_L0B1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B1")
+ 
+            with tik_instance.if_scope(core_m_idx == 0):
+                with tik_instance.for_range(0, 2) as cc1:
+                    tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
+                    tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752, 0)
+                    with tik_instance.for_range(0, 8) as cc10:
+                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256],  0, 8, 8, 0, True)
+                    with tik_instance.for_range(0, 16) as cc101:
+                        tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],  0, 8, 16, 0, False)
+ 
+                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0)
+                    tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0)
+                    tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255,1,1,8,8)
+                    tik_instance.vmuls(64, resMatmul_local_UB[255*64], resMatmul_local_UB[255*64], matrix_max_scalar, 255,1,1,8,8)
+                    tik_instance.vmuls(64, resMatmul_local_UB[510*64], resMatmul_local_UB[510*64], matrix_max_scalar, 2,1,1,8,8)
+ 
+                    tik_instance.data_move(resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504)
+            with tik_instance.else_scope():
+                tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
+                tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + 2 * 4096], 0, 8, 256, 752, 0)
+                with tik_instance.for_range(0, 8) as cc10:
+                    tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256],  0, 8, 8, 0, True)
+                with tik_instance.for_range(0, 16) as cc101:
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],  0, 8, 16, 0, False)
+ 
+                tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0)
+                tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0)
+                tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255,1,1,8,8)
+                tik_instance.vmuls(64, resMatmul_local_UB[255*64], resMatmul_local_UB[255*64], matrix_max_scalar, 255,1,1,8,8)
+                tik_instance.vmuls(64, resMatmul_local_UB[510*64], resMatmul_local_UB[510*64], matrix_max_scalar, 2,1,1,8,8)
+ 
+                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504)
+ 
+                tik_instance.data_move(input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
+                tik_instance.data_move(input_1_local_L11, input_x1[core_n_idx * 129024 + 12288], 0, 8, 240, 768, 0)
+ 
+                with tik_instance.for_range(0, 8) as cc102:
+                    tik_instance.load2dv1(input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256],  0, 8, 8, 0, True)
+                with tik_instance.for_range(0, 16) as cc103:
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256],  0, 8, 15, 0, False)
+ 
+                tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B1, 240, 128, 128, 0)
+                tik_instance.data_move(resMatmul_local_UB1, resMatmul_local_UB_local_L0C1, 0, 1, 120, 0, 0)
+ 
+                tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255,1,1,8,8)
+                tik_instance.vmuls(64, resMatmul_local_UB1[255*64], resMatmul_local_UB1[255*64], matrix_max_scalar, 225,1,1,8,8)
+ 
+                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 12288], resMatmul_local_UB1, 0, 8, 480, 0, 1536)
+ 
+        tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
+        return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
new file mode 100644
index 00000000000..955e6889fbf
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
@@ -0,0 +1,517 @@
+# -*- coding:utf-8 -*-
+"""
+copyright 2020 Huawei Technologies Co., Ltd
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License == distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+matmul
+"""
+from __future__ import absolute_import
+import te.platform.cce_params as cce
+from te import tvm
+from topi.cce import util
+from te import tik
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+# General limitation of the size for input shape: 2**31
+SHAPE_SIZE_LIMIT = 2147483648
+NoneType = type(None)
+
+matmul_cube_fracz_left_cast_op_info = TBERegOp("CusMatMulCubeFraczLeftCast") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("matmulcubefraczleftcast.so") \
+    .compute_cost(10) \
+    .kernel_name("CusMatMulCubeFraczLeftCast") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .input(2, "x3", False, "optional", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_Default, DataType.F32_FracZ, DataType.F16_Default, DataType.F16_FracZ) \
+    .get_op_info()
+
+# pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
+def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
+    """
+    Check the given input if legal
+
+    Parameters:
+    shape_a: list or tuple
+            Shape of the first tensor a with rank > 1
+    shape_b:  list or tuple
+            Shape of the second tensor b with the same type with a,
+            and shape_a, shape_b must be 2 dims
+    shape_bias: list or tuple
+            Shape of bias, only support the input data format with ND
+src_dtype: str
+            The data type of input, support "float32", "float16"
+    trans_a: bool
+            If True, shape_a == transposed before multiplication
+    trans_b: bool
+            If True, shape_b == transposed before multiplication
+
+    Returns None
+    """
+    shape_len = len(shape_a)
+    src_dtype = src_dtype.lower()
+    k_block_size = cce.BLOCK_REDUCE
+
+    check_list = ("float16")
+
+    if src_dtype not in check_list:
+        raise RuntimeError("matmul_cce only support %s while src_dtype == %s"
+                           % (",".join(check_list), src_dtype))
+    if shape_len != len(shape_b):
+        raise RuntimeError("length of a and b are not equal")
+
+    if shape_len != 2:
+        raise RuntimeError(
+            "length of shape must be 2, more than 2 dimensions should use batch_matmul now!")
+
+    is_gevm = True if shape_a[-2] == 1 or shape_a[-1] == 1 else False
+    is_gemv = True if shape_b[-2] == 1 or shape_b[-1] == 1 else False
+
+    if trans_a:
+        m_shape = shape_a[shape_len - 1]
+        km_shape = shape_a[shape_len - 2]
+    else:
+        m_shape = shape_a[shape_len - 2]
+        km_shape = shape_a[shape_len - 1]
+
+    if trans_b:
+        kn_shape = shape_b[shape_len - 1]
+        n_shape = shape_b[shape_len - 2]
+    else:
+        kn_shape = shape_b[shape_len - 2]
+        n_shape = shape_b[shape_len - 1]
+
+    if m_shape == 1:
+        if n_shape == 1:
+            raise RuntimeError("input shape M and N can't both be 1")
+
+    if km_shape != kn_shape:
+        print(km_shape, kn_shape)
+        raise RuntimeError("reduce axis not same")
+
+    if m_shape % cce.BLOCK_IN != 0 and m_shape != 1:
+        raise RuntimeError(
+            "input shape M should be 1 or multiple of %d" % cce.BLOCK_IN)
+
+    if m_shape != 1:
+        if n_shape == 1:
+            if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
+                raise RuntimeError("input shape K1 should be multiple of %d"
+                                   % (cce.BLOCK_IN * cce.BLOCK_IN))
+        elif km_shape % k_block_size != 0:
+            raise RuntimeError(
+                "input shape K1 should be multiple of %d" % cce.BLOCK_IN)
+    else:
+        if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
+            raise RuntimeError("input shape K1 should be multiple of %d"
+                               % (cce.BLOCK_IN * cce.BLOCK_IN))
+
+    if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
+        raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
+
+    if len(shape_bias):
+        if len(shape_bias) == 1:
+            if is_gevm or is_gemv:
+                if shape_bias[0] != m_shape * n_shape:
+                    raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
+            else:
+                if shape_bias[0] != n_shape:
+                    raise RuntimeError("broadcast bias shape must be equal to shape n")
+        elif len(shape_bias) == shape_len:
+            if [i for i in shape_bias[-2:]] != [m_shape, n_shape]:
+                raise RuntimeError("non broadcast bias shape must be same as output shape")
+        else:
+            raise RuntimeError("unsupport input shape now for batch bias case")
+
+def _get_bias(shape_bias):
+    bias_length = shape_bias[0]
+    if bias_length % 16 == 0:
+        return shape_bias
+    else:
+        bias_length = (bias_length // 16) * 16 + 16
+        shape_bias = []
+        shape_bias.append(bias_length)
+        return shape_bias
+
+def _get_input_shape(shape_x):
+    dim_a = shape_x[0]
+    dim_b = shape_x[1]
+    res = []
+    if dim_a % 16 != 0:
+        dim_a = (dim_a // 16) * 16 + 16
+        res.append(dim_a)
+    else:
+        res.append(dim_a)
+
+    if dim_b % 16 != 0:
+        dim_b = (dim_b // 16) * 16 + 16
+        res.append(dim_b)
+    else:
+        res.append(dim_b)
+    return res
+
+def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+    shape_a = input_x1.get("shape")
+    shape_b = input_x2.get("shape")
+    print("shape_a: ", shape_a)
+    print("shape_b: ", shape_b)
+    src_dtype = input_x1.get("dtype")
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(shape_a)
+    util.check_shape_rule(shape_b)
+    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
+    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
+    if bias is not None and bool(bias):
+        shape_bias = bias.get("shape")
+    try:
+        trans_a_f = bool(1 - trans_a)
+        if src_dtype == "float32" or src_dtype == "int32":
+            if len(shape_a) != 2 and len(shape_b) != 2:
+                return False
+            if trans_b:
+                if shape_b[0] == 1:
+                    return False
+            else:
+                if shape_b[1] == 1:
+                    return False
+            if trans_a:
+                if trans_b:
+                    if shape_a[0] != shape_b[1]:
+                        return False
+                elif shape_a[0] != shape_b[0]:
+                    return False
+            elif trans_b:
+                if shape_a[1] != shape_b[1]:
+                    return False
+            elif shape_a[1] != shape_b[0]:
+                return False
+ 
+            if trans_a_f and trans_b and shape_b[1] == 1:
+                return False
+ 
+        if src_dtype == "float16":
+            if len(shape_a) != 2 and len(shape_b) != 2:
+                return False
+ 
+            if trans_a:
+                m_shape = shape_a[1]
+                k_shape = shape_a[0]
+            else:
+                m_shape = shape_a[0]
+                k_shape = shape_a[1]
+ 
+            if trans_b:
+                n_shape = shape_b[0]
+                k_b_shape = shape_b[1]
+            else:
+                n_shape = shape_b[1]
+                k_b_shape = shape_b[0]
+ 
+            if k_shape != k_b_shape:
+                return False
+ 
+            if m_shape == 1 or n_shape == 1:
+                if k_shape % 256 != 0:
+                    return False
+ 
+    except RuntimeError as e:
+        return False
+ 
+    return True
+ 
+# pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
+@op_info_register(matmul_cube_fracz_left_cast_op_info)
+def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
+                               kernel_name="CusMatMulCubeFraczLeftCast"):
+    """
+    calculating  matrix multiplication with bias, C = A*B + bias, support input
+    data with fractal format.
+
+    Parameters:
+    shape_a: list or tuple
+            Shape of the first tensor a with rank > 1
+    shape_b:  list or tuple
+            Shape of the second tensor b with the same type with a,
+            and shape_a, shape_b must be 2 dims
+    src_dtype: str
+            The data type of input, support "float32", "float16"
+    dst_dtype: str
+            The data type of output, support "float32", "float16"
+    trans_a: bool
+            If True, shape_a == transposed before multiplication
+    trans_b: bool
+            If True, shape_b == transposed before multiplication
+    is_fractal: bool
+            If True, the input data format of a and b must be fractal format
+    shape_bias: list or tuple
+            Shape of bias, only support the input data format with ND
+ 
+    Returns
+    -------
+    None
+    """
+    shape_a = input_x1.get("ori_shape")
+    shape_b = input_x2.get("ori_shape")
+    print("============")
+    print(input_x1.get("format"), input_x2.get("format"))
+    print(shape_a, shape_b)
+    print("============")
+    if input_x2.get("format") == "FRACTAL_Z":
+        n, c, h, w = shape_b
+        c0 = 16
+        c1 = c // c0
+        if c1 == 0:
+            c1 = 1
+        shape_b = [n, c1 * h * w * c0]
+        shape_a = [n, n]
+ 
+    if input_x1.get("format") == "FRACTAL_Z":
+        n, c, h, w = shape_a
+        c0 = 16
+        c1 = c // c0
+        if c1 == 0:
+            c1 = 1
+        shape_a = [n, c1 * h * w * c0]
+        shape_b = [c1 * h * w * c0, c1 * h * w * c0]
+
+    if input_x2.get("format") == "FRACTAL_NZ":
+        shape_a = [shape_b[0], shape_b[0]]
+        shape_b = shape_b
+ 
+    if input_x1.get("format") == "FRACTAL_NZ":
+        shape_a = shape_a
+        shape_b = [shape_a[1], shape_a[1]]
+
+    shape_a = list(shape_a)
+    shape_b = list(shape_b)
+ 
+    shape_a = _get_input_shape(shape_a)
+    shape_b = _get_input_shape(shape_b)
+ 
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(shape_a)
+    util.check_shape_rule(shape_b)
+    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
+    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
+ 
+    shape_a = [shape_a[1], shape_a[0]]
+    trans_a = bool(1 - trans_a)
+ 
+    shape_b = [shape_b[1], shape_b[0]]
+    trans_b = bool(1 - trans_b)
+
+    shape_bias = ()
+    if bias is not None and bool(bias):
+        shape_bias = bias.get("shape")
+        shape_bias = list(shape_bias)
+        shape_bias = _get_bias(shape_bias)
+ 
+    src_dtype = input_x1.get("dtype").lower()
+    _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
+ 
+    m_shape = shape_a[len(shape_a) - 2]
+    km_shape = shape_a[len(shape_a) - 1]
+    kn_shape = shape_b[len(shape_a) - 2]
+    n_shape = shape_b[len(shape_a) - 1]
+ 
+    if src_dtype == "float16":
+        block_reduce = cce.BLOCK_REDUCE
+ 
+    block_in = cce.BLOCK_IN
+    block_out = cce.BLOCK_OUT
+ 
+    if trans_a and km_shape == 1:
+        block_in = cce.BLOCK_VECTOR
+ 
+    if not trans_a and m_shape == 1:
+        block_in = cce.BLOCK_VECTOR
+ 
+    if trans_b and kn_shape == 1:
+        block_out = cce.BLOCK_VECTOR
+ 
+    if not trans_b and n_shape == 1:
+        block_out = cce.BLOCK_VECTOR
+ 
+    if trans_a:
+        shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
+    else:
+        shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
+ 
+    if trans_b:
+        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
+    else:
+        shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
+    shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
+    shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
+ 
+    if util.get_product_version() == util.VERSION_MINI:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+    else:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+    input_x1 = tik_instance.Tensor(input_x1.get("dtype"), shape_a_temp, name="left_matrix", scope=tik.scope_gm)
+    input_x2 = tik_instance.Tensor(input_x2.get("dtype"), shape_b_temp, name="right_matrix", scope=tik.scope_gm)
+    res_matmul = tik_instance.Tensor(output_y.get("dtype"), output_y.get("shape"), name="output", scope=tik.scope_gm)
+    DIAG_SIZE = 128
+    mo_tile, ko_tile, no_tile, diag_opt = get_cus_tile_info(input_x1, input_x2, DIAG_SIZE)
+    cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b, res_matmul,
+                         mo_tile=mo_tile, ko_tile=ko_tile, no_tile=no_tile,
+                         diag_opt=diag_opt, diag_size=DIAG_SIZE)
+                         diag_opt=diag_opt, diag_size=DIAG_SIZE)
+    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[res_matmul])
+    return tik_instance
+ 
+def get_cus_tile_info(input_x1, input_x2, diag_size):
+    tile_map = {
+        ((32, 32, 16, 16), (128, 32, 16, 16)): (8, 8, 16),
+        ((8, 8, 16, 16), (72, 8, 16, 16)): (8, 8, 4),
+        ((32, 32, 16, 16), (288, 32, 16, 16)): (8, 8, 12),
+        ((128, 128, 16, 16), (32, 128, 16, 16)): (8, 8, 16),
+        ((16, 16, 16, 16), (144, 16, 16, 16)): (8, 8, 9),
+        ((64, 64, 16, 16), (16, 64, 16, 16)): (8, 8, 4),
+        ((16, 16, 16, 16), (64, 16, 16, 16)):  (8, 8, 4),
+        ((32, 32, 16, 16), (8, 32, 16, 16)):  (8, 8, 1),
+        ((128, 128, 16, 16), (64, 128, 16, 16)): (8, 8, 16),
+        ((16, 16, 16, 16),  (4, 16, 16, 16)):  (8, 8, 1),
+        ((16, 16, 16, 16), (32, 16, 16, 16)): (8, 8, 2),
+        ((64, 64, 16, 16), (32, 64, 16, 16)): (8, 8, 8),
+        ((32, 32, 16, 16), (64, 32, 16, 16)): (8, 8, 8),
+        ((32, 32, 16, 16), (16, 32, 16, 16)): (8, 8, 2),
+        ((8, 8, 16, 16), (32, 8, 16, 16)): (8, 8, 1),
+        ((8, 8, 16, 16), (16, 8, 16, 16)): (4, 8, 1),
+        ((4, 4, 16, 16), (16, 4, 16, 16)): (2, 4, 1),
+        ((4, 4, 16, 16), (4, 4, 16, 16)): (1, 4, 1),
+        ((4, 4, 16, 16), (36, 4, 16, 16)): (2, 4, 3),
+        ((4, 4, 16, 16), (49, 4, 16, 16)): (1, 4, 7)
+    }
+    shape_info = (tuple(input_x1.shape), tuple(input_x2.shape))
+    diag_opt = False
+    if input_x1.shape[0]*input_x1.shape[3] > diag_size:
+        diag_opt = True
+    if shape_info not in tile_map:
+        raise ValueError("shape %s is not supported" % str(shape_info))
+    mo_tile, ko_tile, no_tile = tile_map[shape_info]
+    return mo_tile, ko_tile, no_tile, diag_opt
+
+def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
+                         res, mo_tile, ko_tile, no_tile, diag_opt=False, diag_size=128):
+    ko, mo, mi, ki = input_x1.shape
+    no, ko, ki, ni = input_x2.shape
+    c0 = input_x1.shape[-1]
+    diag_outer = diag_size // c0
+    maxblocknum = 32
+    fp32_size = 4
+    fp16_size = 2
+    blocksize = 32
+    vectorfp32_size = 64
+    if [input_x1.shape[-1], input_x1.shape[-2], input_x2.shape[-1], input_x2.shape[-2]] != [c0, c0, c0, c0]:
+        raise ValueError("shape of input_x1 or input_x2 is not supported!")
+    if not trans_a or not trans_b:
+        raise ValueError("only trans_a=False and trans_b=False be supported!")
+ 
+    core_m_num = mo // mo_tile
+    loop_n_num = no // no_tile
+    if loop_n_num * core_m_num <= maxblocknum:
+        core_n_num = loop_n_num
+    else:
+        core_n_num = maxblocknum // core_m_num
+    if core_n_num > 0 and loop_n_num % core_n_num == 0:
+        loop_n_num = loop_n_num // core_n_num
+    else:
+        raise ValueError("Does not support this scenario!")
+    block_num = core_m_num * core_n_num
+ 
+    loop_k_num = ko // ko_tile
+    if diag_opt:
+        loop_k_num = diag_outer // ko_tile
+    # double buffer:
+    thread_num_k = 2
+    loop_k_num *= thread_num_k
+    ko_tile_inner = ko_tile // thread_num_k
+    with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx:
+        core_m = block_idx // core_n_num
+        core_n = block_idx % core_n_num
+        with tik_instance.for_range(0, loop_n_num) as cc_n:
+            res_L0C = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
+                                                name="resMatmul_L0C", scope=tik.scope_cc)
+            with tik_instance.for_range(0, loop_k_num, thread_num=thread_num_k) as thread_idx_k:
+                # input_x2 -> input_x2_ub -(fp322fp16)-> input_x2_cast_ub -> input_x2_L1
+                input_x2_ub = tik_instance.Tensor("float32", [no_tile, ko_tile_inner, c0, c0], name="input_x2_ub",
+                                                  scope=tik.scope_ubuf)
+                if diag_opt:
+                    k_idx = core_m * mo_tile + thread_idx_k * ko_tile_inner
+                else:
+                    k_idx = thread_idx_k * ko_tile_inner
+                tik_instance.data_move(input_x2_ub,
+                                       input_x2[(core_n * loop_n_num + cc_n) * no_tile,
+                                                k_idx, 0, 0],
+                                       0, no_tile, ko_tile_inner * c0 * c0 * fp32_size // blocksize,
+                                       (ko - ko_tile_inner) * c0 * c0 * fp32_size // blocksize, 0)
+                input_x2_cast_ub = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
+                                                       name="input_x2_cast_ub", scope=tik.scope_ubuf)
+                repeate_num = no_tile * ko_tile_inner * c0 * c0 // vectorfp32_size
+                repeate_times_max = 255
+                count = 0
+                while repeate_num > repeate_times_max:
+                    tik_instance.vconv(vectorfp32_size, 'none',
+                                       input_x2_cast_ub[count * repeate_times_max * vectorfp32_size],
+                                       input_x2_ub[count * repeate_times_max * vectorfp32_size],
+                                       repeate_times_max,
+                                       1, 1, 4, 8)
+                    repeate_num -= repeate_times_max
+                    count += 1
+                tik_instance.vconv(vectorfp32_size, 'none',
+                                   input_x2_cast_ub[count * repeate_times_max * vectorfp32_size],
+                                   input_x2_ub[count * repeate_times_max * vectorfp32_size], repeate_num,
+                                   1, 1, 4, 8)
+               input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
+                                                  name="input_x2_L1", scope=tik.scope_cbuf)
+                tik_instance.data_move(input_x2_L1, input_x2_cast_ub, 0, 1,
+                                       no_tile * ko_tile_inner * c0 * c0 * fp16_size // blocksize, 0, 0)
+                # input_x1 -> input_x1_L1
+                input_x1_L1 = tik_instance.Tensor(input_x1.dtype, [ko_tile_inner, mo_tile, c0, c0],
+                                                  name="input_x1_L1", scope=tik.scope_cbuf)
+                tik_instance.data_move(input_x1_L1,
+                                       input_x1[k_idx,
+                                                core_m * mo_tile, 0, 0],
+                                       0, ko_tile_inner, mo_tile * c0 * c0 * fp16_size // blocksize,
+                                       (mo - mo_tile) * c0 * c0 * fp16_size // blocksize, 0)
+                # input_x2_L1 -> input_x2_L0B
+                input_x2_L0B = tik_instance.Tensor("float16", [ko_tile_inner, no_tile, c0, c0],
+                                                   name="input_x2_L0B", scope=tik.scope_cb)
+                with tik_instance.for_range(0, ko_tile_inner) as cc2:
+                    tik_instance.load2dv1(input_x2_L0B[cc2, 0, 0, 0], input_x2_L1[0, cc2, 0, 0], 0, no_tile,
+                                          ko_tile_inner,
+                                          0, True)
+                # input_x1_L1 -> input_x1_L0A
+                input_x1_L0A = tik_instance.Tensor(input_x1.dtype, [mo_tile, ko_tile_inner, c0, c0],
+                                                   name="input_x1_L0A", scope=tik.scope_ca)
+                with tik_instance.for_range(0, mo_tile) as cc1:
+                    tik_instance.load2dv1(input_x1_L0A[cc1, 0, 0, 0], input_x1_L1[0, cc1, 0, 0], 0, ko_tile_inner,
+                                          mo_tile, 0, False)
+                with tik_instance.if_scope(thread_idx_k == 0):
+                    tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
+                                      ko_tile_inner * c0, no_tile * c0, 0)
+                with tik_instance.else_scope():
+                    tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
+                                      ko_tile_inner * c0, no_tile * c0, 1)
+            res_ub = tik_instance.Tensor(input_x1.dtype, [no_tile, mo_tile, c0, c0],
+                                               name="resMatmul_ub", scope=tik.scope_ubuf)
+            tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0, 1)
+            tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile, core_m * mo_tile, 0, 0],
+                                   res_ub, 0, no_tile,
+                                   mo_tile * c0 * c0 * fp16_size // blocksize, 0,
+                                   (mo - mo_tile) * c0 * c0 * fp16_size // blocksize)
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
new file mode 100644
index 00000000000..321ec602f74
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+"""
+copyright 2020 Huawei Technologies Co., Ltd
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License == distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+matmul
+"""
+from __future__ import absolute_import
+import te.lang.cce
+import te.platform.cce_params as cce
+from te.platform.fusion_manager import fusion_manager
+from te import tvm
+from topi import generic
+from topi.cce import util
+from te import tik
+from impl.matmul_vector import matmul_vector_cce
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+# General limitation of the size for input shape: 2**31
+SHAPE_SIZE_LIMIT = 2147483648
+NoneType = type(None)
+
+cus_matmul_cube_fracz_right_mul_op_info = TBERegOp("CusMatMulCubeFraczRightMul") \
+                             .fusion_type("OPAQUE") \
+                             .async_flag(False) \
+                             .binfile_name("matmulcubefraczrightmul.so") \
+                             .compute_cost(10) \
+                             .kernel_name("CusMatMulCubeFraczRightMul") \
+                             .partial_flag(True) \
+                             .input(0, "x1", False, "required", "all") \
+                             .input(1, "x2", False, "required", "all") \
+                             .input(2, "x3", False, "required", "all") \
+                             .input(3, "x4", False, "optional", "all") \
+                             .output(0, "y", False, "required", "all") \
+                             .dtype_format(DataType.F16_FracZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_FracZ) \
+                             .get_op_info()
+
+
+@op_info_register(cus_matmul_cube_fracz_right_mul_op_info)
+def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+    if util.get_product_version() == util.VERSION_MINI:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+    else:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+
+    input_x1_shape = input_x1.get("shape")
+    input_x1_dtype = input_x1.get("dtype").lower()
+    input_x2_shape = input_x2.get("shape")
+    input_x2_dtype = input_x2.get("dtype").lower()
+    input_x3_shape = input_x3.get("shape")
+    input_x3_dtype = input_x3.get("dtype").lower()
+    output_shape = output_y.get("shape")
+    Supported = [((72, 8, 16, 16),"float16", (72, 72, 16, 16), "float16", (1,), "float32"),
+                 ((32, 8, 16, 16),"float16", (32, 32, 16, 16), "float16", (1,), "float32"),
+                 ((8, 32, 16, 16),"float16", (8, 8, 16, 16), "float16", (1,), "float32"),
+                 ((4, 4, 16, 16),"float16", (4, 4, 16, 16), "float16", (1,), "float32"),
+                 ((4, 16, 16, 16), 'float16', (4, 4, 16, 16), 'float16', (1,), 'float32'),
+                 ((49, 4, 16, 16), 'float16', (49, 49, 16, 16), 'float16', (1,), 'float32'),
+                 ((36, 4, 16, 16), 'float16', (36, 36, 16, 16), 'float16', (1,), 'float32'),
+                 ((64, 16, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'),
+                 ((32, 64, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'),
+                 ((32, 16, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'),
+                 ((16, 32, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32'),
+                 ((16, 8, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32'),
+                 ((16, 4, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32'),
+                 ((288, 32, 16, 16), 'float16', (288, 288, 16, 16), 'float16', (1,), 'float32'),
+                 ((144, 16, 16, 16), 'float16', (144, 144, 16, 16), 'float16', (1,), 'float32'),
+                 ((128, 32, 16, 16), 'float16', (128, 128, 16, 16), 'float16', (1,), 'float32'),
+                 ((64, 128, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'),
+                 ((32, 128, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'),
+                 ((64, 32, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'),
+                 ((16, 64, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32')]
+    input_shape = (tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
+    if input_shape not in Supported:
+        raise RuntimeError("input_shape %s is not supported" % str(input_shape))
+
+    input_x1 = tik_instance.Tensor("float16", input_x1_shape, name="left_matrix", scope=tik.scope_gm)
+    input_x2 = tik_instance.Tensor("float16", input_x2_shape, name="right_matrix", scope=tik.scope_gm)
+    input_x3 = tik_instance.Tensor("float32", input_x3_shape, name="matrix_max", scope=tik.scope_gm)
+    resMatmul = tik_instance.Tensor("float32", output_shape, name="output", scope=tik.scope_gm)
+    cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3, resMatmul)
+    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
+    return tik_instance
+
+def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
+                              res):
+    diag_size = 128
+    ko, mo, mi, ki = input_x1.shape
+    no, ko, ki, ni = input_x2.shape
+    c0 = input_x1.shape[-1]
+    diag_outer = diag_size // c0
+    if [input_x1.shape[-1], input_x1.shape[-2], input_x2.shape[-1], input_x2.shape[-2]] != [c0, c0, c0, c0]:
+        raise ValueError("shape of input_x1 or input_x2 is not supported!")
+
+    def get_cus_tile_info(input_x1, input_x2, input_x3):
+        input_shape = (tuple(input_x1.shape), input_x1.dtype, tuple(input_x2.shape), input_x2.dtype,
+                       tuple(input_x3.shape), input_x3.dtype)
+        tile_map = {
+            # no diag opt:
+            ((8, 32, 16, 16), "float16", (8, 8, 16, 16), "float16", (1,), "float32"): (4, 8, 2, 8, 4),
+            ((4, 4, 16, 16), "float16", (4, 4, 16, 16), "float16", (1,), "float32"): (1, 4, 1, 4, 4),
+            ((4, 16, 16, 16), 'float16', (4, 4, 16, 16), 'float16', (1,), 'float32'): (1, 4, 2, 16, 2),
+            ((49, 4, 16, 16), 'float16', (49, 49, 16, 16), 'float16', (1,), 'float32'): (1, 7, 7, 4, 7),
+            ((36, 4, 16, 16), 'float16', (36, 36, 16, 16), 'float16', (1,), 'float32'): (2, 6, 3, 2, 12),
+            # diag opt:
+            ((288, 32, 16, 16), 'float16', (288, 288, 16, 16), 'float16', (1,), 'float32'): (16, 8, 8, 2, 12),
+        }
+        maxblocknum = 32
+        diag_opt = False
+        if input_x2.shape[0] * input_x2.shape[3] > diag_size and input_x2.shape[0] % diag_outer == 0:
+            diag_opt = True
+        if input_shape in tile_map:
+            mo_tile_, ko_tile_, no_tile_, core_m_num_, core_n_num_ = tile_map[input_shape]
+        elif diag_opt:
+            ko_tile_ = diag_outer
+            no_tile_ = ko_tile_
+            core_n_num_ = no // no_tile_
+            core_m_num_max = maxblocknum // core_n_num_
+            mo_tile_ = -1
+            core_m_num_ = -1
+            for i in range(core_m_num_max, 0, -1):
+                if mo % i == 0:
+                    core_m_num_ = i
+                    mo_tile_ = mo // i
+                    break
+            if mo_tile_ == -1:
+                raise ValueError("no valid tile be found!")
+            while mo_tile_ > 16:
+                mo_tile_ = mo_tile_ // 2
+        else:
+            raise ValueError("please add tile config to the tile_map")
+        print("shape: %s, tile: %s" % (input_shape, str((mo_tile_, ko_tile_, no_tile_, core_m_num_, core_n_num_,
+                                                         diag_opt))))
+        return mo_tile_, ko_tile_, no_tile_, core_m_num_, core_n_num_, diag_opt
+
+    mo_tile, ko_tile, no_tile, core_m_num, core_n_num, diag_opt = get_cus_tile_info(input_x1, input_x2, input_x3)
+    fp32_size = 4
+    fp16_size = 2
+    blocksize = 32
+    vectorfp32_size = 64
+    loop_n_num_total = no // no_tile
+    loop_m_num_total = mo // mo_tile
+    if loop_n_num_total % core_n_num != 0 or loop_m_num_total % core_m_num != 0:
+        raise ValueError("Does not support this scenario!")
+    loop_n_num = loop_n_num_total // core_n_num
+    loop_m_num = loop_m_num_total // core_m_num
+    block_num = core_n_num * core_m_num
+    loop_k_num = ko // ko_tile
+    if diag_opt:
+        loop_k_num = diag_outer // ko_tile
+    # double buffer:
+    thread_num_k = 2
+    if ko_tile % 2 == 0:
+        loop_k_num *= thread_num_k
+        ko_tile_inner = ko_tile // thread_num_k
+    else:
+        ko_tile_inner = ko_tile
+        ko_tile *= thread_num_k
+    with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx:
+        core_m = block_idx // core_n_num
+        core_n = block_idx % core_n_num
+        with tik_instance.for_range(0, loop_m_num) as cc_m:
+            with tik_instance.for_range(0, loop_n_num) as cc_n:
+                res_L0C = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
+                                              name="resMatmul_L0C", scope=tik.scope_cc)
+                with tik_instance.for_range(0, loop_k_num, thread_num=thread_num_k) as thread_idx_k:
+                    if diag_opt:
+                        k_idx = (core_n*loop_n_num + cc_n) * no_tile + thread_idx_k * ko_tile_inner
+                    else:
+                        k_idx = thread_idx_k * ko_tile_inner
+                    # input_x1 -> input_x1_L1
+                    input_x1_L1 = tik_instance.Tensor(input_x1.dtype, [ko_tile_inner, mo_tile, c0, c0],
+                                                      name="input_x1_L1", scope=tik.scope_cbuf)
+                    tik_instance.data_move(input_x1_L1,
+                                           input_x1[k_idx,
+                                                    (core_m * loop_m_num + cc_m) * mo_tile, 0, 0],
+                                           0, ko_tile_inner, mo_tile * c0 * c0 * fp16_size // blocksize,
+                                           (mo - mo_tile) * c0 * c0 * fp16_size // blocksize, 0)
+                    # input_x2 -> input_x2_L1
+                    input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
+                                                      name="input_x2_L1", scope=tik.scope_cbuf)
+                    tik_instance.data_move(input_x2_L1,
+                                           input_x2[(core_n*loop_n_num + cc_n) * no_tile,
+                                                    k_idx, 0, 0],
+                                           0, no_tile, ko_tile_inner * c0 * c0 * fp16_size // blocksize,
+                                           (ko - ko_tile_inner) * c0 * c0 * fp16_size // blocksize, 0)
+                    # input_x1_L1 -> input_x1_L0A
+                    input_x1_L0A = tik_instance.Tensor(input_x1.dtype, [mo_tile, ko_tile_inner, c0, c0],
+                                                       name="input_x1_L0A", scope=tik.scope_ca)
+                    with tik_instance.for_range(0, mo_tile) as cc1:
+                        tik_instance.load2dv1(input_x1_L0A[cc1, 0, 0, 0], input_x1_L1[0, cc1, 0, 0], 0, ko_tile_inner,
+                                              mo_tile, 0, False)
+                    # input_x2_L1 -> input_x2_L0B
+                    input_x2_L0B = tik_instance.Tensor("float16", [ko_tile_inner, no_tile, c0, c0],
+                                                       name="input_x2_L0B", scope=tik.scope_cb)
+                    with tik_instance.for_range(0, ko_tile_inner) as cc2:
+                        tik_instance.load2dv1(input_x2_L0B[cc2, 0, 0, 0], input_x2_L1[0, cc2, 0, 0], 0, no_tile,
+                                              ko_tile_inner,
+                                              0, True)
+                    with tik_instance.if_scope(thread_idx_k == 0):
+                        tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
+                                          ko_tile_inner * c0, no_tile * c0, 0)
+                    with tik_instance.else_scope():
+                        tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
+                                          ko_tile_inner * c0, no_tile * c0, 1)
+                res_ub = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
+                                                   name="resMatmul_ub", scope=tik.scope_ubuf)
+                tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0)
+ 
+                input_3_local_UB = tik_instance.Tensor("float32", (8,), scope=tik.scope_ubuf, name="input_3_local_UB")
+                tik_instance.data_move(input_3_local_UB, input_x3, 0, 1, 1, 0, 0)
+                matrix_max_scalar = tik_instance.Scalar("float32")
+                matrix_max_scalar.set_as(input_3_local_UB[0])
+                repeate_num = no_tile * mo_tile * c0 * c0 // vectorfp32_size
+                repeate_times_max = 255
+                count = 0
+                while repeate_num > repeate_times_max:
+                    tik_instance.vmuls(vectorfp32_size,
+                                       res_ub[count * repeate_times_max * vectorfp32_size],
+                                       res_ub[count * repeate_times_max * vectorfp32_size],
+                                       matrix_max_scalar, repeate_times_max, 1, 1, 8, 8)
+                    repeate_num -= repeate_times_max
+                    count += 1
+                tik_instance.vmuls(vectorfp32_size,
+                                   res_ub[count * repeate_times_max * vectorfp32_size],
+                                   res_ub[count * repeate_times_max * vectorfp32_size],
+                                   matrix_max_scalar, repeate_num, 1, 1, 8, 8)
+ 
+                tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile,
+                                           (core_m * loop_m_num + cc_m) * mo_tile, 0, 0],
+                                       res_ub, 0, no_tile,
+                                       mo_tile * c0 * c0 * fp32_size // blocksize, 0,
+                                       (mo - mo_tile) * c0 * c0 * fp32_size // blocksize)
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
new file mode 100644
index 00000000000..4bbfcf7f334
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
@@ -0,0 +1,390 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+"""
+copyright 2020 Huawei Technologies Co., Ltd
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License == distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+matmul
+"""
+from __future__ import absolute_import
+import te.lang.cce
+import te.platform.cce_params as cce
+from te import tvm
+from topi import generic
+from topi.cce import util
+from impl.matmul_vector import matmul_vector_cce
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+# General limitation of the size for input shape: 2**31
+SHAPE_SIZE_LIMIT = 2147483648
+NoneType = type(None)
+
+matmul_cube_op_info = TBERegOp("CusMatMulCube") \
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("matmulcube.so") \
+    .compute_cost(10) \
+    .kernel_name("CusMatMulCube") \
+    .partial_flag(True) \
+    .attr("transpose_a", "required", "bool", "all")\
+    .attr("transpose_b", "required", "bool", "all")\
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .input(2, "x3", False, "optional", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_FracNZ) \
+    .get_op_info()
+
+# pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
+def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
+    """
+    Check the given input if legal
+
+    Parameters:
+    shape_a: list or tuple
+            Shape of the first tensor a with rank > 1
+    shape_b:  list or tuple
+            Shape of the second tensor b with the same type with a,
+            and shape_a, shape_b must be 2 dims
+    shape_bias: list or tuple
+            Shape of bias, only support the input data format with ND
+    src_dtype: str
+            The data type of input, support "float32", "float16"
+    trans_a: bool
+            If True, shape_a == transposed before multiplication
+    trans_b: bool
+            If True, shape_b == transposed before multiplication
+
+    Returns None
+    """
+    shape_len = len(shape_a)
+    src_dtype = src_dtype.lower()
+    k_block_size = cce.BLOCK_REDUCE
+
+    check_list = ("float16")
+
+    if src_dtype not in check_list:
+        raise RuntimeError("matmul_cce only support %s while src_dtype == %s"
+                           % (",".join(check_list), src_dtype))
+    if shape_len != len(shape_b):
+        raise RuntimeError("length of a and b are not equal")
+
+    if shape_len != 2:
+        raise RuntimeError(
+            "length of shape must be 2, more than 2 dimensions should use batch_matmul now!")
+
+    is_gevm = True if shape_a[-2] == 1 or shape_a[-1] == 1 else False
+    is_gemv = True if shape_b[-2] == 1 or shape_b[-1] == 1 else False
+
+    if trans_a:
+        m_shape = shape_a[shape_len - 1]
+        km_shape = shape_a[shape_len - 2]
+    else:
+        m_shape = shape_a[shape_len - 2]
+        km_shape = shape_a[shape_len - 1]
+
+    if trans_b:
+        kn_shape = shape_b[shape_len - 1]
+        n_shape = shape_b[shape_len - 2]
+    else:
+        kn_shape = shape_b[shape_len - 2]
+        n_shape = shape_b[shape_len - 1]
+
+    if m_shape == 1:
+        if n_shape == 1:
+            raise RuntimeError("input shape M and N can't both be 1")
+
+    if km_shape != kn_shape:
+        raise RuntimeError("reduce axis not same")
+
+    if m_shape % cce.BLOCK_IN != 0 and m_shape != 1:
+        raise RuntimeError(
+            "input shape M should be 1 or multiple of %d" % cce.BLOCK_IN)
+
+    if m_shape != 1:
+        if n_shape == 1:
+            if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+                raise RuntimeError("input shape K1 should be multiple of %d"
+                                   % (cce.BLOCK_IN*cce.BLOCK_IN))
+        elif km_shape%k_block_size != 0:
+            raise RuntimeError(
+                "input shape K1 should be multiple of %d" % cce.BLOCK_IN)
+    else:
+        if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+            raise RuntimeError("input shape K1 should be multiple of %d"
+                               % (cce.BLOCK_IN*cce.BLOCK_IN))
+
+    if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
+        raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
+
+    if len(shape_bias):
+        if len(shape_bias) == 1:
+            if is_gevm or is_gemv:
+                if shape_bias[0] != m_shape*n_shape:
+                    raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
+            else:
+                if shape_bias[0] != n_shape:
+                    raise RuntimeError("broadcast bias shape must be equal to shape n")
+        elif len(shape_bias) == shape_len:
+            if [i for i in shape_bias[-2:]] != [m_shape, n_shape]:
+                raise RuntimeError("non broadcast bias shape must be same as output shape")
+        else:
+            raise RuntimeError("unsupport input shape now for batch bias case")
+
+def _get_bias(shape_bias):
+    bias_length = shape_bias[0]
+    if bias_length % 16 ==0:
+        return shape_bias
+    else:
+        bias_length = (bias_length // 16)*16 + 16
+        shape_bias = []
+        shape_bias.append(bias_length)
+        return shape_bias
+
+def _get_input_shape(shape_x):
+    dim_a = shape_x[0]
+    dim_b = shape_x[1]
+    res = []
+    if dim_a % 16 !=0:
+        dim_a = (dim_a // 16)*16 + 16
+        res.append(dim_a)
+    else:
+        res.append(dim_a)
+
+    if dim_b % 16 !=0:
+        dim_b = (dim_b // 16)*16 + 16
+        res.append(dim_b)
+    else:
+        res.append(dim_b)
+    return res
+
+def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+    shape_a = input_x1.get("shape")
+    shape_b = input_x2.get("shape")
+    print("shape_a: ", shape_a)
+    print("shape_b: ", shape_b)
+    src_dtype = input_x1.get("dtype")
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(shape_a)
+    util.check_shape_rule(shape_b)
+    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
+    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
+    if bias is not None and bool(bias):
+        shape_bias = bias.get("shape")
+    try:
+        trans_a_f = bool(1-trans_a)
+        if src_dtype == "float32" or src_dtype == "int32":
+            if len(shape_a) != 2 and len(shape_b) != 2:
+                return False
+            if trans_b:
+                if shape_b[0] == 1:
+                    return False
+            else:
+                if shape_b[1] == 1:
+                    return False
+            if trans_a:
+                if trans_b:
+                    if shape_a[0] != shape_b[1]:
+                        return False
+                elif shape_a[0] != shape_b[0]:
+                    return False
+            elif trans_b:
+                if shape_a[1] != shape_b[1]:
+                    return False
+            elif shape_a[1] != shape_b[0]:
+                return False
+ 
+            if trans_a_f and trans_b and shape_b[1] == 1:
+                return False
+ 
+        if src_dtype == "float16":
+            if len(shape_a) != 2 and len(shape_b) != 2:
+                return False
+
+            if trans_a:
+                m_shape = shape_a[1]
+                k_shape = shape_a[0]
+            else:
+                m_shape = shape_a[0]
+                k_shape = shape_a[1]
+ 
+            if trans_b:
+                n_shape = shape_b[0]
+                k_b_shape = shape_b[1]
+            else:
+                n_shape = shape_b[1]
+                k_b_shape = shape_b[0]
+ 
+            if k_shape != k_b_shape:
+                return False
+ 
+            if m_shape == 1 or n_shape == 1:
+                if k_shape % 256 != 0:
+                    return False
+ 
+    except RuntimeError as e:
+        return False
+ 
+    return True
+ 
+# pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
+@op_info_register(matmul_cube_op_info)
+def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+    """
+    calculating  matrix multiplication with bias, C = A*B + bias, support input
+    data with fractal format.
+ 
+    Parameters:
+    shape_a: list or tuple
+            Shape of the first tensor a with rank > 1
+    shape_b:  list or tuple
+            Shape of the second tensor b with the same type with a,
+            and shape_a, shape_b must be 2 dims
+    src_dtype: str
+            The data type of input, support "float32", "float16"
+    dst_dtype: str
+            The data type of output, support "float32", "float16"
+    trans_a: bool
+            If True, shape_a == transposed before multiplication
+    trans_b: bool
+            If True, shape_b == transposed before multiplication
+    is_fractal: bool
+            If True, the input data format of a and b must be fractal format
+    shape_bias: list or tuple
+            Shape of bias, only support the input data format with ND
+ 
+    Returns
+    -------
+    None
+    """
+    shape_a = input_x1.get("ori_shape")
+    shape_b = input_x2.get("ori_shape")
+ 
+    if shape_a is not None:
+        if len(shape_a) < 2:
+            shape_a = input_x1.get("shape")
+ 
+    if shape_b is not None:
+        if len(shape_b) < 2:
+            shape_b = input_x2.get("shape")
+ 
+    shape_a = list(shape_a)
+    shape_b = list(shape_b)
+ 
+    if input_x1.get("format") == "FRACTAL_NZ":
+        shape_a = _get_input_shape(shape_a)
+        shape_b = _get_input_shape(shape_b)
+
+    util.check_kernel_name(kernel_name)
+    util.check_shape_rule(shape_a)
+    util.check_shape_rule(shape_b)
+    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
+    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
+ 
+    if input_x1.get("format") == "FRACTAL_NZ":
+        shape_a = [shape_a[1], shape_a[0]]
+        trans_a = bool(1-trans_a)
+ 
+    if input_x2.get("format") == "FRACTAL_NZ":
+        shape_b = [shape_b[1], shape_b[0]]
+        trans_b = bool(1-trans_b)
+ 
+    shape_bias = ()
+    if bias is not None and bool(bias):
+        shape_bias = bias.get("shape")
+        shape_bias = list(shape_bias)
+        shape_bias = _get_bias(shape_bias)
+ 
+    src_dtype = input_x1.get("dtype").lower()
+    dst_dtype = output_y.get("dtype").lower()
+    if src_dtype == "float32" or src_dtype == "int32":
+        matmul_vector_cce(shape_a, shape_b, src_dtype, trans_a, trans_b, shape_bias, kernel_name)
+        return
+    _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
+    m_shape = shape_a[len(shape_a) - 2]
+    km_shape = shape_a[len(shape_a) - 1]
+    kn_shape = shape_b[len(shape_a) - 2]
+    n_shape = shape_b[len(shape_a) - 1]
+
+    if src_dtype == "float16":
+        block_reduce = cce.BLOCK_REDUCE
+
+    block_in = cce.BLOCK_IN
+    block_out = cce.BLOCK_OUT
+
+    if trans_a and km_shape == 1:
+        block_in = cce.BLOCK_VECTOR
+
+    if not trans_a and m_shape == 1:
+        block_in = cce.BLOCK_VECTOR
+
+    if trans_b and kn_shape == 1:
+        block_out = cce.BLOCK_VECTOR
+
+    if not trans_b and n_shape == 1:
+        block_out = cce.BLOCK_VECTOR
+
+    if trans_a:
+        shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
+    else:
+        shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
+ 
+    if trans_b:
+        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
+    else:
+        shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
+ 
+    if input_x1.get("format") == "FORMAT_FRACTAL_Z":
+        shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
+        format_a = "fractal"
+    elif input_x1.get("format") == "FRACTAL_NZ":
+        shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
+        format_a = "FRACTAL_NZ"
+    else:
+        shape_a_temp = (shape_a[len(shape_a) - 2], shape_a[len(shape_a) - 1])
+        format_a = "ND"
+ 
+    if input_x2.get("format") == "FORMAT_FRACTAL_Z":
+        shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
+        format_b = "fractal"
+    elif input_x2.get("format") == "FRACTAL_NZ":
+        shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
+        format_b = "FRACTAL_NZ"
+    else:
+        shape_b_temp = (shape_b[len(shape_b) - 2], shape_b[len(shape_b) - 1])
+        format_b = "ND"
+ 
+    tensor_bias = None
+    tensor_a = tvm.placeholder(shape_a_temp, name='tensor_a',
+                               dtype=src_dtype)
+    tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b',
+                               dtype=src_dtype)
+ 
+    if len(shape_bias) > 0:
+        tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias',
+                                      dtype=dst_dtype)
+    result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
+                                format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
+ 
+    with tvm.target.cce():
+        schedule = generic.auto_schedule(result)
+ 
+    tensor_list = [tensor_a, tensor_b, result]
+    if len(shape_bias) > 0:
+        tensor_list = [tensor_a, tensor_b, tensor_bias, result]
+ 
+    config = {"print_ir": False,
+              "name": kernel_name,
+              "tensor_list": tensor_list}
+ 
+    te.lang.cce.cce_build_code(schedule, config)
diff --git a/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py b/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
new file mode 100644
index 00000000000..14c4b590cee
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
@@ -0,0 +1,75 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""CusMatrixCombine"""
+from te import tik
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+cus_matrix_combine_op_info = TBERegOp("CusMatrixCombine") \
+                             .fusion_type("OPAQUE") \
+                             .async_flag(False) \
+                             .binfile_name("matrixcombine.so") \
+                             .compute_cost(10) \
+                             .kernel_name("CusMatrixCombine") \
+                             .partial_flag(True) \
+                             .input(0, "x1", False, "required", "all") \
+                             .output(0, "y", False, "required", "all") \
+                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+                             .get_op_info()
+
+@op_info_register(cus_matrix_combine_op_info)
+def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
+    input_x_shape = input_x.get("shape")
+    output_shape = output.get("shape")
+    split_dim = 128
+
+    if util.get_product_version() == util.VERSION_MINI:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+    else:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+
+    input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+    res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
+
+    blocks = 32
+    matrix_dim = input_x_shape[0] * input_x_shape[1]
+    if input_x_shape[0] == 1 and input_x_shape[1] == 64 :
+        tiling_dim = 2
+        bs = 1
+        with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf)
+            tik_instance.data_move(input_x_ub, input_x[0, block_index * tiling_dim, 0], 0, 1, 16, 0, 0)
+            tik_instance.data_move(res[block_index * tiling_dim, 0], input_x_ub, 0, 1, 16, 0, 0)
+    else:
+        tiling_dim = 4
+        bs = input_x_shape[0]
+        with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf)
+            zero = tik_instance.Scalar("float32")
+            zero.set_as(0.0)
+            with tik_instance.for_range(0, bs) as i:
+                repeat_real = tiling_dim * matrix_dim // 64
+                if repeat_real <= 255:
+                    tik_instance.vector_dup(64, input_x_ub, zero, repeat_real, 1, 8)
+                else:
+                    repeat_1 = 255
+                    repeat_2 = repeat_real - 255
+                    tik_instance.vector_dup(64, input_x_ub, zero, repeat_1, 1, 8)
+                    tik_instance.vector_dup(64, input_x_ub[255 * 64], zero, repeat_2, 1, 8)
+                with tik_instance.for_range(0, tiling_dim) as j:
+                    tik_instance.data_move(input_x_ub[j, split_dim * i], input_x[i, block_index * tiling_dim + j, 0], 0, 1, 16, 0, 0)
+                tik_instance.data_move(res[i * split_dim + block_index * tiling_dim, 0], input_x_ub, 0, 1, tiling_dim * matrix_dim *4 // 32, 0, 0)
+    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
+    return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py b/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
new file mode 100644
index 00000000000..838bb355e51
--- /dev/null
+++ b/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
@@ -0,0 +1,238 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""CusTranspose02314"""
+from te import tik
+from topi.cce import util
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
+cus_transpose02314_op_info = TBERegOp("CusTranspose02314") \
+                             .fusion_type("OPAQUE") \
+                             .async_flag(False) \
+                             .binfile_name("transpose02314.so") \
+                             .compute_cost(10) \
+                             .kernel_name("CusTranspose02314") \
+                             .partial_flag(True) \
+                             .input(0, "x1", False, "required", "all") \
+                             .output(0, "y", False, "required", "all") \
+                             .dtype_format(DataType.F16_5HD, DataType.F16_Default) \
+                             .get_op_info()
+
+@op_info_register(cus_transpose02314_op_info)
+def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
+    input_x_shape = input_x.get("shape")
+    output_shape = output.get("shape")
+    perm = (0,2,3,1,4)
+    input_x_shape = tuple(input_x_shape)
+    support_shape = [ (32,128,7,7,16),
+            (32,32,7,7,16),
+            (32,32,14,14,16),
+            (32,64,14,14,16),
+            (32,16,14,14,16),
+            (32,16,28,28,16),
+            (32,32,28,28,16),
+            (32,8,28,28,16),
+            (32,8,56,56,16),
+            (32,16,56,56,16),
+            (32,4,56,56,16),
+            (32,4,112,112,16)]
+    if input_x_shape not in support_shape:
+        raise RuntimeError("input_shape %s is not supported" % str(input_x_shape))
+
+    if util.get_product_version() == util.VERSION_MINI:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+    else:
+        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+
+    input_x = tik_instance.Tensor("float16", input_x_shape, name="input_x", scope=tik.scope_gm)
+    res = tik_instance.Tensor("float16", output_shape, name="res", scope=tik.scope_gm)
+
+    dtype = "float16"
+    if tuple(input_x_shape) == (32,4,112,112,16):
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            with tik_instance.for_range(0, 14) as cc1_db:
+                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
+                    zero = tik_instance.Scalar(dtype="float16", init_value=0)
+                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 802816 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448, 12096, 0)
+                    with tik_instance.for_range(0, 448) as cc7:
+                        with tik_instance.for_range(0, 4) as cc8:
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16], input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32,4,56,56,16):
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            zero = tik_instance.Scalar(dtype="float16", init_value=0)
+            with tik_instance.for_range(0, 3) as cc1_db:
+                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 200704 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448, 2688, 0)
+                    with tik_instance.for_range(0, 448) as cc7:
+                        with tik_instance.for_range(0, 4) as cc8:
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16], input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
+
+            input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf)
+            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2", scope=tik.scope_ubuf)
+            tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 43008], 0, 4, 448, 2688, 0)
+            with tik_instance.for_range(0, 448) as cc72:
+                with tik_instance.for_range(0, 4) as cc82:
+                    tik_instance.vadds(16, T_transpose_local_UB2[cc72 * 64 + cc82 * 16], input_1_local_UB2[7168 * cc82 + cc72 * 16], zero, 1, 1, 1, 0, 0)
+            tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32,16,56,56,16):
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            zero = tik_instance.Scalar(dtype="float16", init_value=0)
+            with tik_instance.for_range(0, 14) as cc1_db:
+                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 802816 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 3024, 0)
+                    with tik_instance.for_range(0, 112) as cc7:
+                        with tik_instance.for_range(0, 16) as cc8:
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32,8,56,56,16):
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            zero = tik_instance.Scalar(dtype="float16", init_value=0)
+            with tik_instance.for_range(0, 7) as cc1_db:
+                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 7168 + 3584 * db_idx], 0, 8, 224, 2912, 0)
+                    with tik_instance.for_range(0, 224) as cc7:
+                        with tik_instance.for_range(0, 16) as cc8:
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16], input_1_local_UB[3584 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32,8,28,28,16):
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            zero = tik_instance.Scalar(dtype="float16", init_value=0)
+            with tik_instance.for_range(0, 2) as cc1_db:
+                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
+                    input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB", scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 100352 + cc1_db * 6272 + 3136 * db_idx], 0, 8, 196, 588, 0)
+                    with tik_instance.for_range(0, 196) as cc7:
+                        with tik_instance.for_range(0, 8) as cc8:
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16], input_1_local_UB[3136 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 100352 + cc1_db * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0)
+    elif tuple(input_x_shape) == (32,32,28,28,16):
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            zero = tik_instance.Scalar(dtype="float16", init_value=0)
+            with tik_instance.for_range(0, 7) as cc1_db:
+                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 1792 + 896 * db_idx], 0, 32, 56, 728, 0)
+                    with tik_instance.for_range(0, 56) as cc7:
+                        with tik_instance.for_range(0, 32) as cc8:
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 512 + cc8 * 16], input_1_local_UB[896 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32,16,28,28,16):
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            zero = tik_instance.Scalar(dtype="float16", init_value=0)
+            with tik_instance.for_range(0, 3) as cc1_db:
+                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 200704 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 672, 0)
+                    with tik_instance.for_range(0, 112) as cc7:
+                        with tik_instance.for_range(0, 16) as cc8:
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
+
+            input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf)
+            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2", scope=tik.scope_ubuf)
+            tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 10752], 0, 16, 112, 672, 0)
+            with tik_instance.for_range(0, 112) as cc7:
+                with tik_instance.for_range(0, 16) as cc8:
+                    tik_instance.vadds(16, T_transpose_local_UB2[cc7 * 256 + cc8 * 16], input_1_local_UB2[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+            tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0)
+
+    elif tuple(input_x_shape) == (32,16,14,14,16):
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            zero = tik_instance.Scalar(dtype="float16", init_value=0)
+            with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
+                input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf)
+                T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB", scope=tik.scope_ubuf)
+                tik_instance.data_move(input_1_local_UB, input_x[block_idx * 50176 + 1568 * db_idx], 0, 16, 98, 98, 0)
+                with tik_instance.for_range(0, 98) as cc7:
+                    with tik_instance.for_range(0, 16) as cc8:
+                        tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1568 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                tik_instance.data_move(res[block_idx * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0)
+    elif tuple(input_x_shape) == (32, 128, 7, 7, 16) and tuple(perm) == (0,2,3,1,4) and dtype == "float16":
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            with tik_instance.for_range(0, 7, thread_num=2) as cc1:
+                input_x_ub = tik_instance.Tensor(dtype, [1, 128, 1, 7, 16], name="input_1_local_UB",
+                                                 scope=tik.scope_ubuf)
+                transpose_ub = tik_instance.Tensor(dtype, [1, 1, 7, 128, 16], name="transpose_local_UB",
+                                                 scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[block_idx, 0, cc1, 0, 0], 0, 128, 7, 42, 0)
+                with tik_instance.for_range(0, 7) as cc7:
+                    with tik_instance.for_range(0, 128) as cc8:
+                        tik_instance.vadds(16, transpose_ub[0, 0, cc7, cc8, 0], input_x_ub[0, cc8, 0, cc7, 0], 0,
+                                           1, 1, 1, 0, 0)
+                tik_instance.data_move(res[block_idx * 100352 + 14336 * cc1], transpose_ub, 0, 1, 896, 0, 0)
+
+    elif tuple(input_x_shape) == (32, 32, 7, 7, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            input_x_ub = tik_instance.Tensor(dtype, [1, 32, 7, 7, 16], name="input_1_local_UB",
+                                             scope=tik.scope_ubuf)
+            transpose_ub = tik_instance.Tensor(dtype, [1, 7, 7, 32, 16], name="transpose_local_UB",
+                                               scope=tik.scope_ubuf)
+            tik_instance.data_move(input_x_ub, input_x[block_idx, 0, 0, 0, 0], 0, 1, 1568, 0, 0)
+            with tik_instance.for_range(0, 7) as cc1:
+               with tik_instance.for_range(0, 7) as cc2:
+                    with tik_instance.for_range(0, 32) as cc3:
+                        tik_instance.vadds(16, transpose_ub[0, cc1, cc2, cc3, 0], input_x_ub[0, cc3, cc1, cc2, 0], 0,
+                                           1, 1, 1, 0, 0)
+            tik_instance.data_move(res[block_idx * 25088], transpose_ub, 0, 1, 1568, 0, 0)
+
+    elif tuple(input_x_shape) == (32, 32, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
+        def _inner_compute(split_index):
+            input_x_ub = tik_instance.Tensor(dtype, [1, 32, 2, 14, 16], name="input_1_local_UB",
+                                             scope=tik.scope_ubuf)
+            transpose_ub = tik_instance.Tensor(dtype, [1, 2, 14, 32, 16], name="transpose_local_UB",
+                                               scope=tik.scope_ubuf)
+            tik_instance.data_move(input_x_ub, input_x[block_idx, 0, split_index * 2, 0, 0], 0, 32, 28, 168, 0)
+            with tik_instance.for_range(0, 2) as cc2:
+                with tik_instance.for_range(0, 14) as cc3:
+                    with tik_instance.for_range(0, 32) as cc4:
+                        tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
+                                           0, 1, 1, 1, 0, 0)
+            tik_instance.data_move(res[block_idx * 100352 + split_index * 2 * 7168], transpose_ub, 0, 1, 896, 0, 0)
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            with tik_instance.for_range(0, 6, thread_num=2) as cc1:
+                _inner_+compute(cc1)
+            _inner_compute(6)
+    elif tuple(input_x_shape) == (32,64,14,14,16)  and tuple(perm) == (0,2,3,1,4) and dtype == "float16":
+        def _inner_compute(split_index, block_idx):
+            input_x_ub = tik_instance.Tensor(dtype, [1, 64, 2, 14, 16], name="input_1_local_UB",
+                                             scope=tik.scope_ubuf)
+            transpose_ub = tik_instance.Tensor(dtype, [1, 2, 14, 64, 16], name="transpose_local_UB",
+                                               scope=tik.scope_ubuf)
+            tik_instance.data_move(input_x_ub, input_x[block_idx, 0, split_index * 2, 0, 0], 0, 64, 28, 168, 0)
+            with tik_instance.for_range(0, 2) as cc2:
+                with tik_instance.for_range(0, 14) as cc3:
+                    with tik_instance.for_range(0, 64) as cc4:
+                        tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
+                                           0, 1, 1, 1, 0, 0)
+            tik_instance.data_move(res[block_idx * 200704 + split_index * 2 * 14336], transpose_ub, 0, 1, 1792, 0, 0)
+       with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+            with tik_instance.for_range(0, 6, thread_num=2) as cc1:
+                _inner_compute(cc1, block_idx)
+            _inner_compute(6, block_idx)
+
+    tik_instance.BuildCCE(kernel_name, inputs=[input_x], outputs=[res])
+    return tik_instance
diff --git a/mindspore/ops/_op_impl/custom_op/batch_matmul_impl.py b/mindspore/ops/_op_impl/custom_op/batch_matmul_impl.py
deleted file mode 100644
index e2afa96a7d6..00000000000
--- a/mindspore/ops/_op_impl/custom_op/batch_matmul_impl.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""batch_matmul_impl"""
-from mindspore.ops.op_info_register import op_info_register
-
-
-@op_info_register("""{
-    "op_name": "CusBatchMatMul",
-    "imply_type": "TBE",
-    "fusion_type": "OPAQUE",
-    "async_flag": false,
-    "binfile_name": "batchmatmul.so",
-    "compute_cost": 10,
-    "kernel_name": "CusBatchMatMul",
-    "partial_flag": true,
-    "attr": [
-    ],
-    "inputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x1",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        },
-        {
-            "index": 1,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x2",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ],
-    "outputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "y",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ]
-}""")
-def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"):
-    """CusBatchMatMul"""
-    return
diff --git a/mindspore/ops/_op_impl/custom_op/cholesky_trsm.py b/mindspore/ops/_op_impl/custom_op/cholesky_trsm.py
deleted file mode 100644
index 5c38dfc25d4..00000000000
--- a/mindspore/ops/_op_impl/custom_op/cholesky_trsm.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""CusCholeskyTrsm"""
-from mindspore.ops.op_info_register import op_info_register
-
-
-@op_info_register("""{
-    "op_name": "CusCholeskyTrsm",
-    "imply_type": "TBE",
-    "fusion_type": "OPAQUE",
-    "async_flag": false,
-    "binfile_name": "choleskytrsm.so",
-    "compute_cost": 10,
-    "kernel_name": "CusCholeskyTrsm",
-    "partial_flag": true,
-    "attr": [
- 
-    ],
-    "inputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x1",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ],
-    "outputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "y",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-   ]
-}""")
-def CusCholeskyTrsm(input_x, output, kernel_name):
-    """CusCholeskyTrsm"""
-    return
diff --git a/mindspore/ops/_op_impl/custom_op/fused_abs_max1.py b/mindspore/ops/_op_impl/custom_op/fused_abs_max1.py
deleted file mode 100644
index b9a0d452738..00000000000
--- a/mindspore/ops/_op_impl/custom_op/fused_abs_max1.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""CusFusedAbsMax1"""
-from mindspore.ops.op_info_register import op_info_register
-
-
-@op_info_register("""{
-    "op_name": "CusFusedAbsMax1",
-    "imply_type": "TBE",
-    "fusion_type": "OPAQUE",
-    "async_flag": false,
-    "binfile_name": "fusedabsmax1.so",
-    "compute_cost": 10,
-    "kernel_name": "CusFusedAbsMax1",
-    "partial_flag": true,
-    "attr": [
-        {
-            "name": "origin_shape",
-            "param_type": "required",
-            "type": "listInt",
-            "value": "all"
-        }
-    ],
-    "inputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x1",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ],
-    "outputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "y",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ]
-}""")
-def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_max1"):
-    """CusFusedAbsMax1"""
-    return
diff --git a/mindspore/ops/_op_impl/custom_op/img2col_impl.py b/mindspore/ops/_op_impl/custom_op/img2col_impl.py
deleted file mode 100644
index 5137d4d7e70..00000000000
--- a/mindspore/ops/_op_impl/custom_op/img2col_impl.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""CusImg2ColNC1HWC0"""
-from mindspore.ops.op_info_register import op_info_register
-
-
-@op_info_register("""{
-    "op_name": "CusImg2ColNC1HWC0",
-    "imply_type": "TBE",
-    "fusion_type": "OPAQUE",
-    "async_flag": false,
-    "binfile_name": "img2colnc1hwc0.so",
-    "compute_cost": 10,
-    "kernel_name": "CusImg2ColNC1HWC0",
-    "partial_flag": true,
-    "attr": [
-        {
-            "name": "ksizes",
-            "param_type": "required",
-            "type": "listInt",
-            "value": "all"
-        },
-        {
-            "name": "strides",
-            "param_type": "required",
-            "type": "listInt",
-            "value": "all"
-        },
-        {
-            "name": "dilates",
-            "param_type": "required",
-            "type": "listInt",
-            "value": "all"
-        },
-        {
-            "name": "padding",
-            "param_type": "required",
-            "type": "str",
-            "value": "all"
-        }
-    ],
-    "inputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "NC1HWC0"
-            ],
-            "name": "x1",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ],
-    "outputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "FRACTAL_NZ"
-            ],
-            "name": "y",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ]
-}""")
-def CusImg2ColNC1HWC0(input_x, output, ksizes, strides, dilates, padding, kernel_name="img2col"):
-    """CusImg2ColNC1HWC0"""
-    return
diff --git a/mindspore/ops/_op_impl/custom_op/matmul_cube_dense_left.py b/mindspore/ops/_op_impl/custom_op/matmul_cube_dense_left.py
deleted file mode 100644
index 300410eb4a3..00000000000
--- a/mindspore/ops/_op_impl/custom_op/matmul_cube_dense_left.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# -*- coding:utf-8 -*-
-"""
-copyright 2020 Huawei Technologies Co., Ltd
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License == distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-matmul
-"""
-from __future__ import absolute_import
-
-from mindspore.ops.op_info_register import op_info_register
-from topi.cce import util
-
-# General limitation of the size for input shape: 2**31
-SHAPE_SIZE_LIMIT = 2147483648
-NoneType = type(None)
-
-
-@op_info_register("""{
-    "op_name": "CusMatMulCubeDenseLeft",
-    "imply_type": "TBE",
-    "fusion_type": "OPAQUE",
-    "async_flag": false,
-    "binfile_name": "matmulcubedenseleft.so",
-    "compute_cost": 10,
-    "kernel_name": "CusMatMulCubeDenseLeft",
-    "partial_flag": true,
-    "attr": [
-    ],
-    "inputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x1",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        },
-        {
-            "index": 1,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "FRACTAL_NZ"
-            ],
-            "name": "x2",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        },
-        {
-            "index": 2,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x3",
-            "need_compile": false,
-            "param_type": "optional",
-            "shape": "all"
-        }
-    ],
-    "outputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "FRACTAL_NZ"
-            ],
-            "name": "y",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ]
-}""")
-@util.check_input_type(dict, dict, (dict, NoneType), dict, bool, bool, str)
-def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
-                           kernel_name="matmulcube"):
-    """CusMatMulCubeDenseLeft"""
-    return
diff --git a/mindspore/ops/_op_impl/custom_op/matmul_cube_fracz_left_cast_impl.py b/mindspore/ops/_op_impl/custom_op/matmul_cube_fracz_left_cast_impl.py
deleted file mode 100644
index 3da1593dfd3..00000000000
--- a/mindspore/ops/_op_impl/custom_op/matmul_cube_fracz_left_cast_impl.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# -*- coding:utf-8 -*-
-"""
-copyright 2020 Huawei Technologies Co., Ltd
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License == distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-matmul
-"""
-from __future__ import absolute_import
-
-from mindspore.ops.op_info_register import op_info_register
-from topi.cce import util
-
-# General limitation of the size for input shape: 2**31
-SHAPE_SIZE_LIMIT = 2147483648
-NoneType = type(None)
-
-
-@op_info_register("""{
-    "op_name": "CusMatMulCubeFraczLeftCast",
-    "imply_type": "TBE",
-    "fusion_type": "OPAQUE",
-    "async_flag": false,
-    "binfile_name": "matmulcubefraczleftcast.so",
-    "compute_cost": 10,
-    "kernel_name": "CusMatMulCubeFraczLeftCast",
-    "partial_flag": true,
-    "attr": [
-    ],
-    "inputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x1",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        },
-        {
-            "index": 1,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "FracZ"
-            ],
-            "name": "x2",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        },
-        {
-            "index": 2,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x3",
-            "need_compile": false,
-            "param_type": "optional",
-            "shape": "all"
-        }
-    ],
-    "outputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "FracZ"
-            ],
-            "name": "y",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ]
-}""")
-# pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
-@util.check_input_type(dict, dict, (dict, NoneType), dict, bool, bool, str)
-def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
-                               kernel_name="CusMatMulCubeFraczLeftCast"):
-    """CusMatMulCubeFraczLeftCast"""
-    return
diff --git a/mindspore/ops/_op_impl/custom_op/matmul_cube_fracz_right_mul_impl.py b/mindspore/ops/_op_impl/custom_op/matmul_cube_fracz_right_mul_impl.py
deleted file mode 100644
index 7fc2ba35d16..00000000000
--- a/mindspore/ops/_op_impl/custom_op/matmul_cube_fracz_right_mul_impl.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/usr/bin/env python
-# -*- coding:utf-8 -*-
-"""
-copyright 2020 Huawei Technologies Co., Ltd
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License == distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-matmul
-"""
-from __future__ import absolute_import
-
-from mindspore.ops.op_info_register import op_info_register
-
-# General limitation of the size for input shape: 2**31
-SHAPE_SIZE_LIMIT = 2147483648
-NoneType = type(None)
-
-
-@op_info_register("""{
-    "op_name": "CusMatMulCubeFraczRightMul",
-    "imply_type": "TBE",
-    "fusion_type": "OPAQUE",
-    "async_flag": false,
-    "binfile_name": "matmulcubefraczrightmul.so",
-    "compute_cost": 10,
-    "kernel_name": "CusMatMulCubeFraczRightMul",
-    "partial_flag": true,
-    "attr": [
-    ],
-    "inputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "FracZ"
-            ],
-            "name": "x1",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        },
-        {
-            "index": 1,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x2",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        },
-        {
-            "index": 2,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x3",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        },
-        {
-            "index": 3,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x4",
-            "need_compile": false,
-            "param_type": "optional",
-            "shape": "all"
-        }
-    ],
-    "outputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "FracZ"
-            ],
-            "name": "y",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ]
-}""")
-def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
-                               kernel_name="matmulcube"):
-    """CusMatMulCubeFraczRightMul"""
-    return
diff --git a/mindspore/ops/_op_impl/custom_op/matmul_cube_impl.py b/mindspore/ops/_op_impl/custom_op/matmul_cube_impl.py
deleted file mode 100644
index 7c2d81e1d67..00000000000
--- a/mindspore/ops/_op_impl/custom_op/matmul_cube_impl.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env python
-# -*- coding:utf-8 -*-
-"""
-copyright 2020 Huawei Technologies Co., Ltd
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License == distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-matmul
-"""
-from __future__ import absolute_import
-
-from mindspore.ops.op_info_register import op_info_register
-from topi.cce import util
-
-# General limitation of the size for input shape: 2**31
-SHAPE_SIZE_LIMIT = 2147483648
-NoneType = type(None)
-
-
-@op_info_register("""{
-    "op_name": "CusMatMulCube",
-    "imply_type": "TBE",
-    "fusion_type": "OPAQUE",
-    "async_flag": false,
-    "binfile_name": "matmulcube.so",
-    "compute_cost": 10,
-    "kernel_name": "CusMatMulCube",
-    "partial_flag": true,
-    "attr": [
-        {
-            "name": "transpose_a",
-            "param_type": "required",
-            "type": "bool",
-            "value": "all"
-        },
-        {
-            "name": "transpose_b",
-            "param_type": "required",
-            "type": "bool",
-            "value": "all"
-        }
-    ],
-    "inputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "FRACTAL_NZ"
-            ],
-            "name": "x1",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        },
-        {
-            "index": 1,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "FRACTAL_NZ"
-            ],
-            "name": "x2",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        },
-        {
-            "index": 2,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x3",
-            "need_compile": false,
-            "param_type": "optional",
-            "shape": "all"
-        }
-    ],
-    "outputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "FRACTAL_NZ"
-            ],
-            "name": "y",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ]
-}""")
-# pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
-@util.check_input_type(dict, dict, (dict, NoneType), dict, bool, bool, str)
-def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
-    """CusMatMulCube"""
-    return
diff --git a/mindspore/ops/_op_impl/custom_op/matrix_combine_impl.py b/mindspore/ops/_op_impl/custom_op/matrix_combine_impl.py
deleted file mode 100644
index 32045e7ccbd..00000000000
--- a/mindspore/ops/_op_impl/custom_op/matrix_combine_impl.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""CusMatrixCombine"""
-from mindspore.ops.op_info_register import op_info_register
-
-
-@op_info_register("""{
-    "op_name": "CusMatrixCombine",
-    "imply_type": "TBE",
-    "fusion_type": "OPAQUE",
-    "async_flag": false,
-    "binfile_name": "matrixcombine.so",
-    "compute_cost": 10,
-    "kernel_name": "CusMatrixCombine",
-    "partial_flag": true,
-    "attr": [
-    ],
-    "inputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "x1",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ],
-    "outputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float32"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "y",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ]
-}""")
-def CusMatrixCombine(input_x, output, kernel_name="matrix_combine"):
-    """CusMatrixCombine"""
-    return
diff --git a/mindspore/ops/_op_impl/custom_op/transpose02314_impl.py b/mindspore/ops/_op_impl/custom_op/transpose02314_impl.py
deleted file mode 100644
index c5aebe523d5..00000000000
--- a/mindspore/ops/_op_impl/custom_op/transpose02314_impl.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""CusTranspose02314"""
-from mindspore.ops.op_info_register import op_info_register
-
-
-@op_info_register("""{
-    "op_name": "CusTranspose02314",
-    "imply_type": "TBE",
-    "fusion_type": "OPAQUE",
-    "async_flag": false,
-    "binfile_name": "transpose02314.so",
-    "compute_cost": 10,
-    "kernel_name": "CusTranspose02314",
-    "partial_flag": true,
-    "attr": [
-    ],
-    "inputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "NC1HWC0"
-            ],
-            "name": "x1",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ],
-    "outputs": [
-        {
-            "index": 0,
-            "dtype": [
-                "float16"
-            ],
-            "format": [
-                "DefaultFormat"
-            ],
-            "name": "y",
-            "need_compile": false,
-            "param_type": "required",
-            "shape": "all"
-        }
-    ]
-}""")
-def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
-    """CusTranspose02314"""
-    return
diff --git a/mindspore/ops/operations/__init__.py b/mindspore/ops/operations/__init__.py
index e933fa97013..32edce64e0a 100644
--- a/mindspore/ops/operations/__init__.py
+++ b/mindspore/ops/operations/__init__.py
@@ -70,6 +70,7 @@ from .nn_ops import (LSTM, SGD, Adam, ApplyMomentum, BatchNorm,
 from .other_ops import Assign, IOU, BoundingBoxDecode, BoundingBoxEncode, CheckValid, MakeRefKey, CheckBprop
 from . import _quant_ops
 from ._quant_ops import *
+from .thor_ops import *
 
 __all__ = [
     'TensorAdd',
@@ -261,5 +262,6 @@ __all__ = [
     "SquareSumAll"
 ]
 
+__all__.extend(thor_ops.__all__)
 __all__.extend(_quant_ops.__all__)
 __all__.sort()
diff --git a/mindspore/ops/operations/thor_ops.py b/mindspore/ops/operations/thor_ops.py
index 23593a26305..225aff68fa1 100644
--- a/mindspore/ops/operations/thor_ops.py
+++ b/mindspore/ops/operations/thor_ops.py
@@ -16,16 +16,29 @@
 import mindspore as ms
 from mindspore.ops import prim_attr_register, PrimitiveWithInfer
 from mindspore.ops.composite import multitype_ops as C
+import numpy as np
 
+__all__ = ["CusBatchMatMul",
+           "CusCholeskyTrsm",
+           "CusFusedAbsMax1",
+           "CusImg2Col",
+           "CusMatMulCubeDenseLeft",
+           "CusMatMulCubeFraczRightMul",
+           "CusMatMulCube",
+           "CusMatrixCombine",
+           "CusTranspose02314",
+           "CusMatMulCubeDenseRight",
+           "CusMatMulCubeFraczLeftCast",
+           ]
 
 class CusBatchMatMul(PrimitiveWithInfer):
-    """CusMatMulCube definition"""
+    """CusBatchMatMul definition"""
 
     @prim_attr_register
     def __init__(self):
-        """init CusMatMulCube"""
+        """init CusBatchMatMul"""
         self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
-
+        from mindspore.ops._op_impl._custom_op.matrix_combine_impl import CusMatrixCombine
     def get_bprop(self):
         def bprop(x1, x2, out, dout):
             return (C.zeros_like(x1), C.zeros_like(x2))
@@ -46,7 +59,7 @@ class CusCholeskyTrsm(PrimitiveWithInfer):
     def __init__(self):
         """init CusCholeskyTrsm"""
         self.init_prim_io_names(inputs=['x1'], outputs=['y'])
-
+        from mindspore.ops._op_impl._custom_op.cholesky_trsm_impl import CusCholeskyTrsm
     def infer_shape(self, data1_shape):
         ll = []
         m, _ = data1_shape
@@ -61,14 +74,14 @@ class CusCholeskyTrsm(PrimitiveWithInfer):
 
 
 class CusFusedAbsMax1(PrimitiveWithInfer):
-    """CusCholeskyTrsm definition"""
+    """CusFusedAbsMax1 definition"""
 
     @prim_attr_register
     def __init__(self, origin_shape=[-1, -1]):
-        """init CusCholeskyTrsm"""
+        """init CusFusedAbsMax1"""
         self.init_prim_io_names(inputs=['x1'], outputs=['y'])
         self.origin_shape = origin_shape
-
+        from mindspore.ops._op_impl._custom_op.fused_abs_max1_impl import CusFusedAbsMax1
     def get_bprop(self):
         def bprop(x, out, dout):
             return (C.zeros_like(x),)
@@ -98,7 +111,7 @@ class CusImg2Col(PrimitiveWithInfer):
         self.strides = strides
         self.dilates = dilates
         self.mode = mode
-
+        from mindspore.ops._op_impl._custom_op.img2col_impl import CusImg2Col
     def get_bprop(self):
         def bprop(x, out, dout):
             return (C.zeros_like(x),)
@@ -126,9 +139,9 @@ class CusMatMulCubeDenseLeft(PrimitiveWithInfer):
 
     @prim_attr_register
     def __init__(self):
-        """init CusMatMulCube"""
+        """init CusMatMulCubeDenseLeft"""
         self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
-
+        from mindspore.ops._op_impl._custom_op.matmul_cube_dense_left_impl import CusMatMulCubeDenseLeft 
     def get_bprop(self):
         def bprop(x1, x2, out, dout):
             return (C.zeros_like(x1), C.zeros_like(x2))
@@ -149,7 +162,7 @@ class CusMatMulCubeFraczRightMul(PrimitiveWithInfer):
     def __init__(self):
         """init CusMatMulCubeFraczRightMul"""
         self.init_prim_io_names(inputs=['x1', 'x2', 'x3'], outputs=['y'])
-
+        from mindspore.ops._op_impl._custom_op.matmul_cube_fracz_right_mul_impl import CusMatMulCubeFraczRightMul
     def get_bprop(self):
         def bprop(x1, x2, x3, out, dout):
             return (C.zeros_like(x1), C.zeros_like(x2), C.zeros_like(x3))
@@ -172,7 +185,7 @@ class CusMatMulCube(PrimitiveWithInfer):
         self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
         self.transpose_a = transpose_a
         self.transpose_b = transpose_b
-
+        from mindspore.ops._op_impl._custom_op.matmul_cube_impl import CusMatMulCube
     def get_bprop(self):
         def bprop(x1, x2, out, dout):
             return (C.zeros_like(x1), C.zeros_like(x2))
@@ -199,13 +212,13 @@ class CusMatMulCube(PrimitiveWithInfer):
 
 
 class CusMatrixCombine(PrimitiveWithInfer):
-    """CusMatMulCube definition"""
+    """CusMatrixCombine definition"""
 
     @prim_attr_register
     def __init__(self):
-        """init CusMatMulCube"""
+        """init CusMatrixCombine"""
         self.init_prim_io_names(inputs=['x'], outputs=['y'])
-
+        from mindspore.ops._op_impl._custom_op.matrix_combine_impl import CusMatrixCombine
     def get_bprop(self):
         def bprop(x, out, dout):
             return (C.zeros_like(x),)
@@ -229,7 +242,7 @@ class CusTranspose02314(PrimitiveWithInfer):
     def __init__(self):
         """init CusTranspose02314"""
         self.init_prim_io_names(inputs=['x1'], outputs=['y'])
-
+        from mindspore.ops._op_impl._custom_op.transpose_02314_impl import CusTranspose02314
     def get_bprop(self):
         def bprop(x, out, dout):
             return (C.zeros_like(x),)
@@ -246,3 +259,41 @@ class CusTranspose02314(PrimitiveWithInfer):
 
     def infer_dtype(self, data1_dtype):
         return data1_dtype
+
+class CusMatMulCubeDenseRight(PrimitiveWithInfer):
+    """CusMatMulCubeDenseRight definition"""
+    @prim_attr_register
+    def __init__(self):
+        """init CusMatMulCubeDenseRight"""
+        self.init_prim_io_names(inputs=['x1', 'x2','x3'], outputs=['y'])
+        from mindspore.ops._op_impl._custom_op.matmul_cube_dense_right_impl import CusMatMulCubeDenseRight
+
+    def get_bprop(self):
+        def bprop(x1, x2, x3, out, dout):
+            return (C.zeros_like(x1),C.zeros_like(x2), C.zeros_like(x3))
+        return bprop
+ 
+    def infer_shape(self, data1_shape, data2_shape, data3_shape):
+        return data1_shape
+ 
+    def infer_dtype(self, data1_dtype, data2_dtype, data3_dtype):
+        return ms.common.dtype.tensor_type(getattr(ms, "float32"))
+
+class CusMatMulCubeFraczLeftCast(PrimitiveWithInfer):
+    """CusMatMulCubeFraczLeftCast definition"""
+    @prim_attr_register
+    def __init__(self):
+        """init CusMatMulCubeFraczLeftCast"""
+        self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
+        from mindspore.ops._op_impl._custom_op.matmul_cube_fracz_left_cast_impl import CusMatMulCubeFraczLeftCast
+
+    def get_bprop(self):
+        def bprop(x1, x2, out, dout):
+            return (C.zeros_like(x1),C.zeros_like(x2))
+        return bprop
+
+    def infer_shape(self, data1_shape, data2_shape):
+        return data2_shape
+
+    def infer_dtype(self, data1_dtype, data2_dtype):
+        return ms.common.dtype.tensor_type(getattr(ms, "float16"))

From b57c0839ab835eba068bba23b5ec3a0a09b83839 Mon Sep 17 00:00:00 2001
From: z00478463 <zhangzhongpeng1@huawei.com>
Date: Mon, 25 May 2020 19:33:17 +0800
Subject: [PATCH 2/8] for impl of 2nd-order and format

---
 mindspore/ops/_op_impl/_custom_op/__init__.py              | 2 +-
 mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py    | 2 +-
 mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py   | 7 +++++--
 .../_op_impl/_custom_op/matmul_cube_dense_right_impl.py    | 2 +-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/mindspore/ops/_op_impl/_custom_op/__init__.py b/mindspore/ops/_op_impl/_custom_op/__init__.py
index c2f5675d38b..d22ac306350 100644
--- a/mindspore/ops/_op_impl/_custom_op/__init__.py
+++ b/mindspore/ops/_op_impl/_custom_op/__init__.py
@@ -24,4 +24,4 @@ from .matmul_cube_fracz_left_cast_impl import CusMatMulCubeFraczLeftCast
 from .matmul_cube_fracz_right_mul_impl import CusMatMulCubeFraczRightMul
 from .matmul_cube_impl import CusMatMulCube
 from .matrix_combine_impl import CusMatrixCombine
-from .transpose_02314_impl import CusTranspose02314
+from .transpose02314_impl import CusTranspose02314
diff --git a/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py b/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
index 99c4b651c25..9092b8a9ff7 100644
--- a/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
@@ -101,4 +101,4 @@ def CusCholeskyTrsm(input_x,output, kernel_name):
         tik_instance.data_move(res[block_index,0,0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim,0,0)
 
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
-        return tik_instance
+    return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
index 0a53313d59d..2092c658c22 100644
--- a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
@@ -42,7 +42,8 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
 
 
     if len(input_x_shape) > 2:
-        if (input_x_shape[0] == 1 and input_x_shape[1] == 128 and input_x_shape[2] == 128)  or (input_x_shape[0] == 4 and input_x_shape[1] == 16) or (input_x_shape[0] == 16 and input_x_shape[1] == 4):             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+        if (input_x_shape[0] == 1 and input_x_shape[1] == 128 and input_x_shape[2] == 128)  or (input_x_shape[0] == 4 and input_x_shape[1] == 16) or (input_x_shape[0] == 16 and input_x_shape[1] == 4):
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
             for val in input_x_shape:
@@ -131,7 +132,8 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 4 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 8 and input_x_shape[1] == 32) or (input_x_shape[0] == 32 and input_x_shape[1] == 8):            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
+        elif (input_x_shape[0] == 4 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 8 and input_x_shape[1] == 32) or (input_x_shape[0] == 32 and input_x_shape[1] == 8):
+            input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
             for val in input_x_shape:
@@ -608,6 +610,7 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
             for val in input_x_shape:
+                total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
             with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
index ade39a6ddb7..d0522824cac 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
@@ -54,7 +54,7 @@ def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}
     shape_b_input = input_x2.get("shape")
     matrix_max_input = input_x3.get("shape")
     input_shape = (tuple(shape_a_input), tuple(shape_b_input), tuple(matrix_max_input))
-if input_shape not in support_shape:
+    if input_shape not in support_shape:
         raise RuntimeError("input_shape %s is not supported" % str(input_shape))
  
     if shape_a_temp[0] == 128 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 128:

From 648501daa07d5c5b4b5ba2906ec62f67522ecb13 Mon Sep 17 00:00:00 2001
From: z00478463 <zhangzhongpeng1@huawei.com>
Date: Mon, 25 May 2020 19:56:18 +0800
Subject: [PATCH 3/8] for format

---
 .../ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py      | 2 +-
 .../ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py | 1 -
 mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py        | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
index ad9eb696f88..a3517dce75b 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
@@ -352,7 +352,7 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
     if trans_b and kn_shape == 1:
         block_out = cce.BLOCK_VECTOR
  
-   if not trans_b and n_shape == 1:
+    if not trans_b and n_shape == 1:
         block_out = cce.BLOCK_VECTOR
  
     if trans_a:
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
index 955e6889fbf..aeef521d1ea 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
@@ -370,7 +370,6 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
     cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b, res_matmul,
                          mo_tile=mo_tile, ko_tile=ko_tile, no_tile=no_tile,
                          diag_opt=diag_opt, diag_size=DIAG_SIZE)
-                         diag_opt=diag_opt, diag_size=DIAG_SIZE)
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[res_matmul])
     return tik_instance
  
diff --git a/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py b/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
index 838bb355e51..76fe9625a43 100644
--- a/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
@@ -229,7 +229,7 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
                         tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
                                            0, 1, 1, 1, 0, 0)
             tik_instance.data_move(res[block_idx * 200704 + split_index * 2 * 14336], transpose_ub, 0, 1, 1792, 0, 0)
-       with tik_instance.for_range(0, 32, block_num=32) as block_idx:
+        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             with tik_instance.for_range(0, 6, thread_num=2) as cc1:
                 _inner_compute(cc1, block_idx)
             _inner_compute(6, block_idx)

From 2d0ee05454eccafcd35f303b69575a36d6c33330 Mon Sep 17 00:00:00 2001
From: z00478463 <zhangzhongpeng1@huawei.com>
Date: Tue, 26 May 2020 09:14:41 +0800
Subject: [PATCH 4/8] for pylint 2nd

---
 mindspore/ops/_op_impl/_custom_op/__init__.py |   11 -
 .../_op_impl/_custom_op/batch_matmul_impl.py  |  167 +-
 .../_op_impl/_custom_op/cholesky_trsm_impl.py |   66 +-
 .../_custom_op/fused_abs_max1_impl.py         |  864 +++++----
 .../ops/_op_impl/_custom_op/img2col_impl.py   | 1641 +++++++++--------
 .../_custom_op/matmul_cube_dense_left_impl.py |  182 +-
 .../matmul_cube_dense_right_impl.py           |  155 +-
 .../matmul_cube_fracz_left_cast_impl.py       |  158 +-
 .../matmul_cube_fracz_right_mul_impl.py       |   64 +-
 .../_op_impl/_custom_op/matmul_cube_impl.py   |   95 +-
 .../_custom_op/matrix_combine_impl.py         |   43 +-
 .../_custom_op/transpose02314_impl.py         |  206 ++-
 12 files changed, 2058 insertions(+), 1594 deletions(-)

diff --git a/mindspore/ops/_op_impl/_custom_op/__init__.py b/mindspore/ops/_op_impl/_custom_op/__init__.py
index d22ac306350..5fe583a60fc 100644
--- a/mindspore/ops/_op_impl/_custom_op/__init__.py
+++ b/mindspore/ops/_op_impl/_custom_op/__init__.py
@@ -14,14 +14,3 @@
 # ============================================================================
 
 """custom ops"""
-from .batch_matmul_impl import CusBatchMatMul
-from .cholesky_trsm_impl import CusCholeskyTrsm
-from .fused_abs_max1_impl import CusFusedAbsMax1
-from .img2col_impl import CusImg2Col
-from .matmul_cube_dense_left_impl import CusMatMulCubeDenseLeft
-from .matmul_cube_dense_right_impl import CusMatMulCubeDenseRight
-from .matmul_cube_fracz_left_cast_impl import CusMatMulCubeFraczLeftCast
-from .matmul_cube_fracz_right_mul_impl import CusMatMulCubeFraczRightMul
-from .matmul_cube_impl import CusMatMulCube
-from .matrix_combine_impl import CusMatrixCombine
-from .transpose02314_impl import CusTranspose02314
diff --git a/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py b/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
index ddbc26a03de..d8395c1e81c 100644
--- a/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
@@ -14,29 +14,31 @@
 # ============================================================================
 """batch_matmul_impl"""
 
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
- 
+
 cus_batchmatmul_op_info = TBERegOp("CusBatchMatMul") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("batchmatmul.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusBatchMatMul") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .input(1, "x2", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
-                             .get_op_info()
- 
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("batchmatmul.so") \
+    .compute_cost(10) \
+    .kernel_name("CusBatchMatMul") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
+
 def _get_flattern_shape(shape):
     flattern_shape = 1
     for dim in shape:
         flattern_shape *= dim
     return (flattern_shape,)
- 
+
+
 def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
     input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
     t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
@@ -66,12 +68,13 @@ def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_
                           matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
         tik_instance.data_move(res[res_index + thread_idx2 * 64],
                                matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
- 
+
+
 def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
     input_1_local_UB = tik_instance.Tensor(dtype, [64], name="input_1_local_UB", scope=tik.scope_ubuf)
     tik_instance.data_move(input_1_local_UB, input1[input1_index], 0, 1, 8, 0, 0)
     with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2:
-        input_2_local_UB = tik_instance.Tensor(dtype, [32*64], name="input_2_local_UB",
+        input_2_local_UB = tik_instance.Tensor(dtype, [32 * 64], name="input_2_local_UB",
                                                scope=tik.scope_ubuf)
         t_1_local_UB = input_2_local_UB
         matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [32], name="matmul_hybrid_f_t_local_UB",
@@ -83,6 +86,8 @@ def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, inpu
                                1, 1, 1, 8)
         tik_instance.data_move(res[res_index + thread_idx2 * 32],
                                matmul_hybrid_f_t_local_UB, 0, 1, 4, 0, 0)
+
+
 @op_info_register(cus_batchmatmul_op_info)
 def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"):
     if util.get_product_version() == util.VERSION_MINI:
@@ -97,51 +102,54 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
             dtype, input_x2.get("dtype").lower()))
     input_shape = (tuple(x1_shape), tuple(x2_shape), dtype, transpose_a, transpose_b)
     support_shape = [((8, 128, 128), (8, 128, 128), "float32", False, True),
-                    ((36, 128, 128), (36, 128, 128), "float32", False, True),
-                    ((5, 128, 128), (5, 128, 128), "float32", False, True),
-                    ((18, 128, 128), (18, 128, 128), "float32", False, True),
-                    ((16, 128, 128), (16, 128, 128), "float32", False, True),
-                    ((9, 128, 128), (9, 128, 128), "float32", False, True),
-                    ((1, 64, 64), (1, 64, 64), "float32", False, True),
-                    ((1, 128, 128), (1, 128, 128), "float32", False, True),
-                    ((4, 128, 128), (4, 128, 128), "float32", False, True),
-                    ((2, 128, 128), (2, 128, 128), "float32", False, True)]
+                     ((36, 128, 128), (36, 128, 128), "float32", False, True),
+                     ((5, 128, 128), (5, 128, 128), "float32", False, True),
+                     ((18, 128, 128), (18, 128, 128), "float32", False, True),
+                     ((16, 128, 128), (16, 128, 128), "float32", False, True),
+                     ((9, 128, 128), (9, 128, 128), "float32", False, True),
+                     ((1, 64, 64), (1, 64, 64), "float32", False, True),
+                     ((1, 128, 128), (1, 128, 128), "float32", False, True),
+                     ((4, 128, 128), (4, 128, 128), "float32", False, True),
+                     ((2, 128, 128), (2, 128, 128), "float32", False, True)]
     if input_shape not in support_shape:
         raise RuntimeError("input_shape %s is not supported" % str(input_shape))
- 
- 
+
     # if not transpose_a and transpose_b:
     batch, m, k = x1_shape
     _, n, _ = x2_shape
- 
+
     input1_shape = _get_flattern_shape(x1_shape)
     input1 = tik_instance.Tensor(dtype, input1_shape, name="input1", scope=tik.scope_gm)
     input2_shape = _get_flattern_shape(x2_shape)
     input2 = tik_instance.Tensor(dtype, input2_shape, name="input2", scope=tik.scope_gm)
- 
+
     output_shape = x1_shape
     res_shape = _get_flattern_shape(output_shape)
     res = tik_instance.Tensor(dtype, res_shape, name="res", scope=tik.scope_gm)
- 
+
     if input_shape == ((36, 128, 128), (36, 128, 128), "float32", False, True):
         with tik_instance.for_range(0, 18, block_num=18) as block_idx:
             with tik_instance.for_range(0, 2) as cc0:
                 with tik_instance.for_range(0, 128, thread_num=2) as cc1:
-                    input1_index = block_idx * 32768 + cc0*16384 + cc1 * 128
-                    input2_index = block_idx * 32768 + cc0*16384
-                    res_index = block_idx*32768 + cc0*16384 + cc1*128
+                    input1_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
+                    input2_index = block_idx * 32768 + cc0 * 16384
+                    res_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
                     _inner_matmul_new(tik_instance, dtype,
-                                  input1, input1_index,
-                                  input2, input2_index,
-                                  res, res_index)
+                                      input1, input1_index,
+                                      input2, input2_index,
+                                      res, res_index)
     if input_shape == ((5, 128, 128), (5, 128, 128), "float32", False, True):
         with tik_instance.for_range(0, 30, block_num=30) as block_idx:
             with tik_instance.for_range(0, 11) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as thread_idx:
                     with tik_instance.if_scope(((((block_idx % 6) * 22) + (cc1_db * 2) + thread_idx) < 128)):
-                        input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
-                        t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
-                        tik_instance.data_move(input_1_local_UB, input1[(block_idx//6)*16384 + (block_idx % 6)*2816 + cc1_db * 256 + thread_idx*128], 0, 1, 16, 0, 0)
+                        input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB",
+                                                               scope=tik.scope_ubuf)
+                        t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB",
+                                                             scope=tik.scope_ubuf)
+                        tik_instance.data_move(input_1_local_UB, input1[
+                            (block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 + thread_idx * 128], 0, 1,
+                                               16, 0, 0)
                         with tik_instance.for_range(0, 2) as vec_i:
                             tik_instance.vadds(64, t_1_0_local_UB[vec_i * 64], input_1_local_UB[vec_i * 64], 0,
                                                64, 1, 1, 16, 0)
@@ -150,58 +158,61 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
                                                                    scope=tik.scope_ubuf)
                             t_1_local_UB = input_2_local_UB
                             bisec_last_axis_local_UB = input_2_local_UB
-                            matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64], name="matmul_hybrid_f_t_local_UB",
+                            matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64],
+                                                                             name="matmul_hybrid_f_t_local_UB",
                                                                              scope=tik.scope_ubuf)
                             matmul_hybrid_f_t_local_UB_dst_tmp = tik_instance.Tensor(dtype, [64],
                                                                                      name="matmul_hybrid_f_t_local_UB_dst_tmp",
                                                                                      scope=tik.scope_ubuf)
                             tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB, 0, 1, 1, 8)
-                            tik_instance.data_move(input_2_local_UB, input2[(block_idx//6) * 16384 + thread_idx2*8192], 0, 1,
+                            tik_instance.data_move(input_2_local_UB,
+                                                   input2[(block_idx // 6) * 16384 + thread_idx2 * 8192], 0, 1,
                                                    1024, 0, 0)
                             tik_instance.vmul(64, t_1_local_UB, t_1_0_local_UB, input_2_local_UB, 128, 1, 1, 1, 8, 8, 8)
                             tik_instance.vadd(64, bisec_last_axis_local_UB, t_1_local_UB, t_1_local_UB[64], 64, 1, 1, 1,
                                               16, 16, 16)
                             tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB_dst_tmp, 0, 1, 1, 8)
                             with tik_instance.for_range(0, 64) as cc6:
-                                tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6], bisec_last_axis_local_UB[cc6*128],
+                                tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6],
+                                                   bisec_last_axis_local_UB[cc6 * 128],
                                                    1, 1, 1, 8)
                             tik_instance.vadd(64, matmul_hybrid_f_t_local_UB, matmul_hybrid_f_t_local_UB_dst_tmp,
                                               matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
-                            tik_instance.data_move(res[(block_idx//6)*16384 + (block_idx%6)*2816 + cc1_db*256 +
-                                                       thread_idx*128 + thread_idx2*64],
-                                                   matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
- 
+                            tik_instance.data_move(
+                                res[(block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 +
+                                    thread_idx * 128 + thread_idx2 * 64],
+                                matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)
+
     if input_shape == ((18, 128, 128), (18, 128, 128), "float32", False, True):
         with tik_instance.for_range(0, 18, block_num=18) as block_idx:
             with tik_instance.for_range(0, 128, thread_num=2) as cc0:
                 input1_index = block_idx * 16384 + cc0 * 128
                 input2_index = block_idx * 16384
-                res_index = block_idx*16384 + cc0*128
+                res_index = block_idx * 16384 + cc0 * 128
                 _inner_matmul_new(tik_instance, dtype,
-                              input1, input1_index,
-                              input2, input2_index,
-                              res, res_index)
- 
- 
+                                  input1, input1_index,
+                                  input2, input2_index,
+                                  res, res_index)
+
     if input_shape == ((9, 128, 128), (9, 128, 128), "float32", False, True):
         with tik_instance.for_range(0, 27, block_num=27) as block_idx:
             with tik_instance.for_range(0, 42, thread_num=2) as cc0:
-                input1_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + cc0 * 128
-                input2_index = (block_idx//3) * 16384
-                res_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + cc0*128
-                _inner_matmul_new(tik_instance, dtype,
-                              input1, input1_index,
-                              input2, input2_index,
-                              res, res_index)
-            with tik_instance.if_scope((block_idx % 3) < 2):
-                input1_index = (block_idx//3) * 16384 + (block_idx % 3)*5504 + 42*128
+                input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
                 input2_index = (block_idx // 3) * 16384
-                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42*128
+                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
                 _inner_matmul_new(tik_instance, dtype,
-                              input1, input1_index,
-                              input2, input2_index,
-                              res, res_index)
- 
+                                  input1, input1_index,
+                                  input2, input2_index,
+                                  res, res_index)
+            with tik_instance.if_scope((block_idx % 3) < 2):
+                input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
+                input2_index = (block_idx // 3) * 16384
+                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
+                _inner_matmul_new(tik_instance, dtype,
+                                  input1, input1_index,
+                                  input2, input2_index,
+                                  res, res_index)
+
     if input_shape == ((1, 64, 64), (1, 64, 64), "float32", False, True):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             with tik_instance.for_range(0, 2, thread_num=2) as cc0:
@@ -209,35 +220,35 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
                 input2_index = 0
                 res_index = block_idx * 128 + cc0 * 64
                 _inner_matmul_new_1_64_32_64(tik_instance, dtype,
-                              input1, input1_index,
-                              input2, input2_index,
-                              res, res_index)
- 
+                                             input1, input1_index,
+                                             input2, input2_index,
+                                             res, res_index)
+
     input_shape_list = [((1, 128, 128), (1, 128, 128), "float32", False, True),
                         ((2, 128, 128), (2, 128, 128), "float32", False, True),
                         ((4, 128, 128), (4, 128, 128), "float32", False, True),
                         ((8, 128, 128), (8, 128, 128), "float32", False, True),
                         ((16, 128, 128), (16, 128, 128), "float32", False, True)
-    ]
+                        ]
     if input_shape in input_shape_list:
         block_num = 32
         input1_unit_size = 128
-        input2_unint_size = 128*128
+        input2_unint_size = 128 * 128
         with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx:
             block_process_ele_num = (batch * m * k) // block_num
-            loop_time = (batch*m*k)//block_num//input1_unit_size
+            loop_time = (batch * m * k) // block_num // input1_unit_size
             thread_num = 2
             with tik_instance.for_range(0, loop_time, thread_num=thread_num) as cc0:
-                input1_index = block_idx*block_process_ele_num + cc0*input1_unit_size
+                input1_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                 if batch > 1:
-                    input2_index = block_idx//(block_num//batch) * input2_unint_size
+                    input2_index = block_idx // (block_num // batch) * input2_unint_size
                 else:
                     input2_index = 0
-                res_index = block_idx*block_process_ele_num + cc0*input1_unit_size
+                res_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                 _inner_matmul_new(tik_instance, dtype,
                                   input1, input1_index,
                                   input2, input2_index,
                                   res, res_index)
- 
+
     tik_instance.BuildCCE(kernel_name, inputs=[input1, input2], outputs=[res])
     return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py b/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
index 9092b8a9ff7..50830fe0f69 100644
--- a/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
@@ -13,24 +13,25 @@
 # limitations under the License.
 # ============================================================================
 """CusCholeskyTrsm"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
 cus_cholesky_trsm_op_info = TBERegOp("CusCholeskyTrsm") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("choleskytrsm.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusCholeskyTrsm") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("choleskytrsm.so") \
+    .compute_cost(10) \
+    .kernel_name("CusCholeskyTrsm") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
 
 @op_info_register(cus_cholesky_trsm_op_info)
-def CusCholeskyTrsm(input_x,output, kernel_name):
+def CusCholeskyTrsm(input_x, output, kernel_name):
     input_x_shape = input_x.get("shape")
     output_shape = output.get("shape")
     split_dim = 128
@@ -47,34 +48,36 @@ def CusCholeskyTrsm(input_x,output, kernel_name):
 
     input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
     res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
-    with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-        input_x_ub = tik_instance.Tensor("float32", (split_dim,split_dim), name="input_x_ub", scope=tik.scope_ubuf)
-        temp_ub = tik_instance.Tensor("float32", (split_dim,split_dim), name="temp_ub", scope=tik.scope_ubuf)
+    with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+        input_x_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="input_x_ub", scope=tik.scope_ubuf)
+        temp_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="temp_ub", scope=tik.scope_ubuf)
         assist_1_ub = tik_instance.Tensor("float32", (split_dim,), name="assist_1_ub", scope=tik.scope_ubuf)
         assist_2_ub = tik_instance.Tensor("float32", (split_dim,), name="assist_2_ub", scope=tik.scope_ubuf)
-        with tik_instance.for_range(0,split_dim) as i:
-            tik_instance.data_move(input_x_ub[i,0], input_x[block_index * split_dim + i, block_index * split_dim], 0, 1, vector_repeat_times * 8, 0, 0)
-        scalar1 = tik_instance.Scalar("float32", init_value = -0.5)
+        with tik_instance.for_range(0, split_dim) as i:
+            tik_instance.data_move(input_x_ub[i, 0], input_x[block_index * split_dim + i, block_index * split_dim], 0,
+                                   1, vector_repeat_times * 8, 0, 0)
+        scalar1 = tik_instance.Scalar("float32", init_value=-0.5)
 
         with tik_instance.for_range(0, split_dim) as i:
-            scalar2= tik_instance.Scalar("float32")
-            tik_instance.vln(64, assist_1_ub[0], input_x_ub[i,0], vector_repeat_times, 1, 1, 8, 8)
+            scalar2 = tik_instance.Scalar("float32")
+            tik_instance.vln(64, assist_1_ub[0], input_x_ub[i, 0], vector_repeat_times, 1, 1, 8, 8)
             tik_instance.vmuls(64, assist_2_ub[0], assist_1_ub[0], scalar1, vector_repeat_times, 1, 1, 8, 8)
             tik_instance.vexp(64, assist_1_ub[0], assist_2_ub[0], vector_repeat_times, 1, 1, 8, 8)
             scalar2.set_as(assist_1_ub[i])
-            tik_instance.vmuls(64, input_x_ub[i,0], input_x_ub[i,0], scalar2, vector_repeat_times, 1, 1, 8, 8)
+            tik_instance.vmuls(64, input_x_ub[i, 0], input_x_ub[i, 0], scalar2, vector_repeat_times, 1, 1, 8, 8)
             with tik_instance.for_range(i + 1, split_dim) as j:
-                scalar3= tik_instance.Scalar("float32")
+                scalar3 = tik_instance.Scalar("float32")
                 scalar3.set_as(input_x_ub[i, j])
-                tik_instance.vmuls(64,temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8)
-            tik_instance.vsub(64,input_x_ub[i+1,0], input_x_ub[i+1,0], temp_ub[i+1,0], (split_dim-1-i) * vector_repeat_times, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmuls(64, temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8)
+            tik_instance.vsub(64, input_x_ub[i + 1, 0], input_x_ub[i + 1, 0], temp_ub[i + 1, 0],
+                              (split_dim - 1 - i) * vector_repeat_times, 1, 1, 1, 8, 8, 8)
 
         zero = tik_instance.Scalar("float32")
         zero.set_as(0.0)
         one = tik_instance.Scalar("float32")
         one.set_as(1.0)
         with tik_instance.for_range(0, split_dim) as i:
-            tik_instance.vector_dup(64, temp_ub[i,0], zero, vector_repeat_times, 1, 8)
+            tik_instance.vector_dup(64, temp_ub[i, 0], zero, vector_repeat_times, 1, 8)
             temp_ub.__setitem__(i * split_dim + i, one)
 
         chol_diag_element_final = tik_instance.Scalar("float32")
@@ -89,16 +92,19 @@ def CusCholeskyTrsm(input_x,output, kernel_name):
             with tik_instance.for_range(0, i) as j:
                 chol_diag_element_loop = tik_instance.Scalar("float32")
                 chol_diag_element_loop.set_as(input_x_ub[index, index + 1 + j])
-                tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop, vector_repeat_times,1,1,8,8)
-                tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times,1,1,1,8,8,8)
+                tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop,
+                                   vector_repeat_times, 1, 1, 8, 8)
+                tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8, 8)
             temp_scalar = tik_instance.Scalar("float32")
             temp_scalar.set_as(input_x_ub[index, index])
             chol_diag_element = tik_instance.Scalar("float32")
             chol_diag_element.set_as(1.0 / temp_scalar)
-            tik_instance.vsub(64,temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times,1,1,1,8,8,8)
-            tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element,vector_repeat_times,1,1,8,8)
+            tik_instance.vsub(64, temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8,
+                              8)
+            tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element, vector_repeat_times, 1, 1,
+                               8, 8)
 
-        tik_instance.data_move(res[block_index,0,0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim,0,0)
+        tik_instance.data_move(res[block_index, 0, 0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim, 0, 0)
 
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
     return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
index 2092c658c22..0c47ce78b15 100644
--- a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
@@ -13,25 +13,26 @@
 # limitations under the License.
 # ============================================================================
 """CusFusedAbsMax1"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
 cus_fused_abs_max1_op_info = TBERegOp("CusFusedAbsMax1") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("fusedabsmax1.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusFusedAbsMax1") \
-                             .partial_flag(True) \
-                             .attr("origin_shape", "required", "listInt", "all") \
-                             .input(0, "x1", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("fusedabsmax1.so") \
+    .compute_cost(10) \
+    .kernel_name("CusFusedAbsMax1") \
+    .partial_flag(True) \
+    .attr("origin_shape", "required", "listInt", "all") \
+    .input(0, "x1", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
 
 @op_info_register(cus_fused_abs_max1_op_info)
-def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs_max1"):
+def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_max1"):
     input_x_shape = input_x.get("shape")
     output_shape = output.get("shape")
 
@@ -40,9 +41,9 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
     else:
         tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
 
-
     if len(input_x_shape) > 2:
-        if (input_x_shape[0] == 1 and input_x_shape[1] == 128 and input_x_shape[2] == 128)  or (input_x_shape[0] == 4 and input_x_shape[1] == 16) or (input_x_shape[0] == 16 and input_x_shape[1] == 4):
+        if (input_x_shape[0] == 1 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 4 and input_x_shape[1] == 16) or (input_x_shape[0] == 16 and input_x_shape[1] == 4):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
@@ -50,28 +51,39 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time = each_block_element // 64
-                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time,1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 2 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 16 and input_x_shape[1] == 8):
-            if origin_shape[0] == 147  and (input_x_shape[0] == 2 and input_x_shape[1] == 128 and input_x_shape[2] == 128) :
+        elif (input_x_shape[0] == 2 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 16 and input_x_shape[1] == 8):
+            if origin_shape[0] == 147 and (
+                    input_x_shape[0] == 2 and input_x_shape[1] == 128 and input_x_shape[2] == 128):
                 assert origin_shape[0] == 147
                 assert origin_shape[1] == 147
                 phase_1 = 16384
@@ -80,9 +92,11 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 each_block_element = phase_1 // blocks + 64
                 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
                 res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
-                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                     scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                               scope=tik.scope_ubuf)
                     tik_instance.data_move(input_x_ub, input_x[512 * block_index], 0, 1, 512 // 8, 0, 0)
                     line_id = block_index % 19
                     tik_instance.data_move(input_x_ub[512], input_x[16384 + 128 * line_id], 0, 1, 8, 0, 0)
@@ -92,16 +106,22 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                    with tik_instance.for_range(0,64) as cc0:
+                    with tik_instance.for_range(0, 64) as cc0:
                         data_temp = tik_instance.Scalar("float32")
                         data_temp.set_as(input_x_ub[cc0])
                         tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1,
+                                      1, 8, 8, 8)
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
             else:
                 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -111,28 +131,38 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     total_elements *= val
                 blocks = 32
                 each_block_element = total_elements // blocks
-                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                     scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                           each_block_element // 8, 0, 0)
                     repeat_time = each_block_element // 64
                     tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                    with tik_instance.for_range(0,64) as cc0:
+                    with tik_instance.for_range(0, 64) as cc0:
                         data_temp = tik_instance.Scalar("float32")
                         data_temp.set_as(input_x_ub[cc0])
                         tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1,
+                                      1, 8, 8, 8)
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 4 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 8 and input_x_shape[1] == 32) or (input_x_shape[0] == 32 and input_x_shape[1] == 8):
+        elif (input_x_shape[0] == 4 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 8 and input_x_shape[1] == 32) or (input_x_shape[0] == 32 and input_x_shape[1] == 8):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
@@ -140,10 +170,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
@@ -151,36 +184,50 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 32 and input_x_shape[1] == 16) or (input_x_shape[0] == 16 and input_x_shape[1] == 32):
-            if (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[0] == 1000:
+        elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 32 and input_x_shape[1] == 16) or (
+                input_x_shape[0] == 16 and input_x_shape[1] == 32):
+            if (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[
+                0] == 1000:
                 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
                 res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
                 blocks = 32
                 each_block_element = 7 * 128 * 128 // 32 + 4 * 128
                 phase_1 = 7 * 128 * 128 // 32
-                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                     scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                               scope=tik.scope_ubuf)
                     tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0)
-                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, 0)
+                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0,
+                                           0)
                     move_idx = block_index % 8
-                    tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1, 128 // 8, 0, 0)
+                    tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1,
+                                           128 // 8, 0, 0)
                     repeat_time = each_block_element // 64
                     tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                     vmask = 1000 - 7 * 128 - 64
-                    with tik_instance.for_range(0, 4) as loop_idx :
-                        tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
+                    with tik_instance.for_range(0, 4) as loop_idx:
+                        tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx],
+                                          input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub[512], input_x_ub[2048], 24, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
@@ -189,38 +236,52 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
 
                     with tik_instance.for_range(0, 4) as loop_idx:
-                        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8, 8, 8)
-                    with tik_instance.for_range(0,64) as cc0:
+                        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8,
+                                          8, 8)
+                    with tik_instance.for_range(0, 64) as cc0:
                         data_temp = tik_instance.Scalar("float32")
                         data_temp.set_as(input_x_ub[cc0])
                         tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1,
+                                      1, 8, 8, 8)
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
 
-            elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[0] == 1001:
+            elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[
+                0] == 1001:
                 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
                 res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
                 blocks = 32
                 each_block_element = 7 * 128 * 128 // 32 + 4 * 128
                 phase_1 = 7 * 128 * 128 // 32
-                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                     scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                               scope=tik.scope_ubuf)
                     tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0)
-                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, 0)
-                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0, 0)
+                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0,
+                                           0)
+                    tik_instance.data_move(input_x_ub[phase_1], input_x[114688 + block_index * 384], 0, 1, 384 // 8, 0,
+                                           0)
                     move_idx = block_index % 9
-                    tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1, 128 // 8, 0, 0)
+                    tik_instance.data_move(input_x_ub[phase_1 + 384], input_x[114688 + 96 * 128 + move_idx * 128], 0, 1,
+                                           128 // 8, 0, 0)
                     repeat_time = each_block_element // 64
                     tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                     vmask = 1001 - 7 * 128 - 64
                     with tik_instance.for_range(0, 4) as loop_idx:
-                        tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
+                        tik_instance.vmax(vmask, input_x_ub[3584 + 128 * loop_idx], input_x_ub[3584 + 128 * loop_idx],
+                                          input_x_ub[3584 + 128 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub[512], input_x_ub[2048], 24, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
@@ -228,17 +289,24 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                     with tik_instance.for_range(0, 4) as loop_idx:
-                        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8, 8, 8)
-                    with tik_instance.for_range(0,64) as cc0:
+                        tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[3584 + 128 * loop_idx], 1, 1, 1, 1, 8,
+                                          8, 8)
+                    with tik_instance.for_range(0, 64) as cc0:
                         data_temp = tik_instance.Scalar("float32")
                         data_temp.set_as(input_x_ub[cc0])
                         tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1,
+                                      1, 8, 8, 8)
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
             else:
                 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -248,10 +316,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     total_elements *= val
                 blocks = 32
                 each_block_element = total_elements // blocks
-                with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+                with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                    input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                     scope=tik.scope_ubuf)
+                    broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                           each_block_element // 8, 0, 0)
                     repeat_time = each_block_element // 64
                     tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
@@ -260,18 +331,26 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                     tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                    with tik_instance.for_range(0,64) as cc0:
+                    with tik_instance.for_range(0, 64) as cc0:
                         data_temp = tik_instance.Scalar("float32")
                         data_temp.set_as(input_x_ub[cc0])
                         tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1,
+                                      1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1,
+                                      1, 8, 8, 8)
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 16 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 16 and input_x_shape[1] == 64) or (input_x_shape[0] == 64 and input_x_shape[1] == 16):
+        elif (input_x_shape[0] == 16 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 16 and input_x_shape[1] == 64) or (
+                input_x_shape[0] == 64 and input_x_shape[1] == 16):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
@@ -279,10 +358,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
@@ -292,16 +374,22 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 5 and input_x_shape[1] == 128 and input_x_shape[2] == 128 and origin_shape[0] == 576:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -311,33 +399,43 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
             each_block_element = total_elements // blocks
             phase_1 = 2048
             phase_2 = 128
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
                 tik_instance.data_move(input_x_ub, input_x[phase_1 * block_index], 0, 1, phase_1 // 8, 0, 0)
                 tik_instance.data_move(input_x_ub[phase_1], input_x[65536 + phase_2 * block_index * 2], 0, 1, 8, 0, 0)
-                tik_instance.data_move(input_x_ub[phase_1 + 64], input_x[65536 + 128 + phase_2 * block_index * 2], 0, 1, 8, 0, 0)
+                tik_instance.data_move(input_x_ub[phase_1 + 64], input_x[65536 + 128 + phase_2 * block_index * 2], 0, 1,
+                                       8, 0, 0)
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
-                tik_instance.vmax(64, input_x_ub[2048], input_x_ub[2048], input_x_ub[2048+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[2048], input_x_ub[2048], input_x_ub[2048 + 64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 9 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 72 and input_x_shape[1] == 8):
+        elif (input_x_shape[0] == 9 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 72 and input_x_shape[1] == 8):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
@@ -345,10 +443,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
@@ -357,20 +458,26 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096+256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096+128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096 + 256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096 + 128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[4096], input_x_ub[4096], input_x_ub[4096 + 64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 18 and input_x_shape[1] == 128 and input_x_shape[2] == 128:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -380,10 +487,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
@@ -393,23 +503,30 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192+512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192+256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192+128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 512], 8, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[8192], input_x_ub[8192], input_x_ub[8192 + 64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 36 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (input_x_shape[0] == 144 and input_x_shape[1] == 16):
+        elif (input_x_shape[0] == 36 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
+                input_x_shape[0] == 144 and input_x_shape[1] == 16):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
@@ -417,14 +534,18 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time_1 = 255
                 repeat_time_2 = each_block_element // 64 - 255
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1,
+                                  1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
@@ -433,22 +554,32 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 1024], 16, 1, 1, 1, 8, 8,
+                                  8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 512], 8, 1, 1, 1, 8, 8,
+                                  8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 256], 4, 1, 1, 1, 8, 8,
+                                  8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 128], 2, 1, 1, 1, 8, 8,
+                                  8)
+                tik_instance.vmax(64, input_x_ub[16384], input_x_ub[16384], input_x_ub[16384 + 64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 128 and input_x_shape[1] == 63:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -458,32 +589,47 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time_1 = 255
                 repeat_time_2 = each_block_element // 64 - 255 * 3
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 3 * 64], input_x_ub[repeat_time_1 * 3 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1,
+                                  1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64],
+                                  repeat_time_1, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 3 * 64], input_x_ub[repeat_time_1 * 3 * 64],
+                                  repeat_time_2, 1, 1, 8, 8)
                 loop_size = each_block_element // 16384
                 with tik_instance.for_range(0, loop_size) as loop_idx:
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 8192], 128, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 4096], 64, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 8192], 128, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 4096], 64, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 2048], 32, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
                 with tik_instance.for_range(0, loop_size - 1) as loop_idx:
-                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * (loop_idx + 1)], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * (loop_idx + 1)], 1, 1, 1, 1, 8, 8,
+                                      8)
                 tail_element = each_block_element - 16384 * loop_size
                 repeats = tail_element // 64
-                with tik_instance.for_range(0, repeats) as i :
-                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * loop_size + i * 64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, repeats) as i:
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * loop_size + i * 64], 1, 1, 1, 1, 8,
+                                      8, 8)
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, input_x_ub[64 + cc0 * 64], data_temp, 1, 1, 8)
@@ -494,7 +640,8 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[128 + 64], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub[64], input_x_ub[64], input_x_ub[64 + 64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], input_x_ub[64], 0, 1, 8, 0, 0)
-        elif (input_x_shape[0] == 32 and input_x_shape[1] == 128) or (input_x_shape[0] == 128 and input_x_shape[1] == 32):
+        elif (input_x_shape[0] == 32 and input_x_shape[1] == 128) or (
+                input_x_shape[0] == 128 and input_x_shape[1] == 32):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
@@ -502,37 +649,57 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time_1 = 255
                 repeat_time_2 = each_block_element // 64 - 255 * 2
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1,
+                                  1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64],
+                                  repeat_time_2, 1, 1, 8, 8)
                 loop_size = each_block_element // 16384
                 with tik_instance.for_range(0, loop_size) as loop_idx:
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 8192], 128, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 4096], 64, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 2048], 32, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 1024], 16, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 512], 8, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 256], 4, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 128], 2, 1, 1, 1, 8, 8, 8)
-                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 8192], 128, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 4096], 64, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 2048], 32, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 1024], 16, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 512], 8, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 256], 4, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 128], 2, 1, 1, 1, 8, 8, 8)
+                    tik_instance.vmax(64, input_x_ub[16384 * loop_idx], input_x_ub[16384 * loop_idx],
+                                      input_x_ub[16384 * loop_idx + 64], 1, 1, 1, 1, 8, 8, 8)
                 with tik_instance.for_range(0, loop_size - 1) as loop_idx:
-                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * (loop_idx + 1)], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                    tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384 * (loop_idx + 1)], 1, 1, 1, 1, 8, 8,
+                                      8)
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 288 and input_x_shape[1] == 32:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -542,20 +709,23 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
                 assist_ub = tik_instance.Tensor("float32", (64,), name="assist_ub", scope=tik.scope_ubuf)
                 zero = tik_instance.Scalar("float32")
                 zero.set_as(0)
                 tik_instance.vector_dup(64, assist_ub, zero, 1, 1, 8)
                 input_x_ub = tik_instance.Tensor("float32", (32768,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
                 repeat_time_1 = 255
                 repeat_time_2 = 32768 // 64 - 255 * 2
- 
+
                 tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 0], 0, 1, 4096, 0, 0)
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1,
+                                  1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64],
+                                  repeat_time_2, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
@@ -567,11 +737,14 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
- 
-                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 32768], 0, 1, 4096, 0, 0)
+
+                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 32768], 0, 1, 4096, 0,
+                                       0)
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1,
+                                  1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64],
+                                  repeat_time_2, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
@@ -583,7 +756,8 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
-                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 65536], 0, 1, 1024, 0, 0)
+                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 65536], 0, 1, 1024, 0,
+                                       0)
                 tik_instance.vabs(64, input_x_ub, input_x_ub, 128, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
@@ -593,17 +767,23 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
- 
-                with tik_instance.for_range(0,64) as cc0:
+
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(assist_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 64 and input_x_shape[1] == 128:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -613,20 +793,23 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
                 assist_ub = tik_instance.Tensor("float32", (64,), name="assist_ub", scope=tik.scope_ubuf)
                 zero = tik_instance.Scalar("float32")
                 zero.set_as(0)
                 tik_instance.vector_dup(64, assist_ub, zero, 1, 1, 8)
                 input_x_ub = tik_instance.Tensor("float32", (32768,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
                 repeat_time_1 = 255
                 repeat_time_2 = 32768 // 64 - 255 * 2
 
                 tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 0], 0, 1, 4096, 0, 0)
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1,
+                                  1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64],
+                                  repeat_time_2, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
@@ -639,10 +822,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
 
-                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 32768], 0, 1, 4096, 0, 0)
+                tik_instance.data_move(input_x_ub[0], input_x[each_block_element * block_index + 32768], 0, 1, 4096, 0,
+                                       0)
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_1, 1,
+                                  1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 2 * 64], input_x_ub[repeat_time_1 * 2 * 64],
+                                  repeat_time_2, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[16384], 255, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub[16320], input_x_ub[16320], input_x_ub[32704], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
@@ -655,16 +841,22 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, assist_ub, assist_ub, input_x_ub, 1, 1, 1, 1, 8, 8, 8)
 
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(assist_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif (input_x_shape[0] == 64 and input_x_shape[1] == 32) or (input_x_shape[0] == 32 and input_x_shape[1] == 64):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -674,14 +866,18 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time_1 = 255
                 repeat_time_2 = each_block_element // 64 - 255
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time_1, 1, 1, 8, 8)
-                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1, 1, 8, 8)
+                tik_instance.vabs(64, input_x_ub[repeat_time_1 * 64], input_x_ub[repeat_time_1 * 64], repeat_time_2, 1,
+                                  1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[8192], 128, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[4096], 64, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[2048], 32, 1, 1, 1, 8, 8, 8)
@@ -690,16 +886,22 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 36 and input_x_shape[1] == 4:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -709,10 +911,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
 
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
@@ -722,16 +927,22 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 4 and input_x_shape[1] == 4:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -741,24 +952,33 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
 
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 49 and input_x_shape[1] == 4:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -768,10 +988,13 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
 
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, 24, 1, 1, 8, 8)
@@ -781,20 +1004,26 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[256], 4, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[128], 2, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024+256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024+128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024+64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024 + 256], 4, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024 + 128], 2, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, input_x_ub[1024], input_x_ub[1024], input_x_ub[1024 + 64], 1, 1, 1, 1, 8, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif input_x_shape[0] == 1 and input_x_shape[1] == 64 and input_x_shape[2] == 64:
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
@@ -804,31 +1033,40 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
                 total_elements *= val
             blocks = 32
             each_block_element = total_elements // blocks
-            with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub", scope=tik.scope_ubuf)
-                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB", scope=tik.scope_ubuf)
-                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1, each_block_element // 8, 0, 0)
+            with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+                input_x_ub = tik_instance.Tensor("float32", (each_block_element,), name="input_x_ub",
+                                                 scope=tik.scope_ubuf)
+                broadcast_0_local_UB = tik_instance.Tensor("float32", (4096,), name="broadcast_0_local_UB",
+                                                           scope=tik.scope_ubuf)
+                tik_instance.data_move(input_x_ub, input_x[each_block_element * block_index], 0, 1,
+                                       each_block_element // 8, 0, 0)
                 repeat_time = each_block_element // 64
                 tik_instance.vabs(64, input_x_ub, input_x_ub, repeat_time, 1, 1, 8, 8)
                 tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[64], 1, 1, 1, 1, 8, 8, 8)
-                with tik_instance.for_range(0,64) as cc0:
+                with tik_instance.for_range(0, 64) as cc0:
                     data_temp = tik_instance.Scalar("float32")
                     data_temp.set_as(input_x_ub[cc0])
                     tik_instance.vector_dup(64, broadcast_0_local_UB[cc0 * 64], data_temp, 1, 1, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1, 8, 8, 8)
-                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[2048], 32, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[1024], 16, 1, 1,
+                                  1, 8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[512], 8, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[256], 4, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[128], 2, 1, 1, 1,
+                                  8, 8, 8)
+                tik_instance.vmax(64, broadcast_0_local_UB, broadcast_0_local_UB, broadcast_0_local_UB[64], 1, 1, 1, 1,
+                                  8, 8, 8)
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
- 
+
         else:
             raise RuntimeError("UnSupportedShape")
     elif len(input_x_shape) == 2 and (input_x_shape[0] == 32 and input_x_shape[1] == 64):
         input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
         res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
-        input_x_ub = tik_instance.Tensor("float32", (32*64,), name="input_x_ub", scope=tik.scope_ubuf)
+        input_x_ub = tik_instance.Tensor("float32", (32 * 64,), name="input_x_ub", scope=tik.scope_ubuf)
         tik_instance.data_move(input_x_ub, input_x, 0, 1, 256, 0, 0)
         tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[1024], 16, 1, 1, 1, 8, 8, 8)
         tik_instance.vmax(64, input_x_ub, input_x_ub, input_x_ub[512], 8, 1, 1, 1, 8, 8, 8)
@@ -838,6 +1076,6 @@ def CusFusedAbsMax1(input_x, output, origin_shape = None, kernel_name="fused_abs
         tik_instance.data_move(res[0], input_x_ub, 0, 1, 1, 0, 0)
     else:
         raise RuntimeError("UnSupportedShape")
- 
+
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
     return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/img2col_impl.py b/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
index 0d69240dc43..8c1fd1262f5 100644
--- a/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
@@ -13,26 +13,27 @@
 # limitations under the License.
 # ============================================================================
 """CusImg2ColNC1HWC0"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
- 
+
 cus_img2col_info = TBERegOp("CusImg2Col") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("img2col.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusImg2Col") \
-                             .partial_flag(True) \
-                             .attr("ksizes", "required", "listInt", "all") \
-                             .attr("strides", "required", "listInt", "all") \
-                             .attr("dilates", "required", "listInt", "all") \
-                             .attr("mode", "required", "str", "all") \
-                             .input(0, "x1", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F16_5HD, DataType.F16_FracNZ) \
-                             .get_op_info()
- 
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("img2col.so") \
+    .compute_cost(10) \
+    .kernel_name("CusImg2Col") \
+    .partial_flag(True) \
+    .attr("ksizes", "required", "listInt", "all") \
+    .attr("strides", "required", "listInt", "all") \
+    .attr("dilates", "required", "listInt", "all") \
+    .attr("mode", "required", "str", "all") \
+    .input(0, "x1", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_FracNZ) \
+    .get_op_info()
+
+
 @op_info_register(cus_img2col_info)
 def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img2col"):
     input_x_shape = input_x.get("shape")
@@ -43,7 +44,7 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
     _, filter_h, filter_w, _ = ksizes
     _, stride_h, stride_w, _ = strides
     _, dilation_filter_h, dilation_filter_w, _ = dilates
- 
+
     input_shape = (tuple(input_x_shape), input_x_dtype, (filter_h, filter_w), (stride_h, stride_w))
     supported_shape = [((32, 32, 14, 14, 16), 'float16', (3, 3), (2, 2)),
                        ((32, 1, 224, 224, 16), 'float16', (7, 7), (2, 2)),
@@ -63,102 +64,106 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                        ((32, 8, 28, 28, 16), 'float16', (1, 1), (1, 1)),
                        ((32, 32, 28, 28, 16), 'float16', (1, 1), (1, 1)),
                        ((32, 16, 14, 14, 16), 'float16', (1, 1), (1, 1)),
-                       ((32, 16, 56, 56, 16), 'float16', (1, 1), (1, 1)),]
- 
- 
+                       ((32, 16, 56, 56, 16), 'float16', (1, 1), (1, 1)), ]
+
     if input_shape not in supported_shape:
         raise RuntimeError("input_shape %s is not supported" % str(input_shape))
- 
+
     output_tmp = [N * int(H // stride_h) * int(W // stride_w), filter_h * filter_w * C]
     output_shape = [output_tmp[1] // 16, output_tmp[0] // 16, 16, 16]
     if util.get_product_version() == util.VERSION_MINI:
         tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
     else:
         tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
- 
+
     input_x = tik_instance.Tensor("float16", input_x_shape, name="input_x", scope=tik.scope_gm)
     res = tik_instance.Tensor("float16", output_shape, name="res", scope=tik.scope_gm)
- 
+
     if input_shape == ((32, 1, 224, 224, 16), 'float16', (7, 7), (2, 2)):
-        pad = [3,3,3,3]
+        pad = [3, 3, 3, 3]
         l1_h = 56
         l1_w = 224
         c1_index = 0
         jump_stride = 1
         repeat_mode = 1
- 
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53760,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
- 
-            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 12544, 0, 0)
-            with tik_instance.for_range(0,7) as eeb:
-                with tik_instance.for_range(0,7) as cc0:
+
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53760,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 12544, 0, 0)
+            with tik_instance.for_range(0, 7) as eeb:
+                with tik_instance.for_range(0, 7) as cc0:
                     temp = eeb % 2
                     rep = ((55 - temp - (-3 + eeb)) // 2 + 1) * 7
                     fetch_filter_w = cc0
                     fetch_filter_h = eeb
                     left_top_w = -3
                     left_top_h = -3
- 
+
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1,
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
- 
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+
                     with tik_instance.for_range(0, rep) as cc1:
-                        tik_instance.data_move(res[cc0 + eeb * 7,cc1 + 784 * block_index,0,0], input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
- 
-            with tik_instance.for_range(1,3) as eeb0:
-                tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,56*eeb0,0,0], 0, 1, 12544, 0, 0)
-                with tik_instance.for_range(0,7) as eeb:
-                    with tik_instance.for_range(0,7) as cc0:
+                        tik_instance.data_move(res[cc0 + eeb * 7, cc1 + 784 * block_index, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
+
+            with tik_instance.for_range(1, 3) as eeb0:
+                tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 56 * eeb0, 0, 0], 0, 1, 12544, 0, 0)
+                with tik_instance.for_range(0, 7) as eeb:
+                    with tik_instance.for_range(0, 7) as cc0:
                         temp = eeb % 2
                         rep_prefix = ((55 - temp - (-3 + eeb)) // 2 + 1) * 7
                         rep = 196
                         fetch_filter_w = cc0
                         fetch_filter_h = eeb
                         left_top_w = -3
- 
+
                         left_top_h = 1 + ((55 - temp - (-3 + eeb)) // 2 - 29) * 2
- 
+
                         tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1,
-                            pad,
-                            l1_h,
-                            l1_w,
-                            c1_index,
-                            fetch_filter_w,
-                            fetch_filter_h,
-                            left_top_w,
-                            left_top_h,
-                            stride_w,
-                            stride_h,
-                            filter_w,
-                            filter_h,
-                            dilation_filter_w,
-                            dilation_filter_h,
-                            jump_stride,
-                            repeat_mode,
-                            rep)
+                                              pad,
+                                              l1_h,
+                                              l1_w,
+                                              c1_index,
+                                              fetch_filter_w,
+                                              fetch_filter_h,
+                                              left_top_w,
+                                              left_top_h,
+                                              stride_w,
+                                              stride_h,
+                                              filter_w,
+                                              filter_h,
+                                              dilation_filter_w,
+                                              dilation_filter_h,
+                                              jump_stride,
+                                              repeat_mode,
+                                              rep)
                         with tik_instance.for_range(0, rep) as cc1:
-                            tik_instance.data_move(res[cc0 + eeb * 7,cc1 + rep_prefix + (eeb0 - 1) * rep + 784 * block_index, 0,0], input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
- 
-            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,56*3,0,0], 0, 1, 12544, 0, 0)
- 
+                            tik_instance.data_move(
+                                res[cc0 + eeb * 7, cc1 + rep_prefix + (eeb0 - 1) * rep + 784 * block_index, 0, 0],
+                                input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
+
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 56 * 3, 0, 0], 0, 1, 12544, 0, 0)
+
             with tik_instance.for_range(0, 7) as eeb:
                 with tik_instance.for_range(0, 7) as cc0:
                     temp = eeb % 2
@@ -169,37 +174,40 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                     left_top_w = -3
                     left_top_h = 1 + ((55 - temp - (-3 + eeb)) // 2 - 29) * 2
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1,
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
- 
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+
                     with tik_instance.for_range(0, rep) as cc1:
-                        tik_instance.data_move(res[cc0 + eeb * 7,cc1 + rep_prefix + 784 * block_index,0,0], input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
- 
+                        tik_instance.data_move(res[cc0 + eeb * 7, cc1 + rep_prefix + 784 * block_index, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[cc1 * 256], 0, 1, 16, 0, 0)
+
     if input_shape == ((32, 4, 56, 56, 16), 'float16', (3, 3), (1, 1)):
-        pad = [1,1,1,1]
+        pad = [1, 1, 1, 1]
         l1_h = 56
         l1_w = 56
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 12544, 0, 0)
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 12544, 0, 0)
             with tik_instance.for_range(0, 9) as eeb0:
                 rep = 196
                 fetch_filter_w = eeb0 % 3
@@ -209,260 +217,288 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                 with tik_instance.for_range(0, 4) as eeb1:
                     c1_index = eeb1
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1,
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
                     with tik_instance.for_range(0, rep) as i:
-                        tik_instance.data_move(res[eeb1 * 9 + eeb0,i + 196 * block_index,0,0], input_1_1_fractal_L1_local_UB[i * 256], 0, 1, 16, 0, 0)
+                        tik_instance.data_move(res[eeb1 * 9 + eeb0, i + 196 * block_index, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[i * 256], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 8, 56, 56, 16), 'float16', (3, 3), (2, 2)):
-        pad = [1,1,1,1]
+        pad = [1, 1, 1, 1]
         l1_h = 56
         l1_w = 56
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (401408,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (112896,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 25088, 0, 0)
-            with tik_instance.for_range(0,8) as eeb0:
-                with tik_instance.for_range(0,9) as eeb1:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (401408,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (112896,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 25088, 0, 0)
+            with tik_instance.for_range(0, 8) as eeb0:
+                with tik_instance.for_range(0, 9) as eeb1:
                     rep = 49
                     fetch_filter_w = eeb1 % 3
                     fetch_filter_h = eeb1 // 3
                     left_top_w = -1
                     left_top_h = -1
                     c1_index = eeb0
-                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[49*256*eeb1], input_1_1_local_L1,
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,9) as eeb1:
-                    with tik_instance.for_range(0,49) as i:
-                        tik_instance.data_move(res[eeb1 + eeb0 * 9,49 * block_index + i,0,0], input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[49 * 256 * eeb1], input_1_1_local_L1,
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 9) as eeb1:
+                    with tik_instance.for_range(0, 49) as i:
+                        tik_instance.data_move(res[eeb1 + eeb0 * 9, 49 * block_index + i, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 8, 28, 28, 16), 'float16', (3, 3), (1, 1)):
-        pad = [1,1,1,1]
+        pad = [1, 1, 1, 1]
         l1_h = 28
         l1_w = 28
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (112896,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 6272, 0, 0)
-            with tik_instance.for_range(0,8) as eeb0:
-                with tik_instance.for_range(0,9) as eeb1:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (112896,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 6272, 0, 0)
+            with tik_instance.for_range(0, 8) as eeb0:
+                with tik_instance.for_range(0, 9) as eeb1:
                     rep = 49
                     fetch_filter_w = eeb1 % 3
                     fetch_filter_h = eeb1 // 3
                     left_top_w = -1
                     left_top_h = -1
                     c1_index = eeb0
-                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[49*256*eeb1], input_1_1_local_L1,
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,9) as eeb1:
-                    with tik_instance.for_range(0,49) as i:
-                        tik_instance.data_move(res[eeb1 + eeb0 * 9,49 * block_index + i,0,0], input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[49 * 256 * eeb1], input_1_1_local_L1,
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 9) as eeb1:
+                    with tik_instance.for_range(0, 49) as i:
+                        tik_instance.data_move(res[eeb1 + eeb0 * 9, 49 * block_index + i, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 16, 28, 28, 16), 'float16', (3, 3), (2, 2)):
-        pad = [1,1,1,1]
+        pad = [1, 1, 1, 1]
         l1_h = 28
         l1_w = 28
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
             eeb0 = block_index % 2
             eeb1 = block_index // 2
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,16) as i:
-                tik_instance.data_move(input_1_1_local_L1[i * 12544], input_x[i + 16 * eeb0,eeb1,0,0,0], 0, 1, 784, 0, 0)
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (200704,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 16) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 12544], input_x[i + 16 * eeb0, eeb1, 0, 0, 0], 0, 1, 784,
+                                       0, 0)
 
-            with tik_instance.for_range(0,9) as eeb3:
+            with tik_instance.for_range(0, 9) as eeb3:
                 rep = 13
                 fetch_filter_w = eeb3 % 3
                 fetch_filter_h = eeb3 // 3
                 left_top_w = -1
                 left_top_h = -1
                 c1_index = 0
-                with tik_instance.for_range(0,16) as i:
+                with tik_instance.for_range(0, 16) as i:
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * i], input_1_1_local_L1[12544 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,16) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196,0,0)
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 16) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
 
                 with tik_instance.for_range(196 * eeb0, 196 * (eeb0 + 1)) as i:
-                    tik_instance.data_move(res[eeb1 * 9 + eeb3,i,0,0], input_1_2_fractal_L1_local_UB[256 * (i - 196 * eeb0)],0,1,16,0,0)
+                    tik_instance.data_move(res[eeb1 * 9 + eeb3, i, 0, 0],
+                                           input_1_2_fractal_L1_local_UB[256 * (i - 196 * eeb0)], 0, 1, 16, 0, 0)
 
-    if input_shape ==  ((32, 16, 14, 14, 16), 'float16', (3, 3), (1, 1)):
-        pad = [1,1,1,1]
+    if input_shape == ((32, 16, 14, 14, 16), 'float16', (3, 3), (1, 1)):
+        pad = [1, 1, 1, 1]
         l1_h = 14
         l1_w = 14
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
             eeb0 = block_index % 2
             eeb1 = block_index // 2
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (50176,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,16) as i:
-                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i + 16 * eeb0,eeb1,0,0,0], 0, 1, 196, 0, 0)
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (50176,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 16) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i + 16 * eeb0, eeb1, 0, 0, 0], 0, 1, 196,
+                                       0, 0)
 
-            with tik_instance.for_range(0,9) as eeb3:
+            with tik_instance.for_range(0, 9) as eeb3:
                 rep = 13
                 fetch_filter_w = eeb3 % 3
                 fetch_filter_h = eeb3 // 3
                 left_top_w = -1
                 left_top_h = -1
                 c1_index = 0
-                with tik_instance.for_range(0,16) as i:
+                with tik_instance.for_range(0, 16) as i:
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * i], input_1_1_local_L1[3136 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,16) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196,0,0)
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 16) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
 
                 with tik_instance.for_range(196 * eeb0, 196 * (eeb0 + 1)) as i:
-                    tik_instance.data_move(res[eeb1 * 9 + eeb3,i,0,0], input_1_2_fractal_L1_local_UB[256 * (i - 196 * eeb0)],0,1,16,0,0)
+                    tik_instance.data_move(res[eeb1 * 9 + eeb3, i, 0, 0],
+                                           input_1_2_fractal_L1_local_UB[256 * (i - 196 * eeb0)], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 32, 14, 14, 16), 'float16', (3, 3), (2, 2)):
-        pad = [1,1,1,1]
+        pad = [1, 1, 1, 1]
         l1_h = 14
         l1_w = 14
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,32) as i:
-                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i,block_index,0,0,0], 0, 1, 196, 0, 0)
-            with tik_instance.for_range(0,9) as eeb:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i, block_index, 0, 0, 0], 0, 1, 196, 0, 0)
+            with tik_instance.for_range(0, 9) as eeb:
                 rep = 4
-                fetch_filter_w = eeb  % 3
+                fetch_filter_w = eeb % 3
                 fetch_filter_h = eeb // 3
                 left_top_w = -1
                 left_top_h = -1
                 c1_index = 0
-                with tik_instance.for_range(0,32) as i:
+                with tik_instance.for_range(0, 32) as i:
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[3136 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
 
-                with tik_instance.for_range(0,32) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i*1024],0,1,49,0,0)
+                with tik_instance.for_range(0, 32) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
 
-                with tik_instance.for_range(0,98) as i:
-                    tik_instance.data_move(res[eeb + block_index * 9, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+                with tik_instance.for_range(0, 98) as i:
+                    tik_instance.data_move(res[eeb + block_index * 9, i, 0, 0], input_1_2_fractal_L1_local_UB[256 * i],
+                                           0, 1, 16, 0, 0)
 
     if input_shape == ((32, 64, 14, 14, 16), 'float16', (1, 1), (2, 2)):
-        pad = [0,0,0,0]
+        pad = [0, 0, 0, 0]
         l1_h = 14
         l1_w = 14
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
 
-            with tik_instance.for_range(0,2) as eeb0:
-                with tik_instance.for_range(0,32) as i:
-                    tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i,block_index * 2 + eeb0,0,0,0], 0, 1, 196, 0, 0)
-                with tik_instance.for_range(0,32) as i:
+            with tik_instance.for_range(0, 2) as eeb0:
+                with tik_instance.for_range(0, 32) as i:
+                    tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i, block_index * 2 + eeb0, 0, 0, 0], 0,
+                                           1, 196, 0, 0)
+                with tik_instance.for_range(0, 32) as i:
                     rep = 4
                     fetch_filter_w = 0
                     fetch_filter_h = 0
@@ -470,41 +506,46 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                     left_top_h = 0
                     c1_index = 0
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[3136 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,32) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i*1024],0,1,49,0,0)
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 32) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
 
-                with tik_instance.for_range(0,98) as i:
-                    tik_instance.data_move(res[eeb0 + block_index * 2, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+                with tik_instance.for_range(0, 98) as i:
+                    tik_instance.data_move(res[eeb0 + block_index * 2, i, 0, 0], input_1_2_fractal_L1_local_UB[256 * i],
+                                           0, 1, 16, 0, 0)
 
     if input_shape == ((32, 32, 7, 7, 16), 'float16', (3, 3), (1, 1)):
-        pad = [1,1,1,1]
+        pad = [1, 1, 1, 1]
         l1_h = 7
         l1_w = 7
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,32) as i:
-                tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i,block_index,0,0,0], 0, 1, 49, 0, 0)
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i, block_index, 0, 0, 0], 0, 1, 49, 0, 0)
 
             with tik_instance.for_range(0, 9) as eeb:
                 rep = 4
@@ -513,92 +554,106 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                 left_top_w = -1
                 left_top_h = -1
                 c1_index = 0
-                with tik_instance.for_range(0,32) as i:
+                with tik_instance.for_range(0, 32) as i:
                     tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[784 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,32) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i*1024],0,1,49,0,0)
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 32) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
 
-                with tik_instance.for_range(0,98) as i:
-                    tik_instance.data_move(res[eeb + block_index * 9, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+                with tik_instance.for_range(0, 98) as i:
+                    tik_instance.data_move(res[eeb + block_index * 9, i, 0, 0], input_1_2_fractal_L1_local_UB[256 * i],
+                                           0, 1, 16, 0, 0)
 
     if input_shape == ((32, 128, 7, 7, 16), 'float16', (1, 1), (1, 1)):
-        pad = [0,0,0,0]
+        pad = [0, 0, 0, 0]
         l1_h = 7
         l1_w = 7
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,4) as eeb0:
-                with tik_instance.for_range(0,32) as i:
-                    tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i,eeb0 + block_index * 4,0,0,0], 0, 1, 49, 0, 0)
-                with tik_instance.for_range(0,32) as i:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 4) as eeb0:
+                with tik_instance.for_range(0, 32) as i:
+                    tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i, eeb0 + block_index * 4, 0, 0, 0], 0,
+                                           1, 49, 0, 0)
+                with tik_instance.for_range(0, 32) as i:
                     rep = 4
                     fetch_filter_w = 0
                     fetch_filter_h = 0
                     left_top_w = 0
                     left_top_h = 0
                     c1_index = 0
-                    with tik_instance.for_range(0,32) as i:
+                    with tik_instance.for_range(0, 32) as i:
                         tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[784 * i],
-                            pad,
-                            l1_h,
-                            l1_w,
-                            c1_index,
-                            fetch_filter_w,
-                            fetch_filter_h,
-                            left_top_w,
-                            left_top_h,
-                            stride_w,
-                            stride_h,
-                            filter_w,
-                            filter_h,
-                            dilation_filter_w,
-                            dilation_filter_h,
-                            jump_stride,
-                            repeat_mode,
-                            rep)
+                                              pad,
+                                              l1_h,
+                                              l1_w,
+                                              c1_index,
+                                              fetch_filter_w,
+                                              fetch_filter_h,
+                                              left_top_w,
+                                              left_top_h,
+                                              stride_w,
+                                              stride_h,
+                                              filter_w,
+                                              filter_h,
+                                              dilation_filter_w,
+                                              dilation_filter_h,
+                                              jump_stride,
+                                              repeat_mode,
+                                              rep)
 
-                with tik_instance.for_range(0,32) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i*1024],0,1,49,0,0)
+                with tik_instance.for_range(0, 32) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
 
-                with tik_instance.for_range(0,98) as i:
-                    tik_instance.data_move(res[eeb0 + block_index * 4, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
+                with tik_instance.for_range(0, 98) as i:
+                    tik_instance.data_move(res[eeb0 + block_index * 4, i, 0, 0], input_1_2_fractal_L1_local_UB[256 * i],
+                                           0, 1, 16, 0, 0)
 
     if input_shape == ((32, 64, 14, 14, 16), 'float16', (1, 1), (1, 1)):
-        pad = [0,0,0,0]
+        pad = [0, 0, 0, 0]
         l1_h = 14
         l1_w = 14
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_2_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf, name = "input_1_2_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,32) as i:
-                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i,block_index * 2,0,0,0], 0, 1, 196, 0, 0)
-                tik_instance.data_move(input_1_2_local_L1[i * 3136], input_x[i,block_index * 2 + 1,0,0,0], 0, 1, 196, 0, 0)
-            with tik_instance.for_range(0,2) as eeb1:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_2_local_L1 = tik_instance.Tensor("float16", (100352,), scope=tik.scope_cbuf,
+                                                     name="input_1_2_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i, block_index * 2, 0, 0, 0], 0, 1, 196, 0,
+                                       0)
+                tik_instance.data_move(input_1_2_local_L1[i * 3136], input_x[i, block_index * 2 + 1, 0, 0, 0], 0, 1,
+                                       196, 0, 0)
+            with tik_instance.for_range(0, 2) as eeb1:
                 with tik_instance.for_range(eeb1 * 16, (eeb1 + 1) * 16) as i:
                     rep = 13
                     fetch_filter_w = 0
@@ -606,30 +661,33 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                     left_top_w = 0
                     left_top_h = 0
                     c1_index = 0
-                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - eeb1 * 16)], input_1_1_local_L1[3136 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,16) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i*3328],0,1,196,0,0)
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - eeb1 * 16)],
+                                          input_1_1_local_L1[3136 * i],
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 16) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
                 with tik_instance.for_range(eeb1 * 196, (eeb1 + 1) * 196) as i:
-                    tik_instance.data_move(res[block_index * 2, i, 0, 0], input_1_2_fractal_L1_local_UB[256*(i - eeb1 * 196)], 0,1,16,0,0)
- 
-            with tik_instance.for_range(0,2) as eeb1:
+                    tik_instance.data_move(res[block_index * 2, i, 0, 0],
+                                           input_1_2_fractal_L1_local_UB[256 * (i - eeb1 * 196)], 0, 1, 16, 0, 0)
+
+            with tik_instance.for_range(0, 2) as eeb1:
                 with tik_instance.for_range(eeb1 * 16, (eeb1 + 1) * 16) as i:
                     rep = 13
                     fetch_filter_w = 0
@@ -637,42 +695,48 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                     left_top_w = 0
                     left_top_h = 0
                     c1_index = 0
-                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - eeb1 * 16)], input_1_2_local_L1[3136 * i],
-                        pad,
-                        l1_h,
-                        l1_w,
-                        c1_index,
-                        fetch_filter_w,
-                        fetch_filter_h,
-                        left_top_w,
-                        left_top_h,
-                        stride_w,
-                        stride_h,
-                        filter_w,
-                        filter_h,
-                        dilation_filter_w,
-                        dilation_filter_h,
-                        jump_stride,
-                        repeat_mode,
-                        rep)
-                with tik_instance.for_range(0,16) as i:
-                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i*3328],0,1,196,0,0)
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - eeb1 * 16)],
+                                          input_1_2_local_L1[3136 * i],
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          rep)
+                with tik_instance.for_range(0, 16) as i:
+                    tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                           input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
                 with tik_instance.for_range(eeb1 * 196, (eeb1 + 1) * 196) as i:
-                    tik_instance.data_move(res[block_index * 2 + 1, i, 0, 0], input_1_2_fractal_L1_local_UB[256*(i - eeb1 * 196)], 0,1,16,0,0)
- 
+                    tik_instance.data_move(res[block_index * 2 + 1, i, 0, 0],
+                                           input_1_2_fractal_L1_local_UB[256 * (i - eeb1 * 196)], 0, 1, 16, 0, 0)
+
     if input_shape == ((32, 32, 28, 28, 16), 'float16', (1, 1), (2, 2)):
-        pad = [0,0,0,0]
+        pad = [0, 0, 0, 0]
         l1_h = 28
         l1_w = 28
         jump_stride = 1
         repeat_mode = 1
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            input_1_1_local_L1 = tik_instance.Tensor("float16", (401408,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-            with tik_instance.for_range(0,32) as i:
-                tik_instance.data_move(input_1_1_local_L1[i * 12544], input_x[i,block_index,0,0,0], 0, 1, 784, 0, 0)
-            with tik_instance.for_range(0,16) as i:
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (401408,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (53248,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (50176,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 12544], input_x[i, block_index, 0, 0, 0], 0, 1, 784, 0, 0)
+            with tik_instance.for_range(0, 16) as i:
                 rep = 13
                 fetch_filter_w = 0
                 fetch_filter_h = 0
@@ -680,29 +744,31 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                 left_top_h = 0
                 c1_index = 0
                 tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * i], input_1_1_local_L1[12544 * i],
-                    pad,
-                    l1_h,
-                    l1_w,
-                    c1_index,
-                    fetch_filter_w,
-                    fetch_filter_h,
-                    left_top_w,
-                    left_top_h,
-                    stride_w,
-                    stride_h,
-                    filter_w,
-                    filter_h,
-                    dilation_filter_w,
-                    dilation_filter_h,
-                    jump_stride,
-                    repeat_mode,
-                    rep)
-            with tik_instance.for_range(0,16) as i:
-                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i*3328],0,1,196,0,0)
-            with tik_instance.for_range(0,196) as i:
-                tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[256*i], 0,1,16,0,0)
- 
-            with tik_instance.for_range(16,32) as i:
+                                      pad,
+                                      l1_h,
+                                      l1_w,
+                                      c1_index,
+                                      fetch_filter_w,
+                                      fetch_filter_h,
+                                      left_top_w,
+                                      left_top_h,
+                                      stride_w,
+                                      stride_h,
+                                      filter_w,
+                                      filter_h,
+                                      dilation_filter_w,
+                                      dilation_filter_h,
+                                      jump_stride,
+                                      repeat_mode,
+                                      rep)
+            with tik_instance.for_range(0, 16) as i:
+                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                       input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
+            with tik_instance.for_range(0, 196) as i:
+                tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[256 * i], 0, 1, 16, 0,
+                                       0)
+
+            with tik_instance.for_range(16, 32) as i:
                 rep = 13
                 fetch_filter_w = 0
                 fetch_filter_h = 0
@@ -710,342 +776,375 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                 left_top_h = 0
                 c1_index = 0
                 tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[3328 * (i - 16)], input_1_1_local_L1[12544 * i],
-                    pad,
-                    l1_h,
-                    l1_w,
-                    c1_index,
-                    fetch_filter_w,
-                    fetch_filter_h,
-                    left_top_w,
-                    left_top_h,
-                    stride_w,
-                    stride_h,
-                    filter_w,
-                    filter_h,
-                    dilation_filter_w,
-                    dilation_filter_h,
-                    jump_stride,
-                    repeat_mode,
-                    rep)
-            with tik_instance.for_range(0,16) as i:
-                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i*3328],0,1,196,0,0)
-            with tik_instance.for_range(196,392) as i:
-                tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[256*(i-196)], 0,1,16,0,0)
- 
+                                      pad,
+                                      l1_h,
+                                      l1_w,
+                                      c1_index,
+                                      fetch_filter_w,
+                                      fetch_filter_h,
+                                      left_top_w,
+                                      left_top_h,
+                                      stride_w,
+                                      stride_h,
+                                      filter_w,
+                                      filter_h,
+                                      dilation_filter_w,
+                                      dilation_filter_h,
+                                      jump_stride,
+                                      repeat_mode,
+                                      rep)
+            with tik_instance.for_range(0, 16) as i:
+                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                       input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
+            with tik_instance.for_range(196, 392) as i:
+                tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[256 * (i - 196)], 0, 1,
+                                       16, 0, 0)
+
     if input_shape == ((32, 32, 7, 7, 16), 'float16', (1, 1), (1, 1)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 7
-      l1_w = 7
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32, block_num=32) as block_index:
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (25088, ), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088, ), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
- 
-        with tik_instance.for_range(0,32) as i:
-          tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i, block_index, 0, 0, 0], 0, 1, 49, 0, 0)
- 
-        with tik_instance.for_range(0,32) as i:
-            fetch_filter_w = 0
-            fetch_filter_h = 0
-            left_top_h = 0
-            left_top_w = 0
-            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[784 * i],
-              pad,
-              l1_h,
-              l1_w,
-              c1_index,
-              fetch_filter_w,
-              fetch_filter_h,
-              left_top_w,
-              left_top_h,
-              stride_w,
-              stride_h,
-              filter_w,
-              filter_h,
-              dilation_filter_w,
-              dilation_filter_h,
-              jump_stride,
-              repeat_mode,
-              4)
- 
-        with tik_instance.for_range(0,32) as i:
-          tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16], input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
-        with tik_instance.for_range(0,98) as i:
-          tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[i * 256], 0, 1, 16, 0, 0)
- 
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 7
+        l1_w = 7
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (32768,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (25088,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 784], input_x[i, block_index, 0, 0, 0], 0, 1, 49, 0, 0)
+
+            with tik_instance.for_range(0, 32) as i:
+                fetch_filter_w = 0
+                fetch_filter_h = 0
+                left_top_h = 0
+                left_top_w = 0
+                tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[1024 * i], input_1_1_local_L1[784 * i],
+                                      pad,
+                                      l1_h,
+                                      l1_w,
+                                      c1_index,
+                                      fetch_filter_w,
+                                      fetch_filter_h,
+                                      left_top_w,
+                                      left_top_h,
+                                      stride_w,
+                                      stride_h,
+                                      filter_w,
+                                      filter_h,
+                                      dilation_filter_w,
+                                      dilation_filter_h,
+                                      jump_stride,
+                                      repeat_mode,
+                                      4)
+
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 49 * 16],
+                                       input_1_1_fractal_L1_local_UB[i * 1024], 0, 1, 49, 0, 0)
+            with tik_instance.for_range(0, 98) as i:
+                tik_instance.data_move(res[block_index, i, 0, 0], input_1_2_fractal_L1_local_UB[i * 256], 0, 1, 16, 0,
+                                       0)
+
     if input_shape == ((32, 4, 56, 56, 16), 'float16', (1, 1), (1, 1)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 56
-      l1_w = 56
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32,block_num=32) as block_index:
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (12544 * 32 // 2, ), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (100352 // 2,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 12544, 0, 0)
-        with tik_instance.for_range(0, 4) as eeb:
-            fetch_filter_w = 0
-            fetch_filter_h = 0
-            left_top_h = 0
-            left_top_w = 0
-            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1[eeb * 56 * 56 * 16],
-              pad,
-              l1_h,
-              l1_w,
-              c1_index,
-              fetch_filter_w,
-              fetch_filter_h,
-              left_top_w,
-              left_top_h,
-              stride_w,
-              stride_h,
-              filter_w,
-              filter_h,
-              dilation_filter_w,
-              dilation_filter_h,
-              jump_stride,
-              repeat_mode,
-              196)
-            with tik_instance.for_range(0, 196) as rep:
-              tik_instance.data_move(res[eeb, rep + block_index * 196, 0, 0], input_1_1_fractal_L1_local_UB[rep * 256], 0, 1, 16, 0, 0)
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 56
+        l1_w = 56
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (12544 * 32 // 2,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (100352 // 2,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 12544, 0, 0)
+            with tik_instance.for_range(0, 4) as eeb:
+                fetch_filter_w = 0
+                fetch_filter_h = 0
+                left_top_h = 0
+                left_top_w = 0
+                tik_instance.load3dv1(input_1_1_fractal_L1_local_UB, input_1_1_local_L1[eeb * 56 * 56 * 16],
+                                      pad,
+                                      l1_h,
+                                      l1_w,
+                                      c1_index,
+                                      fetch_filter_w,
+                                      fetch_filter_h,
+                                      left_top_w,
+                                      left_top_h,
+                                      stride_w,
+                                      stride_h,
+                                      filter_w,
+                                      filter_h,
+                                      dilation_filter_w,
+                                      dilation_filter_h,
+                                      jump_stride,
+                                      repeat_mode,
+                                      196)
+                with tik_instance.for_range(0, 196) as rep:
+                    tik_instance.data_move(res[eeb, rep + block_index * 196, 0, 0],
+                                           input_1_1_fractal_L1_local_UB[rep * 256], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 8, 28, 28, 16), 'float16', (1, 1), (1, 1)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 28
-      l1_w = 28
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32,block_num=32) as block_index:
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (6272 * 32 // 2, ), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 6272, 0, 0)
-        with tik_instance.for_range(0, 1) as eeb0:
-          with tik_instance.for_range(0, 8) as eeb1:
-            fetch_filter_w = 0
-            fetch_filter_h = 0
-            left_top_h = 0
-            left_top_w = 0
-            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256], input_1_1_local_L1[(eeb1 + eeb0 * 8) * 28 * 28 * 16],
-              pad,
-              l1_h,
-              l1_w,
-              c1_index,
-              fetch_filter_w,
-              fetch_filter_h,
-              left_top_w,
-              left_top_h,
-              stride_w,
-              stride_h,
-              filter_w,
-              filter_h,
-              dilation_filter_w,
-              dilation_filter_h,
-              jump_stride,
-              repeat_mode,
-              49)
-          with tik_instance.for_range(0, 8) as eeb1:
-            with tik_instance.for_range(0, 49) as i:
-              tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0], input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 28
+        l1_w = 28
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (6272 * 32 // 2,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 6272, 0, 0)
+            with tik_instance.for_range(0, 1) as eeb0:
+                with tik_instance.for_range(0, 8) as eeb1:
+                    fetch_filter_w = 0
+                    fetch_filter_h = 0
+                    left_top_h = 0
+                    left_top_w = 0
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256],
+                                          input_1_1_local_L1[(eeb1 + eeb0 * 8) * 28 * 28 * 16],
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          49)
+                with tik_instance.for_range(0, 8) as eeb1:
+                    with tik_instance.for_range(0, 49) as i:
+                        tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 32, 28, 28, 16), 'float16', (1, 1), (1, 1)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 28
-      l1_w = 28
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32,block_num=32) as block_index:
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2, ), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        tik_instance.data_move(input_1_1_local_L1, input_x[block_index,0,0,0,0], 0, 1, 25088, 0, 0)
-        with tik_instance.for_range(0, 4) as eeb0:
-          with tik_instance.for_range(0, 8) as eeb1:
-            fetch_filter_w = 0
-            fetch_filter_h = 0
-            left_top_h = 0
-            left_top_w = 0
-            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256], input_1_1_local_L1[(eeb1 + eeb0 * 8) * 28 * 28 * 16],
-              pad,
-              l1_h,
-              l1_w,
-              c1_index,
-              fetch_filter_w,
-              fetch_filter_h,
-              left_top_w,
-              left_top_h,
-              stride_w,
-              stride_h,
-              filter_w,
-              filter_h,
-              dilation_filter_w,
-              dilation_filter_h,
-              jump_stride,
-              repeat_mode,
-              49)
-          with tik_instance.for_range(0, 8) as eeb1:
-            with tik_instance.for_range(0, 49) as i:
-              tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0], input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 28
+        l1_w = 28
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            tik_instance.data_move(input_1_1_local_L1, input_x[block_index, 0, 0, 0, 0], 0, 1, 25088, 0, 0)
+            with tik_instance.for_range(0, 4) as eeb0:
+                with tik_instance.for_range(0, 8) as eeb1:
+                    fetch_filter_w = 0
+                    fetch_filter_h = 0
+                    left_top_h = 0
+                    left_top_w = 0
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256],
+                                          input_1_1_local_L1[(eeb1 + eeb0 * 8) * 28 * 28 * 16],
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          49)
+                with tik_instance.for_range(0, 8) as eeb1:
+                    with tik_instance.for_range(0, 49) as i:
+                        tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[i * 256 + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
 
     if input_shape == ((32, 16, 14, 14, 16), 'float16', (1, 1), (1, 1)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 14
-      l1_w = 14
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32,block_num=32) as block_index:
-        eeb0 = block_index % 2
-        eeb1 = block_index // 2
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (196 * 32 * 16,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (106496 // 2,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (196 * 16 * 16,), scope=tik.scope_ubuf, name = "input_1_2_fractal_L1_local_UB")
-        with tik_instance.for_range(0,32) as i:
-          tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i,eeb1,0,0,0], 0, 1, 196, 0, 0)
-        with tik_instance.for_range(0,16) as i:
-          fetch_filter_w = 0
-          fetch_filter_h = 0
-          left_top_h = 0
-          left_top_w = 0
-          tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[i * 3328], input_1_1_local_L1[i * 3136 + eeb0 * 16 * 3136],
-              pad,
-              l1_h,
-              l1_w,
-              c1_index,
-              fetch_filter_w,
-              fetch_filter_h,
-              left_top_w,
-              left_top_h,
-              stride_w,
-              stride_h,
-              filter_w,
-              filter_h,
-              dilation_filter_w,
-              dilation_filter_h,
-              jump_stride,
-              repeat_mode,
-              13)
-        with tik_instance.for_range(0,16) as i:
-          tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16], input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196,0,0)
-        with tik_instance.for_range(0, 196) as i:
-          tik_instance.data_move(res[eeb1, i + 196 * eeb0, 0, 0],input_1_2_fractal_L1_local_UB[256 * i], 0, 1, 16, 0, 0)
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 14
+        l1_w = 14
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            eeb0 = block_index % 2
+            eeb1 = block_index // 2
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (196 * 32 * 16,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (106496 // 2,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            input_1_2_fractal_L1_local_UB = tik_instance.Tensor("float16", (196 * 16 * 16,), scope=tik.scope_ubuf,
+                                                                name="input_1_2_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 32) as i:
+                tik_instance.data_move(input_1_1_local_L1[i * 3136], input_x[i, eeb1, 0, 0, 0], 0, 1, 196, 0, 0)
+            with tik_instance.for_range(0, 16) as i:
+                fetch_filter_w = 0
+                fetch_filter_h = 0
+                left_top_h = 0
+                left_top_w = 0
+                tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[i * 3328],
+                                      input_1_1_local_L1[i * 3136 + eeb0 * 16 * 3136],
+                                      pad,
+                                      l1_h,
+                                      l1_w,
+                                      c1_index,
+                                      fetch_filter_w,
+                                      fetch_filter_h,
+                                      left_top_w,
+                                      left_top_h,
+                                      stride_w,
+                                      stride_h,
+                                      filter_w,
+                                      filter_h,
+                                      dilation_filter_w,
+                                      dilation_filter_h,
+                                      jump_stride,
+                                      repeat_mode,
+                                      13)
+            with tik_instance.for_range(0, 16) as i:
+                tik_instance.data_move(input_1_2_fractal_L1_local_UB[i * 196 * 16],
+                                       input_1_1_fractal_L1_local_UB[i * 3328], 0, 1, 196, 0, 0)
+            with tik_instance.for_range(0, 196) as i:
+                tik_instance.data_move(res[eeb1, i + 196 * eeb0, 0, 0], input_1_2_fractal_L1_local_UB[256 * i], 0, 1,
+                                       16, 0, 0)
 
     if input_shape == ((32, 16, 56, 56, 16), 'float16', (1, 1), (1, 1)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 56
-      l1_w = 56
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32,block_num=32) as block_index:
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (196 * 256 * 2,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        with tik_instance.for_range(0,2) as eeb0:
-          tik_instance.data_move(input_1_1_local_L1, input_x[block_index,eeb0 * 8,0,0,0], 0, 1, 25088, 0, 0)
-          with tik_instance.for_range(0,4) as eeb1:
-            with tik_instance.for_range(0,2) as eeb2:
-              fetch_filter_w = 0
-              fetch_filter_h = 0
-              left_top_h = 0
-              left_top_w = 0
-              tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb2 * 196 * 256], input_1_1_local_L1[(eeb2 + eeb1 * 2) * 56 * 56 * 16],
-                pad,
-                l1_h,
-                l1_w,
-                c1_index,
-                fetch_filter_w,
-                fetch_filter_h,
-                left_top_w,
-                left_top_h,
-                stride_w,
-                stride_h,
-                filter_w,
-                filter_h,
-                dilation_filter_w,
-                dilation_filter_h,
-                jump_stride,
-                repeat_mode,
-                196)
-            with tik_instance.for_range(0,2) as eeb2:
-              with tik_instance.for_range(0,196) as i:
-                tik_instance.data_move(res[eeb0 * 8 + eeb1 * 2 + eeb2, i + block_index * 196, 0, 0],input_1_1_fractal_L1_local_UB[256 * i + eeb2 * 196 * 256], 0, 1, 16, 0, 0)
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 56
+        l1_w = 56
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (196 * 256 * 2,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 2) as eeb0:
+                tik_instance.data_move(input_1_1_local_L1, input_x[block_index, eeb0 * 8, 0, 0, 0], 0, 1, 25088, 0, 0)
+                with tik_instance.for_range(0, 4) as eeb1:
+                    with tik_instance.for_range(0, 2) as eeb2:
+                        fetch_filter_w = 0
+                        fetch_filter_h = 0
+                        left_top_h = 0
+                        left_top_w = 0
+                        tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb2 * 196 * 256],
+                                              input_1_1_local_L1[(eeb2 + eeb1 * 2) * 56 * 56 * 16],
+                                              pad,
+                                              l1_h,
+                                              l1_w,
+                                              c1_index,
+                                              fetch_filter_w,
+                                              fetch_filter_h,
+                                              left_top_w,
+                                              left_top_h,
+                                              stride_w,
+                                              stride_h,
+                                              filter_w,
+                                              filter_h,
+                                              dilation_filter_w,
+                                              dilation_filter_h,
+                                              jump_stride,
+                                              repeat_mode,
+                                              196)
+                    with tik_instance.for_range(0, 2) as eeb2:
+                        with tik_instance.for_range(0, 196) as i:
+                            tik_instance.data_move(res[eeb0 * 8 + eeb1 * 2 + eeb2, i + block_index * 196, 0, 0],
+                                                   input_1_1_fractal_L1_local_UB[256 * i + eeb2 * 196 * 256], 0, 1, 16,
+                                                   0, 0)
 
     if input_shape == ((32, 16, 56, 56, 16), 'float16', (1, 1), (2, 2)):
-      if padding == 'SAME':
-        padding_left = 0
-        padding_right = 0
-        padding_top = 0
-        padding_bottom = 0
-      pad = [padding_left, padding_right, padding_top, padding_bottom]
-      l1_h = 56
-      l1_w = 56
-      c1_index = 0
-      jump_stride = 1
-      repeat_mode = 1
-      with tik_instance.for_range(0,32,block_num=32) as block_index:
-        input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2,), scope=tik.scope_cbuf, name = "input_1_1_local_L1")
-        input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf, name = "input_1_1_fractal_L1_local_UB")
-        with tik_instance.for_range(0,2) as eeb0:
-          tik_instance.data_move(input_1_1_local_L1, input_x[block_index,eeb0 * 8,0,0,0], 0, 1, 25088, 0, 0)
-          with tik_instance.for_range(0,8) as eeb1:
-            fetch_filter_w = 0
-            fetch_filter_h = 0
-            left_top_h = 0
-            left_top_w = 0
-            tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256], input_1_1_local_L1[eeb1 * 56 * 56 * 16],
-              pad,
-              l1_h,
-              l1_w,
-              c1_index,
-              fetch_filter_w,
-              fetch_filter_h,
-              left_top_w,
-              left_top_h,
-              stride_w,
-              stride_h,
-              filter_w,
-              filter_h,
-              dilation_filter_w,
-              dilation_filter_h,
-              jump_stride,
-              repeat_mode,
-              49)
-          with tik_instance.for_range(0,8) as eeb1:
-            with tik_instance.for_range(0,49) as i:
-              tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0],input_1_1_fractal_L1_local_UB[256 * i + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
+        if padding == 'SAME':
+            padding_left = 0
+            padding_right = 0
+            padding_top = 0
+            padding_bottom = 0
+        pad = [padding_left, padding_right, padding_top, padding_bottom]
+        l1_h = 56
+        l1_w = 56
+        c1_index = 0
+        jump_stride = 1
+        repeat_mode = 1
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            input_1_1_local_L1 = tik_instance.Tensor("float16", (25088 * 32 // 2,), scope=tik.scope_cbuf,
+                                                     name="input_1_1_local_L1")
+            input_1_1_fractal_L1_local_UB = tik_instance.Tensor("float16", (49 * 256 * 8,), scope=tik.scope_ubuf,
+                                                                name="input_1_1_fractal_L1_local_UB")
+            with tik_instance.for_range(0, 2) as eeb0:
+                tik_instance.data_move(input_1_1_local_L1, input_x[block_index, eeb0 * 8, 0, 0, 0], 0, 1, 25088, 0, 0)
+                with tik_instance.for_range(0, 8) as eeb1:
+                    fetch_filter_w = 0
+                    fetch_filter_h = 0
+                    left_top_h = 0
+                    left_top_w = 0
+                    tik_instance.load3dv1(input_1_1_fractal_L1_local_UB[eeb1 * 49 * 256],
+                                          input_1_1_local_L1[eeb1 * 56 * 56 * 16],
+                                          pad,
+                                          l1_h,
+                                          l1_w,
+                                          c1_index,
+                                          fetch_filter_w,
+                                          fetch_filter_h,
+                                          left_top_w,
+                                          left_top_h,
+                                          stride_w,
+                                          stride_h,
+                                          filter_w,
+                                          filter_h,
+                                          dilation_filter_w,
+                                          dilation_filter_h,
+                                          jump_stride,
+                                          repeat_mode,
+                                          49)
+                with tik_instance.for_range(0, 8) as eeb1:
+                    with tik_instance.for_range(0, 49) as i:
+                        tik_instance.data_move(res[eeb0 * 8 + eeb1, i + block_index * 49, 0, 0],
+                                               input_1_1_fractal_L1_local_UB[256 * i + eeb1 * 49 * 256], 0, 1, 16, 0, 0)
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
     return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
index a3517dce75b..0458363a6db 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
@@ -17,17 +17,15 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
+
 import te.lang.cce
 import te.platform.cce_params as cce
-from te.platform.fusion_manager import fusion_manager
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+from te import tik
 from te import tvm
 from topi import generic
 from topi.cce import util
 
-from impl.matmul_vector import matmul_vector_cce
-
-from te import tik
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
@@ -46,6 +44,7 @@ matmul_cube_dense_left_op_info = TBERegOp("CusMatMulCubeDenseLeft") \
     .dtype_format(DataType.F16_Default, DataType.F16_FracNZ, DataType.F16_Default, DataType.F16_FracNZ) \
     .get_op_info()
 
+
 # pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
 def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
     """
@@ -115,16 +114,16 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
 
     if m_shape != 1:
         if n_shape == 1:
-            if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+            if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
                 raise RuntimeError("input shape K1 should be multiple of %d"
-                                   % (cce.BLOCK_IN*cce.BLOCK_IN))
-        elif km_shape%k_block_size != 0:
+                                   % (cce.BLOCK_IN * cce.BLOCK_IN))
+        elif km_shape % k_block_size != 0:
             raise RuntimeError(
                 "input shape K1 should be multiple of %d" % cce.BLOCK_IN)
     else:
-        if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+        if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
             raise RuntimeError("input shape K1 should be multiple of %d"
-                               % (cce.BLOCK_IN*cce.BLOCK_IN))
+                               % (cce.BLOCK_IN * cce.BLOCK_IN))
 
     if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
         raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
@@ -132,7 +131,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
     if len(shape_bias):
         if len(shape_bias) == 1:
             if is_gevm or is_gemv:
-                if shape_bias[0] != m_shape*n_shape:
+                if shape_bias[0] != m_shape * n_shape:
                     raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
             else:
                 if shape_bias[0] != n_shape:
@@ -143,33 +142,36 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
         else:
             raise RuntimeError("unsupport input shape now for batch bias case")
 
+
 def _get_bias(shape_bias):
     bias_length = shape_bias[0]
-    if bias_length % 16 ==0:
+    if bias_length % 16 == 0:
         return shape_bias
     else:
-        bias_length = (bias_length // 16)*16 + 16
+        bias_length = (bias_length // 16) * 16 + 16
         shape_bias = []
         shape_bias.append(bias_length)
         return shape_bias
 
+
 def _get_input_shape(shape_x):
     dim_a = shape_x[0]
     dim_b = shape_x[1]
     res = []
-    if dim_a % 16 !=0:
-        dim_a = (dim_a // 16)*16 + 16
+    if dim_a % 16 != 0:
+        dim_a = (dim_a // 16) * 16 + 16
         res.append(dim_a)
     else:
         res.append(dim_a)
 
-    if dim_b % 16 !=0:
-        dim_b = (dim_b // 16)*16 + 16
+    if dim_b % 16 != 0:
+        dim_b = (dim_b // 16) * 16 + 16
         res.append(dim_b)
     else:
         res.append(dim_b)
     return res
 
+
 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
     shape_a = input_x1.get("shape")
     shape_b = input_x2.get("shape")
@@ -184,7 +186,7 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
     if bias is not None and bool(bias):
         shape_bias = bias.get("shape")
     try:
-        trans_a_f = bool(1-trans_a)
+        trans_a_f = bool(1 - trans_a)
         if src_dtype == "float32" or src_dtype == "int32":
             if len(shape_a) != 2 and len(shape_b) != 2:
                 return False
@@ -205,44 +207,46 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
                     return False
             elif shape_a[1] != shape_b[0]:
                 return False
- 
+
             if trans_a_f and trans_b and shape_b[1] == 1:
                 return False
- 
+
         if src_dtype == "float16":
             if len(shape_a) != 2 and len(shape_b) != 2:
                 return False
- 
+
             if trans_a:
                 m_shape = shape_a[1]
                 k_shape = shape_a[0]
             else:
                 m_shape = shape_a[0]
                 k_shape = shape_a[1]
- 
+
             if trans_b:
                 n_shape = shape_b[0]
                 k_b_shape = shape_b[1]
             else:
                 n_shape = shape_b[1]
                 k_b_shape = shape_b[0]
- 
+
             if k_shape != k_b_shape:
                 return False
- 
+
             if m_shape == 1 or n_shape == 1:
                 if k_shape % 256 != 0:
                     return False
- 
+
     except RuntimeError as e:
         return False
- 
+
     return True
- 
+
+
 # pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
 # @util.check_input_type(dict, dict, (dict, NoneType), dict, bool, bool, str)
 @op_info_register(matmul_cube_dense_left_op_info)
-def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
+                           kernel_name="matmulcube"):
     """
     calculating  matrix multiplication with bias, C = A*B + bias, support input
     data with fractal format.
@@ -279,87 +283,87 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
     print(shape_a, shape_b)
     print("============")
     if input_x2.get("format") == "FRACTAL_Z":
-        n,c,h,w = shape_b
+        n, c, h, w = shape_b
         c0 = 16
         c1 = c // c0
         if c1 == 0:
             c1 = 1
         shape_b = [n, c1 * h * w * c0]
-        shape_a = [n,n]
+        shape_a = [n, n]
 
     if input_x1.get("format") == "FRACTAL_Z":
-        n,c,h,w = shape_a
+        n, c, h, w = shape_a
         c0 = 16
         c1 = c // c0
         if c1 == 0:
             c1 = 1
         shape_a = [n, c1 * h * w * c0]
         shape_b = [c1 * h * w * c0, c1 * h * w * c0]
- 
+
     if input_x2.get("format") == "FRACTAL_NZ":
         shape_a = [shape_b[0], shape_b[0]]
         shape_b = shape_b
- 
+
     if input_x1.get("format") == "FRACTAL_NZ":
         shape_a = shape_a
         shape_b = [shape_a[1], shape_a[1]]
- 
+
     shape_a = list(shape_a)
     shape_b = list(shape_b)
- 
+
     shape_a = _get_input_shape(shape_a)
     shape_b = _get_input_shape(shape_b)
- 
+
     util.check_kernel_name(kernel_name)
     util.check_shape_rule(shape_a)
     util.check_shape_rule(shape_b)
     util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
     util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
- 
+
     shape_a = [shape_a[1], shape_a[0]]
-    trans_a = bool(1-trans_a)
- 
+    trans_a = bool(1 - trans_a)
+
     shape_b = [shape_b[1], shape_b[0]]
-    trans_b = bool(1-trans_b)
- 
+    trans_b = bool(1 - trans_b)
+
     shape_bias = ()
     if bias is not None and bool(bias):
         shape_bias = bias.get("shape")
         shape_bias = list(shape_bias)
         shape_bias = _get_bias(shape_bias)
- 
+
     src_dtype = input_x1.get("dtype").lower()
     dst_dtype = output_y.get("dtype").lower()
     _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
- 
+
     m_shape = shape_a[len(shape_a) - 2]
     km_shape = shape_a[len(shape_a) - 1]
     kn_shape = shape_b[len(shape_a) - 2]
     n_shape = shape_b[len(shape_a) - 1]
- 
+
     if src_dtype == "float16":
         block_reduce = cce.BLOCK_REDUCE
- 
+
     block_in = cce.BLOCK_IN
     block_out = cce.BLOCK_OUT
- 
+
     if trans_a and km_shape == 1:
         block_in = cce.BLOCK_VECTOR
- 
+
     if not trans_a and m_shape == 1:
         block_in = cce.BLOCK_VECTOR
- 
+
     if trans_b and kn_shape == 1:
         block_out = cce.BLOCK_VECTOR
- 
+
     if not trans_b and n_shape == 1:
         block_out = cce.BLOCK_VECTOR
- 
+
     if trans_a:
         shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
     else:
         shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
- 
+
     if trans_b:
         shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
     else:
@@ -368,7 +372,7 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
     format_a = "FRACTAL_NZ"
     shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
     format_b = "FRACTAL_NZ"
- 
+
     print("=======================================")
     print(shape_a_temp, shape_b_temp)
     print(format_a, format_b)
@@ -378,67 +382,85 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
                                dtype=src_dtype)
     tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b',
                                dtype=src_dtype)
- 
+
     if len(shape_bias) > 0:
         tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias',
                                       dtype=dst_dtype)
- 
+
     if shape_a_temp[0] == 63 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 63:
         if util.get_product_version() == util.VERSION_MINI:
-          tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
+            tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
         else:
-          tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
- 
+            tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
+
         input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
         input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
         resMatmul = tik_instance.Tensor("float16", shape_output, name="output", scope=tik.scope_gm)
-        with tik_instance.for_range(0,32,block_num=32) as block_index:
-            resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf, name = "resMatmul_local_UB")
-            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc, name = "resMatmul_local_UB")
-            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca, name = "input_1_local_L1_local_L0A")
-            input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf, name = "input_2_local_L1")
-            input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf, name = "input_1_local_L1")
-            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B")
+        with tik_instance.for_range(0, 32, block_num=32) as block_index:
+            resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_ubuf,
+                                                     name="resMatmul_local_UB")
+            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (128 * 256,), scope=tik.scope_cc,
+                                                               name="resMatmul_local_UB")
+            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_ca,
+                                                             name="input_1_local_L1_local_L0A")
+            input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cbuf,
+                                                   name="input_2_local_L1")
+            input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cbuf,
+                                                   name="input_1_local_L1")
+            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 256,), scope=tik.scope_cb,
+                                                             name="input_2_local_L1_local_L0B")
             core_m_idx = block_index % 8
             core_n_idx = block_index // 8
             with tik_instance.if_scope(core_m_idx != 7):
-                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128, 55 * 16, 0)
-                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 128, 55 * 16, 0)
+                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128,
+                                       55 * 16, 0)
+                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
+                                       32, 128, 55 * 16, 0)
                 with tik_instance.for_range(0, 8) as cc12:
-                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256],  0, 8, 8, 0, False)
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256], 0, 8,
+                                          8, 0, False)
                 with tik_instance.for_range(0, 2) as cc6:
                     with tik_instance.for_range(0, 8) as cc121:
-                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc121 * 4096], input_2_local_L1[cc6 * 32768 + cc121 * 256],  0, 16, 8, 0, True)
-                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 128, 128, 256, 0)
+                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc121 * 4096],
+                                              input_2_local_L1[cc6 * 32768 + cc121 * 256], 0, 16, 8, 0, True)
+                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
+                                      input_2_local_L1_local_L0B, 128, 128, 256, 0)
                     tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0, 1)
-                    tik_instance.data_move(resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 256 // 2 , 0, 55 * 16 * 2 // 2)
+                    tik_instance.data_move(resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
+                                           resMatmul_local_UB, 0, 16, 256 // 2, 0, 55 * 16 * 2 // 2)
             with tik_instance.else_scope():
-                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112, 56 * 16, 0)
-                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 112, 56 * 16, 0)
+                tik_instance.data_move(input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112,
+                                       56 * 16, 0)
+                tik_instance.data_move(input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0,
+                                       32, 112, 56 * 16, 0)
                 with tik_instance.for_range(0, 7) as cc10:
-                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256],  0, 7, 7, 0, False)
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256], 0, 7,
+                                          7, 0, False)
                 with tik_instance.for_range(0, 2) as cc5:
                     with tik_instance.for_range(0, 7) as cc101:
-                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc101 * 4096], input_2_local_L1[cc5 * 28672 + cc101 * 256],  0, 16, 7, 0, True)
-                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 112, 112, 256, 0)
+                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc101 * 4096],
+                                              input_2_local_L1[cc5 * 28672 + cc101 * 256], 0, 16, 7, 0, True)
+                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
+                                      input_2_local_L1_local_L0B, 112, 112, 256, 0)
                     tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 112, 0, 0, 1)
-                    tik_instance.data_move(resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 224 // 2 , 0, 56 * 16 * 2 // 2)
+                    tik_instance.data_move(resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
+                                           resMatmul_local_UB, 0, 16, 224 // 2, 0, 56 * 16 * 2 // 2)
         tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[resMatmul])
         return tik_instance
     else:
         print("come into tbe, shape is error!")
         result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
                                     format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
- 
+
         with tvm.target.cce():
             schedule = generic.auto_schedule(result)
- 
+
         tensor_list = [tensor_a, tensor_b, result]
         if len(shape_bias) > 0:
             tensor_list = [tensor_a, tensor_b, tensor_bias, result]
- 
+
         config = {"print_ir": False,
                   "name": kernel_name,
                   "tensor_list": tensor_list}
- 
+
         te.lang.cce.cce_build_code(schedule, config)
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
index d0522824cac..5cae9afda01 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
@@ -18,15 +18,10 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
-import te.lang.cce
-import te.platform.cce_params as cce
-from te.platform.fusion_manager import fusion_manager
-from te import tvm
-from topi import generic
-from topi.cce import util
-from impl.matmul_vector import matmul_vector_cce
-from te import tik
+
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+from te import tik
+from topi.cce import util
 
 matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
     .fusion_type("OPAQUE") \
@@ -40,23 +35,26 @@ matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
     .input(2, "x3", False, "required", "all") \
     .input(3, "x4", False, "optional", "all") \
     .output(0, "y", False, "required", "all") \
-    .dtype_format(DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_FracNZ) \
+    .dtype_format(DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default,
+                  DataType.F32_FracNZ) \
     .get_op_info()
- 
+
+
 @op_info_register(matmul_cube_dense_right_op_info)
-def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
+                            kernel_name="matmulcube"):
     shape_a_temp = (128, 63, 16, 16)
     shape_b_temp = (128, 128, 16, 16)
     shape_output = output_y.get("shape")
     matrix_max_shape = (1,)
-    support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape),]
+    support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape), ]
     shape_a_input = input_x1.get("shape")
     shape_b_input = input_x2.get("shape")
     matrix_max_input = input_x3.get("shape")
     input_shape = (tuple(shape_a_input), tuple(shape_b_input), tuple(matrix_max_input))
     if input_shape not in support_shape:
         raise RuntimeError("input_shape %s is not supported" % str(input_shape))
- 
+
     if shape_a_temp[0] == 128 and shape_a_temp[1] == 63 and shape_b_temp[0] == 128 and shape_b_temp[1] == 128:
         if util.get_product_version() == util.VERSION_MINI:
             tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
@@ -64,79 +62,110 @@ def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}
             tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
         input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
         input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
-        input_x3 = tik_instance.Tensor("float32", [1,], name="matrix_max", scope=tik.scope_gm)
+        input_x3 = tik_instance.Tensor("float32", [1, ], name="matrix_max", scope=tik.scope_gm)
         resMatmul = tik_instance.Tensor("float32", shape_output, name="output", scope=tik.scope_gm)
         with tik_instance.for_range(0, 32, block_num=32) as block_index:
             core_m_idx = block_index // 16
             core_n_idx = block_index % 16
             matrix_max_scalar = tik_instance.Scalar("float32")
-            matrix_max_local_UB = tik_instance.Tensor("float32", (8,), scope = tik.scope_ubuf, name = "matrix_max_local_UB")
+            matrix_max_local_UB = tik_instance.Tensor("float32", (8,), scope=tik.scope_ubuf, name="matrix_max_local_UB")
             tik_instance.data_move(matrix_max_local_UB, input_x3, 0, 1, 1, 0, 0)
             matrix_max_scalar.set_as(matrix_max_local_UB[0])
- 
-            resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_ubuf, name = "resMatmul_local_UB")
-            resMatmul_local_UB1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_ubuf, name = "resMatmul_local_UB1")
 
-            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_cc, name = "resMatmul_local_UB_local_L0C")
-            resMatmul_local_UB_local_L0C1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_cc, name = "resMatmul_local_UB_local_L0C1")
+            resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_ubuf,
+                                                     name="resMatmul_local_UB")
+            resMatmul_local_UB1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_ubuf,
+                                                      name="resMatmul_local_UB1")
+
+            resMatmul_local_UB_local_L0C = tik_instance.Tensor("float32", (256 * 128,), scope=tik.scope_cc,
+                                                               name="resMatmul_local_UB_local_L0C")
+            resMatmul_local_UB_local_L0C1 = tik_instance.Tensor("float32", (240 * 128,), scope=tik.scope_cc,
+                                                                name="resMatmul_local_UB_local_L0C1")
+
+            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (256 * 128,), scope=tik.scope_ca,
+                                                             name="input_1_local_L1_local_L0A")
+            input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf,
+                                                   name="input_2_local_L1")
+            input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf,
+                                                    name="input_2_local_L11")
+
+            input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16,), scope=tik.scope_cbuf,
+                                                   name="input_1_local_L1")
+            input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16,), scope=tik.scope_cbuf,
+                                                    name="input_1_local_L11")
+
+            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb,
+                                                             name="input_2_local_L1_local_L0B")
+            input_2_local_L1_local_L0B1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb,
+                                                              name="input_2_local_L1_local_L0B1")
 
-            input_1_local_L1_local_L0A = tik_instance.Tensor("float16", (256 * 128,), scope=tik.scope_ca, name = "input_1_local_L1_local_L0A")
-            input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf, name = "input_2_local_L1")
-            input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16,), scope=tik.scope_cbuf, name = "input_2_local_L11")
- 
-            input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16,), scope=tik.scope_cbuf, name = "input_1_local_L1")
-            input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16,), scope=tik.scope_cbuf, name = "input_1_local_L11")
- 
-            input_2_local_L1_local_L0B = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B")
-            input_2_local_L1_local_L0B1 = tik_instance.Tensor("float16", (128 * 128,), scope=tik.scope_cb, name = "input_2_local_L1_local_L0B1")
- 
             with tik_instance.if_scope(core_m_idx == 0):
                 with tik_instance.for_range(0, 2) as cc1:
-                    tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
-                    tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752, 0)
+                    tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8,
+                                           128, 1920, 0)
+                    tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752,
+                                           0)
                     with tik_instance.for_range(0, 8) as cc10:
-                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256],  0, 8, 8, 0, True)
+                        tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0,
+                                              8, 8, 0, True)
                     with tik_instance.for_range(0, 16) as cc101:
-                        tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],  0, 8, 16, 0, False)
- 
-                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0)
+                        tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],
+                                              0, 8, 16, 0, False)
+
+                    tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A,
+                                      input_2_local_L1_local_L0B, 256, 128, 128, 0)
                     tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0)
-                    tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255,1,1,8,8)
-                    tik_instance.vmuls(64, resMatmul_local_UB[255*64], resMatmul_local_UB[255*64], matrix_max_scalar, 255,1,1,8,8)
-                    tik_instance.vmuls(64, resMatmul_local_UB[510*64], resMatmul_local_UB[510*64], matrix_max_scalar, 2,1,1,8,8)
- 
-                    tik_instance.data_move(resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504)
+                    tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8)
+                    tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64],
+                                       matrix_max_scalar, 255, 1, 1, 8, 8)
+                    tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64],
+                                       matrix_max_scalar, 2, 1, 1, 8, 8)
+
+                    tik_instance.data_move(resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512,
+                                           0, 1504)
             with tik_instance.else_scope():
-                tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
+                tik_instance.data_move(input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128,
+                                       1920, 0)
                 tik_instance.data_move(input_1_local_L1, input_x1[core_n_idx * 129024 + 2 * 4096], 0, 8, 256, 752, 0)
                 with tik_instance.for_range(0, 8) as cc10:
-                    tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256],  0, 8, 8, 0, True)
+                    tik_instance.load2dv1(input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0, 8,
+                                          8, 0, True)
                 with tik_instance.for_range(0, 16) as cc101:
-                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256],  0, 8, 16, 0, False)
- 
-                tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0)
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256], 0, 8,
+                                          16, 0, False)
+
+                tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B,
+                                  256, 128, 128, 0)
                 tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0)
-                tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255,1,1,8,8)
-                tik_instance.vmuls(64, resMatmul_local_UB[255*64], resMatmul_local_UB[255*64], matrix_max_scalar, 255,1,1,8,8)
-                tik_instance.vmuls(64, resMatmul_local_UB[510*64], resMatmul_local_UB[510*64], matrix_max_scalar, 2,1,1,8,8)
- 
-                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504)
- 
-                tik_instance.data_move(input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0)
+                tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8)
+                tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64], matrix_max_scalar,
+                                   255, 1, 1, 8, 8)
+                tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64], matrix_max_scalar, 2,
+                                   1, 1, 8, 8)
+
+                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0,
+                                       1504)
+
+                tik_instance.data_move(input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128,
+                                       1920, 0)
                 tik_instance.data_move(input_1_local_L11, input_x1[core_n_idx * 129024 + 12288], 0, 8, 240, 768, 0)
- 
+
                 with tik_instance.for_range(0, 8) as cc102:
-                    tik_instance.load2dv1(input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256],  0, 8, 8, 0, True)
+                    tik_instance.load2dv1(input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256], 0,
+                                          8, 8, 0, True)
                 with tik_instance.for_range(0, 16) as cc103:
-                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256],  0, 8, 15, 0, False)
- 
-                tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B1, 240, 128, 128, 0)
+                    tik_instance.load2dv1(input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256], 0,
+                                          8, 15, 0, False)
+
+                tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A,
+                                  input_2_local_L1_local_L0B1, 240, 128, 128, 0)
                 tik_instance.data_move(resMatmul_local_UB1, resMatmul_local_UB_local_L0C1, 0, 1, 120, 0, 0)
- 
-                tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255,1,1,8,8)
-                tik_instance.vmuls(64, resMatmul_local_UB1[255*64], resMatmul_local_UB1[255*64], matrix_max_scalar, 225,1,1,8,8)
- 
+
+                tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255, 1, 1, 8, 8)
+                tik_instance.vmuls(64, resMatmul_local_UB1[255 * 64], resMatmul_local_UB1[255 * 64], matrix_max_scalar,
+                                   225, 1, 1, 8, 8)
+
                 tik_instance.data_move(resMatmul[core_n_idx * 129024 + 12288], resMatmul_local_UB1, 0, 8, 480, 0, 1536)
- 
+
         tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
         return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
index aeef521d1ea..ebff84d889d 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
@@ -17,11 +17,12 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
+
 import te.platform.cce_params as cce
-from te import tvm
-from topi.cce import util
-from te import tik
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+from te import tik
+from topi.cce import util
+
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
@@ -40,6 +41,7 @@ matmul_cube_fracz_left_cast_op_info = TBERegOp("CusMatMulCubeFraczLeftCast") \
     .dtype_format(DataType.F16_Default, DataType.F32_FracZ, DataType.F16_Default, DataType.F16_FracZ) \
     .get_op_info()
 
+
 # pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
 def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
     """
@@ -137,6 +139,7 @@ src_dtype: str
         else:
             raise RuntimeError("unsupport input shape now for batch bias case")
 
+
 def _get_bias(shape_bias):
     bias_length = shape_bias[0]
     if bias_length % 16 == 0:
@@ -147,6 +150,7 @@ def _get_bias(shape_bias):
         shape_bias.append(bias_length)
         return shape_bias
 
+
 def _get_input_shape(shape_x):
     dim_a = shape_x[0]
     dim_b = shape_x[1]
@@ -164,6 +168,7 @@ def _get_input_shape(shape_x):
         res.append(dim_b)
     return res
 
+
 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
     shape_a = input_x1.get("shape")
     shape_b = input_x2.get("shape")
@@ -199,40 +204,41 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
                     return False
             elif shape_a[1] != shape_b[0]:
                 return False
- 
+
             if trans_a_f and trans_b and shape_b[1] == 1:
                 return False
- 
+
         if src_dtype == "float16":
             if len(shape_a) != 2 and len(shape_b) != 2:
                 return False
- 
+
             if trans_a:
                 m_shape = shape_a[1]
                 k_shape = shape_a[0]
             else:
                 m_shape = shape_a[0]
                 k_shape = shape_a[1]
- 
+
             if trans_b:
                 n_shape = shape_b[0]
                 k_b_shape = shape_b[1]
             else:
                 n_shape = shape_b[1]
                 k_b_shape = shape_b[0]
- 
+
             if k_shape != k_b_shape:
                 return False
- 
+
             if m_shape == 1 or n_shape == 1:
                 if k_shape % 256 != 0:
                     return False
- 
+
     except RuntimeError as e:
         return False
- 
+
     return True
- 
+
+
 # pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
 @op_info_register(matmul_cube_fracz_left_cast_op_info)
 def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False,
@@ -278,7 +284,7 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
             c1 = 1
         shape_b = [n, c1 * h * w * c0]
         shape_a = [n, n]
- 
+
     if input_x1.get("format") == "FRACTAL_Z":
         n, c, h, w = shape_a
         c0 = 16
@@ -291,26 +297,26 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
     if input_x2.get("format") == "FRACTAL_NZ":
         shape_a = [shape_b[0], shape_b[0]]
         shape_b = shape_b
- 
+
     if input_x1.get("format") == "FRACTAL_NZ":
         shape_a = shape_a
         shape_b = [shape_a[1], shape_a[1]]
 
     shape_a = list(shape_a)
     shape_b = list(shape_b)
- 
+
     shape_a = _get_input_shape(shape_a)
     shape_b = _get_input_shape(shape_b)
- 
+
     util.check_kernel_name(kernel_name)
     util.check_shape_rule(shape_a)
     util.check_shape_rule(shape_b)
     util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
     util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
- 
+
     shape_a = [shape_a[1], shape_a[0]]
     trans_a = bool(1 - trans_a)
- 
+
     shape_b = [shape_b[1], shape_b[0]]
     trans_b = bool(1 - trans_b)
 
@@ -319,45 +325,45 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
         shape_bias = bias.get("shape")
         shape_bias = list(shape_bias)
         shape_bias = _get_bias(shape_bias)
- 
+
     src_dtype = input_x1.get("dtype").lower()
     _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)
- 
+
     m_shape = shape_a[len(shape_a) - 2]
     km_shape = shape_a[len(shape_a) - 1]
     kn_shape = shape_b[len(shape_a) - 2]
     n_shape = shape_b[len(shape_a) - 1]
- 
+
     if src_dtype == "float16":
         block_reduce = cce.BLOCK_REDUCE
- 
+
     block_in = cce.BLOCK_IN
     block_out = cce.BLOCK_OUT
- 
+
     if trans_a and km_shape == 1:
         block_in = cce.BLOCK_VECTOR
- 
+
     if not trans_a and m_shape == 1:
         block_in = cce.BLOCK_VECTOR
- 
+
     if trans_b and kn_shape == 1:
         block_out = cce.BLOCK_VECTOR
- 
+
     if not trans_b and n_shape == 1:
         block_out = cce.BLOCK_VECTOR
- 
+
     if trans_a:
         shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
     else:
         shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
- 
+
     if trans_b:
         shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
     else:
         shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
     shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
     shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
- 
+
     if util.get_product_version() == util.VERSION_MINI:
         tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
     else:
@@ -372,7 +378,8 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
                          diag_opt=diag_opt, diag_size=DIAG_SIZE)
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[res_matmul])
     return tik_instance
- 
+
+
 def get_cus_tile_info(input_x1, input_x2, diag_size):
     tile_map = {
         ((32, 32, 16, 16), (128, 32, 16, 16)): (8, 8, 16),
@@ -381,10 +388,10 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
         ((128, 128, 16, 16), (32, 128, 16, 16)): (8, 8, 16),
         ((16, 16, 16, 16), (144, 16, 16, 16)): (8, 8, 9),
         ((64, 64, 16, 16), (16, 64, 16, 16)): (8, 8, 4),
-        ((16, 16, 16, 16), (64, 16, 16, 16)):  (8, 8, 4),
-        ((32, 32, 16, 16), (8, 32, 16, 16)):  (8, 8, 1),
+        ((16, 16, 16, 16), (64, 16, 16, 16)): (8, 8, 4),
+        ((32, 32, 16, 16), (8, 32, 16, 16)): (8, 8, 1),
         ((128, 128, 16, 16), (64, 128, 16, 16)): (8, 8, 16),
-        ((16, 16, 16, 16),  (4, 16, 16, 16)):  (8, 8, 1),
+        ((16, 16, 16, 16), (4, 16, 16, 16)): (8, 8, 1),
         ((16, 16, 16, 16), (32, 16, 16, 16)): (8, 8, 2),
         ((64, 64, 16, 16), (32, 64, 16, 16)): (8, 8, 8),
         ((32, 32, 16, 16), (64, 32, 16, 16)): (8, 8, 8),
@@ -398,13 +405,14 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
     }
     shape_info = (tuple(input_x1.shape), tuple(input_x2.shape))
     diag_opt = False
-    if input_x1.shape[0]*input_x1.shape[3] > diag_size:
+    if input_x1.shape[0] * input_x1.shape[3] > diag_size:
         diag_opt = True
     if shape_info not in tile_map:
         raise ValueError("shape %s is not supported" % str(shape_info))
     mo_tile, ko_tile, no_tile = tile_map[shape_info]
     return mo_tile, ko_tile, no_tile, diag_opt
 
+
 def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
                          res, mo_tile, ko_tile, no_tile, diag_opt=False, diag_size=128):
     ko, mo, mi, ki = input_x1.shape
@@ -420,7 +428,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
         raise ValueError("shape of input_x1 or input_x2 is not supported!")
     if not trans_a or not trans_b:
         raise ValueError("only trans_a=False and trans_b=False be supported!")
- 
+
     core_m_num = mo // mo_tile
     loop_n_num = no // no_tile
     if loop_n_num * core_m_num <= maxblocknum:
@@ -432,7 +440,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
     else:
         raise ValueError("Does not support this scenario!")
     block_num = core_m_num * core_n_num
- 
+
     loop_k_num = ko // ko_tile
     if diag_opt:
         loop_k_num = diag_outer // ko_tile
@@ -445,7 +453,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
         core_n = block_idx % core_n_num
         with tik_instance.for_range(0, loop_n_num) as cc_n:
             res_L0C = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
-                                                name="resMatmul_L0C", scope=tik.scope_cc)
+                                          name="resMatmul_L0C", scope=tik.scope_cc)
             with tik_instance.for_range(0, loop_k_num, thread_num=thread_num_k) as thread_idx_k:
                 # input_x2 -> input_x2_ub -(fp322fp16)-> input_x2_cast_ub -> input_x2_L1
                 input_x2_ub = tik_instance.Tensor("float32", [no_tile, ko_tile_inner, c0, c0], name="input_x2_ub",
@@ -476,41 +484,41 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
                                    input_x2_cast_ub[count * repeate_times_max * vectorfp32_size],
                                    input_x2_ub[count * repeate_times_max * vectorfp32_size], repeate_num,
                                    1, 1, 4, 8)
-               input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
-                                                  name="input_x2_L1", scope=tik.scope_cbuf)
-                tik_instance.data_move(input_x2_L1, input_x2_cast_ub, 0, 1,
-                                       no_tile * ko_tile_inner * c0 * c0 * fp16_size // blocksize, 0, 0)
-                # input_x1 -> input_x1_L1
-                input_x1_L1 = tik_instance.Tensor(input_x1.dtype, [ko_tile_inner, mo_tile, c0, c0],
-                                                  name="input_x1_L1", scope=tik.scope_cbuf)
-                tik_instance.data_move(input_x1_L1,
-                                       input_x1[k_idx,
-                                                core_m * mo_tile, 0, 0],
-                                       0, ko_tile_inner, mo_tile * c0 * c0 * fp16_size // blocksize,
-                                       (mo - mo_tile) * c0 * c0 * fp16_size // blocksize, 0)
-                # input_x2_L1 -> input_x2_L0B
-                input_x2_L0B = tik_instance.Tensor("float16", [ko_tile_inner, no_tile, c0, c0],
-                                                   name="input_x2_L0B", scope=tik.scope_cb)
-                with tik_instance.for_range(0, ko_tile_inner) as cc2:
-                    tik_instance.load2dv1(input_x2_L0B[cc2, 0, 0, 0], input_x2_L1[0, cc2, 0, 0], 0, no_tile,
-                                          ko_tile_inner,
-                                          0, True)
-                # input_x1_L1 -> input_x1_L0A
-                input_x1_L0A = tik_instance.Tensor(input_x1.dtype, [mo_tile, ko_tile_inner, c0, c0],
-                                                   name="input_x1_L0A", scope=tik.scope_ca)
-                with tik_instance.for_range(0, mo_tile) as cc1:
-                    tik_instance.load2dv1(input_x1_L0A[cc1, 0, 0, 0], input_x1_L1[0, cc1, 0, 0], 0, ko_tile_inner,
-                                          mo_tile, 0, False)
-                with tik_instance.if_scope(thread_idx_k == 0):
-                    tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
-                                      ko_tile_inner * c0, no_tile * c0, 0)
-                with tik_instance.else_scope():
-                    tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
-                                      ko_tile_inner * c0, no_tile * c0, 1)
-            res_ub = tik_instance.Tensor(input_x1.dtype, [no_tile, mo_tile, c0, c0],
-                                               name="resMatmul_ub", scope=tik.scope_ubuf)
-            tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0, 1)
-            tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile, core_m * mo_tile, 0, 0],
-                                   res_ub, 0, no_tile,
-                                   mo_tile * c0 * c0 * fp16_size // blocksize, 0,
-                                   (mo - mo_tile) * c0 * c0 * fp16_size // blocksize)
+            input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
+                                              name="input_x2_L1", scope=tik.scope_cbuf)
+            tik_instance.data_move(input_x2_L1, input_x2_cast_ub, 0, 1,
+                                   no_tile * ko_tile_inner * c0 * c0 * fp16_size // blocksize, 0, 0)
+            # input_x1 -> input_x1_L1
+            input_x1_L1 = tik_instance.Tensor(input_x1.dtype, [ko_tile_inner, mo_tile, c0, c0],
+                                              name="input_x1_L1", scope=tik.scope_cbuf)
+            tik_instance.data_move(input_x1_L1,
+                                   input_x1[k_idx,
+                                            core_m * mo_tile, 0, 0],
+                                   0, ko_tile_inner, mo_tile * c0 * c0 * fp16_size // blocksize,
+                                   (mo - mo_tile) * c0 * c0 * fp16_size // blocksize, 0)
+            # input_x2_L1 -> input_x2_L0B
+            input_x2_L0B = tik_instance.Tensor("float16", [ko_tile_inner, no_tile, c0, c0],
+                                               name="input_x2_L0B", scope=tik.scope_cb)
+            with tik_instance.for_range(0, ko_tile_inner) as cc2:
+                tik_instance.load2dv1(input_x2_L0B[cc2, 0, 0, 0], input_x2_L1[0, cc2, 0, 0], 0, no_tile,
+                                      ko_tile_inner,
+                                      0, True)
+            # input_x1_L1 -> input_x1_L0A
+            input_x1_L0A = tik_instance.Tensor(input_x1.dtype, [mo_tile, ko_tile_inner, c0, c0],
+                                               name="input_x1_L0A", scope=tik.scope_ca)
+            with tik_instance.for_range(0, mo_tile) as cc1:
+                tik_instance.load2dv1(input_x1_L0A[cc1, 0, 0, 0], input_x1_L1[0, cc1, 0, 0], 0, ko_tile_inner,
+                                      mo_tile, 0, False)
+            with tik_instance.if_scope(thread_idx_k == 0):
+                tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
+                                  ko_tile_inner * c0, no_tile * c0, 0)
+            with tik_instance.else_scope():
+                tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
+                                  ko_tile_inner * c0, no_tile * c0, 1)
+        res_ub = tik_instance.Tensor(input_x1.dtype, [no_tile, mo_tile, c0, c0],
+                                     name="resMatmul_ub", scope=tik.scope_ubuf)
+        tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0, 1)
+        tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile, core_m * mo_tile, 0, 0],
+                               res_ub, 0, no_tile,
+                               mo_tile * c0 * c0 * fp16_size // blocksize, 0,
+                               (mo - mo_tile) * c0 * c0 * fp16_size // blocksize)
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
index 321ec602f74..b5f8ee9d828 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
@@ -18,37 +18,35 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
-import te.lang.cce
-import te.platform.cce_params as cce
-from te.platform.fusion_manager import fusion_manager
-from te import tvm
-from topi import generic
-from topi.cce import util
-from te import tik
-from impl.matmul_vector import matmul_vector_cce
+
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+from te import tik
+from topi.cce import util
+
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
 
 cus_matmul_cube_fracz_right_mul_op_info = TBERegOp("CusMatMulCubeFraczRightMul") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("matmulcubefraczrightmul.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusMatMulCubeFraczRightMul") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .input(1, "x2", False, "required", "all") \
-                             .input(2, "x3", False, "required", "all") \
-                             .input(3, "x4", False, "optional", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F16_FracZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default, DataType.F32_FracZ) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("matmulcubefraczrightmul.so") \
+    .compute_cost(10) \
+    .kernel_name("CusMatMulCubeFraczRightMul") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .input(1, "x2", False, "required", "all") \
+    .input(2, "x3", False, "required", "all") \
+    .input(3, "x4", False, "optional", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_FracZ, DataType.F16_Default, DataType.F32_Default, DataType.F16_Default,
+                  DataType.F32_FracZ) \
+    .get_op_info()
 
 
 @op_info_register(cus_matmul_cube_fracz_right_mul_op_info)
-def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
+                               kernel_name="matmulcube"):
     if util.get_product_version() == util.VERSION_MINI:
         tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
     else:
@@ -61,10 +59,10 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
     input_x3_shape = input_x3.get("shape")
     input_x3_dtype = input_x3.get("dtype").lower()
     output_shape = output_y.get("shape")
-    Supported = [((72, 8, 16, 16),"float16", (72, 72, 16, 16), "float16", (1,), "float32"),
-                 ((32, 8, 16, 16),"float16", (32, 32, 16, 16), "float16", (1,), "float32"),
-                 ((8, 32, 16, 16),"float16", (8, 8, 16, 16), "float16", (1,), "float32"),
-                 ((4, 4, 16, 16),"float16", (4, 4, 16, 16), "float16", (1,), "float32"),
+    Supported = [((72, 8, 16, 16), "float16", (72, 72, 16, 16), "float16", (1,), "float32"),
+                 ((32, 8, 16, 16), "float16", (32, 32, 16, 16), "float16", (1,), "float32"),
+                 ((8, 32, 16, 16), "float16", (8, 8, 16, 16), "float16", (1,), "float32"),
+                 ((4, 4, 16, 16), "float16", (4, 4, 16, 16), "float16", (1,), "float32"),
                  ((4, 16, 16, 16), 'float16', (4, 4, 16, 16), 'float16', (1,), 'float32'),
                  ((49, 4, 16, 16), 'float16', (49, 49, 16, 16), 'float16', (1,), 'float32'),
                  ((36, 4, 16, 16), 'float16', (36, 36, 16, 16), 'float16', (1,), 'float32'),
@@ -81,7 +79,8 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
                  ((32, 128, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'),
                  ((64, 32, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'),
                  ((16, 64, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32')]
-    input_shape = (tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
+    input_shape = (
+    tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
     if input_shape not in Supported:
         raise RuntimeError("input_shape %s is not supported" % str(input_shape))
 
@@ -93,6 +92,7 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
     return tik_instance
 
+
 def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                               res):
     diag_size = 128
@@ -176,7 +176,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                                               name="resMatmul_L0C", scope=tik.scope_cc)
                 with tik_instance.for_range(0, loop_k_num, thread_num=thread_num_k) as thread_idx_k:
                     if diag_opt:
-                        k_idx = (core_n*loop_n_num + cc_n) * no_tile + thread_idx_k * ko_tile_inner
+                        k_idx = (core_n * loop_n_num + cc_n) * no_tile + thread_idx_k * ko_tile_inner
                     else:
                         k_idx = thread_idx_k * ko_tile_inner
                     # input_x1 -> input_x1_L1
@@ -191,7 +191,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                     input_x2_L1 = tik_instance.Tensor("float16", [no_tile, ko_tile_inner, c0, c0],
                                                       name="input_x2_L1", scope=tik.scope_cbuf)
                     tik_instance.data_move(input_x2_L1,
-                                           input_x2[(core_n*loop_n_num + cc_n) * no_tile,
+                                           input_x2[(core_n * loop_n_num + cc_n) * no_tile,
                                                     k_idx, 0, 0],
                                            0, no_tile, ko_tile_inner * c0 * c0 * fp16_size // blocksize,
                                            (ko - ko_tile_inner) * c0 * c0 * fp16_size // blocksize, 0)
@@ -215,9 +215,9 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                         tik_instance.mmad(res_L0C, input_x1_L0A, input_x2_L0B, mo_tile * c0,
                                           ko_tile_inner * c0, no_tile * c0, 1)
                 res_ub = tik_instance.Tensor("float32", [no_tile, mo_tile, c0, c0],
-                                                   name="resMatmul_ub", scope=tik.scope_ubuf)
+                                             name="resMatmul_ub", scope=tik.scope_ubuf)
                 tik_instance.data_move(res_ub, res_L0C, 0, 1, no_tile * mo_tile, 0, 0)
- 
+
                 input_3_local_UB = tik_instance.Tensor("float32", (8,), scope=tik.scope_ubuf, name="input_3_local_UB")
                 tik_instance.data_move(input_3_local_UB, input_x3, 0, 1, 1, 0, 0)
                 matrix_max_scalar = tik_instance.Scalar("float32")
@@ -236,7 +236,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                                    res_ub[count * repeate_times_max * vectorfp32_size],
                                    res_ub[count * repeate_times_max * vectorfp32_size],
                                    matrix_max_scalar, repeate_num, 1, 1, 8, 8)
- 
+
                 tik_instance.data_move(res[(core_n * loop_n_num + cc_n) * no_tile,
                                            (core_m * loop_m_num + cc_m) * mo_tile, 0, 0],
                                        res_ub, 0, no_tile,
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
index 4bbfcf7f334..dfa83c4fb7f 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
@@ -18,13 +18,15 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
+
 import te.lang.cce
 import te.platform.cce_params as cce
+from impl.matmul_vector import matmul_vector_cce
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tvm
 from topi import generic
 from topi.cce import util
-from impl.matmul_vector import matmul_vector_cce
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+
 # General limitation of the size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 NoneType = type(None)
@@ -36,8 +38,8 @@ matmul_cube_op_info = TBERegOp("CusMatMulCube") \
     .compute_cost(10) \
     .kernel_name("CusMatMulCube") \
     .partial_flag(True) \
-    .attr("transpose_a", "required", "bool", "all")\
-    .attr("transpose_b", "required", "bool", "all")\
+    .attr("transpose_a", "required", "bool", "all") \
+    .attr("transpose_b", "required", "bool", "all") \
     .input(0, "x1", False, "required", "all") \
     .input(1, "x2", False, "required", "all") \
     .input(2, "x3", False, "optional", "all") \
@@ -45,6 +47,7 @@ matmul_cube_op_info = TBERegOp("CusMatMulCube") \
     .dtype_format(DataType.F16_FracNZ, DataType.F16_FracNZ, DataType.F16_Default, DataType.F32_FracNZ) \
     .get_op_info()
 
+
 # pylint: disable=locally-disabled,too-many-arguments,too-many-branches, too-many-statements, too-many-locals,
 def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
     """
@@ -113,16 +116,16 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
 
     if m_shape != 1:
         if n_shape == 1:
-            if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+            if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
                 raise RuntimeError("input shape K1 should be multiple of %d"
-                                   % (cce.BLOCK_IN*cce.BLOCK_IN))
-        elif km_shape%k_block_size != 0:
+                                   % (cce.BLOCK_IN * cce.BLOCK_IN))
+        elif km_shape % k_block_size != 0:
             raise RuntimeError(
                 "input shape K1 should be multiple of %d" % cce.BLOCK_IN)
     else:
-        if km_shape % (cce.BLOCK_IN*cce.BLOCK_IN) != 0:
+        if km_shape % (cce.BLOCK_IN * cce.BLOCK_IN) != 0:
             raise RuntimeError("input shape K1 should be multiple of %d"
-                               % (cce.BLOCK_IN*cce.BLOCK_IN))
+                               % (cce.BLOCK_IN * cce.BLOCK_IN))
 
     if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
         raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
@@ -130,7 +133,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
     if len(shape_bias):
         if len(shape_bias) == 1:
             if is_gevm or is_gemv:
-                if shape_bias[0] != m_shape*n_shape:
+                if shape_bias[0] != m_shape * n_shape:
                     raise RuntimeError("broadcast case shape bias for gemv must be equal m*n")
             else:
                 if shape_bias[0] != n_shape:
@@ -141,33 +144,36 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
         else:
             raise RuntimeError("unsupport input shape now for batch bias case")
 
+
 def _get_bias(shape_bias):
     bias_length = shape_bias[0]
-    if bias_length % 16 ==0:
+    if bias_length % 16 == 0:
         return shape_bias
     else:
-        bias_length = (bias_length // 16)*16 + 16
+        bias_length = (bias_length // 16) * 16 + 16
         shape_bias = []
         shape_bias.append(bias_length)
         return shape_bias
 
+
 def _get_input_shape(shape_x):
     dim_a = shape_x[0]
     dim_b = shape_x[1]
     res = []
-    if dim_a % 16 !=0:
-        dim_a = (dim_a // 16)*16 + 16
+    if dim_a % 16 != 0:
+        dim_a = (dim_a // 16) * 16 + 16
         res.append(dim_a)
     else:
         res.append(dim_a)
 
-    if dim_b % 16 !=0:
-        dim_b = (dim_b // 16)*16 + 16
+    if dim_b % 16 != 0:
+        dim_b = (dim_b // 16) * 16 + 16
         res.append(dim_b)
     else:
         res.append(dim_b)
     return res
 
+
 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
     shape_a = input_x1.get("shape")
     shape_b = input_x2.get("shape")
@@ -182,7 +188,7 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
     if bias is not None and bool(bias):
         shape_bias = bias.get("shape")
     try:
-        trans_a_f = bool(1-trans_a)
+        trans_a_f = bool(1 - trans_a)
         if src_dtype == "float32" or src_dtype == "int32":
             if len(shape_a) != 2 and len(shape_b) != 2:
                 return False
@@ -203,10 +209,10 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
                     return False
             elif shape_a[1] != shape_b[0]:
                 return False
- 
+
             if trans_a_f and trans_b and shape_b[1] == 1:
                 return False
- 
+
         if src_dtype == "float16":
             if len(shape_a) != 2 and len(shape_b) != 2:
                 return False
@@ -217,26 +223,27 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
             else:
                 m_shape = shape_a[0]
                 k_shape = shape_a[1]
- 
+
             if trans_b:
                 n_shape = shape_b[0]
                 k_b_shape = shape_b[1]
             else:
                 n_shape = shape_b[1]
                 k_b_shape = shape_b[0]
- 
+
             if k_shape != k_b_shape:
                 return False
- 
+
             if m_shape == 1 or n_shape == 1:
                 if k_shape % 256 != 0:
                     return False
- 
+
     except RuntimeError as e:
         return False
- 
+
     return True
- 
+
+
 # pylint: disable=locally-disabled,too-many-arguments, too-many-locals, too-many-statements
 @op_info_register(matmul_cube_op_info)
 def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
@@ -269,18 +276,18 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
     """
     shape_a = input_x1.get("ori_shape")
     shape_b = input_x2.get("ori_shape")
- 
+
     if shape_a is not None:
         if len(shape_a) < 2:
             shape_a = input_x1.get("shape")
- 
+
     if shape_b is not None:
         if len(shape_b) < 2:
             shape_b = input_x2.get("shape")
- 
+
     shape_a = list(shape_a)
     shape_b = list(shape_b)
- 
+
     if input_x1.get("format") == "FRACTAL_NZ":
         shape_a = _get_input_shape(shape_a)
         shape_b = _get_input_shape(shape_b)
@@ -290,21 +297,21 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
     util.check_shape_rule(shape_b)
     util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
     util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
- 
+
     if input_x1.get("format") == "FRACTAL_NZ":
         shape_a = [shape_a[1], shape_a[0]]
-        trans_a = bool(1-trans_a)
- 
+        trans_a = bool(1 - trans_a)
+
     if input_x2.get("format") == "FRACTAL_NZ":
         shape_b = [shape_b[1], shape_b[0]]
-        trans_b = bool(1-trans_b)
- 
+        trans_b = bool(1 - trans_b)
+
     shape_bias = ()
     if bias is not None and bool(bias):
         shape_bias = bias.get("shape")
         shape_bias = list(shape_bias)
         shape_bias = _get_bias(shape_bias)
- 
+
     src_dtype = input_x1.get("dtype").lower()
     dst_dtype = output_y.get("dtype").lower()
     if src_dtype == "float32" or src_dtype == "int32":
@@ -338,12 +345,12 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
         shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in)
     else:
         shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce)
- 
+
     if trans_b:
         shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out)
     else:
         shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce)
- 
+
     if input_x1.get("format") == "FORMAT_FRACTAL_Z":
         shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3])
         format_a = "fractal"
@@ -353,7 +360,7 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
     else:
         shape_a_temp = (shape_a[len(shape_a) - 2], shape_a[len(shape_a) - 1])
         format_a = "ND"
- 
+
     if input_x2.get("format") == "FORMAT_FRACTAL_Z":
         shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3])
         format_b = "fractal"
@@ -363,28 +370,28 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
     else:
         shape_b_temp = (shape_b[len(shape_b) - 2], shape_b[len(shape_b) - 1])
         format_b = "ND"
- 
+
     tensor_bias = None
     tensor_a = tvm.placeholder(shape_a_temp, name='tensor_a',
                                dtype=src_dtype)
     tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b',
                                dtype=src_dtype)
- 
+
     if len(shape_bias) > 0:
         tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias',
                                       dtype=dst_dtype)
     result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a,
                                 format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias)
- 
+
     with tvm.target.cce():
         schedule = generic.auto_schedule(result)
- 
+
     tensor_list = [tensor_a, tensor_b, result]
     if len(shape_bias) > 0:
         tensor_list = [tensor_a, tensor_b, tensor_bias, result]
- 
+
     config = {"print_ir": False,
               "name": kernel_name,
               "tensor_list": tensor_list}
- 
+
     te.lang.cce.cce_build_code(schedule, config)
diff --git a/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py b/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
index 14c4b590cee..b001e0ba33d 100644
--- a/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
@@ -13,24 +13,25 @@
 # limitations under the License.
 # ============================================================================
 """CusMatrixCombine"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
 cus_matrix_combine_op_info = TBERegOp("CusMatrixCombine") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("matrixcombine.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusMatrixCombine") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F32_Default, DataType.F32_Default) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("matrixcombine.so") \
+    .compute_cost(10) \
+    .kernel_name("CusMatrixCombine") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F32_Default, DataType.F32_Default) \
+    .get_op_info()
+
 
 @op_info_register(cus_matrix_combine_op_info)
-def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
+def CusMatrixCombine(input_x, output, kernel_name="matrix_combine"):
     input_x_shape = input_x.get("shape")
     output_shape = output.get("shape")
     split_dim = 128
@@ -45,18 +46,20 @@ def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
 
     blocks = 32
     matrix_dim = input_x_shape[0] * input_x_shape[1]
-    if input_x_shape[0] == 1 and input_x_shape[1] == 64 :
+    if input_x_shape[0] == 1 and input_x_shape[1] == 64:
         tiling_dim = 2
         bs = 1
-        with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf)
+        with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub",
+                                             scope=tik.scope_ubuf)
             tik_instance.data_move(input_x_ub, input_x[0, block_index * tiling_dim, 0], 0, 1, 16, 0, 0)
             tik_instance.data_move(res[block_index * tiling_dim, 0], input_x_ub, 0, 1, 16, 0, 0)
     else:
         tiling_dim = 4
         bs = input_x_shape[0]
-        with tik_instance.for_range(0,blocks,block_num=blocks) as block_index:
-            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf)
+        with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
+            input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub",
+                                             scope=tik.scope_ubuf)
             zero = tik_instance.Scalar("float32")
             zero.set_as(0.0)
             with tik_instance.for_range(0, bs) as i:
@@ -69,7 +72,9 @@ def CusMatrixCombine(input_x, output,kernel_name="matrix_combine"):
                     tik_instance.vector_dup(64, input_x_ub, zero, repeat_1, 1, 8)
                     tik_instance.vector_dup(64, input_x_ub[255 * 64], zero, repeat_2, 1, 8)
                 with tik_instance.for_range(0, tiling_dim) as j:
-                    tik_instance.data_move(input_x_ub[j, split_dim * i], input_x[i, block_index * tiling_dim + j, 0], 0, 1, 16, 0, 0)
-                tik_instance.data_move(res[i * split_dim + block_index * tiling_dim, 0], input_x_ub, 0, 1, tiling_dim * matrix_dim *4 // 32, 0, 0)
+                    tik_instance.data_move(input_x_ub[j, split_dim * i], input_x[i, block_index * tiling_dim + j, 0], 0,
+                                           1, 16, 0, 0)
+                tik_instance.data_move(res[i * split_dim + block_index * tiling_dim, 0], input_x_ub, 0, 1,
+                                       tiling_dim * matrix_dim * 4 // 32, 0, 0)
     tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res])
     return tik_instance
diff --git a/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py b/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
index 76fe9625a43..f341efe4b73 100644
--- a/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
@@ -13,40 +13,41 @@
 # limitations under the License.
 # ============================================================================
 """CusTranspose02314"""
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from topi.cce import util
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 
 cus_transpose02314_op_info = TBERegOp("CusTranspose02314") \
-                             .fusion_type("OPAQUE") \
-                             .async_flag(False) \
-                             .binfile_name("transpose02314.so") \
-                             .compute_cost(10) \
-                             .kernel_name("CusTranspose02314") \
-                             .partial_flag(True) \
-                             .input(0, "x1", False, "required", "all") \
-                             .output(0, "y", False, "required", "all") \
-                             .dtype_format(DataType.F16_5HD, DataType.F16_Default) \
-                             .get_op_info()
+    .fusion_type("OPAQUE") \
+    .async_flag(False) \
+    .binfile_name("transpose02314.so") \
+    .compute_cost(10) \
+    .kernel_name("CusTranspose02314") \
+    .partial_flag(True) \
+    .input(0, "x1", False, "required", "all") \
+    .output(0, "y", False, "required", "all") \
+    .dtype_format(DataType.F16_5HD, DataType.F16_Default) \
+    .get_op_info()
+
 
 @op_info_register(cus_transpose02314_op_info)
 def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
     input_x_shape = input_x.get("shape")
     output_shape = output.get("shape")
-    perm = (0,2,3,1,4)
+    perm = (0, 2, 3, 1, 4)
     input_x_shape = tuple(input_x_shape)
-    support_shape = [ (32,128,7,7,16),
-            (32,32,7,7,16),
-            (32,32,14,14,16),
-            (32,64,14,14,16),
-            (32,16,14,14,16),
-            (32,16,28,28,16),
-            (32,32,28,28,16),
-            (32,8,28,28,16),
-            (32,8,56,56,16),
-            (32,16,56,56,16),
-            (32,4,56,56,16),
-            (32,4,112,112,16)]
+    support_shape = [(32, 128, 7, 7, 16),
+                     (32, 32, 7, 7, 16),
+                     (32, 32, 14, 14, 16),
+                     (32, 64, 14, 14, 16),
+                     (32, 16, 14, 14, 16),
+                     (32, 16, 28, 28, 16),
+                     (32, 32, 28, 28, 16),
+                     (32, 8, 28, 28, 16),
+                     (32, 8, 56, 56, 16),
+                     (32, 16, 56, 56, 16),
+                     (32, 4, 56, 56, 16),
+                     (32, 4, 112, 112, 16)]
     if input_x_shape not in support_shape:
         raise RuntimeError("input_shape %s is not supported" % str(input_x_shape))
 
@@ -59,125 +60,172 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
     res = tik_instance.Tensor("float16", output_shape, name="res", scope=tik.scope_gm)
 
     dtype = "float16"
-    if tuple(input_x_shape) == (32,4,112,112,16):
+    if tuple(input_x_shape) == (32, 4, 112, 112, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             with tik_instance.for_range(0, 14) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
                     zero = tik_instance.Scalar(dtype="float16", init_value=0)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 802816 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448, 12096, 0)
+                    tik_instance.data_move(input_1_local_UB,
+                                           input_x[block_idx * 802816 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448,
+                                           12096, 0)
                     with tik_instance.for_range(0, 448) as cc7:
                         with tik_instance.for_range(0, 4) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16], input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
-    elif tuple(input_x_shape) == (32,4,56,56,16):
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16],
+                                               input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32, 4, 56, 56, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 3) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 200704 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448, 2688, 0)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB,
+                                           input_x[block_idx * 200704 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448,
+                                           2688, 0)
                     with tik_instance.for_range(0, 448) as cc7:
                         with tik_instance.for_range(0, 4) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16], input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16],
+                                               input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
 
             input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf)
-            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2", scope=tik.scope_ubuf)
+            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2",
+                                                        scope=tik.scope_ubuf)
             tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 43008], 0, 4, 448, 2688, 0)
             with tik_instance.for_range(0, 448) as cc72:
                 with tik_instance.for_range(0, 4) as cc82:
-                    tik_instance.vadds(16, T_transpose_local_UB2[cc72 * 64 + cc82 * 16], input_1_local_UB2[7168 * cc82 + cc72 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.vadds(16, T_transpose_local_UB2[cc72 * 64 + cc82 * 16],
+                                       input_1_local_UB2[7168 * cc82 + cc72 * 16], zero, 1, 1, 1, 0, 0)
             tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0)
-    elif tuple(input_x_shape) == (32,16,56,56,16):
+    elif tuple(input_x_shape) == (32, 16, 56, 56, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 14) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 802816 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 3024, 0)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB,
+                                           input_x[block_idx * 802816 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112,
+                                           3024, 0)
                     with tik_instance.for_range(0, 112) as cc7:
                         with tik_instance.for_range(0, 16) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
-    elif tuple(input_x_shape) == (32,8,56,56,16):
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
+                                               input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32, 8, 56, 56, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 7) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 7168 + 3584 * db_idx], 0, 8, 224, 2912, 0)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB,
+                                           input_x[block_idx * 401408 + cc1_db * 7168 + 3584 * db_idx], 0, 8, 224, 2912,
+                                           0)
                     with tik_instance.for_range(0, 224) as cc7:
                         with tik_instance.for_range(0, 16) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16], input_1_local_UB[3584 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
-    elif tuple(input_x_shape) == (32,8,28,28,16):
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16],
+                                               input_1_local_UB[3584 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32, 8, 28, 28, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 2) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 100352 + cc1_db * 6272 + 3136 * db_idx], 0, 8, 196, 588, 0)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB,
+                                           input_x[block_idx * 100352 + cc1_db * 6272 + 3136 * db_idx], 0, 8, 196, 588,
+                                           0)
                     with tik_instance.for_range(0, 196) as cc7:
                         with tik_instance.for_range(0, 8) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16], input_1_local_UB[3136 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 100352 + cc1_db * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0)
-    elif tuple(input_x_shape) == (32,32,28,28,16):
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16],
+                                               input_1_local_UB[3136 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 100352 + cc1_db * 50176 + 25088 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1568, 0, 0)
+    elif tuple(input_x_shape) == (32, 32, 28, 28, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 7) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 1792 + 896 * db_idx], 0, 32, 56, 728, 0)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 1792 + 896 * db_idx],
+                                           0, 32, 56, 728, 0)
                     with tik_instance.for_range(0, 56) as cc7:
                         with tik_instance.for_range(0, 32) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 512 + cc8 * 16], input_1_local_UB[896 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
-    elif tuple(input_x_shape) == (32,16,28,28,16):
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 512 + cc8 * 16],
+                                               input_1_local_UB[896 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
+    elif tuple(input_x_shape) == (32, 16, 28, 28, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 3) as cc1_db:
                 with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
-                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf)
-                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf)
-                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 200704 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 672, 0)
+                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
+                                                           scope=tik.scope_ubuf)
+                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
+                                                               scope=tik.scope_ubuf)
+                    tik_instance.data_move(input_1_local_UB,
+                                           input_x[block_idx * 200704 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 672,
+                                           0)
                     with tik_instance.for_range(0, 112) as cc7:
                         with tik_instance.for_range(0, 16) as cc8:
-                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
-                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0)
+                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
+                                               input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx],
+                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
 
             input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf)
-            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2", scope=tik.scope_ubuf)
+            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2",
+                                                        scope=tik.scope_ubuf)
             tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 10752], 0, 16, 112, 672, 0)
             with tik_instance.for_range(0, 112) as cc7:
                 with tik_instance.for_range(0, 16) as cc8:
-                    tik_instance.vadds(16, T_transpose_local_UB2[cc7 * 256 + cc8 * 16], input_1_local_UB2[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                    tik_instance.vadds(16, T_transpose_local_UB2[cc7 * 256 + cc8 * 16],
+                                       input_1_local_UB2[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
             tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0)
 
-    elif tuple(input_x_shape) == (32,16,14,14,16):
+    elif tuple(input_x_shape) == (32, 16, 14, 14, 16):
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             zero = tik_instance.Scalar(dtype="float16", init_value=0)
             with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                 input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf)
-                T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB", scope=tik.scope_ubuf)
+                T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB",
+                                                           scope=tik.scope_ubuf)
                 tik_instance.data_move(input_1_local_UB, input_x[block_idx * 50176 + 1568 * db_idx], 0, 16, 98, 98, 0)
                 with tik_instance.for_range(0, 98) as cc7:
                     with tik_instance.for_range(0, 16) as cc8:
-                        tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1568 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
+                        tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
+                                           input_1_local_UB[1568 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                 tik_instance.data_move(res[block_idx * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0)
-    elif tuple(input_x_shape) == (32, 128, 7, 7, 16) and tuple(perm) == (0,2,3,1,4) and dtype == "float16":
+    elif tuple(input_x_shape) == (32, 128, 7, 7, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             with tik_instance.for_range(0, 7, thread_num=2) as cc1:
                 input_x_ub = tik_instance.Tensor(dtype, [1, 128, 1, 7, 16], name="input_1_local_UB",
                                                  scope=tik.scope_ubuf)
                 transpose_ub = tik_instance.Tensor(dtype, [1, 1, 7, 128, 16], name="transpose_local_UB",
-                                                 scope=tik.scope_ubuf)
+                                                   scope=tik.scope_ubuf)
                 tik_instance.data_move(input_x_ub, input_x[block_idx, 0, cc1, 0, 0], 0, 128, 7, 42, 0)
                 with tik_instance.for_range(0, 7) as cc7:
                     with tik_instance.for_range(0, 128) as cc8:
@@ -193,7 +241,7 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
                                                scope=tik.scope_ubuf)
             tik_instance.data_move(input_x_ub, input_x[block_idx, 0, 0, 0, 0], 0, 1, 1568, 0, 0)
             with tik_instance.for_range(0, 7) as cc1:
-               with tik_instance.for_range(0, 7) as cc2:
+                with tik_instance.for_range(0, 7) as cc2:
                     with tik_instance.for_range(0, 32) as cc3:
                         tik_instance.vadds(16, transpose_ub[0, cc1, cc2, cc3, 0], input_x_ub[0, cc3, cc1, cc2, 0], 0,
                                            1, 1, 1, 0, 0)
@@ -212,11 +260,12 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
                         tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
                                            0, 1, 1, 1, 0, 0)
             tik_instance.data_move(res[block_idx * 100352 + split_index * 2 * 7168], transpose_ub, 0, 1, 896, 0, 0)
+
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             with tik_instance.for_range(0, 6, thread_num=2) as cc1:
-                _inner_+compute(cc1)
+                _inner_ + compute(cc1)
             _inner_compute(6)
-    elif tuple(input_x_shape) == (32,64,14,14,16)  and tuple(perm) == (0,2,3,1,4) and dtype == "float16":
+    elif tuple(input_x_shape) == (32, 64, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
         def _inner_compute(split_index, block_idx):
             input_x_ub = tik_instance.Tensor(dtype, [1, 64, 2, 14, 16], name="input_1_local_UB",
                                              scope=tik.scope_ubuf)
@@ -229,6 +278,7 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
                         tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
                                            0, 1, 1, 1, 0, 0)
             tik_instance.data_move(res[block_idx * 200704 + split_index * 2 * 14336], transpose_ub, 0, 1, 1792, 0, 0)
+
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             with tik_instance.for_range(0, 6, thread_num=2) as cc1:
                 _inner_compute(cc1, block_idx)

From c59e6667f1697b553db303eaab0614c8fad45177 Mon Sep 17 00:00:00 2001
From: z00478463 <zhangzhongpeng1@huawei.com>
Date: Tue, 26 May 2020 09:26:13 +0800
Subject: [PATCH 5/8] for pylint 3rd

---
 example/resnet50_imagenet2012_THOR/config.py  |  8 ----
 .../run_distribute_train.sh                   |  3 +-
 mindspore/ops/operations/thor_ops.py          | 40 ++++++++++---------
 3 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/example/resnet50_imagenet2012_THOR/config.py b/example/resnet50_imagenet2012_THOR/config.py
index 6c664891f76..b9df4947aa3 100644
--- a/example/resnet50_imagenet2012_THOR/config.py
+++ b/example/resnet50_imagenet2012_THOR/config.py
@@ -31,15 +31,7 @@ config = ed({
     "save_checkpoint_steps": 5004,
     "keep_checkpoint_max": 20,
     "save_checkpoint_path": "./",
-    "lr_init": 0.01,
-    "lr_end": 0.00001,
-    "lr_max": 0.1,
-    "warmup_epochs": 0,
-    "lr_decay_mode": "cosine",
     "label_smooth": 1,
     "label_smooth_factor": 0.1,
-    "lr": 0.1,
-    "T_max": 90,
-    "eta_min": 0,
     "frequency": 278
 })
diff --git a/example/resnet50_imagenet2012_THOR/run_distribute_train.sh b/example/resnet50_imagenet2012_THOR/run_distribute_train.sh
index ae05c45dfe1..e39034a9127 100644
--- a/example/resnet50_imagenet2012_THOR/run_distribute_train.sh
+++ b/example/resnet50_imagenet2012_THOR/run_distribute_train.sh
@@ -45,8 +45,7 @@ do
     mkdir ./train_parallel$i
     cp *.py ./train_parallel$i
     cp *.sh ./train_parallel$i
-    cp -r second_order ./train_parallel$i/second_order
-    cp -r test_ops ./train_parallel$i/test_ops
+    cp -r model ./train_parallel$i
     cd ./train_parallel$i || exit
     echo "start training for rank $RANK_ID, device $DEVICE_ID"
 
diff --git a/mindspore/ops/operations/thor_ops.py b/mindspore/ops/operations/thor_ops.py
index 225aff68fa1..e48180c3f6c 100644
--- a/mindspore/ops/operations/thor_ops.py
+++ b/mindspore/ops/operations/thor_ops.py
@@ -16,7 +16,6 @@
 import mindspore as ms
 from mindspore.ops import prim_attr_register, PrimitiveWithInfer
 from mindspore.ops.composite import multitype_ops as C
-import numpy as np
 
 __all__ = ["CusBatchMatMul",
            "CusCholeskyTrsm",
@@ -31,6 +30,7 @@ __all__ = ["CusBatchMatMul",
            "CusMatMulCubeFraczLeftCast",
            ]
 
+
 class CusBatchMatMul(PrimitiveWithInfer):
     """CusBatchMatMul definition"""
 
@@ -38,7 +38,7 @@ class CusBatchMatMul(PrimitiveWithInfer):
     def __init__(self):
         """init CusBatchMatMul"""
         self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
-        from mindspore.ops._op_impl._custom_op.matrix_combine_impl import CusMatrixCombine
+
     def get_bprop(self):
         def bprop(x1, x2, out, dout):
             return (C.zeros_like(x1), C.zeros_like(x2))
@@ -59,7 +59,7 @@ class CusCholeskyTrsm(PrimitiveWithInfer):
     def __init__(self):
         """init CusCholeskyTrsm"""
         self.init_prim_io_names(inputs=['x1'], outputs=['y'])
-        from mindspore.ops._op_impl._custom_op.cholesky_trsm_impl import CusCholeskyTrsm
+
     def infer_shape(self, data1_shape):
         ll = []
         m, _ = data1_shape
@@ -81,7 +81,7 @@ class CusFusedAbsMax1(PrimitiveWithInfer):
         """init CusFusedAbsMax1"""
         self.init_prim_io_names(inputs=['x1'], outputs=['y'])
         self.origin_shape = origin_shape
-        from mindspore.ops._op_impl._custom_op.fused_abs_max1_impl import CusFusedAbsMax1
+
     def get_bprop(self):
         def bprop(x, out, dout):
             return (C.zeros_like(x),)
@@ -91,7 +91,7 @@ class CusFusedAbsMax1(PrimitiveWithInfer):
     def infer_shape(self, data1_shape):
         ll = []
         if len(data1_shape) == 2:
-            ll = [1,]
+            ll = [1, ]
         else:
             ll = [32, 64]
         return ll
@@ -111,7 +111,7 @@ class CusImg2Col(PrimitiveWithInfer):
         self.strides = strides
         self.dilates = dilates
         self.mode = mode
-        from mindspore.ops._op_impl._custom_op.img2col_impl import CusImg2Col
+
     def get_bprop(self):
         def bprop(x, out, dout):
             return (C.zeros_like(x),)
@@ -141,7 +141,7 @@ class CusMatMulCubeDenseLeft(PrimitiveWithInfer):
     def __init__(self):
         """init CusMatMulCubeDenseLeft"""
         self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
-        from mindspore.ops._op_impl._custom_op.matmul_cube_dense_left_impl import CusMatMulCubeDenseLeft 
+
     def get_bprop(self):
         def bprop(x1, x2, out, dout):
             return (C.zeros_like(x1), C.zeros_like(x2))
@@ -162,7 +162,7 @@ class CusMatMulCubeFraczRightMul(PrimitiveWithInfer):
     def __init__(self):
         """init CusMatMulCubeFraczRightMul"""
         self.init_prim_io_names(inputs=['x1', 'x2', 'x3'], outputs=['y'])
-        from mindspore.ops._op_impl._custom_op.matmul_cube_fracz_right_mul_impl import CusMatMulCubeFraczRightMul
+
     def get_bprop(self):
         def bprop(x1, x2, x3, out, dout):
             return (C.zeros_like(x1), C.zeros_like(x2), C.zeros_like(x3))
@@ -185,7 +185,7 @@ class CusMatMulCube(PrimitiveWithInfer):
         self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
         self.transpose_a = transpose_a
         self.transpose_b = transpose_b
-        from mindspore.ops._op_impl._custom_op.matmul_cube_impl import CusMatMulCube
+
     def get_bprop(self):
         def bprop(x1, x2, out, dout):
             return (C.zeros_like(x1), C.zeros_like(x2))
@@ -218,7 +218,7 @@ class CusMatrixCombine(PrimitiveWithInfer):
     def __init__(self):
         """init CusMatrixCombine"""
         self.init_prim_io_names(inputs=['x'], outputs=['y'])
-        from mindspore.ops._op_impl._custom_op.matrix_combine_impl import CusMatrixCombine
+
     def get_bprop(self):
         def bprop(x, out, dout):
             return (C.zeros_like(x),)
@@ -242,7 +242,7 @@ class CusTranspose02314(PrimitiveWithInfer):
     def __init__(self):
         """init CusTranspose02314"""
         self.init_prim_io_names(inputs=['x1'], outputs=['y'])
-        from mindspore.ops._op_impl._custom_op.transpose_02314_impl import CusTranspose02314
+
     def get_bprop(self):
         def bprop(x, out, dout):
             return (C.zeros_like(x),)
@@ -260,36 +260,40 @@ class CusTranspose02314(PrimitiveWithInfer):
     def infer_dtype(self, data1_dtype):
         return data1_dtype
 
+
 class CusMatMulCubeDenseRight(PrimitiveWithInfer):
     """CusMatMulCubeDenseRight definition"""
+
     @prim_attr_register
     def __init__(self):
         """init CusMatMulCubeDenseRight"""
-        self.init_prim_io_names(inputs=['x1', 'x2','x3'], outputs=['y'])
-        from mindspore.ops._op_impl._custom_op.matmul_cube_dense_right_impl import CusMatMulCubeDenseRight
+        self.init_prim_io_names(inputs=['x1', 'x2', 'x3'], outputs=['y'])
 
     def get_bprop(self):
         def bprop(x1, x2, x3, out, dout):
-            return (C.zeros_like(x1),C.zeros_like(x2), C.zeros_like(x3))
+            return (C.zeros_like(x1), C.zeros_like(x2), C.zeros_like(x3))
+
         return bprop
- 
+
     def infer_shape(self, data1_shape, data2_shape, data3_shape):
         return data1_shape
- 
+
     def infer_dtype(self, data1_dtype, data2_dtype, data3_dtype):
         return ms.common.dtype.tensor_type(getattr(ms, "float32"))
 
+
 class CusMatMulCubeFraczLeftCast(PrimitiveWithInfer):
     """CusMatMulCubeFraczLeftCast definition"""
+
     @prim_attr_register
     def __init__(self):
         """init CusMatMulCubeFraczLeftCast"""
         self.init_prim_io_names(inputs=['x1', 'x2'], outputs=['y'])
-        from mindspore.ops._op_impl._custom_op.matmul_cube_fracz_left_cast_impl import CusMatMulCubeFraczLeftCast
 
     def get_bprop(self):
         def bprop(x1, x2, out, dout):
-            return (C.zeros_like(x1),C.zeros_like(x2))
+            return (C.zeros_like(x1), C.zeros_like(x2))
+
         return bprop
 
     def infer_shape(self, data1_shape, data2_shape):

From f34d06cd0f90b3a6db2b1d960d309e87a9281146 Mon Sep 17 00:00:00 2001
From: z00478463 <zhangzhongpeng1@huawei.com>
Date: Tue, 26 May 2020 10:39:42 +0800
Subject: [PATCH 6/8] for pylint 4th

---
 example/resnet50_imagenet2012_THOR/config.py  |  2 +-
 .../resnet50_imagenet2012_THOR/model/thor.py  | 14 ++-----
 .../model/thor_layer.py                       | 38 +++++++------------
 .../_op_impl/_custom_op/batch_matmul_impl.py  |  5 ++-
 .../_op_impl/_custom_op/cholesky_trsm_impl.py |  1 +
 .../_custom_op/fused_abs_max1_impl.py         |  9 +++--
 .../ops/_op_impl/_custom_op/img2col_impl.py   |  3 +-
 .../_custom_op/matmul_cube_dense_left_impl.py | 19 +++++-----
 .../matmul_cube_dense_right_impl.py           | 13 ++++---
 .../matmul_cube_fracz_left_cast_impl.py       | 16 ++++----
 .../matmul_cube_fracz_right_mul_impl.py       |  9 +++--
 .../_op_impl/_custom_op/matmul_cube_impl.py   | 14 +++----
 .../_custom_op/transpose02314_impl.py         |  3 +-
 mindspore/ops/operations/thor_ops.py          |  2 +-
 14 files changed, 73 insertions(+), 75 deletions(-)

diff --git a/example/resnet50_imagenet2012_THOR/config.py b/example/resnet50_imagenet2012_THOR/config.py
index b9df4947aa3..fc01287cc85 100644
--- a/example/resnet50_imagenet2012_THOR/config.py
+++ b/example/resnet50_imagenet2012_THOR/config.py
@@ -33,5 +33,5 @@ config = ed({
     "save_checkpoint_path": "./",
     "label_smooth": 1,
     "label_smooth_factor": 0.1,
-    "frequency": 278
+    "frequency": 834
 })
diff --git a/example/resnet50_imagenet2012_THOR/model/thor.py b/example/resnet50_imagenet2012_THOR/model/thor.py
index 44c0fd45dba..d414f238515 100644
--- a/example/resnet50_imagenet2012_THOR/model/thor.py
+++ b/example/resnet50_imagenet2012_THOR/model/thor.py
@@ -22,12 +22,6 @@ from mindspore.nn.optim.optimizer import Optimizer
 from mindspore.ops import functional as F, composite as C, operations as P
 from mindspore.parallel._utils import _get_device_num, _get_mirror_mean
 
-from cus_ops.cus_matmul_cube_dense_right import CusMatMulCubeDenseRight
-from cus_ops.cus_matmul_cube_fracz_left_cast import CusMatMulCubeFraczLeftCast
-from cus_ops.cus_matmul_cube_dense_left import CusMatMulCubeDenseLeft
-from cus_ops.cus_matmul_cube_fracz_right_mul import CusMatMulCubeFraczRightMul
-from model.grad_reducer_thor import DistributedGradReducerThor
-
 momentum_opt = C.MultitypeFuncGraph("momentum_opt")
 
 
@@ -68,10 +62,10 @@ class THOR(Optimizer):
         self.matrix_G = ParameterTuple(matrix_G)
         self.A_inv_max = ParameterTuple(A_inv_max)
         self.G_inv_max = ParameterTuple(G_inv_max)
-        self.cube_matmul_left = CusMatMulCubeFraczLeftCast()
-        self.cube_matmul_left_fc = CusMatMulCubeDenseLeft()
-        self.cube_matmul_right_fc = CusMatMulCubeDenseRight()
-        self.cube_matmul_right_mul = CusMatMulCubeFraczRightMul()
+        self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
+        self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
+        self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
+        self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul()
         self.transpose = P.Transpose()
         self.shape = P.Shape()
         self.reshape = P.Reshape()
diff --git a/example/resnet50_imagenet2012_THOR/model/thor_layer.py b/example/resnet50_imagenet2012_THOR/model/thor_layer.py
index 8097d729ea1..fea74605b68 100644
--- a/example/resnet50_imagenet2012_THOR/model/thor_layer.py
+++ b/example/resnet50_imagenet2012_THOR/model/thor_layer.py
@@ -23,19 +23,9 @@ from mindspore.common.tensor import Tensor
 from mindspore.nn.cell import Cell
 from mindspore.nn.layer.activation import get_activation
 from mindspore.ops import operations as P
-
-from cus_ops.cus_batch_matmul import CusBatchMatMul
-from cus_ops.cus_cholesky_trsm import CusCholeskyTrsm
-from cus_ops.cus_fused_abs_max1 import CusFusedAbsMax1
-from cus_ops.cus_img2col import CusImg2Col
-from cus_ops.cus_matmul_cube import CusMatMulCube
-from cus_ops.cus_matrix_combine import CusMatrixCombine
-from cus_ops.cus_transpose02314 import CusTranspose02314
-
 import numpy as np
 C0 = 16
 
-
 def caculate_device_shape(matrix_dim, channel, is_A):
     ll = (0)
     if is_A:
@@ -153,11 +143,11 @@ class Conv2d_Thor(_Conv):
                                group=self.group
                                )
 
-        self.img2col = CusImg2Col(ksizes=ksizes, strides=strides)
-        self.cube_matmul = CusMatMulCube(transpose_a=True)
-        self.matrix_combine = CusMatrixCombine()
-        self.cholesky = CusCholeskyTrsm()
-        self.transpose02314 = CusTranspose02314()
+        self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides)
+        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
+        self.matrix_combine = P.CusMatrixCombine()
+        self.cholesky = P.CusCholeskyTrsm()
+        self.transpose02314 = P.CusTranspose02314()
         self.matrix_A_dim = self.in_channels * self.kernel_size[0] * self.kernel_size[1]
         self.matrix_G_dim = self.out_channels
         self.matrix_A_device_shape, self.matrix_A_device_dim = caculate_device_shape(self.matrix_A_dim,
@@ -190,7 +180,7 @@ class Conv2d_Thor(_Conv):
         self.mul = P.Mul()
         self.cast = P.Cast()
         self.damping = Tensor(damping)
-        self.vector_matmul = CusBatchMatMul()
+        self.vector_matmul = P.CusBatchMatMul()
         self.diag_block_dim = 128
         self.channels_slice_flag = False
         if self.in_channels % C0 != 0:
@@ -221,8 +211,8 @@ class Conv2d_Thor(_Conv):
 
         self.dampingA = Tensor(np.identity(dampingA_dim), mstype.float32)
         self.dampingG = Tensor(np.identity(dampingG_dim), mstype.float32)
-        self.fused_abs_max1 = CusFusedAbsMax1([self.matrix_A_dim, self.matrix_A_dim])
-        self.fused_abs_max2 = CusFusedAbsMax1()
+        self.fused_abs_max1 = P.CusFusedAbsMax1([self.matrix_A_dim, self.matrix_A_dim])
+        self.fused_abs_max2 = P.CusFusedAbsMax1()
         self.log = P.Log()
         self.exp = P.Exp()
         self.sqrt = P.Sqrt()
@@ -375,9 +365,9 @@ class Dense_Thor(Cell):
         self.fake_G = Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16))
 
         self.matmul = P.MatMul(transpose_b=True)
-        self.cube_matmul = CusMatMulCube(transpose_a=True)
-        self.matrix_combine = CusMatrixCombine()
-        self.cholesky = CusCholeskyTrsm()
+        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
+        self.matrix_combine = P.CusMatrixCombine()
+        self.cholesky = P.CusCholeskyTrsm()
         self.shape = P.Shape()
         self.reshape = P.Reshape()
         self.transpose = P.Transpose()
@@ -386,7 +376,7 @@ class Dense_Thor(Cell):
         self.cast = P.Cast()
         self.damping = Tensor(damping)
         self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
-        self.vector_matmul = CusBatchMatMul()
+        self.vector_matmul = P.CusBatchMatMul()
         self.pad = P.Pad(((0, 24), (0, 24)))
         self.pad1 = P.Pad(((0, 8), (0, 8)))
         self.slice = P.Slice()
@@ -396,8 +386,8 @@ class Dense_Thor(Cell):
         self.axis = 0
         self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False)
         self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False)
-        self.fused_abs_max1 = CusFusedAbsMax1([1000, 1000])
-        self.fused_abs_max2 = CusFusedAbsMax1()
+        self.fused_abs_max1 = P.CusFusedAbsMax1([1000, 1000])
+        self.fused_abs_max2 = P.CusFusedAbsMax1()
         self.log = P.Log()
         self.exp = P.Exp()
         self.dampingA = Tensor(np.identity(2048), mstype.float32)
diff --git a/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py b/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
index d8395c1e81c..97982c53cf5 100644
--- a/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/batch_matmul_impl.py
@@ -33,6 +33,7 @@ cus_batchmatmul_op_info = TBERegOp("CusBatchMatMul") \
 
 
 def _get_flattern_shape(shape):
+    """_get_flattern_shape"""
     flattern_shape = 1
     for dim in shape:
         flattern_shape *= dim
@@ -40,6 +41,7 @@ def _get_flattern_shape(shape):
 
 
 def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
+    """_inner_matmul_new"""
     input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf)
     t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf)
     tik_instance.data_move(input_1_local_UB, input1[input1_index], 0, 1, 16, 0, 0)
@@ -71,6 +73,7 @@ def _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_
 
 
 def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index):
+    """_inner_matmul_new_1_64_32_64"""
     input_1_local_UB = tik_instance.Tensor(dtype, [64], name="input_1_local_UB", scope=tik.scope_ubuf)
     tik_instance.data_move(input_1_local_UB, input1[input1_index], 0, 1, 8, 0, 0)
     with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2:
@@ -90,6 +93,7 @@ def _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, inpu
 
 @op_info_register(cus_batchmatmul_op_info)
 def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"):
+    """CusBatchMatMul"""
     if util.get_product_version() == util.VERSION_MINI:
         tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
     else:
@@ -116,7 +120,6 @@ def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=Tr
 
     # if not transpose_a and transpose_b:
     batch, m, k = x1_shape
-    _, n, _ = x2_shape
 
     input1_shape = _get_flattern_shape(x1_shape)
     input1 = tik_instance.Tensor(dtype, input1_shape, name="input1", scope=tik.scope_gm)
diff --git a/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py b/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
index 50830fe0f69..71dd1ccb2d5 100644
--- a/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/cholesky_trsm_impl.py
@@ -32,6 +32,7 @@ cus_cholesky_trsm_op_info = TBERegOp("CusCholeskyTrsm") \
 
 @op_info_register(cus_cholesky_trsm_op_info)
 def CusCholeskyTrsm(input_x, output, kernel_name):
+    """CusCholeskyTrsm"""
     input_x_shape = input_x.get("shape")
     output_shape = output.get("shape")
     split_dim = 128
diff --git a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
index 0c47ce78b15..f4b8d44063b 100644
--- a/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/fused_abs_max1_impl.py
@@ -33,6 +33,7 @@ cus_fused_abs_max1_op_info = TBERegOp("CusFusedAbsMax1") \
 
 @op_info_register(cus_fused_abs_max1_op_info)
 def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_max1"):
+    """CusFusedAbsMax1"""
     input_x_shape = input_x.get("shape")
     output_shape = output.get("shape")
 
@@ -203,9 +204,9 @@ def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_m
                 tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
                 input_x_shape[0] == 32 and input_x_shape[1] == 16) or (
-                input_x_shape[0] == 16 and input_x_shape[1] == 32):
+                    input_x_shape[0] == 16 and input_x_shape[1] == 32):
             if (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[
-                0] == 1000:
+                    0] == 1000:
                 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
                 res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
                 blocks = 32
@@ -257,7 +258,7 @@ def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_m
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
 
             elif (input_x_shape[0] == 8 and input_x_shape[1] == 128 and input_x_shape[2] == 128) and origin_shape[
-                0] == 1001:
+                    0] == 1001:
                 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
                 res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
                 blocks = 32
@@ -350,7 +351,7 @@ def CusFusedAbsMax1(input_x, output, origin_shape=None, kernel_name="fused_abs_m
                     tik_instance.data_move(res[block_index, 0], broadcast_0_local_UB, 0, 1, 8, 0, 0)
         elif (input_x_shape[0] == 16 and input_x_shape[1] == 128 and input_x_shape[2] == 128) or (
                 input_x_shape[0] == 16 and input_x_shape[1] == 64) or (
-                input_x_shape[0] == 64 and input_x_shape[1] == 16):
+                    input_x_shape[0] == 64 and input_x_shape[1] == 16):
             input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm)
             res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm)
             total_elements = 1
diff --git a/mindspore/ops/_op_impl/_custom_op/img2col_impl.py b/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
index 8c1fd1262f5..433e3355650 100644
--- a/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/img2col_impl.py
@@ -36,6 +36,7 @@ cus_img2col_info = TBERegOp("CusImg2Col") \
 
 @op_info_register(cus_img2col_info)
 def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img2col"):
+    """CusImg2Col"""
     input_x_shape = input_x.get("shape")
     input_x_dtype = input_x.get("dtype")
     N, C1, H, W, C0 = input_x_shape
@@ -64,7 +65,7 @@ def CusImg2Col(input_x, output, ksizes, strides, dilates, mode, kernel_name="img
                        ((32, 8, 28, 28, 16), 'float16', (1, 1), (1, 1)),
                        ((32, 32, 28, 28, 16), 'float16', (1, 1), (1, 1)),
                        ((32, 16, 14, 14, 16), 'float16', (1, 1), (1, 1)),
-                       ((32, 16, 56, 56, 16), 'float16', (1, 1), (1, 1)), ]
+                       ((32, 16, 56, 56, 16), 'float16', (1, 1), (1, 1)),]
 
     if input_shape not in supported_shape:
         raise RuntimeError("input_shape %s is not supported" % str(input_shape))
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
index 0458363a6db..2d70263bc12 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
@@ -17,10 +17,9 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
-
+from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 import te.lang.cce
 import te.platform.cce_params as cce
-from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
 from te import tik
 from te import tvm
 from topi import generic
@@ -128,7 +127,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
     if n_shape % cce.BLOCK_IN != 0 and n_shape != 1:
         raise RuntimeError("input shape N should be 1 or multiple of %d" % cce.BLOCK_IN)
 
-    if len(shape_bias):
+    if len(shape_bias) != 0:
         if len(shape_bias) == 1:
             if is_gevm or is_gemv:
                 if shape_bias[0] != m_shape * n_shape:
@@ -145,16 +144,19 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
 
 def _get_bias(shape_bias):
     bias_length = shape_bias[0]
+    shb = []
     if bias_length % 16 == 0:
-        return shape_bias
+        shb = shape_bias
     else:
         bias_length = (bias_length // 16) * 16 + 16
         shape_bias = []
         shape_bias.append(bias_length)
-        return shape_bias
+        shb = shape_bias
+    return shb
 
 
 def _get_input_shape(shape_x):
+    """_get_input_shape"""
     dim_a = shape_x[0]
     dim_b = shape_x[1]
     res = []
@@ -173,6 +175,7 @@ def _get_input_shape(shape_x):
 
 
 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+    """check_supported"""
     shape_a = input_x1.get("shape")
     shape_b = input_x2.get("shape")
     print("shape_a: ", shape_a)
@@ -183,8 +186,6 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
     util.check_shape_rule(shape_b)
     util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
     util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
-    if bias is not None and bool(bias):
-        shape_bias = bias.get("shape")
     try:
         trans_a_f = bool(1 - trans_a)
         if src_dtype == "float32" or src_dtype == "int32":
@@ -250,7 +251,7 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
     """
     calculating  matrix multiplication with bias, C = A*B + bias, support input
     data with fractal format.
- 
+
     Parameters:
     shape_a: list or tuple
             Shape of the first tensor a with rank > 1
@@ -269,7 +270,7 @@ def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=F
             If True, the input data format of a and b must be fractal format
     shape_bias: list or tuple
             Shape of bias, only support the input data format with ND
- 
+
     Returns
     -------
     None
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
index 5cae9afda01..4a1982738d6 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_right_impl.py
@@ -2,19 +2,19 @@
 # -*- coding:utf-8 -*-
 """
 copyright 2020 Huawei Technologies Co., Ltd
- 
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
- 
+
 http://www.apache.org/licenses/LICENSE-2.0
- 
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License == distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
- 
+
 matmul
 """
 from __future__ import absolute_import
@@ -43,11 +43,12 @@ matmul_cube_dense_right_op_info = TBERegOp("CusMatMulCubeDenseRight") \
 @op_info_register(matmul_cube_dense_right_op_info)
 def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
                             kernel_name="matmulcube"):
+    """CusMatMulCubeDenseRight"""
     shape_a_temp = (128, 63, 16, 16)
     shape_b_temp = (128, 128, 16, 16)
     shape_output = output_y.get("shape")
     matrix_max_shape = (1,)
-    support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape), ]
+    support_shape = [(shape_a_temp, shape_b_temp, matrix_max_shape),]
     shape_a_input = input_x1.get("shape")
     shape_b_input = input_x2.get("shape")
     matrix_max_input = input_x3.get("shape")
@@ -62,7 +63,7 @@ def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}
             tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
         input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm)
         input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm)
-        input_x3 = tik_instance.Tensor("float32", [1, ], name="matrix_max", scope=tik.scope_gm)
+        input_x3 = tik_instance.Tensor("float32", [1,], name="matrix_max", scope=tik.scope_gm)
         resMatmul = tik_instance.Tensor("float32", shape_output, name="output", scope=tik.scope_gm)
         with tik_instance.for_range(0, 32, block_num=32) as block_index:
             core_m_idx = block_index // 16
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
index ebff84d889d..817aeb91d43 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
@@ -17,9 +17,8 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
-
-import te.platform.cce_params as cce
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+import te.platform.cce_params as cce
 from te import tik
 from topi.cce import util
 
@@ -141,6 +140,7 @@ src_dtype: str
 
 
 def _get_bias(shape_bias):
+    """_get_bias"""
     bias_length = shape_bias[0]
     if bias_length % 16 == 0:
         return shape_bias
@@ -152,6 +152,7 @@ def _get_bias(shape_bias):
 
 
 def _get_input_shape(shape_x):
+    """_get_input_shape"""
     dim_a = shape_x[0]
     dim_b = shape_x[1]
     res = []
@@ -170,6 +171,7 @@ def _get_input_shape(shape_x):
 
 
 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+    """check_supported"""
     shape_a = input_x1.get("shape")
     shape_b = input_x2.get("shape")
     print("shape_a: ", shape_a)
@@ -180,8 +182,6 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
     util.check_shape_rule(shape_b)
     util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
     util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
-    if bias is not None and bool(bias):
-        shape_bias = bias.get("shape")
     try:
         trans_a_f = bool(1 - trans_a)
         if src_dtype == "float32" or src_dtype == "int32":
@@ -265,7 +265,7 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
             If True, the input data format of a and b must be fractal format
     shape_bias: list or tuple
             Shape of bias, only support the input data format with ND
- 
+
     Returns
     -------
     None
@@ -381,6 +381,7 @@ def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans
 
 
 def get_cus_tile_info(input_x1, input_x2, diag_size):
+    """get_cus_tile_info"""
     tile_map = {
         ((32, 32, 16, 16), (128, 32, 16, 16)): (8, 8, 16),
         ((8, 8, 16, 16), (72, 8, 16, 16)): (8, 8, 4),
@@ -415,8 +416,9 @@ def get_cus_tile_info(input_x1, input_x2, diag_size):
 
 def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
                          res, mo_tile, ko_tile, no_tile, diag_opt=False, diag_size=128):
-    ko, mo, mi, ki = input_x1.shape
-    no, ko, ki, ni = input_x2.shape
+    """cus_cube_matmul_cast"""
+    ko, mo, _, _ = input_x1.shape
+    no, ko, ki, _ = input_x2.shape
     c0 = input_x1.shape[-1]
     diag_outer = diag_size // c0
     maxblocknum = 32
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
index b5f8ee9d828..e30b19ef6f9 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
@@ -47,6 +47,7 @@ cus_matmul_cube_fracz_right_mul_op_info = TBERegOp("CusMatMulCubeFraczRightMul")
 @op_info_register(cus_matmul_cube_fracz_right_mul_op_info)
 def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
                                kernel_name="matmulcube"):
+    """CusMatMulCubeFraczRightMul"""
     if util.get_product_version() == util.VERSION_MINI:
         tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
     else:
@@ -80,7 +81,7 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
                  ((64, 32, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'),
                  ((16, 64, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32')]
     input_shape = (
-    tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
+        tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
     if input_shape not in Supported:
         raise RuntimeError("input_shape %s is not supported" % str(input_shape))
 
@@ -95,15 +96,17 @@ def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y
 
 def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
                               res):
+    """cus_cube_matmul_right_mul"""
     diag_size = 128
-    ko, mo, mi, ki = input_x1.shape
-    no, ko, ki, ni = input_x2.shape
+    ko, mo, _, _ = input_x1.shape
+    no, ko, ki, _ = input_x2.shape
     c0 = input_x1.shape[-1]
     diag_outer = diag_size // c0
     if [input_x1.shape[-1], input_x1.shape[-2], input_x2.shape[-1], input_x2.shape[-2]] != [c0, c0, c0, c0]:
         raise ValueError("shape of input_x1 or input_x2 is not supported!")
 
     def get_cus_tile_info(input_x1, input_x2, input_x3):
+        """get_cus_tile_info"""
         input_shape = (tuple(input_x1.shape), input_x1.dtype, tuple(input_x2.shape), input_x2.dtype,
                        tuple(input_x3.shape), input_x3.dtype)
         tile_map = {
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
index dfa83c4fb7f..603ed287f6e 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_impl.py
@@ -18,11 +18,10 @@ limitations under the License.
 matmul
 """
 from __future__ import absolute_import
-
-import te.lang.cce
-import te.platform.cce_params as cce
 from impl.matmul_vector import matmul_vector_cce
 from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType
+import te.lang.cce
+import te.platform.cce_params as cce
 from te import tvm
 from topi import generic
 from topi.cce import util
@@ -146,6 +145,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
 
 
 def _get_bias(shape_bias):
+    """_get_bias"""
     bias_length = shape_bias[0]
     if bias_length % 16 == 0:
         return shape_bias
@@ -157,6 +157,7 @@ def _get_bias(shape_bias):
 
 
 def _get_input_shape(shape_x):
+    """_get_input_shape"""
     dim_a = shape_x[0]
     dim_b = shape_x[1]
     res = []
@@ -175,6 +176,7 @@ def _get_input_shape(shape_x):
 
 
 def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"):
+    """check_supported"""
     shape_a = input_x1.get("shape")
     shape_b = input_x2.get("shape")
     print("shape_a: ", shape_a)
@@ -185,8 +187,6 @@ def check_supported(input_x1, input_x2, bias=None, output_y={}, trans_a=False, t
     util.check_shape_rule(shape_b)
     util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
     util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)
-    if bias is not None and bool(bias):
-        shape_bias = bias.get("shape")
     try:
         trans_a_f = bool(1 - trans_a)
         if src_dtype == "float32" or src_dtype == "int32":
@@ -250,7 +250,7 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
     """
     calculating  matrix multiplication with bias, C = A*B + bias, support input
     data with fractal format.
- 
+
     Parameters:
     shape_a: list or tuple
             Shape of the first tensor a with rank > 1
@@ -269,7 +269,7 @@ def CusMatMulCube(input_x1, input_x2, bias=None, output_y={}, trans_a=False, tra
             If True, the input data format of a and b must be fractal format
     shape_bias: list or tuple
             Shape of bias, only support the input data format with ND
- 
+
     Returns
     -------
     None
diff --git a/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py b/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
index f341efe4b73..141e2c1d51b 100644
--- a/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/transpose02314_impl.py
@@ -32,6 +32,7 @@ cus_transpose02314_op_info = TBERegOp("CusTranspose02314") \
 
 @op_info_register(cus_transpose02314_op_info)
 def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
+    """CusTranspose02314"""
     input_x_shape = input_x.get("shape")
     output_shape = output.get("shape")
     perm = (0, 2, 3, 1, 4)
@@ -263,7 +264,7 @@ def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
 
         with tik_instance.for_range(0, 32, block_num=32) as block_idx:
             with tik_instance.for_range(0, 6, thread_num=2) as cc1:
-                _inner_ + compute(cc1)
+                _inner_compute(cc1)
             _inner_compute(6)
     elif tuple(input_x_shape) == (32, 64, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
         def _inner_compute(split_index, block_idx):
diff --git a/mindspore/ops/operations/thor_ops.py b/mindspore/ops/operations/thor_ops.py
index e48180c3f6c..5e6ff4b9599 100644
--- a/mindspore/ops/operations/thor_ops.py
+++ b/mindspore/ops/operations/thor_ops.py
@@ -91,7 +91,7 @@ class CusFusedAbsMax1(PrimitiveWithInfer):
     def infer_shape(self, data1_shape):
         ll = []
         if len(data1_shape) == 2:
-            ll = [1, ]
+            ll = [1,]
         else:
             ll = [32, 64]
         return ll

From 3d3a745a4d43ad01d79fb871357bffacba8dcb94 Mon Sep 17 00:00:00 2001
From: z00478463 <zhangzhongpeng1@huawei.com>
Date: Tue, 26 May 2020 11:03:41 +0800
Subject: [PATCH 7/8] for pylint 5th

---
 .../ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py | 2 +-
 .../ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
index 817aeb91d43..9a30da37847 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_left_cast_impl.py
@@ -418,7 +418,7 @@ def cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b,
                          res, mo_tile, ko_tile, no_tile, diag_opt=False, diag_size=128):
     """cus_cube_matmul_cast"""
     ko, mo, _, _ = input_x1.shape
-    no, ko, ki, _ = input_x2.shape
+    no, ko, _, _ = input_x2.shape
     c0 = input_x1.shape[-1]
     diag_outer = diag_size // c0
     maxblocknum = 32
diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
index e30b19ef6f9..79fab2c3cd2 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_fracz_right_mul_impl.py
@@ -99,7 +99,7 @@ def cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3,
     """cus_cube_matmul_right_mul"""
     diag_size = 128
     ko, mo, _, _ = input_x1.shape
-    no, ko, ki, _ = input_x2.shape
+    no, ko, _, _ = input_x2.shape
     c0 = input_x1.shape[-1]
     diag_outer = diag_size // c0
     if [input_x1.shape[-1], input_x1.shape[-2], input_x2.shape[-1], input_x2.shape[-2]] != [c0, c0, c0, c0]:

From 5c639635e5530d4c15c6b7478a932cf1d7fbffd9 Mon Sep 17 00:00:00 2001
From: z00478463 <zhangzhongpeng1@huawei.com>
Date: Tue, 26 May 2020 11:36:51 +0800
Subject: [PATCH 8/8] for pylint nth

---
 mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py | 1 +
 mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py         | 1 +
 2 files changed, 2 insertions(+)

diff --git a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
index 2d70263bc12..e5c380369d0 100644
--- a/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matmul_cube_dense_left_impl.py
@@ -143,6 +143,7 @@ def _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b):
 
 
 def _get_bias(shape_bias):
+    """_get_bias"""
     bias_length = shape_bias[0]
     shb = []
     if bias_length % 16 == 0:
diff --git a/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py b/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
index b001e0ba33d..0a3f41386b9 100644
--- a/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
+++ b/mindspore/ops/_op_impl/_custom_op/matrix_combine_impl.py
@@ -32,6 +32,7 @@ cus_matrix_combine_op_info = TBERegOp("CusMatrixCombine") \
 
 @op_info_register(cus_matrix_combine_op_info)
 def CusMatrixCombine(input_x, output, kernel_name="matrix_combine"):
+    """CusMatrixCombine"""
     input_x_shape = input_x.get("shape")
     output_shape = output.get("shape")
     split_dim = 128