!46976 [MS][Lite]add ascend customized kernel module

Merge pull request !46976 from zhaizhiqiang/master
2022-12-20 01:01:21 +00:00 · 2022-12-20 01:01:21 +00:00 · ef1df784ba
parent 470b760eb3 c74909320b
commit ef1df784ba
25 changed files with 791 additions and 0 deletions
--- a/.jenkins/check/config/filter_cppcheck.txt
+++ b/.jenkins/check/config/filter_cppcheck.txt
@ -71,3 +71,6 @@
 "mindspore/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_winograd_fp32.cc"                      "knownConditionTrueFalse"
 "mindspore/mindspore/lite/src/litert/kernel/cpu/fp32/convolution_winograd_fp32.cc"                      "shadowVariable"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_utils.cc"                      "knownConditionTrueFalse"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/op_proto/add_dsl.cc"               "syntaxError"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/op_proto/matmul_tik.cc"            "syntaxError"
--- a/.jenkins/check/config/filter_cpplint.txt
+++ b/.jenkins/check/config/filter_cpplint.txt
@ -86,3 +86,23 @@
 "mindspore/mindspore/lite/src/litert/delegate/nnapi/nnapi_implementation.cc"                       "build/include_order"
 "mindspore/mindspore/lite/src/extendrt/cxx_api/model/model_impl.cc"                                "whitespace/parens"
 "mindspore/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/experimental/HPC-generator/gemm_mask_avx512/" "runtime/int"
 # ascend samples
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/aicpu/sample/"                                "build/include_subdir"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/"                              "build/include_subdir"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/"                              "build/include_subdir"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/aicpu/sample/"                                "runtime/references"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/"                              "runtime/references"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/"                              "runtime/references"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/aicpu/sample/"                                "whitespace/comments"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/"                              "whitespace/comments"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/"                              "whitespace/comments"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/aicpu/sample/"                                "legal/copyright"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/"                              "legal/copyright"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/"                              "legal/copyright"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/aicpu/sample/"                                "whitespace/ending_newline"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/"                              "whitespace/ending_newline"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/"                              "whitespace/ending_newline"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/aicpu/sample/"                                "build/include"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/"                              "build/include"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/"                              "build/include"
--- a/.jenkins/check/config/filter_pylint.txt
+++ b/.jenkins/check/config/filter_pylint.txt
@ -186,3 +186,10 @@
 "mindspore/mindspore/lite/python/api/tensor.py"                                                          "protected-access"
 "mindspore/mindspore/lite/test"                                                                          "missing-docstring"
 "mindspore/mindspore/lite/test"                                                                          "unused-variable"
 # ascend samples
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/"                              "wrong-import-order"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/"                              "wrong-import-order"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/"                              "bad-whitespace"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/"                              "bad-whitespace"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/"                              "bad-continuation"
 "mindspore/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/"                              "bad-continuation"
--- a/mindspore/lite/tools/kernel_builder/ascend/CMakeLists.txt
+++ b/mindspore/lite/tools/kernel_builder/ascend/CMakeLists.txt
@ -0,0 +1,2 @@
 cmake_minimum_required(VERSION 3.12)
 project(MS_ASCEND_CUSTOM_KERNEL_INSTALLER)
--- a/mindspore/lite/tools/kernel_builder/ascend/README.md
+++ b/mindspore/lite/tools/kernel_builder/ascend/README.md
@ -0,0 +1,15 @@
 Build Ascend customized kernel.
 More details please refer to https://gitee.com/ascend/samples.git.
 ## build
 mkdir build
 cd build
 cmake ../
 make
 ## install
 ./ms_ascend_custom_kernel_installer.run
 After install, you can use converter tools to convert model with customized kernel on Ascend developing env.
--- a/mindspore/lite/tools/kernel_builder/ascend/aicpu/sample/op_info_cfg/aicpu_kernel/reshape_cust.ini
+++ b/mindspore/lite/tools/kernel_builder/ascend/aicpu/sample/op_info_cfg/aicpu_kernel/reshape_cust.ini
@ -0,0 +1,9 @@
 [ReshapeCust]
 opInfo.engine=DNN_VM_AICPU
 opInfo.flagPartial=False
 opInfo.computeCost=100
 opInfo.flagAsync=False
 opInfo.opKernelLib=CUSTAICPUKernel
 opInfo.kernelSo=libcust_aicpu_kernels.so
 opInfo.functionName=RunCpuKernel
 opInfo.workspaceSize=1024
--- a/mindspore/lite/tools/kernel_builder/ascend/aicpu/sample/reshape_cust_kernels.cc
+++ b/mindspore/lite/tools/kernel_builder/ascend/aicpu/sample/reshape_cust_kernels.cc
@ -0,0 +1,41 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
 * Description: implement of sample
 */
 #include "reshape_cust_kernels.h"
 #include <string.h>
 #include "cpu_types.h"
 namespace {
 const char *RESHAPE_CUST = "ReshapeCust";
 }
 namespace aicpu {
 uint32_t ReshapeCustCpuKernel::Compute(CpuKernelContext &ctx) {
  Tensor *input_tensor = ctx.Input(0);
  if (input_tensor == nullptr) {
    return -1;
  }
  Tensor *output_tensor = ctx.Output(0);
  if (output_tensor == nullptr) {
    return -1;
  }
  auto input_data = input_tensor->GetData();
  if (input_data == nullptr) {
    return -1;
  }
  auto output_data = output_tensor->GetData();
  if (output_data == nullptr) {
    return -1;
  }
  uint64_t data_size = input_tensor->GetDataSize();
  memcpy(output_data, input_data, data_size);
  return 0;
 }
 REGISTER_CPU_KERNEL(RESHAPE_CUST, ReshapeCustCpuKernel);
 }  // namespace aicpu
--- a/mindspore/lite/tools/kernel_builder/ascend/aicpu/sample/reshape_cust_kernels.h
+++ b/mindspore/lite/tools/kernel_builder/ascend/aicpu/sample/reshape_cust_kernels.h
@ -0,0 +1,28 @@
 /* Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef _AICPU_RESHAPE_CUST_KERNELS_H_
 #define _AICPU_RESHAPE_CUST_KERNELS_H_
 #include "cpu_kernel.h"
 namespace aicpu {
 class ReshapeCustCpuKernel : public CpuKernel {
 public:
  ~ReshapeCustCpuKernel() = default;
  uint32_t Compute(CpuKernelContext &ctx) override;
 };
 }  // namespace aicpu
 #endif
--- a/mindspore/lite/tools/kernel_builder/ascend/scripts/installer.sh
+++ b/mindspore/lite/tools/kernel_builder/ascend/scripts/installer.sh
@ -0,0 +1,2 @@
 #!/bin/bash
 #install mindspore ascend customized kernel
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/op_proto/CMakeLists.txt
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/op_proto/CMakeLists.txt
@ -0,0 +1,19 @@
 # Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
 set(CMAKE_CXX_COMPILER g++)
 set(CMAKE_C_COMPILER gcc)
 # add source files
 aux_source_directory(. SRCS)
 if("x${SRCS}" STREQUAL "x")
    add_custom_target(${OP_PROTO_TARGET}
            COMMAND mkdir -p ${OP_PROTO_TARGET_OUT_DIR}
            COMMAND echo "no source to make lib${OP_PROTO_TARGET}.so")
    return(0)
 endif()
 set(LIBRARY_OUTPUT_PATH ${OP_PROTO_TARGET_OUT_DIR})
 message(STATUS "OP_PROTO_TARGET=${OP_PROTO_TARGET}")
 add_library(${OP_PROTO_TARGET} SHARED ${SRCS})
 target_link_libraries(${OP_PROTO_TARGET} ${ASCEND_INC}/../lib64/libgraph.so)
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/op_proto/add_dsl.cc
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/op_proto/add_dsl.cc
@ -0,0 +1,94 @@
 /**
 * Copyright (C)  2019. Huawei Technologies Co., Ltd. All rights reserved.
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the Apache License Version 2.0.You may not use this file except in compliance with the License.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * Apache License for more details at
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * @file add_dsl.cpp
 *
 * @brief
 *
 * @version 1.0
 *
 */
 #include "./add_dsl.h"
 #include <string>
 #include <vector>
 namespace ge {
 bool InferShapeAndTypeAdd(Operator &op, const string &input_name1, const string &input_name2,
                          const string &output_name) {
  // vOutputDesc.push_back(op.GetInputDesc(0));
  TensorDesc vOutputDesc = op.GetOutputDescByName(output_name.c_str());
  DataType input_dtype = op.GetInputDescByName(input_name1.c_str()).GetDataType();
  Format input_format = op.GetInputDescByName(input_name1.c_str()).GetFormat();
  // 针对shape维度大小进行交换
  ge::Shape shapeX = op.GetInputDescByName(input_name1.c_str()).GetShape();
  ge::Shape shapeY = op.GetInputDescByName(input_name2.c_str()).GetShape();
  std::vector<int64_t> dimsX = shapeX.GetDims();
  std::vector<int64_t> dimsY = shapeY.GetDims();
  if (dimsX.size() < dimsY.size()) {
    std::vector<int64_t> dimsTmp = dimsX;
    dimsX = dimsY;
    dimsY = dimsTmp;
  }
  // 对小的shape进行1补齐
  if (dimsX.size() != dimsY.size()) {
    int dec = dimsX.size() - dimsY.size();
    for (int i = 0; i < dec; i++) {
      dimsY.insert(dimsY.begin(), (int64_t)1);
    }
  }
  // 设置输出的shape维度
  std::vector<int64_t> dimVec;
  for (size_t i = 0; i < dimsX.size(); i++) {
    if ((dimsX[i] != dimsY[i]) && (dimsX[i] != 1) && (dimsY[i] != 1)) {
      return false;
    }
    int64_t dims = dimsX[i] > dimsY[i] ? dimsX[i] : dimsY[i];
    dimVec.push_back(dims);
  }
  ge::Shape outputShape = ge::Shape(dimVec);
  vOutputDesc.SetShape(outputShape);
  vOutputDesc.SetDataType(input_dtype);
  vOutputDesc.SetFormat(input_format);
  op.UpdateOutputDesc(output_name.c_str(), vOutputDesc);
  return true;
 }
 //----------------Add-------------------
 IMPLEMT_VERIFIER(AddDsl, AddVerify) {
  if (op.GetInputDescByName("x1").GetDataType() != op.GetInputDescByName("x2").GetDataType()) {
    return GRAPH_FAILED;
  }
  return GRAPH_SUCCESS;
 }
 // Obtains the processing function of the output tensor description.
 IMPLEMT_COMMON_INFERFUNC(AddInferShape) {
  if (InferShapeAndTypeAdd(op, "x1", "x2", "y")) {
    return GRAPH_SUCCESS;
  }
  return GRAPH_FAILED;
 }
 // Registered inferfunction
 COMMON_INFER_FUNC_REG(AddDsl, AddInferShape);
 // Registered verify function
 VERIFY_FUNC_REG(AddDsl, AddVerify);
 //----------------Add-------------------
 }  // namespace ge
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/op_proto/add_dsl.h
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/op_proto/add_dsl.h
@ -0,0 +1,35 @@
 /**
 * Copyright (C)  2020. Huawei Technologies Co., Ltd. All rights reserved.
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the Apache License Version 2.0.You may not use this file except in compliance with the License.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * Apache License for more details at
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * @file add_dsl.h
 *
 * @brief
 *
 * @version 1.0
 *
 */
 #ifndef GE_OPS_OP_PROTO_ADDDSL_H_
 #define GE_OPS_OP_PROTO_ADDDSL_H_
 #include "graph/operator_reg.h"
 namespace ge {
 REG_OP(AddDsl)
  .INPUT(x1, TensorType({DT_FLOAT, DT_INT32, DT_INT64, DT_FLOAT16, DT_INT16, DT_INT8, DT_UINT8, DT_DOUBLE,
                         DT_COMPLEX128, DT_COMPLEX64, DT_STRING}))
  .INPUT(x2, TensorType({DT_FLOAT, DT_INT32, DT_INT64, DT_FLOAT16, DT_INT16, DT_INT8, DT_UINT8, DT_DOUBLE,
                         DT_COMPLEX128, DT_COMPLEX64, DT_STRING}))
  .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32, DT_INT64, DT_FLOAT16, DT_INT16, DT_INT8, DT_UINT8, DT_DOUBLE,
                         DT_COMPLEX128, DT_COMPLEX64, DT_STRING}))
  .OP_END_FACTORY_REG(AddDsl)
 }
 #endif  // GE_OPS_OP_PROTO_ADDDSL_H_
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/tbe/impl/init.py
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/tbe/impl/init.py
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/tbe/impl/add_dsl.py
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/tbe/impl/add_dsl.py
@ -0,0 +1,117 @@
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
 """
 Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
 This program is free software; you can redistribute it and/or modify
 it under the terms of the Apache License Version 2.0.You may not use this file
 except in compliance with the License.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 Apache License for more details at
 http://www.apache.org/licenses/LICENSE-2.0
 add
 """
 from __future__ import absolute_import
 import tbe.dsl as tbe
 from functools import reduce
 from tbe import tvm
 from tbe.common.register import register_op_compute
 from tbe.common.utils import para_check
 from tbe.common.utils import shape_util
 # General limitation of the reduce size for input shape: 2**31
 SHAPE_SIZE_LIMIT = 2147483648
 # pylint: disable=locally-disabled,too-many-arguments,unused-argument
@register_op_compute("Add", op_mode="dynamic", support_fusion=True)
 def add_compute(input_x, input_y, output_z, kernel_name="add"):
    """
    calculating data's add, c = a + b
    Parameters
    ----------
    input_x: TVM tensor
        the placeholder of first input data
    input_y: TVM tensor
        the placeholder of second input data
    output_data: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name: str
        cce kernel name, default value is add
    Returns
    -------
    res : output of the data's add
    """
    shape_x = shape_util.shape_to_list(input_x.shape)
    shape_y = shape_util.shape_to_list(input_y.shape)
    shape_x, shape_y, shape_max = shape_util.broadcast_shapes(shape_x, shape_y,
                                                              param_name_input1="input_x",
                                                              param_name_input2="input_y")
    shape_size = reduce(lambda x, y: x * y, shape_max[:])
    if shape_size > SHAPE_SIZE_LIMIT:
        raise RuntimeError("the shape is too large to calculate")
    input_x = tbe.broadcast(input_x, shape_max)
    input_y = tbe.broadcast(input_y, shape_max)
    res = tbe.vadd(input_x, input_y)
    return res
@para_check.check_op_params(para_check.REQUIRED_INPUT, para_check.REQUIRED_INPUT,
                            para_check.REQUIRED_OUTPUT, para_check.KERNEL_NAME)
 def add_dsl(input_x, input_y, output_z, kernel_name="add_dsl"):
    """
    algorithm: add
    calculating data's add, c = a + b
    Parameters
    ----------
    input_x : dict
        shape and dtype of first input, only support float16, float32, int32
    input_y : dict
        shape and dtype of second input, only support float16, float32, int32
    output_z: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name : str
        cce kernel name, default value is add
    Returns
    -------
    None
    """
    shape_x = input_x.get("shape")
    shape_y = input_y.get("shape")
    check_tuple = ("float16", "float32", "int32")
    input_data_type = input_x.get("dtype").lower()
    para_check.check_dtype(input_data_type, check_tuple, param_name="input_x")
    shape_x, shape_y, shape_max = shape_util.broadcast_shapes(shape_x, shape_y,
                                                              param_name_input1="input_x",
                                                              param_name_input2="input_y")
    if shape_x[-1] == 1 and shape_y[-1] == 1 and shape_max[-1] == 1:
        shape_x = shape_x if len(shape_x) == 1 else shape_x[:-1]
        shape_y = shape_y if len(shape_y) == 1 else shape_y[:-1]
        shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1]
    data_x = tvm.placeholder(shape_x, name="data_1", dtype=input_data_type)
    data_y = tvm.placeholder(shape_y, name="data_2", dtype=input_data_type)
    res = add_compute(data_x, data_y, output_z, kernel_name)
    with tvm.target.cce():
        schedule = tbe.auto_schedule(res)
    config = {"name": kernel_name,
              "tensor_list": (data_x, data_y, res)}
    tbe.build(schedule, config)
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/tbe/op_info_cfg/ai_core/ascend310/add_dsl.ini
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/tbe/op_info_cfg/ai_core/ascend310/add_dsl.ini
@ -0,0 +1,18 @@
 [AddDsl]
 input0.name=x1
 input0.dtype=float16,float16,float16,float16,float,float,float,float,int32,int32,int32,int32
 input0.shape=all
 input0.paramType=required
 input0.format=NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND
 input1.name=x2
 input1.dtype=float16,float16,float16,float16,float,float,float,float,int32,int32,int32,int32
 input1.shape=all
 input1.paramType=required
 input1.format=NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND
 output0.name=y
 output0.dtype=float16,float16,float16,float16,float,float,float,float,int32,int32,int32,int32
 output0.shape=all
 output0.paramType=required
 output0.format=NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND
 opFile.value=add_dsl
 opInterface.value=add_dsl
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/tbe/op_info_cfg/ai_core/ascend310p/add_dsl.ini
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/tbe/op_info_cfg/ai_core/ascend310p/add_dsl.ini
@ -0,0 +1,18 @@
 [AddDsl]
 input0.name=x1
 input0.dtype=float16,float16,float16,float16,float,float,float,float,int32,int32,int32,int32
 input0.shape=all
 input0.paramType=required
 input0.format=NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND
 input1.name=x2
 input1.dtype=float16,float16,float16,float16,float,float,float,float,int32,int32,int32,int32
 input1.shape=all
 input1.paramType=required
 input1.format=NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND
 output0.name=y
 output0.dtype=float16,float16,float16,float16,float,float,float,float,int32,int32,int32,int32
 output0.shape=all
 output0.paramType=required
 output0.format=NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND
 opFile.value=add_dsl
 opInterface.value=add_dsl
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/tbe/op_info_cfg/ai_core/ascend910/add_dsl.ini
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_dsl/sample/tbe/op_info_cfg/ai_core/ascend910/add_dsl.ini
@ -0,0 +1,18 @@
 [AddDsl]
 input0.name=x1
 input0.dtype=float16,float16,float16,float16,float,float,float,float,int32,int32,int32,int32
 input0.shape=all
 input0.paramType=required
 input0.format=NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND
 input1.name=x2
 input1.dtype=float16,float16,float16,float16,float,float,float,float,int32,int32,int32,int32
 input1.shape=all
 input1.paramType=required
 input1.format=NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND
 output0.name=y
 output0.dtype=float16,float16,float16,float16,float,float,float,float,int32,int32,int32,int32
 output0.shape=all
 output0.paramType=required
 output0.format=NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND
 opFile.value=add_dsl
 opInterface.value=add_dsl
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/op_proto/CMakeLists.txt
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/op_proto/CMakeLists.txt
@ -0,0 +1,19 @@
 # Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
 set(CMAKE_CXX_COMPILER g++)
 set(CMAKE_C_COMPILER gcc)
 # add source files
 aux_source_directory(. SRCS)
 if("x${SRCS}" STREQUAL "x")
    add_custom_target(${OP_PROTO_TARGET}
            COMMAND mkdir -p ${OP_PROTO_TARGET_OUT_DIR}
            COMMAND echo "no source to make lib${OP_PROTO_TARGET}.so")
    return(0)
 endif()
 set(LIBRARY_OUTPUT_PATH ${OP_PROTO_TARGET_OUT_DIR})
 message(STATUS "OP_PROTO_TARGET=${OP_PROTO_TARGET}")
 add_library(${OP_PROTO_TARGET} SHARED ${SRCS})
 target_link_libraries(${OP_PROTO_TARGET} ${ASCEND_INC}/../lib64/libgraph.so)
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/op_proto/matmul_tik.cc
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/op_proto/matmul_tik.cc
@ -0,0 +1,42 @@
 #include "matmul_tik.h"
 #include <string>
 #include <vector>
 namespace ge {
 IMPLEMT_VERIFIER(MatmulTik, MatmulTikVerify) {
  std::vector<DataType> support_list;
  support_list.reserve(5);
  support_list.push_back(DT_FLOAT16);
  support_list.push_back(DT_FLOAT);
  support_list.push_back(DT_INT32);
  support_list.push_back(DT_INT8);
  support_list.push_back(DT_UINT8);
  return GRAPH_SUCCESS;
 }
 // Obtains the processing function of the output tensor description.
 IMPLEMT_COMMON_INFERFUNC(MatmulTikInferShape) {
  TensorDesc tensordesc_output = op.GetOutputDescByName("y");
  ge::TensorDesc inputTensorDescX = op.GetInputDescByName("x1");
  ge::TensorDesc inputTensorDescY = op.GetInputDescByName("x2");
  ge::Shape shapeX = inputTensorDescX.GetShape();
  ge::Shape shapeY = inputTensorDescY.GetShape();
  DataType dtype = inputTensorDescX.GetDataType();
  std::vector<int64_t> dimVector;
  dimVector.push_back(shapeX.GetDim(0));
  dimVector.push_back(shapeY.GetDim(1));
  ge::Shape outputShape(dimVector);
  tensordesc_output.SetShape(outputShape);
  tensordesc_output.SetDataType(dtype);
  (void)op.UpdateOutputDesc("y", tensordesc_output);
  return GRAPH_SUCCESS;
 }
 // Registered inferfunction
 COMMON_INFER_FUNC_REG(MatmulTik, MatmulTikInferShape);
 // Registered verify function
 VERIFY_FUNC_REG(MatmulTik, MatmulTikVerify);
 }  // namespace ge
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/op_proto/matmul_tik.h
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/op_proto/matmul_tik.h
@ -0,0 +1,14 @@
 #ifndef GE_OP_MATMULTIK_H
 #define GE_OP_MATMULTIK_H
 #include "graph/operator_reg.h"
 namespace ge {
 REG_OP(MatmulTik)
  .INPUT(x1, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
  .INPUT(x2, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
  .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
  .OP_END_FACTORY_REG(MatmulTik)
 }
 #endif  // GE_OP_MATMULTIK_H
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/tbe/impl/init.py
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/tbe/impl/init.py
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/tbe/impl/matmul_tik.py
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/tbe/impl/matmul_tik.py
@ -0,0 +1,210 @@
 """
 Copyright 2020 Huawei Technologies Co., Ltd. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 matmul_tik
 """
 from tbe import tik
 from tbe.common.platform import get_soc_spec
 DTYPE_SIZE = {
    'bool': 1,
    'uint8': 1,
    'int8': 1,
    'uint16': 2,
    'int16': 2,
    'int24': 3,
    'uint32': 4,
    'int32': 4,
    'float16': 2,
    'float32': 4,
    'int48': 6,
    'int64': 8,
    'uint64': 8,
    'float64':8
 }
 def MK_TO_K1MK0(tik_instance, mk_input_tensor, k1mk0_tensor, dtype, k1, m, k0):
    """data move mk to k1mk0"""
    src_ub = tik_instance.Tensor(dtype, (k1, m, k0), name='src_ub', scope=tik.scope_ubuf)
    # data_move(m, k) ---> (k1, m, k0)
    with tik_instance.for_range(0, k1) as i:
        tik_instance.data_move(src_ub[i * m * k0:], mk_input_tensor[i * k0:], 0, m, k0 * DTYPE_SIZE[dtype] // 32,
                               (k1 - 1) * k0 * DTYPE_SIZE[dtype] // 32, 0)
    tik_instance.data_move(k1mk0_tensor, src_ub, 0, 1, k1 * m * k0 * DTYPE_SIZE[dtype] // 32, 0, 0)
 def KN_TO_K1NK0(tik_instance, kn_input_tensor, k1nk0_tensor, dtype, k1, n, k0):
    """data move kn to k1nk0"""
    with tik_instance.for_range(0, k1) as index:
        k1nk0_ub = tik_instance.Tensor(dtype, (n, k0), tik.scope_ubuf, "k1nk0_ub")
        src_ub = tik_instance.Tensor(dtype, (k0, n), tik.scope_ubuf, "src_ub")
        burst_len = k0 * n * DTYPE_SIZE[dtype] // 32
        tik_instance.data_move(src_ub, kn_input_tensor[index * k0 * n], 0, 1, burst_len, 0, 0)
        dst_list = [k1nk0_ub[16 * i] for i in range(16)]
        src_list = [src_ub[n * i] for i in range(16)]
        rep_times = n // k0
        dst_rep_stride = k0
        src_rep_stride = 1
        tik_instance.vec_trans_scatter(False, False, dst_list, src_list, rep_times, dst_rep_stride, src_rep_stride)
        tik_instance.data_move(k1nk0_tensor[index * k0 * n], k1nk0_ub, 0, 1, burst_len, 0, 0)
 def N1MN0_TO_MN(tik_instance, mn_output_tensor, n1mn0_tensor, dtype, n1, m, n0):
    """data move mn to n1mn0"""
    src_ub = tik_instance.Tensor(dtype, (m, n1 * n0), name='src_ub', scope=tik.scope_ubuf)
    # data_move(n1, m, n0) ---> (m, n)
    with tik_instance.for_range(0, n1) as i:
        tik_instance.data_move(src_ub[i * n0:], n1mn0_tensor[i * m * n0:], 0, m,
                               n0 * DTYPE_SIZE[dtype] // 32, 0, (n1 - 1) * n0 * DTYPE_SIZE[dtype] // 32)
    tik_instance.data_move(mn_output_tensor, src_ub, 0, 1, m * n1 * n0 * DTYPE_SIZE[dtype] // 32, 0, 0)
 def matmul_tik_compute(params, kernel_name):
    """
    matmul tik compute
    @param params: matmul data
    @param kernel_name: kernel name
    @return: tik instance
    """
    tik_instance = tik.Tik()
    if not isinstance(params, dict):
        params = params.__dict__
    m_size, k_size, n_size = params['M'], params['K'], params['N']
    data_type = params["data_type"]
    m_tiling_size = int(params["m_tiling_size"])
    n_tiling_size = int(params["n_tiling_size"])
    k_tiling_size = int(params['k_tiling_size'])
    m_cycle_times = params["m_cycle_times"]
    n_cycle_times = params["n_cycle_times"]
    k_cycle_times = params["k_cycle_times"]
    # Determine the output type
    if data_type == "float16":
        if get_soc_spec("SOC_VERSION") in ["SD3403", "OPTG", "Hi3796CV300CS", "TsnsC"]:
            C_loc_out_type = "float16"
        else:
            C_loc_out_type = "float32"
        K0 = 16
    else:
        C_loc_out_type = "int32"
        K0 = 32
    block_size = 16
    n_thread_num = params['n_thread_num']
    m_thread_num = params['m_thread_num']
    k_thread_num = params['k_thread_num']
    mk_gm_input = tik_instance.Tensor(data_type, (m_size, k_size), name="mk_input_gm", scope=tik.scope_gm)
    kn_gm_input = tik_instance.Tensor(data_type, (k_size, n_size), name="kn_input_gm", scope=tik.scope_gm)
    k1mk0_workspace = tik_instance.Tensor(data_type, (k_size // K0, m_size, K0), name="k1mk0_workspace",
                                          scope=tik.scope_gm, is_workspace=True)
    k1nk0_workspace = tik_instance.Tensor(data_type, (k_size // K0, n_size, K0), name="k1nk0_workspace",
                                          scope=tik.scope_gm, is_workspace=True)
    mn_gm_output = tik_instance.Tensor(C_loc_out_type, (m_size, n_size), tik.scope_gm, name="mn_output_gm")
    nmk0_workspace = tik_instance.Tensor(C_loc_out_type, (n_size // block_size, m_size, block_size),
                                         name="nmk0_workspace", scope=tik.scope_gm, is_workspace=True)
    MK_TO_K1MK0(tik_instance, mk_gm_input, k1mk0_workspace, data_type, k_size // K0, m_size, K0)
    KN_TO_K1NK0(tik_instance, kn_gm_input, k1nk0_workspace, data_type, k_size // K0, n_size, K0)
    # Tiling is realized through the for_range() loop.
    with tik_instance.for_range(0, 2, block_num = 1) as core_id:
        with tik_instance.for_range(0, n_cycle_times // 2, thread_num=n_thread_num) as n_idx:
            with tik_instance.for_range(0, m_cycle_times, thread_num=m_thread_num) as m_idx:
                dst_l0c = tik_instance.Tensor(C_loc_out_type, [n_tiling_size // 16, m_tiling_size, 16], name='dst_l0c',
                                              scope=tik.scope_cbuf_out)
                with tik_instance.for_range(0, k_cycle_times,
                                            thread_num=k_thread_num) as k_idx:
                    # Calculation result data transfer.
                    inputa_l1 = tik_instance.Tensor(params['data_type'], [k_tiling_size // K0, m_tiling_size, K0],
                                                    name="A_tiling_l1", scope=tik.scope_cbuf)
                    tik_instance.data_move(inputa_l1,
                                           k1mk0_workspace[k_idx * k_tiling_size // K0, m_idx * m_tiling_size, :],
                                           0, k_tiling_size // K0, m_tiling_size, m_size - m_tiling_size, 0)
                    inputb_l1 = tik_instance.Tensor(params["data_type"], [k_tiling_size // K0, n_tiling_size, K0],
                                               name="B_tiling_l1", scope=tik.scope_cbuf)
                    if n_size - n_tiling_size > 65535:
                        with tik_instance.for_range(0, k_tiling_size // K0) \
                                as dma_k_idx:
                            tik_instance.data_move(inputb_l1[dma_k_idx, :, :],
                                                   k1nk0_workspace[k_idx * k_tiling_size // K0 + dma_k_idx,
                                                   (core_id * n_cycle_times // 2 + n_idx) * n_tiling_size, :],
                                                    0, 1, n_tiling_size, 0, 0)
                    else:
                        tik_instance.data_move(inputb_l1, k1nk0_workspace[k_idx * k_tiling_size // K0,
                                                          (core_id * n_cycle_times // 2 + n_idx) * n_tiling_size, :],
                                               0, k_tiling_size // K0, n_tiling_size, n_size - n_tiling_size, 0)
                    # Call matmul API to matrix multiplication calculation.
                    with tik_instance.if_scope(k_idx == 0):
                        tik_instance.matmul(dst_l0c, inputa_l1, inputb_l1, m_tiling_size, k_tiling_size, n_tiling_size,
                                            init_l1out=True)
                    with tik_instance.else_scope():
                        tik_instance.matmul(dst_l0c, inputa_l1, inputb_l1, m_tiling_size, k_tiling_size, n_tiling_size,
                                            init_l1out=False)
                tik_instance.fixpipe(nmk0_workspace[n_tiling_size // 16 * (core_id * n_cycle_times // 2 + n_idx),
                                     m_idx * m_tiling_size, :], dst_l0c, n_tiling_size // 16, m_tiling_size * 16 *
                                     DTYPE_SIZE[C_loc_out_type]//32,
                                     (m_size - m_tiling_size) * 16 * DTYPE_SIZE[C_loc_out_type] // 32, 0)
    N1MN0_TO_MN(tik_instance, mn_gm_output, nmk0_workspace, C_loc_out_type, n_size // K0, m_size, K0)
    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[mk_gm_input, kn_gm_input], outputs=[mn_gm_output])
    return tik_instance
 def matmul_tik(input_x1, input_x2, output_y=None, kernel_name="simple_matmul"):
    """
    matmul_tik main func
    Parameters
    ----------
    input_x1: input data 1
    input_x2: input data 2
    output_y: output dta
    """
    shape_a = input_x1.get("ori_shape")
    shape_b = input_x2.get("ori_shape")
    output_y = output_y
    m = shape_a[0]
    k = shape_a[1]
    n = shape_b[1]
    data_type = input_x1.get("dtype").lower()
    params = {
        'M': m,
        'K': k,
        'N': n,
        'data_type': data_type,
        'm_tiling_size': 16,
        'm_cycle_times': 1,
        'm_thread_num': 1,
        'n_tiling_size': 64,
        'n_cycle_times': 16,
        'n_thread_num': 1,
        'k_tiling_size': 32,
        'k_cycle_times': 2,
        'k_thread_num': 2,
        'output_y':output_y
    }
    return matmul_tik_compute(params, kernel_name)
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/tbe/op_info_cfg/ai_core/ascend310/matmul_tik.ini
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/tbe/op_info_cfg/ai_core/ascend310/matmul_tik.ini
@ -0,0 +1,20 @@
 [MatmulTik]
 input0.name=x1
 input0.dtype=int8,uint8,float16
 input0.shape=all
 input0.needCompile=false
 input0.paramType=required
 input0.format=ND,ND,ND
 input1.name=x2
 input1.dtype=int8,int8,float16
 input1.shape=all
 input1.needCompile=false
 input1.paramType=required
 input1.format=ND,ND,ND
 output0.name=y
 output0.dtype=int32,int32,float
 output0.shape=all
 output0.paramType=required
 output0.format=ND,ND,ND
 opFile.value=matmul_tik
 opInterface.value=matmul_tik
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/tbe/op_info_cfg/ai_core/ascend310p/matmul_tik.ini
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/tbe/op_info_cfg/ai_core/ascend310p/matmul_tik.ini
@ -0,0 +1,20 @@
 [MatmulTik]
 input0.name=x1
 input0.dtype=int8,uint8,float16
 input0.shape=all
 input0.needCompile=false
 input0.paramType=required
 input0.format=ND,ND,ND
 input1.name=x2
 input1.dtype=int8,int8,float16
 input1.shape=all
 input1.needCompile=false
 input1.paramType=required
 input1.format=ND,ND,ND
 output0.name=y
 output0.dtype=int32,int32,float
 output0.shape=all
 output0.paramType=required
 output0.format=ND,ND,ND
 opFile.value=matmul_tik
 opInterface.value=matmul_tik
--- a/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/tbe/op_info_cfg/ai_core/ascend910/matmul_tik.ini
+++ b/mindspore/lite/tools/kernel_builder/ascend/tbe_tik/sample/tbe/op_info_cfg/ai_core/ascend910/matmul_tik.ini
@ -0,0 +1,20 @@
 [MatmulTik]
 input0.name=x1
 input0.dtype=int8,uint8,float16
 input0.shape=all
 input0.needCompile=false
 input0.paramType=required
 input0.format=ND,ND,ND
 input1.name=x2
 input1.dtype=int8,int8,float16
 input1.shape=all
 input1.needCompile=false
 input1.paramType=required
 input1.format=ND,ND,ND
 output0.name=y
 output0.dtype=int32,int32,float
 output0.shape=all
 output0.paramType=required
 output0.format=ND,ND,ND
 opFile.value=matmul_tik
 opInterface.value=matmul_tik
		`@ -0,0 +1,2 @@`
							`cmake_minimum_required(VERSION 3.12)`
							`project(MS_ASCEND_CUSTOM_KERNEL_INSTALLER)`
		`@ -0,0 +1,2 @@`
							`#!/bin/bash`
							`#install mindspore ascend customized kernel`