!26853 Mindspore support aicpu ops compile.

Merge pull request !26853 from linqingke/aicpu
2021-12-03 01:21:01 +00:00 · 2021-12-03 01:21:01 +00:00 · 973b8ffec9
parent 0c716e476c fa0e9ae42e
commit 973b8ffec9
22 changed files with 2379 additions and 0 deletions
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@ -13,6 +13,11 @@ set(FBS_FILES
        )
 ms_build_flatbuffers(FBS_FILES ${CMAKE_CURRENT_SOURCE_DIR}../../schema generated_fbs_files ${SERVER_FLATBUFFER_OUTPUT})

+if(ENABLE_D)
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/backend/kernel_compiler/aicpu/aicpu_ops)
+    add_subdirectory(backend/kernel_compiler/aicpu/aicpu_ops)
+endif()
+
 if(ENABLE_CPU)
    if(${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "aarch64")
        set(PLATFORM_ARM64 "on")
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@ -18,6 +18,10 @@ if(ENABLE_D)
        "rts/*.cc"
        "hccl/*.cc"
    )
+    file(GLOB_RECURSE AICPU_OPS_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+        "aicpu/aicpu_ops/*.cc"
+    )
+    list(REMOVE_ITEM D_SRC_LIST ${AICPU_OPS_SRC})
    add_compile_definitions(ENABLE_D)
 endif()

--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/CMakeLists.txt
@ -0,0 +1,64 @@
+set(NORMAL_CMAKE_C_COMPILER ${CMAKE_C_COMPILER})
+set(NORMAL_CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER})
+if(DEFINED ENV{ASCEND_CUSTOM_PATH})
+    set(TOOLCHAIN_PATH $ENV{ASCEND_CUSTOM_PATH}/toolkit/toolchain)
+else()
+    set(TOOLCHAIN_PATH /usr/local/Ascend/toolkit/toolchain)
+endif()
+set(CMAKE_C_COMPILER ${TOOLCHAIN_PATH}/hcc/bin/aarch64-target-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PATH}/hcc/bin/aarch64-target-linux-gnu-g++)
+
+if(EXISTS ${CMAKE_C_COMPILER} AND EXISTS ${CMAKE_CXX_COMPILER})
+    set(AICPU_PROTO_SRC
+        ${CMAKE_CURRENT_SOURCE_DIR}/aicpu_op_proto/aicpu_tensor.proto
+    )
+
+    ms_protobuf_generate(PROTO_SRCS PROTO_HDRS ${AICPU_PROTO_SRC})
+
+    set(AICPU_SRC
+        ${PROTO_SRCS}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common/kernel_base.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/common/kernel_log.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/aicpu_sharder/aicpu_async_event.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/aicpu_sharder/aicpu_context.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/aicpu_sharder/aicpu_pulse.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/aicpu_sharder/aicpu_sharder.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/random_choice_with_mask_kernels.cc
+    )
+
+    add_library(aicpu_kernels SHARED
+        ${AICPU_SRC}
+    )
+
+    target_compile_options(aicpu_kernels PRIVATE
+        -march=armv8-a
+        -O2
+        -fvisibility-inlines-hidden
+        -fvisibility=hidden
+        -fno-strict-aliasing
+        -fno-common
+    )
+
+    target_link_libraries(aicpu_kernels PRIVATE
+        -ldl
+        -shared
+        PUBLIC
+        ${SECUREC_LIBRARY}
+        -Wl,--whole-archive
+        -Wl,--no-whole-archive
+        -Wl,-Bsymbolic
+        -rdynamic
+        mindspore::protobuf
+        -pthread
+    )
+
+    set(INSTALL_LIBRARY_DIR lib)
+    install(TARGETS aicpu_kernels OPTIONAL
+        EXPORT aicpu_kernels-targets
+        LIBRARY DESTINATION ${INSTALL_LIBRARY_DIR}
+    )
+
+endif()
+
+set(CMAKE_C_COMPILER ${NORMAL_CMAKE_C_COMPILER})
+set(CMAKE_CXX_COMPILER ${NORMAL_CMAKE_CXX_COMPILER})
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_op_proto/aicpu_tensor.proto
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_op_proto/aicpu_tensor.proto
@ -0,0 +1,118 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+syntax = "proto3";
+package aicpuops;
+
+message AttrValue {
+
+  message ArrayValue {
+    repeated bytes s = 2;                         //"array(string)"
+    repeated int64 i = 3 [ packed = true ];       //"array(int)"
+    repeated float f = 4 [ packed = true ];       //"array(float)"
+    repeated bool b = 5 [ packed = true ];        //"array(bool)"
+    repeated int32 type = 6 [ packed = true ];    //"array(type)"
+    repeated TensorShape shape = 7;               //"array(shape)"
+    repeated Tensor tensor = 8;                   //"array(tensor)"
+  }
+
+  oneof value {
+    ArrayValue array = 1;
+    bytes s = 2;           //"string"
+    int64 i = 3;           //"int"
+    float f = 4;           //"float"
+    bool b = 5;            //"bool"
+    int32 type = 6;        //"type"
+    TensorShape shape = 7; //"shape"
+    Tensor tensor = 8;     //"tensor"
+  }
+}
+
+message DynamicIdx {
+  int32 idx = 1;
+  int32 num = 2;
+}
+
+message NodeDef {
+  string op = 2;
+  map<string, AttrValue> attrs = 3;
+  repeated Tensor inputs = 4;
+  repeated Tensor outputs = 5;
+  map<string, DynamicIdx> dym_inputs = 6;
+  map<string, DynamicIdx> dym_outputs = 7;
+}
+
+message TensorShape {
+  // One dimension of the tensor.
+  message Dim {
+    // size must >=0
+    int64 size = 1;
+  };
+
+  // group dim info
+  repeated Dim dim = 2;
+
+  // If true, the number of dimensions in the shape is unknown.
+  // If true, "dim.size()" must be 0.
+  bool unknown_rank = 3;
+
+  // data format "NHWC" "NCHW" "NC1HWC0" OR "NONE"
+  int32 data_format = 4;
+};
+
+message Tensor {
+
+  // tensor shape info
+  TensorShape tensor_shape = 1;
+
+  // tensor content data type
+  int32 tensor_type = 2;
+
+  // tensor memory device
+  // data located memory device , "DDR" "HBM" OR "NONE"
+  string mem_device = 3;
+  string name = 4;
+  uint64 data_ptr = 5;
+  uint64 data_size = 6;
+}
+
+enum DataType {
+  MS_FLOAT32 = 0;
+  MS_FLOAT16 = 1;
+  MS_INT8 = 2;
+  MS_INT32 = 3;
+  MS_UINT8 = 4;
+  MS_INT16 = 6;
+  MS_UINT16 = 7;
+  MS_UINT32 = 8;
+  MS_INT64 = 9;
+  MS_UINT64 = 10;
+  MS_FLOAT64 = 11;
+  MS_BOOL = 12;
+  MS_STRING = 13;
+  MS_DUAL_SUB_INT8 = 14;
+  MS_DUAL_SUB_UINT8 = 15;
+  MS_COMPLEX64 = 16;
+  MS_COMPLEX128 = 17;
+  MS_QINT8 = 18;
+  MS_QINT16 = 19;
+  MS_QINT32 = 20;
+  MS_QUINT8 = 21;
+  MS_QUINT16 = 22;
+  MS_RESOURCE = 23;
+  MS_STRING_REF = 24;
+  MS_DUAL = 25;
+  MS_UNKNOWN = 26;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_async_event.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_async_event.cc
@ -0,0 +1,137 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "aicpu_sharder/aicpu_async_event.h"
+#include <string>
+#include "common/kernel_log.h"
+#include "aicpu_sharder/aicpu_context.h"
+
+namespace aicpu {
+AsyncEventManager &AsyncEventManager::GetInstance() {
+  static AsyncEventManager async_event_manager;
+  return async_event_manager;
+}
+
+void AsyncEventManager::Register(const NotifyFunc &notify) { notify_func_ = notify; }
+
+void AsyncEventManager::NotifyWait(void *notify_param, const uint32_t param_len) {
+  if (notify_func_ != nullptr) {
+    notify_func_(notify_param, param_len);
+  }
+}
+
+bool AsyncEventManager::GenTaskInfoFromCtx(AsyncTaskInfo *task_info) {
+  if (task_info == nullptr) {
+    AICPU_LOGE("AsyncEventManager GenTaskInfoFromCtx failed, task_info is nullptr.");
+    return false;
+  }
+  (void)aicpu::GetTaskAndStreamId(&task_info->task_id, &task_info->stream_id);
+  std::string wait_id_value;
+  std::string ker_wait_id(aicpu::kContextKeyWaitId);
+  auto status = aicpu::GetThreadLocalCtx(ker_wait_id, &wait_id_value);
+  if (status != aicpu::AICPU_ERROR_NONE) {
+    AICPU_LOGE("GetThreadLocalCtx failed, ret=%d, key=%s.", status, ker_wait_id.c_str());
+    return false;
+  }
+  task_info->wait_id = atoi(wait_id_value.c_str());
+  std::string wait_type_value;
+  std::string key_wait_type(aicpu::kContextKeyWaitType);
+  status = aicpu::GetThreadLocalCtx(key_wait_type, &wait_type_value);
+  if (status != aicpu::AICPU_ERROR_NONE) {
+    AICPU_LOGE("GetThreadLocalCtx failed, ret=%d, key=%s.", status, key_wait_type.c_str());
+    return false;
+  }
+  task_info->wait_type = atoi(wait_type_value.c_str());
+  std::string start_tick_value;
+  std::string key_start_tick(aicpu::kContextKeyStartTick);
+  status = aicpu::GetThreadLocalCtx(key_start_tick, &start_tick_value);
+  if (status != aicpu::AICPU_ERROR_NONE) {
+    AICPU_LOGE("GetThreadLocalCtx failed, ret=%d, key=%s.", status, key_start_tick.c_str());
+    return false;
+  }
+  task_info->start_tick = atol(start_tick_value.c_str());
+  status = aicpu::GetOpname(aicpu::GetAicpuThreadIndex(), &task_info->op_name);
+  if (status != aicpu::AICPU_ERROR_NONE) {
+    AICPU_LOGE("GetOpname failed, ret=%d.", status);
+    return false;
+  }
+  return true;
+}
+
+bool AsyncEventManager::RegEventCb(const uint32_t event_id, const uint32_t sub_event_id,
+                                   const EventProcessCallBack &cb) {
+  if (cb == nullptr) {
+    AICPU_LOGE("AsyncEventManager RegEventCb failed, cb is nullptr.");
+    return false;
+  }
+  AsyncTaskInfo task_info;
+  task_info.task_cb = cb;
+  if (!GenTaskInfoFromCtx(&task_info)) {
+    AICPU_LOGE("AsyncEventManager GenTaskInfoFromCtx failed.");
+    return false;
+  }
+  AsyncEventInfo info;
+  info.event_id = event_id;
+  info.sub_event_id = sub_event_id;
+  {
+    std::unique_lock<std::mutex> lk(map_mutex_);
+    auto iter = asyncTaskMap_.find(info);
+    if (iter != asyncTaskMap_.end()) {
+      AICPU_LOGE("AsyncEventManager RegEventCb failed.");
+      return false;
+    }
+    asyncTaskMap_[info] = task_info;
+  }
+
+  AICPU_LOGI(
+    "AsyncEventManager RegEventCb success, event_id[%u], subeventId[%u], taskId[%lu],"
+    " streamId[%u], waitType[%u], waitId[%u], opName[%s], startTick[%lu].",
+    event_id, sub_event_id, task_info.task_id, task_info.stream_id, task_info.wait_type, task_info.wait_id,
+    task_info.op_name.c_str(), task_info.start_tick);
+  return true;
+}
+
+void AsyncEventManager::ProcessEvent(const uint32_t event_id, const uint32_t sub_event_id, void *param) {
+  AICPU_LOGI("AsyncEventManager proc event_id = %d, sub_event_id = %d", event_id, sub_event_id);
+  AsyncEventInfo info;
+  info.event_id = event_id;
+  info.sub_event_id = sub_event_id;
+  EventProcessCallBack taskCb = nullptr;
+  {
+    std::unique_lock<std::mutex> lk(map_mutex_);
+    auto iter = asyncTaskMap_.find(info);
+    if (iter == asyncTaskMap_.end()) {
+      AICPU_LOGW("AsyncEventManager no async task to deal with.");
+      return;
+    }
+    taskCb = iter->second.task_cb;
+    asyncTaskMap_.erase(iter);
+  }
+  if (taskCb != nullptr) {
+    taskCb(param);
+  }
+  AICPU_LOGI("AsyncEventManager proc end!");
+  return;
+}
+}  // namespace aicpu
+
+void AicpuNotifyWait(void *notify_param, const uint32_t param_len) {
+  aicpu::AsyncEventManager::GetInstance().NotifyWait(notify_param, param_len);
+  return;
+}
+
+bool AicpuRegEventCb(const uint32_t event_id, const uint32_t sub_event_id, const aicpu::EventProcessCallBack &cb) {
+  return aicpu::AsyncEventManager::GetInstance().RegEventCb(event_id, sub_event_id, cb);
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_async_event.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_async_event.h
@ -0,0 +1,144 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_OPS_AICPU_ASYNC_EVENT_H_
+#define AICPU_OPS_AICPU_ASYNC_EVENT_H_
+
+#include <functional>
+#include <vector>
+#include <string>
+#include <map>
+#include <mutex>
+#include "aicpu_sharder/aicpu_context.h"
+
+namespace aicpu {
+using NotifyFunc = std::function<void(void *param, const uint32_t param_len)>;
+using EventProcessCallBack = std::function<void(void *param)>;
+
+struct AsyncEventInfo {
+  uint32_t event_id;
+  uint32_t sub_event_id;
+
+  bool operator==(const AsyncEventInfo &info) {
+    return (event_id == info.event_id) && (sub_event_id == info.sub_event_id);
+  }
+};
+
+inline bool operator<(const AsyncEventInfo &info1, const AsyncEventInfo &info2) {
+  return (info1.event_id < info2.event_id) ||
+         ((info1.event_id == info2.event_id) && (info1.sub_event_id < info2.sub_event_id));
+}
+
+struct AsyncTaskInfo {
+  uint64_t start_tick;
+  std::string op_name;
+  uint8_t wait_type;
+  uint32_t wait_id;
+  uint64_t task_id;
+  uint32_t stream_id;
+  EventProcessCallBack task_cb;
+};
+
+struct AsyncNotifyInfo {
+  uint8_t wait_type;
+  uint32_t wait_id;
+  uint64_t task_id;
+  uint32_t stream_id;
+  uint32_t ret_code;
+  aicpu::aicpuContext_t ctx;
+};
+
+class AsyncEventManager {
+ public:
+  /**
+   * Get the unique object of this class
+   */
+  static AsyncEventManager &GetInstance();
+
+  /**
+   * Register notify callback function
+   * @param notify wait notify callback function
+   */
+  void Register(const NotifyFunc &notify);
+
+  /**
+   * Notify wait task
+   * @param notify_param notify param info
+   * @param param_len notify_param len
+   */
+  void NotifyWait(void *notify_param, const uint32_t param_len);
+
+  /**
+   * Register Event callback function, async op call
+   * @param eventID EventId
+   * @param sub_event_id queue id
+   * @param cb Event callback function
+   * @return whether register success
+   */
+  bool RegEventCb(const uint32_t event_id, const uint32_t sub_event_id, const EventProcessCallBack &cb);
+
+  /**
+   * Process event
+   * @param event_id EventId
+   * @param sub_event_id queue id
+   * @param param event param
+   */
+  void ProcessEvent(const uint32_t event_id, const uint32_t sub_event_id, void *param = nullptr);
+
+ private:
+  AsyncEventManager() : notify_func_(nullptr) {}
+  ~AsyncEventManager() = default;
+
+  AsyncEventManager(const AsyncEventManager &) = delete;
+  AsyncEventManager &operator=(const AsyncEventManager &) = delete;
+  AsyncEventManager(AsyncEventManager &&) = delete;
+  AsyncEventManager &operator=(AsyncEventManager &&) = delete;
+
+  // generate task info from ctx
+  bool GenTaskInfoFromCtx(AsyncTaskInfo *task_info);
+
+  // wait notify function
+  NotifyFunc notify_func_;
+
+  std::mutex map_mutex_;
+
+  std::map<AsyncEventInfo, AsyncTaskInfo> asyncTaskMap_;
+};
+}  // namespace aicpu
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * Notify wait task
+ * @param notify_param notify info
+ * @param param_len
+ */
+__attribute__((weak)) void AicpuNotifyWait(void *notify_param, const uint32_t param_len);
+
+/**
+ * Register Event callback function, async op call
+ * @param info Registered event information
+ * @param cb Event callback function
+ * @return whether register success
+ */
+__attribute__((weak)) bool AicpuRegEventCb(const uint32_t event_id, const uint32_t sub_event_id,
+                                           const aicpu::EventProcessCallBack &cb);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // AICPU_OPS_AICPU_ASYNC_EVENT_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_context.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_context.cc
@ -0,0 +1,308 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "aicpu_sharder/aicpu_context.h"
+
+#include <map>
+#include <vector>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <utility>
+
+#include "common/kernel_log.h"
+
+namespace {
+// current thread context
+aicpu::aicpuContext_t g_cur_ctx;
+// task monitor context
+std::unique_ptr<std::string[]> g_opsname(nullptr);
+thread_local uint32_t g_thread_index = UINT32_MAX;
+uint32_t g_aicpu_core_cnt = 0;
+thread_local std::map<std::string, std::string> g_thread_local_ctx;
+thread_local aicpu::streamAndTaskId_t g_stream_and_task_id;
+// aicpu run mode
+uint32_t g_run_mode = aicpu::AicpuRunMode::THREAD_MODE;
+
+// context info
+std::mutex default_mutex;
+std::vector<std::map<std::string, std::string>> g_default_thread_ctx;
+std::mutex prof_mutex;
+std::vector<std::map<std::string, std::string>> g_prof_thread_ctx;
+std::mutex debug_mutex;
+std::vector<std::map<std::string, std::string>> g_debug_thread_ctx;
+std::mutex func_map_mutex;
+std::map<uint32_t, std::map<uint32_t, std::function<void(void *)>>> g_func_map;
+
+std::map<std::string, std::string> &GetThreadCtx(aicpu::CtxType type, uint32_t thread_index) {
+  if (type == aicpu::CTX_DEBUG) {
+    std::unique_lock<std::mutex> mutex(default_mutex);
+    if (thread_index >= g_debug_thread_ctx.size()) {
+      g_debug_thread_ctx.resize(thread_index + 1);
+    }
+    return g_debug_thread_ctx[thread_index];
+  } else if (type == aicpu::CTX_PROF) {
+    std::unique_lock<std::mutex> mutex(prof_mutex);
+    if (thread_index >= g_prof_thread_ctx.size()) {
+      g_prof_thread_ctx.resize(thread_index + 1);
+    }
+    return g_prof_thread_ctx[thread_index];
+  } else {
+    std::unique_lock<std::mutex> mutex(debug_mutex);
+    if (thread_index >= g_default_thread_ctx.size()) {
+      g_default_thread_ctx.resize(thread_index + 1);
+    }
+    return g_default_thread_ctx[thread_index];
+  }
+}
+}  // namespace
+
+namespace aicpu {
+status_t aicpuSetContext(aicpuContext_t *ctx) {
+  g_cur_ctx = *ctx;
+  return AICPU_ERROR_NONE;
+}
+
+status_t aicpuGetContext(aicpuContext_t *ctx) {
+  *ctx = g_cur_ctx;
+  return AICPU_ERROR_NONE;
+}
+
+status_t InitTaskMonitorContext(uint32_t aicpu_core_cnt) {
+  if (aicpu_core_cnt == 0) {
+    AICPU_LOGE("invalid aicpu core count[%u]", aicpu_core_cnt);
+    return AICPU_ERROR_FAILED;
+  }
+  g_aicpu_core_cnt = aicpu_core_cnt;
+  AICPU_LOGI("aicpu core count[%u]", aicpu_core_cnt);
+  g_opsname.reset(new (std::nothrow) std::string[aicpu_core_cnt]);
+  if (g_opsname == nullptr) {
+    AICPU_LOGE("malloc ops name memory for task monitor failed");
+    return AICPU_ERROR_FAILED;
+  }
+  for (uint32_t index = 0; index < aicpu_core_cnt; ++index) {
+    g_opsname[index] = "null";
+  }
+  return AICPU_ERROR_NONE;
+}
+
+status_t SetAicpuThreadIndex(uint32_t thread_index) {
+  g_thread_index = thread_index;
+  return AICPU_ERROR_NONE;
+}
+
+uint32_t GetAicpuThreadIndex() { return g_thread_index; }
+
+status_t SetOpname(const std::string &opname) {
+  if (g_opsname != nullptr && g_thread_index < g_aicpu_core_cnt) {
+    AICPU_LOGI("set op name to %s for thread[%u]", opname.c_str(), g_thread_index);
+    g_opsname[g_thread_index] = opname;
+    return AICPU_ERROR_NONE;
+  }
+  // maintenance function, if failed just print event log
+  AICPU_LOGEVENT(
+    "set op name[%s] failed, thread index[%u] should be less than total aicpu core count[%u],"
+    " and ops name array addr[%p] cannot null",
+    opname.c_str(), g_thread_index, g_aicpu_core_cnt, g_opsname.get());
+  return AICPU_ERROR_NONE;
+}
+
+status_t GetOpname(uint32_t thread_index, std::string *opname) {
+  *opname = "null";
+  if (g_opsname != nullptr && thread_index < g_aicpu_core_cnt) {
+    *opname = g_opsname[thread_index];
+    return AICPU_ERROR_NONE;
+  }
+  // maintenance function, if failed just print event log
+  AICPU_LOGEVENT(
+    "get op name failed, thread index[%u] should be less than total aicpu core count[%u],"
+    " and ops name array addr[%p] cannot null",
+    g_thread_index, g_aicpu_core_cnt, g_opsname.get());
+  return AICPU_ERROR_NONE;
+}
+
+status_t SetTaskAndStreamId(uint64_t task_id, uint32_t stream_id) {
+  g_stream_and_task_id.task_id = task_id;
+  g_stream_and_task_id.stream_id = stream_id;
+  AICPU_LOGI("Set task_id:[%lu] and stream_id:[%u] success.", task_id, stream_id);
+  return AICPU_ERROR_NONE;
+}
+
+status_t GetTaskAndStreamId(uint64_t *task_id, uint32_t *stream_id) {
+  *task_id = g_stream_and_task_id.task_id;
+  *stream_id = g_stream_and_task_id.stream_id;
+  AICPU_LOGI("Get task_id:[%lu] and stream_id:[%u] success.", *task_id, *stream_id);
+  return AICPU_ERROR_NONE;
+}
+
+status_t SetAicpuRunMode(uint32_t run_mode) {
+  g_run_mode = run_mode;
+  AICPU_LOGI("Set run_mode:[%u] success.", run_mode);
+  return AICPU_ERROR_NONE;
+}
+
+status_t GetAicpuRunMode(uint32_t *run_mode) {
+  *run_mode = g_run_mode;
+  AICPU_LOGI("Get run_mode:[%u] success.", *run_mode);
+  return AICPU_ERROR_NONE;
+}
+
+status_t SetThreadLocalCtx(const std::string &key, const std::string &value) {
+  if (key.empty()) {
+    AICPU_LOGE("set thread local context failed, key is empty");
+    return AICPU_ERROR_FAILED;
+  }
+  try {
+    g_thread_local_ctx[key] = value;
+  } catch (std::exception &e) {
+    AICPU_LOGE("set thread local context failed, %s", e.what());
+    return AICPU_ERROR_FAILED;
+  }
+  return AICPU_ERROR_NONE;
+}
+
+status_t GetThreadLocalCtx(const std::string &key, std::string *value) {
+  if (key.empty()) {
+    AICPU_LOGE("get thread local context failed, key is empty");
+    return AICPU_ERROR_FAILED;
+  }
+  auto iter = g_thread_local_ctx.find(key);
+  if (iter != g_thread_local_ctx.end()) {
+    *value = iter->second;
+    return AICPU_ERROR_NONE;
+  }
+  AICPU_LOGW("get thread local context failed, no such key[%s]", key.c_str());
+  return AICPU_ERROR_FAILED;
+}
+
+status_t RemoveThreadLocalCtx(const std::string &key) {
+  auto iter = g_thread_local_ctx.find(key);
+  if (iter != g_thread_local_ctx.end()) {
+    g_thread_local_ctx.erase(iter);
+    return AICPU_ERROR_NONE;
+  }
+  AICPU_LOGE("remove thread local context failed, no such key[%s]", key.c_str());
+  return AICPU_ERROR_FAILED;
+}
+
+const std::map<std::string, std::string> &GetAllThreadCtxInfo(aicpu::CtxType type, uint32_t thread_index) {
+  AICPU_LOGI("Get all thread ctx info begin, thread index:%u", thread_index);
+  auto &ctx = GetThreadCtx(type, thread_index);
+  return ctx;
+}
+
+status_t RegisterEventCallback(uint32_t event_id, uint32_t subevent_id, std::function<void(void *)> func) {
+  std::lock_guard<std::mutex> lock(func_map_mutex);
+  std::map<uint32_t, std::function<void(void *)>> &sub_map = g_func_map[event_id];
+  auto it = sub_map.insert(std::make_pair(subevent_id, func));
+  if (it.second == false) {
+    AICPU_LOGE(
+      "register event call function failed, repulicate register callback function by event_id[%u] "
+      "subevent_id[%u]",
+      event_id, subevent_id);
+    return AICPU_ERROR_FAILED;
+  }
+  return AICPU_ERROR_NONE;
+}
+
+status_t DoEventCallback(uint32_t event_id, uint32_t subevent_id, void *param) {
+  std::lock_guard<std::mutex> lock(func_map_mutex);
+  auto iter = g_func_map.find(event_id);
+  if (iter == g_func_map.end()) {
+    AICPU_LOGE("do event callback function failed, cannot find callback function by event_id[%u] subevent_id[%u]",
+               event_id, event_id);
+    return AICPU_ERROR_FAILED;
+  }
+
+  std::map<uint32_t, std::function<void(void *)>> &sub_map = iter->second;
+  auto sub_iter = sub_map.find(subevent_id);
+  if (sub_iter == sub_map.end()) {
+    AICPU_LOGE("do event callback function failed, cannot find callback function by event_id[%u] subevent_id[%u]",
+               event_id, event_id);
+    return AICPU_ERROR_FAILED;
+  }
+  (sub_iter->second)(param);
+  // erase func after call
+  sub_map.erase(sub_iter);
+  return AICPU_ERROR_NONE;
+}
+
+status_t UnRegisterCallback(uint32_t event_id, uint32_t subevent_id) {
+  std::lock_guard<std::mutex> lock(func_map_mutex);
+  auto iter = g_func_map.find(event_id);
+  if (iter == g_func_map.end()) {
+    AICPU_LOGEVENT(
+      "skip unregister event callback function, cannot find callback function by event_id[%u] "
+      "subevent_id[%u]",
+      event_id, event_id);
+    return AICPU_ERROR_NONE;
+  }
+
+  std::map<uint32_t, std::function<void(void *)>> &sub_map = iter->second;
+  auto sub_iter = sub_map.find(subevent_id);
+  if (sub_iter == sub_map.end()) {
+    AICPU_LOGEVENT(
+      "skip unregister event callback function, cannot find callback function by event_id[%u] "
+      "subevent_id[%u]",
+      event_id, event_id);
+    return AICPU_ERROR_NONE;
+  }
+  sub_map.erase(sub_iter);
+  return AICPU_ERROR_NONE;
+}
+}  // namespace aicpu
+
+aicpu::status_t SetThreadCtxInfo(aicpu::CtxType type, const std::string &key, const std::string &value) {
+  if (key.empty()) {
+    AICPU_LOGE("Set thread context failed, context type[%d], key is empty", type);
+    return aicpu::AICPU_ERROR_FAILED;
+  }
+
+  auto &ctx = GetThreadCtx(type, g_thread_index);
+  try {
+    ctx[key] = value;
+  } catch (std::exception &e) {
+    AICPU_LOGE("Set thread context failed, context type[%d], %s", type, e.what());
+    return aicpu::AICPU_ERROR_FAILED;
+  }
+  return aicpu::AICPU_ERROR_NONE;
+}
+
+aicpu::status_t GetThreadCtxInfo(aicpu::CtxType type, const std::string &key, std::string *value) {
+  if (key.empty()) {
+    AICPU_LOGE("Get thread context failed, context type[%d], key is empty", type);
+    return aicpu::AICPU_ERROR_FAILED;
+  }
+
+  auto &ctx = GetThreadCtx(type, g_thread_index);
+  auto iter = ctx.find(key);
+  if (iter != ctx.end()) {
+    *value = iter->second;
+    return aicpu::AICPU_ERROR_NONE;
+  }
+  AICPU_LOGE("Get thread context failed, context type[%d], no such key[%s]", type, key.c_str());
+  return aicpu::AICPU_ERROR_FAILED;
+}
+
+aicpu::status_t RemoveThreadCtxInfo(aicpu::CtxType type, const std::string &key) {
+  auto &ctx = GetThreadCtx(type, g_thread_index);
+  auto iter = ctx.find(key);
+  if (iter != ctx.end()) {
+    ctx.erase(iter);
+    return aicpu::AICPU_ERROR_NONE;
+  }
+  AICPU_LOGE("Remove thread context failed, context type[%d], no such key[%s]", type, key.c_str());
+  return aicpu::AICPU_ERROR_FAILED;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_context.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_context.h
@ -0,0 +1,231 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_OPS_AICPU_CONTEXT_H_
+#define AICPU_OPS_AICPU_CONTEXT_H_
+
+#include <sys/types.h>
+#include <cstdint>
+#include <string>
+#include <map>
+#include <functional>
+#include "common/kernel_util.h"
+
+namespace aicpu {
+typedef struct {
+  uint32_t device_id;  // device id
+  uint32_t tsId;       // ts id
+  pid_t host_pid;      // host pid
+  uint32_t vf_id;      // vf id
+} aicpuContext_t;
+
+enum AicpuRunMode : uint32_t {
+  PROCESS_PCIE_MODE = 0,    // 1910/1980/1951 dc, with host mode
+  PROCESS_SOCKET_MODE = 1,  // MDC
+  THREAD_MODE = 2,          // ctrlcpu/minirc/lhisi
+  INVALID_MODE,
+};
+
+typedef struct {
+  uint32_t stream_id;
+  uint64_t task_id;
+} streamAndTaskId_t;
+
+typedef enum {
+  AICPU_ERROR_NONE = 0,    // success
+  AICPU_ERROR_FAILED = 1,  // failed
+} status_t;
+
+enum CtxType : int32_t { CTX_DEFAULT = 0, CTX_PROF, CTX_DEBUG };
+
+constexpr auto kContextKeyOpName = "opname";
+constexpr auto kContextKeyPhaseOneFlag = "phaseOne";
+constexpr auto kContextKeyWaitType = "waitType";
+constexpr auto kContextKeyWaitId = "waitId";
+constexpr auto kContextKeyStartTick = "startTick";
+constexpr auto kContextKeyDrvSubmitTick = "drvSubmitTick";
+constexpr auto kContextKeyDrvSchedTick = "drvSchedTick";
+constexpr auto kContextKeyKernelType = "kernelType";
+
+/**
+ * set aicpu context for current thread.
+ * @param [in]ctx aicpu context
+ * @return status whether this operation success
+ */
+AICPU_VISIBILITY_API status_t aicpuSetContext(aicpuContext_t *ctx);
+
+/**
+ * get aicpu context from current thread.
+ * @param [out]ctx aicpu context
+ * @return status whether this operation success
+ */
+AICPU_VISIBILITY_API status_t aicpuGetContext(aicpuContext_t *ctx);
+
+/**
+ * init context for task monitor, called in compute process start.
+ * @param [in]aicpu_core_cnt aicpu core number
+ * @return status whether this operation success
+ */
+status_t InitTaskMonitorContext(uint32_t aicpu_core_cnt);
+
+/**
+ * set aicpu thread index for task monitor, called in thread callback function.
+ * @param [in]thread_index aicpu thread index
+ * @return status whether this operation success
+ */
+status_t SetAicpuThreadIndex(uint32_t thread_index);
+
+/**
+ * get aicpu thread index.
+ * @return uint32
+ */
+uint32_t GetAicpuThreadIndex();
+
+/**
+ * set op name for task monitor.
+ * called in libtf_kernels.so(tf op) or libaicpu_processer.so(others) or cpu kernel framework.
+ * @param [in]opname op name
+ * @return status whether this operation success
+ */
+status_t __attribute__((weak)) SetOpname(const std::string &opname);
+
+/**
+ * get op name for task monitor
+ * @param [in]thread_index thread index
+ * @param [out]opname op name
+ * @return status whether this operation success
+ */
+status_t GetOpname(uint32_t thread_index, std::string *opname);
+
+/**
+ * get task and stream id.
+ * @param [in]task_id task id.
+ * @param [in]stream_id stream id.
+ * @return status whether this operation success
+ */
+status_t __attribute__((weak)) GetTaskAndStreamId(uint64_t *task_id, uint32_t *stream_id);
+
+/**
+ * set task and stream id.
+ * @param [in]task_id task id.
+ * @param [in]stream_id stream id.
+ * @return status whether this operation success
+ */
+status_t __attribute__((weak)) SetTaskAndStreamId(uint64_t task_id, uint32_t stream_id);
+
+/**
+ * set thread local context of key
+ * @param [in]key context key
+ * @param [in]value context value
+ * @return status whether this operation success
+ * @note Deprecated from 20201216, Replaced by SetThreadCtxInfo
+ */
+status_t __attribute__((weak)) SetThreadLocalCtx(const std::string &key, const std::string &value);
+
+/**
+ * get thread local context of key
+ * @param [in]key context key
+ * @param [out]value context value
+ * @return status whether this operation success
+ * @note Deprecated from 20201216, Replaced by GetThreadCtxInfo
+ */
+status_t GetThreadLocalCtx(const std::string &key, std::string *value);
+
+/**
+ * remove local context of key
+ * @param [in]key context key
+ * @return status whether this operation success
+ * @note Deprecated from 20201216, Replaced by RemoveThreadCtxInfo
+ */
+status_t RemoveThreadLocalCtx(const std::string &key);
+
+/**
+ * get all thread context info of type
+ * @param [in]type: ctx type
+ * @param [in]thread_index: thread index
+ * @return const std::map<std::string, std::string> &: all thread context info
+ */
+const std::map<std::string, std::string> &GetAllThreadCtxInfo(aicpu::CtxType type, uint32_t thread_index);
+
+/**
+ * set run mode.
+ * @param [in]run_mode: run mode.
+ * @return status whether this operation success
+ */
+status_t __attribute__((weak)) SetAicpuRunMode(uint32_t run_mode);
+
+/**
+ * get run mode.
+ * @param [out]run_mode: run mode.
+ * @return status whether this operation success
+ */
+status_t __attribute__((weak)) GetAicpuRunMode(uint32_t *run_mode);
+
+/**
+ * Register callback function by event_id and subevent_id
+ * @param event_id event id
+ * @param subevent_id subevent id
+ * @param func call back function
+ */
+status_t __attribute__((weak))
+RegisterEventCallback(uint32_t event_id, uint32_t subevent_id, std::function<void(void *)> func);
+
+/**
+ * Do callback function by event_id and subevent_id
+ * @param event_id event id
+ * @param subevent_id subevent id
+ * @param param event param
+ */
+status_t __attribute__((weak)) DoEventCallback(uint32_t event_id, uint32_t subevent_id, void *param);
+
+/**
+ * Unregister callback function by event_id and subevent_id
+ * @param event_id event id
+ * @param subevent_id subevent id
+ */
+status_t __attribute__((weak)) UnRegisterCallback(uint32_t event_id, uint32_t subevent_id);
+}  // namespace aicpu
+
+extern "C" {
+/**
+ * set thread context info of type
+ * @param [in]type: ctx type
+ * @param [in]key: key of context info
+ * @param [in]value: value of context info
+ * @return status whether this operation success
+ */
+AICPU_VISIBILITY_API aicpu::status_t SetThreadCtxInfo(aicpu::CtxType type, const std::string &key,
+                                                      const std::string &value);
+
+/**
+ * get thread context info of type
+ * @param [in]type: ctx type
+ * @param [in]key: key of context info
+ * @param [out]value: value of context info
+ * @return status whether this operation success
+ */
+AICPU_VISIBILITY_API aicpu::status_t GetThreadCtxInfo(aicpu::CtxType type, const std::string &key, std::string *value);
+
+/**
+ * remove thread context info of type
+ * @param [in]type: ctx type
+ * @param [in]key: key of context info
+ * @return status whether this operation success
+ */
+AICPU_VISIBILITY_API aicpu::status_t RemoveThreadCtxInfo(aicpu::CtxType type, const std::string &key);
+}
+
+#endif  // AICPU_OPS_AICPU_CONTEXT_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_pulse.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_pulse.cc
@ -0,0 +1,58 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "aicpu_sharder/aicpu_pulse.h"
+#include <unordered_map>
+#include <mutex>
+#include <string>
+#include "common/kernel_log.h"
+
+namespace {
+static std::unordered_map<std::string, PulseNotifyFunc> pulse_notify_func_map;
+static std::mutex mtx;
+}  // namespace
+
+__attribute__((visibility("default"))) void AicpuPulseNotify() {
+  std::unique_lock<std::mutex> lck(mtx);
+  AICPU_LOGD("Aicpu pulse notify start, notify func num=%zu.", pulse_notify_func_map.size());
+  for (auto &notify_func : pulse_notify_func_map) {
+    AICPU_LOGD("Aicpu pulse notify %s start.", notify_func.first.c_str());
+    notify_func.second();
+    AICPU_LOGD("Aicpu pulse notify %s end.", notify_func.first.c_str());
+  }
+  AICPU_LOGD("Aicpu pulse notify end.");
+}
+
+__attribute__((visibility("default"))) int32_t RegisterPulseNotifyFunc(const char *name, PulseNotifyFunc func) {
+  if (name == nullptr) {
+    AICPU_LOGE("Register pulse notify func failed as param name is null");
+    return -1;
+  }
+
+  if (func == nullptr) {
+    AICPU_LOGE("Register pulse notify func for %s failed as param func is null", name);
+    return -1;
+  }
+
+  std::unique_lock<std::mutex> lck(mtx);
+  auto ret = pulse_notify_func_map.emplace(name, func);
+  if (!ret.second) {
+    AICPU_LOGE("Register pulse notify func for %s failed.", name);
+    return -1;
+  }
+  AICPU_LOGI("Register pulse notify func for %s success.", name);
+  return 0;
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_pulse.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_pulse.h
@ -0,0 +1,46 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_OPS_AICPU_PULSE_H_
+#define AICPU_OPS_AICPU_PULSE_H_
+
+#include <cstdint>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*PulseNotifyFunc)(void);
+
+/**
+ * aicpu pulse notify.
+ * timer will call this method per second.
+ */
+void AicpuPulseNotify(void);
+
+/**
+ * Register kernel pulse notify func.
+ * @param name name of kernel lib, must end with '\0' and unique.
+ * @param func pulse notify function.
+ * @return 0:success, other:failed.
+ */
+int32_t RegisterPulseNotifyFunc(const char *name, PulseNotifyFunc func);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AICPU_OPS_AICPU_PULSE_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_sharder.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_sharder.cc
@ -0,0 +1,218 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "aicpu_sharder/aicpu_sharder.h"
+
+#include <semaphore.h>
+#include <unistd.h>
+#include <error.h>
+#include <atomic>
+#include <algorithm>
+#include <cerrno>
+#include <cstring>
+
+#include "common/kernel_log.h"
+
+namespace aicpu {
+#define AICPU_SHARDER_IF_TRUE_RUN(expr, run) \
+  do {                                       \
+    if (expr) {                              \
+      run;                                   \
+    }                                        \
+  } while (0)
+
+void SharderNonBlock::Register(const RunnerBool &schedule, const ClosureBool &do_task, uint32_t cpu_core_num) {
+  schedule_ = schedule;
+  do_task_ = do_task;
+  cpu_core_num_ = cpu_core_num;
+}
+
+bool SharderNonBlock::Enqueue(const Closure &closure, bool submit_topic) {
+  if (schedule_ != nullptr) {
+    return schedule_(closure, submit_topic);
+  }
+  return false;
+}
+
+void SharderNonBlock::Schedule(const Closure &closure) {
+  if (!Enqueue(closure)) {
+    closure();
+  }
+}
+
+uint32_t SharderNonBlock::GetCPUNum() { return cpu_core_num_; }
+
+SharderNonBlock &SharderNonBlock::GetInstance() {
+  static SharderNonBlock sharder_non_block;
+  return sharder_non_block;
+}
+
+int64_t SharderNonBlock::CeilMultiple(int64_t x, int64_t base) {
+  if (base == 0) {
+    return 0;
+  }
+  int64_t ret = x / base;
+  if ((x % base) != 0) {
+    ret++;
+  }
+
+  return ret;
+}
+
+void SharderNonBlock::ParallelFor(int64_t total, int64_t per_unit_size, const SharderWork &work) {
+  AICPU_LOGI("total: %lld, per_unit_size: %lld", total, per_unit_size);
+  if ((total <= 0) || (work == nullptr)) {
+    AICPU_LOGE("invalid param: total<=0 or work is nullptr");
+    return;
+  }
+
+  // work itself
+  if ((schedule_ == nullptr) || (cpu_core_num_ <= 1)) {
+    AICPU_LOGI("work itself all");
+    work(0, total);
+    return;
+  }
+
+  // In order to ensure a smaller scheduling delay, the maximum number of slices is twice the number of CPU cores
+  const int64_t max_shard_num = static_cast<int64_t>(cpu_core_num_) * 2;
+
+  // calculate shard number and block size
+  // i.e., if total is 118, perUintSize is 2, and cpu_core_num_ is 13
+  // then shard_num is 24, block_size is 5
+  int64_t block_size = std::max(int64_t{1}, std::min(total, per_unit_size));
+  int64_t shard_num = CeilMultiple(total, block_size);
+  shard_num = std::min(max_shard_num, shard_num);
+  block_size = CeilMultiple(total, shard_num);
+  shard_num = CeilMultiple(total, block_size);
+  AICPU_LOGI("shard number: %lld, block size: %lld", shard_num, block_size);
+
+  // There is no need to submit an event if shard_num is 1
+  if (shard_num == 1) {
+    AICPU_LOGI("executes on the current thread");
+    work(0, total);
+    return;
+  }
+
+  std::atomic<int64_t> count(shard_num);  // a counter
+  sem_t sem;
+  int32_t sem_init_ret = sem_init(&sem, 0, 0);
+  if (sem_init_ret == -1) {
+    AICPU_LOGE("sem_init error with message: %s", strerror(errno));
+    work(0, total);
+    return;
+  }
+
+  for (int64_t start = 0; start < total; start += block_size) {
+    auto limit = std::min(start + block_size, total);
+    Closure closure = [&sem, &work, &count, start, limit]() {
+      count--;
+      // In order to ensure that user's work function exception does not affect multithread services,
+      // exception capture is needed. Exception type is not cared here, and error log is printed.
+      try {
+        work(start, limit);
+      } catch (...) {
+        AICPU_LOGE("exception occurred in work function with start: %lld, limit: %lld", start, limit);
+      }
+
+      int32_t sem_post_ret = sem_post(&sem);
+      AICPU_SHARDER_IF_TRUE_RUN(sem_post_ret == -1, AICPU_LOGE("sem_post error with message: %s", strerror(errno)));
+    };
+
+    // if enqueue fail, work itself
+    if (!Enqueue(closure, true)) {
+      AICPU_LOGI("Enqueue fail, [%lld, %lld), work itself", start, limit);
+      closure();
+    }
+  }
+
+  if (do_task_ != nullptr) {
+    bool ret = true;
+    while ((count > 0) && ret) {
+      AICPU_LOGI("Main thread do task begin.");
+      ret = do_task_();
+      AICPU_LOGI("Main thread do task end.");
+    }
+  }
+
+  for (int64_t i = 0; i < shard_num; ++i) {
+    int sem_wait_ret = sem_wait(&sem);
+    AICPU_SHARDER_IF_TRUE_RUN(sem_wait_ret == -1, AICPU_LOGE("sem_wait error with message: %s", strerror(errno)));
+  }
+  int32_t sem_des_ret = sem_destroy(&sem);
+  AICPU_SHARDER_IF_TRUE_RUN(sem_des_ret == -1, AICPU_LOGE("sem_destroy error with message: %s", strerror(errno)));
+}
+
+void SharderNonBlock::ParallelForHash(int64_t total, int64_t cpu_nums, const SharderWork &work) {
+  AICPU_LOGI("total: %lld, cpu_nums: %d", total, cpu_nums);
+  if (total <= 0 || work == nullptr) {
+    AICPU_LOGE("invalid param: total<=0 or work is nullptr");
+    return;
+  }
+
+  if ((schedule_ == nullptr) || (cpu_core_num_ <= 1)) {
+    AICPU_LOGE("schedule is nullptr or cpu core num is not enough");
+    return;
+  }
+
+  std::atomic<int64_t> count(cpu_nums);  // a counter
+
+  sem_t sem;
+  int32_t sem_init_ret = sem_init(&sem, 0, 0);
+  if (sem_init_ret == -1) {
+    AICPU_LOGE("sem_init error with message: %s", strerror(errno));
+    return;
+  }
+
+  for (int64_t cur = 0; cur < cpu_nums; cur++) {
+    Closure closure = [&sem, &work, &count, total, cur]() {
+      work(total, cur);
+      count--;
+      int32_t sem_post_ret = sem_post(&sem);
+      AICPU_SHARDER_IF_TRUE_RUN(sem_post_ret == -1, AICPU_LOGE("sem_post error with message: %s", strerror(errno)));
+    };
+
+    // if enqueue fail, work itself
+    if (!Enqueue(closure, true)) {
+      closure();
+    }
+  }
+
+  if (do_task_ != nullptr) {
+    bool ret = true;
+    while ((count > 0) && ret) {
+      ret = do_task_();
+    }
+  }
+
+  for (int64_t i = 0; i < cpu_nums; i++) {
+    int sem_wait_ret = sem_wait(&sem);
+    AICPU_SHARDER_IF_TRUE_RUN(sem_wait_ret == -1, AICPU_LOGE("sem_wait error with message: %s", strerror(errno)));
+  }
+  int32_t sem_des_ret = sem_destroy(&sem);
+  AICPU_SHARDER_IF_TRUE_RUN(sem_des_ret == -1, AICPU_LOGE("sem_destroy error with message: %s", strerror(errno)));
+}
+}  // namespace aicpu
+
+/**
+ * Shards the "total" unit of work refer "perUintSize"
+ */
+void ParallelFor(int64_t total, int64_t per_unit_size, const aicpu::SharderWork &work) {
+  aicpu::SharderNonBlock::GetInstance().ParallelFor(total, per_unit_size, work);
+}
+
+/**
+ * Get CPU number
+ */
+uint32_t GetCPUNum() { return aicpu::SharderNonBlock::GetInstance().GetCPUNum(); }
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_sharder.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_sharder.h
@ -0,0 +1,132 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_OPS_AICPU_SHARDER_H_
+#define AICPU_OPS_AICPU_SHARDER_H_
+
+#include <functional>
+#include <vector>
+#include "common/kernel_util.h"
+
+namespace aicpu {
+using Closure = std::function<void()>;
+using ClosureBool = std::function<bool()>;
+using RunnerBool = std::function<bool(Closure, bool)>;
+using SharderWork = std::function<void(int64_t, int64_t)>;
+
+class SharderNonBlock {
+ public:
+  /**
+   * Get the unique object of this class
+   */
+  static SharderNonBlock &GetInstance();
+
+  /**
+   * Register schedule callback function, do_task function and cpu core number
+   * called by compute process
+   * @param schedule Schedule callback function
+   * @param do_task Callback function for itself schedule
+   * @param cpu_core_num aicpu core number
+   */
+  void Register(const RunnerBool &schedule, const ClosureBool &do_task, uint32_t cpu_core_num);
+
+  /**
+   * Shards the "total" unit of work refer "perUintSize"
+   * @param total Total unit of work
+   * @param per_unit_size Minimum shard unit
+   * @param work should be a callable taking (int64, int64) arguments.
+                 work(start, limit) computes the work units from [start, limit),
+                 i.e., [start, limit) is a shard.
+   */
+  void ParallelFor(int64_t total, int64_t per_unit_size, const SharderWork &work);
+
+  /**
+   * Shards the unit of work refer for hash
+   * @param total, Total unit of work
+   * @param cpu_nums Number of cpu cores
+   * @param work should be a callable taking (int64, int64) arguments.
+                 work(cur, cpu_nums) computes the work units with input hash with (cpu_nums-1) equals cur,
+                 i.e. specially used by parallel unique op
+   */
+  void ParallelForHash(int64_t total, int64_t cpu_nums, const SharderWork &work);
+
+  /**
+   * Schedule a task use schedule function registered by compute process,
+   * note that the task will actually executed asynchronously
+   * @param closure Closure function with nothrow
+   */
+  void Schedule(const Closure &closure);
+
+  /**
+   * Get CPU number
+   * @param None
+   * @return CPU number
+   */
+  uint32_t GetCPUNum();
+
+ private:
+  SharderNonBlock() : schedule_(nullptr), do_task_(nullptr), cpu_core_num_(0) {}
+  ~SharderNonBlock() = default;
+
+  SharderNonBlock(const SharderNonBlock &) = delete;
+  SharderNonBlock &operator=(const SharderNonBlock &) = delete;
+  SharderNonBlock(SharderNonBlock &&) = delete;
+  SharderNonBlock &operator=(SharderNonBlock &&) = delete;
+
+  /**
+   * Closure function enqueue
+   * @param closure Closure function can be called
+   * @param submit_topic whether submit topic, true means submit topic
+   * @return whether enqueue of closure success
+   */
+  bool Enqueue(const Closure &closure, bool submit_topic = false);
+
+  /**
+   * Calculate how many times, which ceiled, "x" is "base".
+   * i.e., x is 1, base is 2, this function will return 1
+   * @param x An integral
+   * @param base An integral as base when cal multiple
+   * @return ceiled multiple
+   */
+  inline int64_t CeilMultiple(int64_t x, int64_t base);
+
+ private:
+  RunnerBool schedule_;    // enqueue runner
+  ClosureBool do_task_;    // a callback, do task from task queue
+  uint32_t cpu_core_num_;  // aicpu core number
+};                         // SharderNonBlock
+}  // namespace aicpu
+
+extern "C" {
+/**
+ * Shards the "total" unit of work refer "perUintSize"
+ * @param total Total unit of work
+ * @param per_unit_size Minimum shard unit
+ * @param work should be a callable taking (int64, int64) arguments.
+                 work(start, limit) computes the work units from [start, limit),
+                i.e., [start, limit) is a shard.
+ */
+AICPU_VISIBILITY_API void ParallelFor(int64_t total, int64_t per_unit_size, const aicpu::SharderWork &work);
+
+/**
+ * Get CPU number
+ * @param None
+ * @return CPU number
+ */
+AICPU_VISIBILITY_API uint32_t GetCPUNum();
+}
+
+#endif  // AICPU_OPS_AICPU_SHARDER_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/distinct_uniform_int_distribution.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/distinct_uniform_int_distribution.h
@ -0,0 +1,62 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_OPS_AICPU_DISTINCT_UNIFORM_INT_DISTRIBUTION_H_
+#define AICPU_OPS_AICPU_DISTINCT_UNIFORM_INT_DISTRIBUTION_H_
+
+#include <random>
+#include <unordered_set>
+
+namespace aicpu {
+template <typename IntType = int>
+class distinct_uniform_int_distribution {
+ public:
+  using result_type = IntType;
+
+ private:
+  using set_type = std::unordered_set<result_type>;
+  using distr_type = std::uniform_int_distribution<result_type>;
+
+ public:
+  distinct_uniform_int_distribution(result_type inf, result_type sup)
+      : inf_(inf), sup_(sup), range_(sup_ - inf_ + 1), distr_(inf_, sup_) {}
+  ~distinct_uniform_int_distribution() = default;
+  void reset() {
+    uset_.clear();
+    distr_.reset();
+  }
+
+  template <typename Generator>
+  result_type exec(Generator *engine) {
+    if (!(uset_.size() < range_)) {
+      std::terminate();
+    }
+    result_type res;
+    do {
+      res = distr_(*engine);
+    } while (uset_.count(res) > 0);
+    uset_.insert(res);
+    return res;
+  }
+
+ private:
+  const result_type inf_;
+  const result_type sup_;
+  const size_t range_;
+  distr_type distr_;
+  set_type uset_;
+};
+}  // namespace aicpu
+#endif  // AICPU_OPS_AICPU_DISTINCT_UNIFORM_INT_DISTRIBUTION_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_base.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_base.cc
@ -0,0 +1,255 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <map>
+#include "common/kernel_base.h"
+#include "common/kernel_errcode.h"
+#include "common/tensor.h"
+
+namespace aicpu {
+namespace {
+// max param len limit 10k.
+constexpr uint32_t MAX_PARAM_LEN = 10240;
+// max io address num limit 1024
+constexpr uint32_t MAX_IO_ADDR_NUMPARAM_LEN = 1024;
+}  // namespace
+static const std::map<const ::aicpuops::DataType, size_t> kKernelBaseDataTypeSize = {
+  {aicpuops::MS_BOOL, sizeof(bool)},       {aicpuops::MS_INT8, sizeof(int8_t)},
+  {aicpuops::MS_UINT8, sizeof(uint8_t)},   {aicpuops::MS_INT16, sizeof(int16_t)},
+  {aicpuops::MS_UINT16, sizeof(uint16_t)}, {aicpuops::MS_INT32, sizeof(int32_t)},
+  {aicpuops::MS_UINT32, sizeof(uint32_t)}, {aicpuops::MS_INT64, sizeof(int64_t)},
+  {aicpuops::MS_UINT64, sizeof(uint64_t)}, {aicpuops::MS_FLOAT16, sizeof(float) / 2},
+  {aicpuops::MS_FLOAT32, sizeof(float)},   {aicpuops::MS_FLOAT64, sizeof(double)}};
+
+KernelBase::KernelBase(const std::string &kernel_name)
+    : kernel_name_(kernel_name),
+      extend_param_len_(0),
+      extend_param_base_(nullptr),
+      param_head_(nullptr),
+      unknow_shape_(false) {}
+
+uint32_t KernelBase::ParseParam(void *param) {
+  if (param == nullptr) {
+    AICPU_LOGE("Kernel:%s ParseParam param is null.", kernel_name_.c_str());
+    return AICPU_KERNEL_STATE_PARAM_INVALID;
+  }
+
+  // parse param_len
+  param_head_ = static_cast<AicpuParamHead *>(param);
+  if (param_head_->length < sizeof(AicpuParamHead) || param_head_->length > MAX_PARAM_LEN) {
+    AICPU_LOGE("Kernel:%s param length=%u not in [%zu, %u].", kernel_name_.c_str(), param_head_->length,
+               sizeof(AicpuParamHead), MAX_PARAM_LEN);
+    return AICPU_KERNEL_STATE_PARAM_INVALID;
+  }
+
+  auto param_base = static_cast<uint8_t *>(param);
+  extend_param_base_ = param_base + sizeof(AicpuParamHead);
+  extend_param_len_ = param_head_->length - sizeof(AicpuParamHead);
+
+  if (param_head_->ioAddrNum > 0) {
+    if (param_head_->ioAddrNum > MAX_IO_ADDR_NUMPARAM_LEN) {
+      AICPU_LOGE("Kernel:%s param ioAddrNum=%u is over %u.", kernel_name_.c_str(), param_head_->ioAddrNum,
+                 MAX_IO_ADDR_NUMPARAM_LEN);
+      return AICPU_KERNEL_STATE_PARAM_INVALID;
+    }
+    uint32_t addr_len = param_head_->ioAddrNum * sizeof(uint64_t);
+    if (extend_param_len_ < addr_len) {
+      AICPU_LOGE("Kernel:%s extend param is not enough for io addr, ioAddrNum=%u, extendParamLen=%u.",
+                 kernel_name_.c_str(), param_head_->ioAddrNum, extend_param_len_);
+      return AICPU_KERNEL_STATE_PARAM_INVALID;
+    }
+    auto io_addr_base = reinterpret_cast<uint64_t *>(extend_param_base_);
+    for (uint32_t i = 0; i < param_head_->ioAddrNum; ++i) {
+      io_addrs_.push_back(static_cast<uintptr_t>(io_addr_base[i]));
+    }
+    extend_param_base_ = extend_param_base_ + addr_len;
+    extend_param_len_ -= addr_len;
+  }
+  AICPU_CHK_STATUS_RET(ParseNodeDef())
+  AICPU_CHK_STATUS_RET(ParseExtInfo())
+  if (unknow_shape_) {
+    AICPU_LOGI("Unknown shape op: %s", kernel_name_.c_str());
+    AICPU_CHK_STATUS_RET(UpdateInputShape())
+    AICPU_CHK_STATUS_RET(UpdateOutputShape())
+  }
+  return ParseKernelParam();
+}
+
+uint32_t KernelBase::Compute(void *param) {
+  uint32_t ret = ParseParam(param);
+  if (ret != AICPU_KERNEL_STATE_SUCCESS) {
+    AICPU_LOGE("Kernel:%s ParseParam failed, ret=%u.", kernel_name_.c_str(), ret);
+    return ret;
+  }
+  return DoCompute();
+}
+
+size_t KernelBase::GetDataTypeSize(::aicpuops::DataType data_type) {
+  auto it = kKernelBaseDataTypeSize.find(data_type);
+  if (it == kKernelBaseDataTypeSize.end()) {
+    AICPU_LOGE("don't support input tensor types");
+    return 0;
+  }
+  return it->second;
+}
+
+template <typename T>
+uint32_t KernelBase::ParseExtendParam(T *param_var, std::string param_name) {
+  if (extend_param_len_ < sizeof(T)) {
+    AICPU_LOGE("Kernel:%s extend param is not enough for [%s] addr, need_len=%u, extendParamLen=%u.",
+               kernel_name_.c_str(), param_name.c_str(), sizeof(T), extend_param_len_);
+    return AICPU_KERNEL_STATE_PARAM_INVALID;
+  }
+  T *param = reinterpret_cast<T *>(extend_param_base_);
+  if (param != nullptr) {
+    *param_var = *param;
+    extend_param_base_ += sizeof(T);
+    extend_param_len_ -= sizeof(T);
+    return AICPU_KERNEL_STATE_SUCCESS;
+  }
+  AICPU_LOGE("Kernel:%s extend param for [%s] addr is invalid.", kernel_name_.c_str(), param_name.c_str());
+  return AICPU_KERNEL_STATE_PARAM_INVALID;
+}
+
+uint32_t KernelBase::ParseNodeDef() {
+  uint32_t node_def_len;
+  AICPU_CHK_STATUS_RET(ParseExtendParam(&node_def_len, "node_def_len"))
+
+  if (extend_param_len_ < node_def_len) {
+    AICPU_LOGE("Kernel:%s extend param is not enough for customizeAttr addr, node_def_len=%u, extendParamLen=%u.",
+               kernel_name_.c_str(), node_def_len, extend_param_len_);
+    return AICPU_KERNEL_STATE_PARAM_INVALID;
+  }
+  std::string std_data(reinterpret_cast<char *>(extend_param_base_), node_def_len);
+  if (!node_def_.ParseFromString(std_data)) {
+    AICPU_LOGE("parse %s KernelBase proto failed, nodeDef=%s.", kernel_name_.c_str(), std_data.c_str());
+    return AICPU_KERNEL_STATE_PARAM_INVALID;
+  }
+  extend_param_base_ += node_def_len;
+  extend_param_len_ -= node_def_len;
+  return AICPU_KERNEL_STATE_SUCCESS;
+}
+
+uint32_t KernelBase::ParseExtShapeType(FWKAdapter::ExtInfo *ext_info) {
+  if (ext_info->infoLen != sizeof(int32_t)) {
+    AICPU_LOGE("Kernel:%s parse ext shape type failed as infoLen must be %zu but %u.", kernel_name_.c_str(),
+               sizeof(int32_t), ext_info->infoLen);
+    return AICPU_KERNEL_STATE_PARAM_INVALID;
+  }
+  unknow_shape_ = true;
+  return AICPU_KERNEL_STATE_SUCCESS;
+}
+
+uint32_t KernelBase::ParseExtInputShape(FWKAdapter::ExtInfo *ext_info) {
+  // no overflow
+  auto need_len = node_def_.inputs_size() * sizeof(FWKAdapter::ShapeAndType);
+  if (ext_info->infoLen != need_len) {
+    AICPU_LOGE(
+      "Kernel:%s parse ext input shape failed as infoLen must be "
+      "input_num[%d]*sizeof(ShapeAndType)[%zu], but %u.",
+      kernel_name_.c_str(), node_def_.inputs_size(), sizeof(FWKAdapter::ShapeAndType), ext_info->infoLen);
+    return AICPU_KERNEL_STATE_PARAM_INVALID;
+  }
+  input_shape_and_type_.clear();
+  auto input = reinterpret_cast<FWKAdapter::ShapeAndType *>(ext_info->infoMsg);
+  for (int index = 0; index < node_def_.inputs_size(); ++index) {
+    input_shape_and_type_.emplace_back(&input[index]);
+  }
+  return AICPU_KERNEL_STATE_SUCCESS;
+}
+
+uint32_t KernelBase::ParseExtOutputShape(FWKAdapter::ExtInfo *ext_info) {
+  // no overflow
+  auto need_len = node_def_.outputs_size() * sizeof(FWKAdapter::ShapeAndType);
+  if (ext_info->infoLen != need_len) {
+    AICPU_LOGE(
+      "Kernel:%s parse ext output shape failed as infoLen must be "
+      "output_num[%d]*sizeof(ShapeAndType)[%zu], but %u.",
+      kernel_name_.c_str(), node_def_.outputs_size(), sizeof(FWKAdapter::ShapeAndType), ext_info->infoLen);
+    return AICPU_KERNEL_STATE_PARAM_INVALID;
+  }
+  output_shape_and_type_.clear();
+  auto output = reinterpret_cast<FWKAdapter::ShapeAndType *>(ext_info->infoMsg);
+  for (int index = 0; index < node_def_.outputs_size(); ++index) {
+    output_shape_and_type_.emplace_back(&output[index]);
+  }
+  return AICPU_KERNEL_STATE_SUCCESS;
+}
+
+uint32_t KernelBase::ParseExtInfo() {
+  uint32_t offset = 0;
+  FWKAdapter::ExtInfo *ext_info_ptr = nullptr;
+  char *ext_info_buf = reinterpret_cast<char *>(static_cast<uintptr_t>(param_head_->extInfoAddr));
+  while (offset + sizeof(FWKAdapter::ExtInfo) <= param_head_->extInfoLength) {
+    ext_info_ptr = reinterpret_cast<FWKAdapter::ExtInfo *>(ext_info_buf + offset);
+    if (ext_info_ptr == nullptr) {
+      AICPU_LOGE("Kernel:%s ext_info is nullptr, extInfoLength=%u, extInfoAddr=%p, offset=%zu.", kernel_name_.c_str(),
+                 param_head_->extInfoLength, param_head_->extInfoAddr, offset);
+      return AICPU_KERNEL_STATE_PARAM_INVALID;
+    }
+    switch (ext_info_ptr->infoType) {
+      case FWKAdapter::FWK_ADPT_EXT_SHAPE_TYPE:
+        AICPU_CHK_STATUS_RET(ParseExtShapeType(ext_info_ptr))
+        break;
+      case FWKAdapter::FWK_ADPT_EXT_INPUT_SHAPE:
+        AICPU_CHK_STATUS_RET(ParseExtInputShape(ext_info_ptr))
+        break;
+      case FWKAdapter::FWK_ADPT_EXT_OUTPUT_SHAPE:
+        AICPU_CHK_STATUS_RET(ParseExtOutputShape(ext_info_ptr))
+        break;
+      default:
+        AICPU_LOGI("Kernel:%s ignore infoType=%d, infoLen=%u.", kernel_name_.c_str(), ext_info_ptr->infoType,
+                   ext_info_ptr->infoLen);
+        break;
+    }
+    // not overflow
+    offset += FWKAdapter::kExtInfoHeadSize;
+    offset += ext_info_ptr->infoLen;
+  }
+  return AICPU_KERNEL_STATE_SUCCESS;
+}
+
+uint32_t KernelBase::UpdateInputShape() {
+  for (int i = 0; i < node_def_.inputs_size(); ++i) {
+    aicpuops::Tensor *input_tensor = node_def_.mutable_inputs(i);
+    aicpuops::TensorShape *input_tensor_shape = input_tensor->mutable_tensor_shape();
+    input_tensor_shape->clear_dim();
+    for (uint32_t index = 0; index < FWKAdapter::kMaxShapeDims; ++index) {
+      // LLONG_MIN for dim end flag
+      if (input_shape_and_type_[i]->dims[index] == LLONG_MIN) {
+        break;
+      }
+      input_tensor_shape->add_dim()->set_size(input_shape_and_type_[i]->dims[index]);
+    }
+  }
+  return AICPU_KERNEL_STATE_SUCCESS;
+}
+
+uint32_t KernelBase::UpdateOutputShape() {
+  for (int i = 0; i < node_def_.outputs_size(); ++i) {
+    aicpuops::Tensor *output_tensor = node_def_.mutable_outputs(i);
+    aicpuops::TensorShape *output_tensor_shape = output_tensor->mutable_tensor_shape();
+    output_tensor_shape->clear_dim();
+    for (uint32_t index = 0; index < FWKAdapter::kMaxShapeDims; ++index) {
+      // LLONG_MIN for dim end flag
+      if (output_shape_and_type_[i]->dims[index] == LLONG_MIN) {
+        break;
+      }
+      output_tensor_shape->add_dim()->set_size(output_shape_and_type_[i]->dims[index]);
+    }
+  }
+  return AICPU_KERNEL_STATE_SUCCESS;
+}
+}  // namespace aicpu
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_base.h
@ -0,0 +1,82 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_OPS_AICPU_COMMON_KERNEL_BASE_H_
+#define AICPU_OPS_AICPU_COMMON_KERNEL_BASE_H_
+
+#include <cstdint>
+#include <vector>
+#include <string>
+
+#include "common/kernel_util.h"
+#include "aicpu/common/aicpu_task_struct.h"
+#include "securec/include/securec.h"
+#include "common/tensor.h"
+#include "cce/fwk_adpt_struct.h"
+#include "common/kernel_log.h"
+#include "proto/aicpu_tensor.pb.h"
+
+namespace aicpu {
+class KernelBase {
+ public:
+  explicit KernelBase(const std::string &kernel_name);
+
+  ~KernelBase() = default;
+
+  uint32_t Compute(void *param);
+  size_t GetDataTypeSize(::aicpuops::DataType data_type);
+
+ protected:
+  virtual uint32_t ParseKernelParam() = 0;
+  virtual uint32_t DoCompute() = 0;
+
+  template <typename T>
+  uint32_t ParseExtendParam(T *param_var, std::string param_name);
+
+  uint32_t ParseNodeDef();
+
+  uint32_t ParseExtInfo();
+
+  uint32_t ParseExtShapeType(FWKAdapter::ExtInfo *ext_info);
+
+  uint32_t ParseExtInputShape(FWKAdapter::ExtInfo *ext_info);
+
+  uint32_t ParseExtOutputShape(FWKAdapter::ExtInfo *ext_info);
+
+  uint32_t UpdateInputShape();
+
+  uint32_t UpdateOutputShape();
+
+ private:
+  KernelBase(const KernelBase &) = delete;
+  KernelBase &operator=(const KernelBase &) = delete;
+  KernelBase(KernelBase &&) = delete;
+  KernelBase &operator=(KernelBase &&) = delete;
+
+  uint32_t ParseParam(void *param);
+
+ protected:
+  std::string kernel_name_;
+  std::vector<uintptr_t> io_addrs_;
+  uint32_t extend_param_len_;
+  uint8_t *extend_param_base_;
+  AicpuParamHead *param_head_;
+  bool unknow_shape_;
+  aicpuops::NodeDef node_def_;
+  std::vector<FWKAdapter::ShapeAndType *> input_shape_and_type_;
+  std::vector<FWKAdapter::ShapeAndType *> output_shape_and_type_;
+};
+}  // namespace aicpu
+#endif  // AICPU_OPS_AICPU_COMMON_KERNEL_BASE_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_errcode.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_errcode.h
@ -0,0 +1,30 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_OPS_AICPU_COMMON_KENERL_ERRCODE_H_
+#define AICPU_OPS_AICPU_COMMON_KENERL_ERRCODE_H_
+
+namespace aicpu {
+enum AicpuKernelErrCode {
+  // 0-3 is fixed error code, runtime need interpret 0-3 error codes
+  AICPU_KERNEL_STATE_SUCCESS = 0,
+  AICPU_KERNEL_STATE_PARAM_INVALID = 1,
+  AICPU_KERNEL_STATE_FAILED = 2,
+  AICPU_KERNEL_STATE_EXECUTE_TIMEOUT = 3,
+  AICPU_KERNEL_STATE_INTERNAL_ERROR = 4,
+  AICPU_KERNEL_STATE_END_OF_SEQUENCE = 201,
+};
+}  // namespace aicpu
+#endif  // AICPU_OPS_AICPU_COMMON_KENERL_ERRCODE_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_log.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_log.cc
@ -0,0 +1,29 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common/kernel_log.h"
+
+namespace aicpu {
+static int log_level = AICPU_LOG_ERROR;
+
+int LogSetLevel(int level) {
+  log_level = level;
+  return log_level;
+}
+
+int LogGetLevel(void) { return log_level; }
+
+bool CheckLogLevel(int log_level_check) { return log_level >= log_level_check; }
+}  // namespace aicpu
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_log.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_log.h
@ -0,0 +1,77 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_OPS_AICPU_COMMON_KERNEL_LOG_H_
+#define AICPU_OPS_AICPU_COMMON_KERNEL_LOG_H_
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <iostream>
+#include <utility>
+#include "common/kernel_errcode.h"
+
+inline int GetTid(void) {
+  thread_local static int tid = syscall(__NR_gettid);
+  return tid;
+}
+static const int LOG_COUNT = 0;
+
+namespace aicpu {
+#define AICPU_LOG_DEBUG 0
+#define AICPU_LOG_INFO 1
+#define AICPU_LOG_WARN 2
+#define AICPU_LOG_ERROR 3
+#define AICPU_LOG_EVENT 0x10
+
+inline void PrintLog(const int level) { std::cerr << level << std::endl; }
+
+template <typename T, typename... Args>
+inline void PrintLog(const int level, T &&head, Args &&... tail) {
+  std::cerr << std::forward<T>(head) << " ";
+  PrintLog(level, std::forward<Args>(tail)...);
+}
+
+int LogSetLevel(int level);
+
+int LogGetLevel(void);
+
+bool CheckLogLevel(int log_level_check);
+
+#define AICPU_LOGD(fmt, ...) \
+  AICPU_LOG(AICPU_LOG_DEBUG, "%s:%s:%d[tid:%lu]:" #fmt, __FUNCTION__, __FILE__, __LINE__, GetTid(), ##__VA_ARGS__);
+#define AICPU_LOGI(fmt, ...) \
+  AICPU_LOG(AICPU_LOG_INFO, "%s:%s:%d[tid:%lu]:" #fmt, __FUNCTION__, __FILE__, __LINE__, GetTid(), ##__VA_ARGS__);
+#define AICPU_LOGW(fmt, ...) \
+  AICPU_LOG(AICPU_LOG_WARN, "%s:%s:%d[tid:%lu]:" #fmt, __FUNCTION__, __FILE__, __LINE__, GetTid(), ##__VA_ARGS__);
+#define AICPU_LOGE(fmt, ...) \
+  AICPU_LOG(AICPU_LOG_ERROR, "%s:%s:%d[tid:%lu]:" #fmt, __FUNCTION__, __FILE__, __LINE__, GetTid(), ##__VA_ARGS__);
+#define AICPU_LOGEVENT(fmt, ...) \
+  AICPU_LOG(AICPU_LOG_EVENT, "%s:%s:%d[tid:%lu]:" #fmt, __FUNCTION__, __FILE__, __LINE__, GetTid(), ##__VA_ARGS__);
+#define AICPU_LOG(level, fmt, ...)                                              \
+  do {                                                                          \
+    if (aicpu::CheckLogLevel(level)) {                                          \
+      aicpu::PrintLog(level, "[%s:%d]" fmt, __FILE__, __LINE__, ##__VA_ARGS__); \
+    }                                                                           \
+  } while (LOG_COUNT != 0)
+
+#define AICPU_CHK_STATUS_RET(expr...)           \
+  do {                                          \
+    const uint32_t status = (expr);             \
+    if (status != AICPU_KERNEL_STATE_SUCCESS) { \
+      return status;                            \
+    }                                           \
+  } while (0);
+}  // namespace aicpu
+#endif  // AICPU_OPS_AICPU_COMMON_KERNEL_LOG_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_util.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_util.h
@ -0,0 +1,22 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_OPS_AICPU_COMMON_KERNEL_UTIL_H_
+#define AICPU_OPS_AICPU_COMMON_KERNEL_UTIL_H_
+
+#ifndef AICPU_VISIBILITY_API
+#define AICPU_VISIBILITY_API __attribute__((visibility("default")))
+#endif
+#endif  // AICPU_OPS_AICPU_COMMON_KERNEL_UTIL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/tensor.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/tensor.h
@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_OPS_COMMON_TENSOR_H_
+#define AICPU_OPS_COMMON_TENSOR_H_
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <vector>
+#include <map>
+
+namespace aicpu {
+namespace ms {
+class Tensor {
+ public:
+  Tensor() = default;
+  ~Tensor() = default;
+  const uint8_t *GetData() const;
+  size_t GetSize() const;
+  void SetData(uint8_t *data, size_t size);
+
+ private:
+  uint8_t *tensor_ptr_;
+  size_t tensor_len_;
+};
+}  // namespace ms
+}  // namespace aicpu
+#endif  // AICPU_OPS_COMMON_TENSOR_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/random_choice_with_mask_kernels.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/random_choice_with_mask_kernels.cc
@ -0,0 +1,280 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "./random_choice_with_mask_kernels.h"
+#include <random>
+#include <climits>
+#include <vector>
+#include <algorithm>
+#include <string>
+#include "aicpu_sharder/aicpu_sharder.h"
+#include "proto/aicpu_tensor.pb.h"
+#include "common/distinct_uniform_int_distribution.h"
+#include "common/tensor.h"
+
+namespace aicpu {
+static void ParseOutputCoordinate(std::vector<int64_t> dims_, int32_t output_length, int32_t input_dim_size,
+                                  int32_t input_total_count, const int *tmp_output, int *output) {
+  int it = 0;
+  int column = input_total_count / dims_[0];
+  for (int i = 0; i < output_length; i++) {
+    int32_t tmp_output_number = tmp_output[i];
+    int tmp_column = column;
+    for (int j = 0; j < input_dim_size; j++) {
+      if (j == input_dim_size - 1) {
+        output[it++] = tmp_output_number;
+        continue;
+      }
+      output[it++] = tmp_output_number / column;
+      tmp_output_number = tmp_output_number % column;
+      tmp_column = tmp_column / dims_[j + 1];
+    }
+  }
+}
+
+static void GetOutputLength(bool *padding_flag, int32_t *output_length, int32_t *output_non_zero_length, int32_t count,
+                            int32_t non_zero_num) {
+  if (count == 0) {
+    *padding_flag = false;
+    *output_length = non_zero_num;
+    *output_non_zero_length = non_zero_num;
+  } else if (count > 0 && count <= non_zero_num) {
+    *padding_flag = false;
+    *output_length = count;
+    *output_non_zero_length = count;
+  } else if (count > non_zero_num) {
+    *padding_flag = true;
+    *output_length = count;
+    *output_non_zero_length = non_zero_num;
+  } else {
+    AICPU_LOGI("input count must greater or equal to 0 but instead is %d", count);
+  }
+}
+
+static bool GetInputTotalCount(const std::vector<int64_t> &dims_, int32_t *input_total_count,
+                               const int32_t &input_dim_size) {
+  const int32_t max_inpu_dim = 5;
+  if (input_dim_size < 1 || input_dim_size > max_inpu_dim) {
+    AICPU_LOGE(
+      "input dim size is %d, it must greater or equal to 1 channels "
+      "and less than or equal to 5 channels!",
+      input_dim_size);
+    return false;
+  }
+  for (int32_t i = 0; i < input_dim_size; i++) {
+    *input_total_count *= dims_[i];
+  }
+  if (*input_total_count <= 0) {
+    AICPU_LOGE("input_total_count is %d, please check setting.", *input_total_count);
+    return false;
+  }
+  return true;
+}
+
+static void UpdateOutput(const std::vector<int64_t> &dims_, const int32_t &non_zero_num, const int32_t &count_,
+                         const int32_t &output_length, const int *mask_dim, int32_t *output_coordinate, bool *mask) {
+  for (int32_t i = non_zero_num * dims_.size(); i < static_cast<int32_t>(count_ * dims_.size()); i++) {
+    output_coordinate[i] = 0;
+  }
+  for (int32_t i = 0; i < output_length; i++) {
+    mask[i] = static_cast<bool>(mask_dim[i]);
+  }
+  for (int32_t i = non_zero_num; i < count_; i++) {
+    mask[i] = false;
+  }
+}
+
+static bool GenerateRandomMask(const int32_t &output_length, const int32_t &non_zero_num,
+                               const int32_t &output_non_zero_length, int **input_dim, int **tmp_output,
+                               int **mask_dim) {
+  *tmp_output = reinterpret_cast<int *>(malloc(output_length * sizeof(int)));
+  if (*tmp_output == nullptr) {
+    AICPU_LOGE("malloc memory failed!");
+    free(*input_dim);
+    return false;
+  }
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  aicpu::distinct_uniform_int_distribution<> dis(0, non_zero_num - 1);
+  *mask_dim = reinterpret_cast<int *>(malloc(output_length * sizeof(int)));
+  if (*mask_dim == nullptr) {
+    AICPU_LOGE("malloc memory failed!");
+    free(*input_dim);
+    free(*tmp_output);
+    return false;
+  }
+  if (memset_s(*mask_dim, output_length, 0x00, output_length) != EOK) {
+    AICPU_LOGE("memset_s to mask_dim failed!");
+    free(*input_dim);
+    free(*tmp_output);
+    free(*mask_dim);
+    return false;
+  }
+  if (memset_s(*tmp_output, output_length, 0x00, output_length) != EOK) {
+    AICPU_LOGE("memset_s to tmp_output failed!");
+    free(*input_dim);
+    free(*tmp_output);
+    free(*mask_dim);
+    return false;
+  }
+
+  if (output_non_zero_length > output_length) {
+    AICPU_LOGE("output_non_zero_length size is too long!");
+    free(*input_dim);
+    free(*tmp_output);
+    free(*mask_dim);
+    return false;
+  }
+  for (int32_t i = 0; i < output_non_zero_length; i++) {
+    int32_t mean = dis.exec(&gen);
+    *((*tmp_output) + i) = *((*input_dim) + mean);
+    *((*mask_dim) + i) = 1;
+  }
+  return true;
+}
+
+uint32_t RandomChoiceWithMaskKernel::DoCompute() {
+  auto *input = reinterpret_cast<bool *>(io_addrs_[0]);
+  auto *output_coordinate = reinterpret_cast<int32_t *>(io_addrs_[1]);
+  auto *mask = reinterpret_cast<bool *>(io_addrs_[2]);
+  int32_t input_dim_size = dims_.size();
+  int32_t non_zero_num = 0;
+  int32_t input_total_count = 1;
+
+  bool ret = GetInputTotalCount(dims_, &input_total_count, input_dim_size);
+  if (!ret) {
+    AICPU_LOGE("Get input total count failed!");
+    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
+  }
+
+  int *input_dim = reinterpret_cast<int *>(malloc(input_total_count * sizeof(int)));
+  if (input_dim == nullptr) {
+    AICPU_LOGE("Malloc memory failed!");
+    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
+  }
+  for (int32_t i = 0; i < input_total_count; i++) {
+    if (input[i] != 0) {
+      input_dim[non_zero_num] = i;
+      non_zero_num++;
+    }
+  }
+  bool padding_flag = false;
+  int32_t output_length = 0;
+  int32_t output_non_zero_length = 0;
+  GetOutputLength(&padding_flag, &output_length, &output_non_zero_length, count_, non_zero_num);
+
+  int *tmp_output = nullptr;
+  int *mask_dim = nullptr;
+  ret = GenerateRandomMask(output_length, non_zero_num, output_non_zero_length, &input_dim, &tmp_output, &mask_dim);
+  if (!ret) {
+    AICPU_LOGE("Generate random mask failed!");
+    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
+  }
+
+  if (padding_flag) {
+    int32_t index = 0;
+    for (int32_t i = output_length - 1; i > non_zero_num; i--) {
+      tmp_output[non_zero_num + index] = 0;
+      mask_dim[non_zero_num + index] = 0;
+      index++;
+    }
+  }
+
+  int32_t copy_output_length = 0;
+  if (output_length * input_dim_size >= INT_MAX || output_length * input_dim_size < 0) {
+    AICPU_LOGE("Output size exceed INT_MAX");
+    free(input_dim);
+    free(tmp_output);
+    free(mask_dim);
+    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
+  }
+  copy_output_length = output_length * input_dim_size;
+  int *output = reinterpret_cast<int *>(malloc(copy_output_length * sizeof(int)));
+  if (output == nullptr) {
+    AICPU_LOGE("malloc memory failed!");
+    free(input_dim);
+    free(tmp_output);
+    free(mask_dim);
+    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
+  }
+  if (memset_s(output, copy_output_length, 0x00, copy_output_length) != EOK) {
+    AICPU_LOGE("memset_s memory failed!");
+    free(input_dim);
+    free(mask_dim);
+    free(tmp_output);
+    free(output);
+    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
+  }
+  ParseOutputCoordinate(dims_, output_length, input_dim_size, input_total_count, tmp_output, output);
+
+  int32_t actual_output_length = count_ * dims_.size();
+  copy_output_length = std::min(actual_output_length, copy_output_length);
+  int32_t copy_output_bytes = 0;
+  if (INT_MAX / static_cast<int>(sizeof(int32_t)) < copy_output_length) {
+    AICPU_LOGE("The output length is out of range!");
+    free(input_dim);
+    free(mask_dim);
+    free(tmp_output);
+    free(output);
+    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
+  }
+  copy_output_bytes = copy_output_length * sizeof(int32_t);
+  memcpy_s(output_coordinate, copy_output_bytes, output, copy_output_bytes);
+  UpdateOutput(dims_, non_zero_num, count_, output_length, mask_dim, output_coordinate, mask);
+  AICPU_LOGI("no zero num is %d, output_length is %d ", non_zero_num, output_length);
+  UpdateOutputShapeValue(non_zero_num, output_length);
+  free(input_dim);
+  free(mask_dim);
+  free(tmp_output);
+  free(output);
+  return AICPU_KERNEL_STATE_SUCCESS;
+}
+
+void RandomChoiceWithMaskKernel::UpdateOutputShapeValue(int32_t non_zero_num, int32_t output_length) {
+  if (unknow_shape_) {
+    output_shape_and_type_[0]->dims[0] = non_zero_num;
+    output_shape_and_type_[1]->dims[0] = output_length;
+  }
+}
+uint32_t RandomChoiceWithMaskKernel::ParseKernelParam() {
+  ::google::protobuf::Map<::std::string, ::aicpuops::AttrValue> nodedef_map = node_def_.attrs();
+  aicpuops::AttrValue random_choice_count_attrs = nodedef_map["count"];
+  count_ = random_choice_count_attrs.i();
+  AICPU_LOGI("This op attr count is %d", count_);
+
+  if ((count_ == 0) && (!unknow_shape_)) {
+    AICPU_LOGE("This op attr count is 0, but the shapetype is %d", unknow_shape_);
+    return AICPU_KERNEL_STATE_PARAM_INVALID;
+  }
+
+  size_t inputs_size = node_def_.inputs_size();
+  for (size_t i = 0; i < inputs_size; i++) {
+    aicpuops::Tensor input_tensor = node_def_.inputs(i);
+    aicpuops::TensorShape input_shape = input_tensor.tensor_shape();
+    for (int j = 0; j < input_shape.dim_size(); j++) {
+      dims_.push_back(input_shape.dim(j).size());
+    }
+  }
+
+  return AICPU_KERNEL_STATE_SUCCESS;
+}
+}  // namespace aicpu
+
+extern "C" {
+__attribute__((visibility("default"))) uint32_t RandomChoiceWithMask(void *param) {
+  aicpu::RandomChoiceWithMaskKernel randomChoiceWithMaskKernel;
+  return randomChoiceWithMaskKernel.Compute(param);
+}
+}
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/random_choice_with_mask_kernels.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/random_choice_with_mask_kernels.h
@ -0,0 +1,36 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef AICPU_OPS_AICPU_RANDOM_CHOICE_WITH_MASK_KERNELS_H_
+#define AICPU_OPS_AICPU_RANDOM_CHOICE_WITH_MASK_KERNELS_H_
+
+#include <vector>
+#include "common/kernel_base.h"
+
+namespace aicpu {
+class RandomChoiceWithMaskKernel : public KernelBase {
+ public:
+  RandomChoiceWithMaskKernel() : KernelBase("RandomChoiceWithMask") {}
+  ~RandomChoiceWithMaskKernel() = default;
+
+ protected:
+  int32_t count_ = 0;
+  std::vector<int64_t> dims_;
+  uint32_t DoCompute() override;
+  uint32_t ParseKernelParam() override;
+  void UpdateOutputShapeValue(int32_t non_zero_num, int32_t output_length);
+};
+}  // namespace aicpu
+#endif  // AICPU_OPS_AICPU_RANDOM_CHOICE_WITH_MASK_KERNELS_H_