From d293c5eb26113a8a379e91cc66c7dad90776c604 Mon Sep 17 00:00:00 2001
From: ckey_Dou <ckey.chengbin@huawei.com>
Date: Tue, 27 Jul 2021 11:13:29 +0800
Subject: [PATCH] using kernel pool to share the compiling results when running
 on multi cards

---
 akg                                           |   2 +-
 .../akg_compiler/akg_process.py               |   5 -
 .../kernel_compiler/akg/akg_kernel_build.cc   | 431 +++++++++++++++++-
 .../kernel_compiler/akg/akg_kernel_build.h    |  75 +++
 .../backend/kernel_compiler/common_utils.cc   |  29 +-
 .../backend/kernel_compiler/common_utils.h    |   3 +-
 .../runtime/device/gpu/gpu_kernel_build.cc    |   3 +-
 .../runtime/device/gpu/gpu_kernel_runtime.cc  |   8 -
 .../hardware/gpu/gpu_device_context.cc        |   9 -
 9 files changed, 498 insertions(+), 67 deletions(-)
diff --git a/akg b/akg
index 4aac4d95750..15b59fb7399 160000
--- a/akg
+++ b/akg
@@ -1 +1 @@
-Subproject commit 4aac4d95750a87e664f175c0fa946a069f8a0c2a
+Subproject commit 15b59fb739944c1903558659a39b34bb632de448
diff --git a/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py b/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py
index c6487c9f17c..d3f0bbf1641 100644
--- a/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py
+++ b/mindspore/_extends/parallel_compile/akg_compiler/akg_process.py
@@ -50,11 +50,6 @@ def _compile_akg_task_gpu(json_strs, attrs):
         if not res:
             raise ValueError("Compile error, args: {}! build attrs: {}".format(json_str, attrs))
 
-    pid_path = os.path.realpath("./cuda_meta_" + str(os.getpid()))
-    if os.path.exists(pid_path):
-        copy_json(pid_path, os.path.realpath("./cuda_meta_" + str(os.getppid())))
-        shutil.rmtree(pid_path)
-
 
 def _compile_akg_task_ascend(json_strs, attrs):
     """
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
index 8b047f153a0..b6d70406d5c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.cc
@@ -16,6 +16,11 @@
 
 #include "backend/kernel_compiler/akg/akg_kernel_build.h"
 
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
 #include <algorithm>
 #include <map>
 #include <memory>
@@ -23,6 +28,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+#include <iostream>
 #include "nlohmann/json.hpp"
 #include "ir/dtype.h"
 #include "ir/func_graph.h"
@@ -34,9 +40,334 @@
 
 namespace mindspore {
 namespace kernel {
+
+#define INIT_SET_FROM_2D_ARRAY(set_var, list_idx) \
+  std::set<size_t> set_var(kernel_lists_[list_idx], kernel_lists_[list_idx] + kernel_lists_[list_idx][kMaxKernelNum_]);
+
+#define LIST_BEGIN(list_idx) kernel_lists_[list_idx]
+#define LIST_END(list_idx) (kernel_lists_[list_idx] + kernel_lists_[list_idx][kMaxKernelNum_])
+#define RESET_LIST_SIZE(list_idx, val) kernel_lists_[list_idx][kMaxKernelNum_] = val
+
+#define INCREASE_LIST_SIZE(list_idx, val) kernel_lists_[list_idx][kMaxKernelNum_] += val
+
 constexpr int32_t PROCESS_NUM = 16;
 constexpr int32_t TIME_OUT = 300;
 
+static inline size_t NameToHashID(const std::string &name) {
+  auto idx = name.find_last_of("_");
+  auto hash_id_str = name.substr(idx + 1);
+  size_t hash_id = 0;
+  size_t carry = 1;
+
+  for (int i = static_cast<int>(hash_id_str.size() - 1); i >= 0; i--) {
+    hash_id += static_cast<size_t>(std::stoi(hash_id_str.substr(static_cast<size_t>(i), 1))) * carry;
+    carry *= 10;
+  }
+
+  return hash_id;
+}
+
+bool AkgKernelPool::LockMng::TryLock() {
+  // Try to lock 100 times. Return errno if lock unsuccessfully
+  uint32_t trial = 100;
+
+  int32_t ret = -1;
+  while (trial > 0) {
+    ret = lockf(fd_, F_TLOCK, 0);
+    if (ret == 0 || (errno != EACCES && errno != EAGAIN)) {
+      break;
+    }
+
+    trial--;
+    usleep(5000);
+  }
+
+  if (ret == -1) {
+    MS_LOG(ERROR) << "Failed to acquire the lock, errno:" << strerror(errno) << ".";
+    return false;
+  }
+
+  return true;
+}
+
+void AkgKernelPool::LockMng::Unlock() {
+  auto ret = lockf(fd_, F_ULOCK, 0);
+  if (ret == -1) {
+    MS_LOG(ERROR) << "Failed to release the lock, errno:" << strerror(errno);
+  }
+}
+
+std::string AkgKernelPool::GetCurrentPath() {
+  char cwd[PATH_MAX];
+  char *ret = getcwd(cwd, sizeof(cwd));
+  if (ret == nullptr) {
+    MS_LOG(ERROR) << "Get current work directory failed, errno:" << strerror(errno);
+    return "";
+  }
+
+  char abspath[PATH_MAX];
+  char *res = realpath(cwd, abspath);
+  if (res == nullptr) {
+    MS_LOG(ERROR) << "Change to realpath failed, errno:" << strerror(errno);
+    return "";
+  }
+
+  return std::string(abspath);
+}
+
+void *AkgKernelPool::CreateSharedMem(const std::string &path) {
+  is_creator_ = false;
+
+  auto hash_id = std::hash<std::string>()(path);
+  auto key_id = static_cast<key_t>(hash_id);
+  auto mem_size = sizeof(size_t) * kListNum_ * (kMaxKernelNum_ + 1) + 512;
+
+  {
+    LockMng lock(fd_);
+    if (!lock.locked_) {
+      MS_LOG(ERROR) << "Failed to acquire lock.";
+      return nullptr;
+    }
+
+    // check if the shared memory exists or not.
+    // remove shared memory if exists and the nattach is 0
+    struct shmid_ds buf;
+    auto id = shmget(key_id, mem_size, 0);
+    if (id != -1) {
+      auto ret = shmctl(id, IPC_STAT, &buf);
+      if (ret == -1) {
+        MS_LOG(ERROR) << "Failed to get the info of shared memory, errno:" << strerror(errno);
+        return nullptr;
+      }
+
+      if (buf.shm_nattch == 0) {
+        ret = shmctl(id, IPC_RMID, nullptr);
+        if (ret < 0) {
+          MS_LOG(EXCEPTION) << "Realse shared_mem failed, errno:" << strerror(errno);
+        }
+      }
+    }
+  }
+
+  LockMng lock(fd_);
+  if (!lock.locked_) {
+    MS_LOG(ERROR) << "Failed to acquire lock.";
+    return nullptr;
+  }
+
+  shm_id_ = shmget(key_id, mem_size, IPC_CREAT | IPC_EXCL | 0600);
+  if (shm_id_ == -1) {
+    if (errno == EEXIST) {
+      shm_id_ = shmget(key_id, mem_size, 0);
+    }
+
+    if (shm_id_ == -1) {
+      MS_LOG(ERROR) << "Create shared_mem failed, error no:" << strerror(errno);
+      return nullptr;
+    }
+  } else {
+    is_creator_ = true;
+  }
+
+  auto local_addr = shmat(shm_id_, nullptr, 0);
+  if (local_addr == reinterpret_cast<void *>(-1)) {
+    MS_LOG(ERROR) << "Attach to shared_mem failed, error no:" << strerror(errno);
+    return nullptr;
+  }
+
+  if (is_creator_) {
+    (void)memset(local_addr, 0, mem_size);
+  }
+
+  return local_addr;
+}
+
+int32_t AkgKernelPool::Init(const std::vector<JsonNodePair> &build_args) {
+  auto cp = GetCurrentPath();
+  if (cp.empty()) {
+    return -1;
+  }
+
+  fd_ = open(kKeyName_, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+  if (fd_ == -1) {
+    MS_LOG(ERROR) << "open file <" << kKeyName_ << "> failed, errno:" << strerror(errno);
+    return -1;
+  }
+
+  auto addr = CreateSharedMem(cp);
+  if (addr == nullptr) {
+    return -1;
+  }
+
+  InitKernelLists(addr);
+
+  auto ret = AddKernels(build_args);
+  if (ret != 0) {
+    MS_LOG(ERROR) << "AkgKernelPool AddKernels failed.";
+    return false;
+  }
+
+  return 0;
+}
+
+AkgKernelPool::~AkgKernelPool() {
+  // Detach shared memory
+  auto ret = shmdt(reinterpret_cast<void *>(kernel_lists_[0]));
+  if (ret < 0) {
+    MS_LOG(EXCEPTION) << "Shared_mem detach failed, errno:" << strerror(errno);
+  }
+
+  // Realse shared_memroy
+  if (is_creator_) {
+    ret = shmctl(shm_id_, IPC_RMID, nullptr);
+    if (ret < 0) {
+      MS_LOG(EXCEPTION) << "Realse shared_mem failed, errno:" << strerror(errno);
+    }
+  }
+
+  // Close key file
+  if (fd_ != -1) {
+    (void)close(fd_);
+  }
+}
+
+int32_t AkgKernelPool::AddKernels(const std::vector<JsonNodePair> &build_args) {
+  LockMng lock(fd_);
+  if (!lock.locked_) {
+    MS_LOG(ERROR) << "Failed to acquire lock.";
+    return -1;
+  }
+
+  INIT_SET_FROM_2D_ARRAY(todo_list, kToDoIdx_);
+  INIT_SET_FROM_2D_ARRAY(doing_list, kDoingIdx_);
+  INIT_SET_FROM_2D_ARRAY(done_list, kDoneIdx_);
+
+  for (const auto &[json_generator, anf_node] : build_args) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto kernel_name = json_generator.kernel_name();
+
+    auto hash_id = NameToHashID(kernel_name);
+    if (self_kernel_ids_.count(hash_id) != 0) {
+      MS_LOG(ERROR) << "Duplicated hash_id in list.";
+      return -1;
+    }
+
+    self_kernel_ids_.emplace(hash_id);
+  }
+
+  std::set<size_t> diff_from_todo;
+  std::set<size_t> diff_from_doing;
+  std::set<size_t> diff_from_done;
+
+  // add the unique kernel only once, so need to check if it exists in todo_list, doing_list, or done_list
+  std::set_difference(self_kernel_ids_.begin(), self_kernel_ids_.end(), todo_list.begin(), todo_list.end(),
+                      std::inserter(diff_from_todo, diff_from_todo.begin()));
+  std::set_difference(diff_from_todo.begin(), diff_from_todo.end(), doing_list.begin(), doing_list.end(),
+                      std::inserter(diff_from_doing, diff_from_doing.begin()));
+  std::set_difference(diff_from_doing.begin(), diff_from_doing.end(), done_list.begin(), done_list.end(),
+                      std::inserter(diff_from_done, diff_from_done.begin()));
+
+  auto new_kernel_size = diff_from_done.size();
+  if (new_kernel_size + todo_list.size() > static_cast<size_t>(kMaxKernelNum_)) {
+    MS_LOG(ERROR) << "The size of kernels is " << new_kernel_size << ", while the left space of the pool is "
+                  << kMaxKernelNum_ - todo_list.size();
+    return -1;
+  }
+
+  std::copy(diff_from_done.begin(), diff_from_done.end(), LIST_END(kToDoIdx_));
+  INCREASE_LIST_SIZE(kToDoIdx_, new_kernel_size);
+
+  return 0;
+}
+
+int32_t AkgKernelPool::FetchKernels(std::set<size_t> *out) {
+  LockMng lock(fd_);
+  if (!lock.locked_) {
+    MS_LOG(ERROR) << "Failed to acquire lock.";
+    return -1;
+  }
+
+  std::set<size_t> left_in_todo_list;
+
+  // filter out kernels which belongs to other processes
+  auto FilterBySelfList = [&left_in_todo_list, &out, this](size_t id) {
+    if (this->self_kernel_ids_.count(id) != 0) {
+      out->emplace(id);
+    } else {
+      left_in_todo_list.emplace(id);
+    }
+  };
+
+  std::for_each(LIST_BEGIN(kToDoIdx_), LIST_END(kToDoIdx_), FilterBySelfList);
+
+  std::copy(out->begin(), out->end(), LIST_END(kDoingIdx_));
+  INCREASE_LIST_SIZE(kDoingIdx_, out->size());
+
+  std::copy(left_in_todo_list.begin(), left_in_todo_list.end(), LIST_BEGIN(kToDoIdx_));
+  RESET_LIST_SIZE(kToDoIdx_, left_in_todo_list.size());
+
+  return 0;
+}
+
+int32_t AkgKernelPool::UpdateAndWait(const std::set<size_t> &ids) {
+  if (!ids.empty()) {
+    LockMng lock(fd_);
+    if (!lock.locked_) {
+      MS_LOG(ERROR) << "Failed to acquire lock.";
+      return -1;
+    }
+
+    // update the state of finished kernels to `done`
+    std::copy(ids.begin(), ids.end(), LIST_END(kDoneIdx_));
+    INCREASE_LIST_SIZE(kDoneIdx_, ids.size());
+
+    // delete the finished kernels from doing_list
+    std::vector<size_t> left_in_doing_list;
+    INIT_SET_FROM_2D_ARRAY(doing_list, kDoingIdx_);
+    std::set_difference(doing_list.begin(), doing_list.end(), ids.begin(), ids.end(),
+                        std::inserter(left_in_doing_list, left_in_doing_list.begin()));
+
+    std::copy(left_in_doing_list.begin(), left_in_doing_list.end(), LIST_BEGIN(kDoingIdx_));
+    RESET_LIST_SIZE(kDoingIdx_, left_in_doing_list.size());
+  }
+
+  auto ret = Wait();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "AkgKernelPool Wait failed.";
+    return -1;
+  }
+
+  return 0;
+}
+
+int32_t AkgKernelPool::Wait() {
+  // wait until all the kernels which belong to this process finish compiling
+  uint32_t trials = 1000;
+
+  while (trials > 0) {
+    {
+      LockMng lock(fd_);
+      if (!lock.locked_) {
+        MS_LOG(ERROR) << "Failed to acquire lock.";
+        return -1;
+      }
+
+      INIT_SET_FROM_2D_ARRAY(done_list, kDoneIdx_);
+
+      if (std::all_of(self_kernel_ids_.begin(), self_kernel_ids_.end(),
+                      [&done_list](size_t id) { return done_list.count(id) != 0; })) {
+        return 0;
+      }
+    }
+
+    usleep(1000000);
+    trials--;
+  }
+
+  MS_LOG(ERROR) << "Time out while wait kernel compiling";
+  return -1;
+}
+
 std::vector<std::string> AkgKernelBuilder::GetNotCachedKernelJsons(const std::vector<JsonNodePair> &build_args) {
   // Remove cached nodes, gether unique nodes, and collect repeated nodes which need postprecess.
   std::vector<std::string> jsons;
@@ -66,6 +397,31 @@ std::vector<std::string> AkgKernelBuilder::GetNotCachedKernelJsons(const std::ve
   return jsons;
 }
 
+std::vector<JsonNodePair> AkgKernelBuilder::GetNotCachedKernels(const std::vector<JsonNodePair> &build_args) {
+  std::unordered_set<std::string> kernel_name_set;
+  std::vector<JsonNodePair> new_build_args;
+  for (const auto &[json_generator, anf_node] : build_args) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto kernel_name = json_generator.kernel_name();
+
+    auto cached_kernel_pack = AkgSearchCache(kernel_name);
+    if (cached_kernel_pack != nullptr) {
+      MS_LOG(DEBUG) << "Use cached kernel, kernel_name[" << kernel_name << "], fullname_with_scope["
+                    << anf_node->fullname_with_scope() << "].";
+      AkgSetKernelMod(cached_kernel_pack, json_generator, anf_node);
+      continue;
+    }
+
+    if (kernel_name_set.count(kernel_name) != 0) {
+      repeat_nodes_.push_back({json_generator, anf_node});
+      continue;
+    }
+    kernel_name_set.insert(kernel_name);
+    new_build_args.push_back({json_generator, anf_node});
+  }
+  return new_build_args;
+}
+
 bool AkgKernelBuilder::InsertToCache(const std::vector<JsonNodePair> &build_args) {
   for (const auto &[json_generator, anf_node] : build_args) {
     auto kernel_name = json_generator.kernel_name();
@@ -97,32 +453,77 @@ bool AkgKernelBuilder::HandleRepeatNodes() {
   return true;
 }
 
+std::vector<std::string> AkgKernelBuilder::GetKernelJsonsByHashId(const std::vector<JsonNodePair> &build_args,
+                                                                  std::set<size_t> fetched_ids) {
+  std::vector<std::string> jsons;
+  for (const auto &[json_generator, anf_node] : build_args) {
+    MS_EXCEPTION_IF_NULL(anf_node);
+    auto kernel_name = json_generator.kernel_name();
+
+    auto hash_id = NameToHashID(kernel_name);
+
+    if (fetched_ids.count(hash_id) == 0) {
+      continue;
+    }
+
+    auto kernel_json = json_generator.kernel_json_str();
+    AkgSaveJsonInfo(kernel_name, kernel_json);
+    jsons.push_back(kernel_json);
+  }
+  return jsons;
+}
+
 bool AkgKernelBuilder::AkgOpParallelBuild(const std::vector<JsonNodePair> &build_args) {
   repeat_nodes_.clear();
-  auto jsons = GetNotCachedKernelJsons(build_args);
-  if (jsons.empty()) {
+  auto new_build_args = GetNotCachedKernels(build_args);
+  if (new_build_args.empty()) {
     return true;
   }
 
-  auto client = GetClient();
-  MS_EXCEPTION_IF_NULL(client);
-  if (!client->AkgStart(PROCESS_NUM, TIME_OUT)) {
-    MS_LOG(ERROR) << "Akg start failed.";
+  AkgKernelPool kp;
+  auto ret = kp.Init(new_build_args);
+  if (ret != 0) {
+    MS_LOG(ERROR) << "AkgKernelPool init failed.";
     return false;
   }
-  auto attrs = CollectBuildAttrs();
-  if (!attrs.empty() && !client->AkgSendAttr(attrs)) {
-    MS_LOG(ERROR) << "Akg send attr failed.";
+
+  std::set<size_t> fetched_ids;
+  ret = kp.FetchKernels(&fetched_ids);
+  if (ret != 0) {
+    MS_LOG(ERROR) << "AkgKernelPool FetchKernels failed.";
     return false;
   }
-  if (!client->AkgSendData(jsons)) {
-    MS_LOG(ERROR) << "Akg send data failed.";
-    return false;
-  }
-  if (!client->AkgWait()) {
-    MS_LOG(ERROR) << "Akg compile failed.";
+
+  if (!fetched_ids.empty()) {
+    auto jsons = GetKernelJsonsByHashId(new_build_args, fetched_ids);
+
+    auto client = GetClient();
+    MS_EXCEPTION_IF_NULL(client);
+    if (!client->AkgStart(PROCESS_NUM, TIME_OUT)) {
+      MS_LOG(ERROR) << "Akg start failed.";
+      return false;
+    }
+    auto attrs = CollectBuildAttrs();
+    if (!attrs.empty() && !client->AkgSendAttr(attrs)) {
+      MS_LOG(ERROR) << "Akg send attr failed.";
+      return false;
+    }
+    if (!client->AkgSendData(jsons)) {
+      MS_LOG(ERROR) << "Akg send data failed.";
+      return false;
+    }
+    if (!client->AkgWait()) {
+      MS_LOG(ERROR) << "Akg compile failed.";
+      return false;
+    }
+  }
+
+  ret = kp.UpdateAndWait(fetched_ids);
+  if (ret != 0) {
+    MS_LOG(ERROR) << "AkgKernelPool UpdateAndWait failed.";
     return false;
   }
+
   // All unique done here, cache them and set kernel.
   if (!InsertToCache(build_args)) {
     MS_LOG(ERROR) << "Insert cache failed.";
diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
index c0012ece6ff..9f9958f1464 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_build.h
@@ -17,10 +17,13 @@
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AKG_AKG_KERNEL_BUILD_H_
 
+#include <sys/shm.h>
+
 #include <string>
 #include <utility>
 #include <vector>
 #include <map>
+#include <set>
 #include "ir/anf.h"
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/session/kernel_build_client.h"
@@ -45,12 +48,84 @@ class AkgKernelBuilder {
 
  private:
   std::vector<std::string> GetNotCachedKernelJsons(const std::vector<JsonNodePair> &build_args);
+  std::vector<JsonNodePair> GetNotCachedKernels(const std::vector<JsonNodePair> &build_args);
+  std::vector<std::string> GetKernelJsonsByHashId(const std::vector<JsonNodePair> &build_args,
+                                                  std::set<size_t> fetched_ids);
   bool InsertToCache(const std::vector<JsonNodePair> &build_args);
   bool HandleRepeatNodes();
   bool AkgOpParallelBuild(const std::vector<JsonNodePair> &build_args);
   std::vector<JsonNodePair> repeat_nodes_;
   std::string CollectBuildAttrs();
 };
+
+class AkgKernelPool {
+ public:
+  class LockMng {
+   public:
+    explicit LockMng(int32_t fd) {
+      fd_ = fd;
+      locked_ = TryLock();
+    }
+
+    virtual ~LockMng() {
+      if (locked_) {
+        Unlock();
+      }
+    }
+
+    bool locked_{false};
+
+   private:
+    bool TryLock();
+    void Unlock();
+
+    int32_t fd_{-1};
+  };
+
+ public:
+  AkgKernelPool() = default;
+  virtual ~AkgKernelPool();
+
+  int32_t Init(const std::vector<JsonNodePair> &build_args);
+  int32_t FetchKernels(std::set<size_t> *out);
+  int32_t UpdateAndWait(const std::set<size_t> &ids);
+
+  constexpr inline static size_t kMaxKernelNum_{1000};
+  constexpr inline static key_t kSharedMemKey_{0x57565845};
+
+  // allocate memory for todo_list, doing_list, done_list
+  constexpr inline static size_t kListNum_{3};
+
+  constexpr inline static auto kKeyName_ = "./akg_build_tmp.key";
+
+  constexpr inline static int32_t kToDoIdx_ = 0;
+  constexpr inline static int32_t kDoingIdx_ = 1;
+  constexpr inline static int32_t kDoneIdx_ = 2;
+
+ private:
+  void *CreateSharedMem(const std::string &path);
+  std::string GetCurrentPath();
+
+  inline void InitKernelLists(void *addr) {
+    kernel_lists_[kToDoIdx_] = reinterpret_cast<size_t *>(addr);
+    kernel_lists_[kDoingIdx_] = kernel_lists_[kToDoIdx_] + kMaxKernelNum_ + 1;
+    kernel_lists_[kDoneIdx_] = kernel_lists_[kDoingIdx_] + kMaxKernelNum_ + 1;
+  }
+
+  int32_t AddKernels(const std::vector<JsonNodePair> &kernel_jsons);
+  int32_t Wait();
+
+  int32_t shm_id_{-1};
+  bool is_creator_{false};
+  int32_t fd_{-1};
+
+  // includes 3 lists: todo_list, doing_list, done_list.
+  // each list has kMaxKernelNum_ + 1 elements and, the count of elements in each list
+  // is stored in kernel_lists_[xx][kMaxKernelNum_]
+  size_t *kernel_lists_[kListNum_]{nullptr, nullptr, nullptr};
+
+  std::set<size_t> self_kernel_ids_;
+};
 }  // namespace kernel
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
index edc94673083..7d19cf65a0c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
@@ -141,14 +141,8 @@ FusionType GetFusionTypeByName(const std::string &name) {
   return iter->first;
 }
 
-void KernelMeta::Initialize(int pid) {
-  if (pid == -1) {
-    kernel_meta_path_ = std::string(kGpuKernelMeta) + "_" + std::to_string(getpid()) + "/";
-  } else {
-    kernel_meta_path_ = std::string(kGpuKernelMeta) + "_" + std::to_string(pid) + "/";
-  }
-  // remove old kernel cache
-  RemoveKernelCache();
+void KernelMeta::Initialize() {
+  kernel_meta_path_ = std::string(kGpuKernelMeta) + "/";
 
 #if defined(_WIN32) || defined(_WIN64)
   auto ret = mkdir(kernel_meta_path_.c_str());
@@ -161,21 +155,6 @@ void KernelMeta::Initialize(int pid) {
   initialized_ = true;
 }
 
-void KernelMeta::RemoveKernelCache() {
-  DIR *dir = opendir(kernel_meta_path_.c_str());
-  if (dir == nullptr) {
-    return;
-  }
-  struct dirent *entry;
-  while ((entry = readdir(dir)) != nullptr) {
-    std::string kernel_file = entry->d_name;
-    std::string kernel_file_realpath = kernel_meta_path_ + kernel_file;
-    (void)remove(kernel_file_realpath.c_str());
-  }
-  (void)closedir(dir);
-  (void)rmdir(kernel_meta_path_.c_str());
-}
-
 std::string KernelMeta::Search(const std::string &kernel_name) const {
   if (!initialized_) {
     return "";
@@ -227,7 +206,7 @@ KernelPackPtr SearchCache(const std::string &kernel_name, const std::string &pro
     KernelPackPtr kernel_pack = std::make_shared<KernelPack>();
     // just a tmp solution.
     if (!kernel_pack->ReadFromJsonFile(kernel_json, processor)) {
-      MS_LOG(DEBUG) << "Read cache json and bin file failed[" << kernel_json << "].";
+      MS_LOG(ERROR) << "Read cache json and bin file failed[" << kernel_json << "].";
       return nullptr;
     } else {
       return kernel_pack;
@@ -250,7 +229,7 @@ KernelPackPtr InsertCache(const std::string &kernel_name, const std::string &pro
   (void)kernel_json.append(kernel_name).append(kJsonSuffix);
   KernelPackPtr kernel_pack = std::make_shared<KernelPack>();
   if (!kernel_pack->ReadFromJsonFile(kernel_json, processor)) {
-    MS_LOG(DEBUG) << "Read json and bin file failed[" << kernel_json << "].";
+    MS_LOG(ERROR) << "Read json and bin file failed[" << kernel_json << "].";
     return nullptr;
   }
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
index 9c50ea0213f..507517954bd 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
@@ -55,8 +55,7 @@ using KernelMetaPtr = std::shared_ptr<KernelMetaInfo>;
 class KernelMeta {
  public:
   KernelMeta() = default;
-  void Initialize(int pid);
-  void RemoveKernelCache();
+  void Initialize();
   std::string Search(const std::string &kernel_name) const;
   bool Insert(const std::string &kernel_name, const std::string &kernel_json);
   std::string kernel_meta_path() const { return kernel_meta_path_; }
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc
index 5be77aef128..6bb925b043b 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_build.cc
@@ -41,8 +41,7 @@ void CreateGPUKernel(const std::vector<CNodePtr> &kernels) {
 
     if (session::AnfRuntimeAlgorithm::GetKernelType(kernel) == KernelType::AKG_KERNEL) {
       if (!bin_map->initialized()) {
-        auto pid = mindspore::kernel::GpuKernelBuildClient::Instance().AkgGetPid();
-        bin_map->Initialize(pid);
+        bin_map->Initialize();
       }
       if (!already_check_nvcc) {
         already_check_nvcc = true;
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index e56bdcfa5ad..f483c796075 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -272,14 +272,6 @@ void GPUKernelRuntime::ReleaseDeviceRes() {
   if (mem_manager_ != nullptr) {
     mem_manager_->FreeDeviceMemory();
   }
-
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
-  if (!(context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG))) {
-    kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
-    MS_EXCEPTION_IF_NULL(bin_map);
-    bin_map->RemoveKernelCache();
-  }
 }
 
 void GPUKernelRuntime::ClearGraphRuntimeResource(uint32_t graph_id, const std::vector<AnfNodePtr> &inputs,
diff --git a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
index 4264cdf6d81..fa92a5aac3f 100644
--- a/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
+++ b/mindspore/ccsrc/runtime/hardware/gpu/gpu_device_context.cc
@@ -152,15 +152,6 @@ void GPUDeviceContext::Destroy() {
     mem_manager_->FreeDeviceMemory();
     mem_manager_ = nullptr;
   }
-
-  // Clean GPU cache kernels which is generated by AKG
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
-  if (!(context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG))) {
-    kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
-    MS_EXCEPTION_IF_NULL(bin_map);
-    bin_map->RemoveKernelCache();
-  }
 }
 
 bool GPUDeviceContext::AllocateMemory(DeviceAddress *const &address, size_t size) const {