diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc
index ebf766f42a9..65c5a0aaf1e 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_kernel_executor.cc
@@ -375,7 +375,6 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<Add
   auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
   KernelType kernel_type = AnfAlgo::GetKernelType(kernel);
   MS_EXCEPTION_IF_NULL(kernel);
-  MS_LOG(DEBUG) << "Launch kernel: " << kernel->fullname_with_scope();
   (void)res_manager_->BindDeviceToCurrentThread();
 
   std::vector<AddressPtr> real_inputs;
@@ -407,7 +406,6 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<Add
   if (nop_op_to_memcpy_.find(kernel) != nop_op_to_memcpy_.end()) {
     (void)MemoryCopyAsync(kernel, real_inputs, outputs);
   } else {
-    MS_LOG(DEBUG) << "Launch kernel " << kernel->fullname_with_scope();
 #ifndef ENABLE_SECURITY
     auto profiler_inst = profiler::ascend::PynativeProfiler::GetInstance();
     MS_EXCEPTION_IF_NULL(profiler_inst);
@@ -415,7 +413,9 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<Add
     profiler_inst->OpDataProducerBegin(res_manager_->runtime_instance_, stream, t_id, kernel->fullname_with_scope(),
                                        is_dynamic_shape);
 #endif
+    MS_LOG(DEBUG) << "Begin launch kernel: " << kernel->fullname_with_scope();
     ret = kernel_mod->Launch(real_inputs, workspace, outputs, stream);
+    MS_LOG(DEBUG) << "End launch kernel: " << kernel->fullname_with_scope();
 #ifndef ENABLE_SECURITY
     profiler_inst->OpDataProducerEnd(t_id, is_dynamic_shape);
 #endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc
index 21656d826f6..a04f6461f98 100644
--- a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_device_context.cc
@@ -340,7 +340,6 @@ bool CPUKernelExecutor::LaunchKernel(const CNodePtr &kernel, const std::vector<A
                                      const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs,
                                      size_t /* stream_id */) const {
   MS_EXCEPTION_IF_NULL(kernel);
-  MS_LOG(DEBUG) << "Launch kernel: " << kernel->fullname_with_scope();
   auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
   MS_EXCEPTION_IF_NULL(kernel_mod);
 
@@ -355,10 +354,16 @@ bool CPUKernelExecutor::LaunchKernel(const CNodePtr &kernel, const std::vector<A
   const auto &profiler_inst = profiler::cpu::CPUProfiler::GetInstance();
   MS_EXCEPTION_IF_NULL(profiler_inst);
   if (profiler_inst->GetEnableFlag()) {
-    return LaunchKernelWithProfiling(kernel, inputs, workspace, outputs);
+    MS_LOG(DEBUG) << "Begin launch kernel: " << kernel->fullname_with_scope();
+    auto ret = LaunchKernelWithProfiling(kernel, inputs, workspace, outputs);
+    MS_LOG(DEBUG) << "End launch kernel: " << kernel->fullname_with_scope();
+    return ret;
   }
 #endif
-  return DoLaunchKernel(kernel_mod, inputs, workspace, outputs);
+  MS_LOG(DEBUG) << "Begin launch kernel: " << kernel->fullname_with_scope();
+  auto ret = DoLaunchKernel(kernel_mod, inputs, workspace, outputs);
+  MS_LOG(DEBUG) << "End launch kernel: " << kernel->fullname_with_scope();
+  return ret;
 }
 
 bool CPUDeviceResManager::LoadCollectiveCommLib() {
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
index aa7501cdc43..c88de549986 100644
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
@@ -599,13 +599,15 @@ bool GPUKernelExecutor::LaunchKernel(const CNodePtr &kernel, const std::vector<A
   if (!profiler_inst->GetEnableFlag()) {
 #endif
     auto lock = LockLaunchKernel(stream);
-    MS_LOG(DEBUG) << "Launch kernel: " << kernel->fullname_with_scope();
+    MS_LOG(DEBUG) << "Begin launch kernel: " << kernel->fullname_with_scope();
     ret = DoLaunchKernel(kernel, inputs, workspace, outputs, stream);
+    MS_LOG(DEBUG) << "End launch kernel: " << kernel->fullname_with_scope();
 #ifndef ENABLE_SECURITY
   } else {
     auto lock = LockLaunchKernel(stream);
-    MS_LOG(DEBUG) << "Launch kernel: " << kernel->fullname_with_scope();
+    MS_LOG(DEBUG) << "Begin launch kernel: " << kernel->fullname_with_scope();
     ret = LaunchKernelWithProfiling(kernel, inputs, workspace, outputs, stream);
+    MS_LOG(DEBUG) << "End launch kernel: " << kernel->fullname_with_scope();
   }
 #endif
   if (!ret) {
diff --git a/mindspore/ccsrc/runtime/device/common_somas_allocator.cc b/mindspore/ccsrc/runtime/device/common_somas_allocator.cc
index 0f8b62da569..bdd0dce2823 100644
--- a/mindspore/ccsrc/runtime/device/common_somas_allocator.cc
+++ b/mindspore/ccsrc/runtime/device/common_somas_allocator.cc
@@ -53,11 +53,11 @@ uint8_t *CommonSomasAllocator::GetNodeOutputPtr(const AnfNodePtr &node, size_t i
   MS_EXCEPTION_IF_NULL(node);
   auto kernel_info = dynamic_cast<KernelInfo *>(node->kernel_info());
   MS_EXCEPTION_IF_NULL(kernel_info);
-  if (index >= kernel_info->somas_output_offset_aligned_size_list().size()) {
+  if (index >= kernel_info->somas_output_result().size()) {
     MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:["
-                      << kernel_info->somas_output_offset_aligned_size_list().size() << "]";
+                      << kernel_info->somas_output_result().size() << "]";
   }
-  auto somas_offset_aligned_size = kernel_info->somas_output_offset_aligned_size_list()[index];
+  auto somas_offset_aligned_size = kernel_info->somas_output_result()[index];
   if (somas_offset_aligned_size.second == 0) {
     return nullptr;
   }
@@ -70,11 +70,11 @@ uint8_t *CommonSomasAllocator::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_
   MS_EXCEPTION_IF_NULL(node);
   auto kernel_info = dynamic_cast<KernelInfo *>(node->kernel_info());
   MS_EXCEPTION_IF_NULL(kernel_info);
-  if (index >= kernel_info->somas_workspace_offset_aligned_size_list().size()) {
+  if (index >= kernel_info->somas_workspace_result().size()) {
     MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:["
-                      << kernel_info->somas_workspace_offset_aligned_size_list().size() << "]";
+                      << kernel_info->somas_workspace_result().size() << "]";
   }
-  auto somas_offset_aligned_size = kernel_info->somas_workspace_offset_aligned_size_list()[index];
+  auto somas_offset_aligned_size = kernel_info->somas_workspace_result()[index];
   if (somas_offset_aligned_size.second == 0) {
     return nullptr;
   }
diff --git a/mindspore/ccsrc/runtime/device/kernel_info.cc b/mindspore/ccsrc/runtime/device/kernel_info.cc
index 0e9058984c7..a08ac6ec670 100644
--- a/mindspore/ccsrc/runtime/device/kernel_info.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_info.cc
@@ -16,6 +16,7 @@
 
 #include "runtime/device/kernel_info.h"
 #include <utility>
+#include "utils/ms_context.h"
 
 namespace mindspore {
 namespace device {
@@ -116,6 +117,25 @@ bool KernelInfo::SetSomasResult(std::vector<std::pair<size_t, size_t>> &&output_
   return true;
 }
 
+bool KernelInfo::IsTensorEnableSomas(const std::vector<std::pair<size_t, size_t>> &somas_result,
+                                     size_t tensor_index) const {
+  // Somas is currently closed in the runtime.
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if ((ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO0) ||
+      (ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1)) {
+    return false;
+  }
+
+  if (somas_result.empty()) {
+    return false;
+  }
+  if (tensor_index >= somas_result.size()) {
+    MS_LOG(EXCEPTION) << "The tensor index:" << tensor_index << " is out of range:" << somas_result.size();
+  }
+  return (somas_result[tensor_index].second != 0);
+}
+
 void KernelInfo::set_kernel_mod(const kernel::KernelModPtr &kernel_mod) { kernel_mod_ = kernel_mod; }
 
 kernel::KernelMod *KernelInfo::MutableKernelMod() const { return kernel_mod_.get(); }
diff --git a/mindspore/ccsrc/runtime/device/kernel_info.h b/mindspore/ccsrc/runtime/device/kernel_info.h
index 9c8dbf5dc12..83c81c4a943 100644
--- a/mindspore/ccsrc/runtime/device/kernel_info.h
+++ b/mindspore/ccsrc/runtime/device/kernel_info.h
@@ -58,8 +58,6 @@ class KernelInfo : public KernelInfoDevice {
   DeviceAddressPtr GetMutableWorkspaceAddr(size_t index) const;
   bool WorkspaceAddrExist(size_t index) const;
   bool SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t index);
-  bool SetSomasResult(std::vector<std::pair<size_t, size_t>> &&output_somas_result,
-                      std::vector<std::pair<size_t, size_t>> &&workspace_somas_result);
   void set_kernel_mod(const kernel::KernelModPtr &kernel_mod);
   kernel::KernelMod *MutableKernelMod() const;
   const kernel::KernelMod *kernel_mod() const;
@@ -73,12 +71,7 @@ class KernelInfo : public KernelInfoDevice {
   uint32_t graph_id() const { return graph_id_; }
   bool operator==(const KernelInfo &other) const;
   bool is_feature_map() const { return is_feature_map_; }
-  const std::vector<std::pair<size_t, size_t>> &somas_output_offset_aligned_size_list() const {
-    return somas_output_result_;
-  }
-  const std::vector<std::pair<size_t, size_t>> &somas_workspace_offset_aligned_size_list() const {
-    return somas_workspace_result_;
-  }
+
   const std::vector<std::shared_ptr<DeviceAddress>> &output_address_list() const { return output_address_list_; }
   const std::vector<std::shared_ptr<DeviceAddress>> &workspace_address_list() const { return workspace_address_list_; }
 
@@ -87,6 +80,13 @@ class KernelInfo : public KernelInfoDevice {
   void set_ref_map(const bool &all_ref, const OutputInputRefMap &ref_map);
   const OutputInputRefMap &out_in_ref_map() const { return out_in_ref_map_; }
 
+  // The interface of somas.
+  bool SetSomasResult(std::vector<std::pair<size_t, size_t>> &&output_somas_result,
+                      std::vector<std::pair<size_t, size_t>> &&workspace_somas_result);
+  bool IsTensorEnableSomas(const std::vector<std::pair<size_t, size_t>> &somas_result, size_t tensor_index) const;
+  const std::vector<std::pair<size_t, size_t>> &somas_output_result() const { return somas_output_result_; }
+  const std::vector<std::pair<size_t, size_t>> &somas_workspace_result() const { return somas_workspace_result_; }
+
  private:
   bool is_feature_map_;
   kernel::KernelBuildInfoPtr select_kernel_build_info_;
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_dump.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_dump.cc
index 21669f8b90d..24433c8c5dd 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_dump.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_dump.cc
@@ -205,17 +205,31 @@ void DumpKernelActor(const KernelActor *actor, std::ofstream &ofs) {
 
   const auto &kernel = actor->kernel();
   MS_EXCEPTION_IF_NULL(kernel);
+  auto kernel_info = dynamic_cast<KernelInfo *>(kernel->kernel_info());
+  MS_EXCEPTION_IF_NULL(kernel_info);
   ofs << "\t\tkernel_name:" << kernel->fullname_with_scope()
       << "\tinputs_num:" << common::AnfAlgo::GetInputTensorNum(kernel)
       << "\toutputs_num:" << common::AnfAlgo::GetOutputTensorNum(kernel)
       << "\tis_dynamic_shape:" << actor->is_dynamic_shape() << "\tis_launch_skipped:" << actor->is_launch_skipped()
       << "\n";
+  const auto &somas_outputs = kernel_info->somas_output_result();
   for (size_t i = 0; i < common::AnfAlgo::GetOutputTensorNum(kernel); ++i) {
     const auto &device_tensor = AnfAlgo::GetMutableOutputAddr(kernel, i, false);
     MS_EXCEPTION_IF_NULL(device_tensor);
     ofs << "\t\t\toutput_index:" << i << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
         << "\toriginal_ref_count:" << device_tensor->original_ref_count()
-        << "\tdynamic_ref_count:" << device_tensor->dynamic_ref_count() << "\n ";
+        << "\tdynamic_ref_count:" << device_tensor->dynamic_ref_count()
+        << "\tis_somas_enable:" << kernel_info->IsTensorEnableSomas(somas_outputs, i) << "\n ";
+  }
+  const auto &somas_workspace = kernel_info->somas_workspace_result();
+  const auto &workspace_addresses = kernel_info->workspace_address_list();
+  for (size_t i = 0; i < workspace_addresses.size(); ++i) {
+    auto &device_tensor = workspace_addresses[i];
+    MS_EXCEPTION_IF_NULL(device_tensor);
+    ofs << "\t\t\tworkspace_index:" << i << "\tptr:" << device_tensor->GetPtr() << "\tsize:" << device_tensor->GetSize()
+        << "\toriginal_ref_count:" << device_tensor->original_ref_count()
+        << "\tdynamic_ref_count:" << device_tensor->dynamic_ref_count()
+        << "\tis_somas_enable:" << kernel_info->IsTensorEnableSomas(somas_workspace, i) << "\n ";
   }
 
   DumpAbstractActor(actor, ofs);
@@ -232,6 +246,18 @@ void DumpKernelActor(const KernelActor *actor, std::ofstream &ofs) {
       ofs << "\t\t\tmodifiable_ref_output_index:" << ref_output_index << "\n ";
     }
   }
+
+  const int32_t kInvalidPosition = -1;
+  if (actor->memory_alloc_insert_position().first != kInvalidPosition) {
+    std::string arrow_type = actor->memory_alloc_insert_position().second ? "data_arrow" : "control_arrow";
+    ofs << "\t\tmemory_alloc_insert_position:" << actor->memory_alloc_insert_position().first
+        << "\tinsert_arrow_type:" << arrow_type << "\n";
+  }
+  if (actor->memory_free_insert_position().first != kInvalidPosition) {
+    std::string arrow_type = actor->memory_free_insert_position().second ? "data_arrow" : "control_arrow";
+    ofs << "\t\tmemory_free_insert_position:" << actor->memory_free_insert_position().first
+        << "\tinsert_arrow_type:" << arrow_type << "\n";
+  }
   ofs << "\n";
 }
 
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.cc
index 18276cbe11c..cb3255d7f1a 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.cc
@@ -27,6 +27,20 @@
 
 namespace mindspore {
 namespace runtime {
+namespace {
+bool IsSomasEnable(const SomasInfo *somas_info) {
+  // Somas is currently closed in the runtime.
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if ((ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO0) ||
+      (ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1)) {
+    return false;
+  }
+
+  return ((somas_info != nullptr) && (somas_info->whole_block_size_ != 0));
+}
+}  // namespace
+
 using distributed::collective::CollectiveManager;
 using distributed::recovery::RecoveryContext;
 
@@ -42,46 +56,18 @@ void KernelActor::Init() {
   MS_EXCEPTION_IF_NULL(kernel_);
   real_input_num_ = common::AnfAlgo::GetInputTensorNum(kernel_);
   kernel_info_ = dynamic_cast<KernelInfo *>(kernel_->kernel_info());
+  MS_EXCEPTION_IF_NULL(kernel_info_);
   is_dynamic_shape_ = common::AnfAlgo::IsDynamicShape(kernel_);
-
-  for (size_t i = 0; i < real_input_num_; ++i) {
-    const auto &input_device_tensor = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel_, i, false);
-    MS_EXCEPTION_IF_NULL(input_device_tensor);
-    (void)real_input_data_infos_.emplace_back(
-      std::make_shared<InputDataInfo>(input_device_tensor->format(), input_device_tensor->host_shape(),
-                                      input_device_tensor->GetSize(), input_device_tensor->type_id()));
+  if (is_dynamic_shape_ && IsSomasEnable(somas_info_)) {
+    MS_LOG(EXCEPTION) << "Not support the somas for the dynamic shape: " << GetAID().Name();
   }
 
   // Init the device tensors and kernel launch info.
-  copy_input_device_tensors_.resize(real_input_num_);
-  input_device_tensors_.resize(real_input_num_);
-  for (auto &input_address : input_device_tensors_) {
-    (void)memory_free_list_.emplace_back(input_address);
-    (void)launch_info_.inputs_.emplace_back(std::make_shared<Address>());
-  }
-  MS_EXCEPTION_IF_NULL(kernel_info_);
-  for (auto &output_address : kernel_info_->output_address_list()) {
-    MS_EXCEPTION_IF_NULL(output_address);
-    (void)output_device_tensors_.emplace_back(output_address.get());
-    (void)memory_alloc_list_.emplace_back(output_address.get());
-    (void)memory_free_list_.emplace_back(output_address.get());
-    (void)launch_info_.outputs_.emplace_back(std::make_shared<Address>());
-  }
-  for (auto &external_reference_tensor : external_reference_tensors_) {
-    (void)memory_free_list_.emplace_back(external_reference_tensor);
-  }
-  // The size of workspace maybe changed in dynamic shape, so put workspace_address in the end of memory_alloc_list_ and
-  // memory_free_list_, for the operation of dynamic_shape condition in FetchWorkspaceDeviceTensor.
-  for (auto &workspace_address : kernel_info_->workspace_address_list()) {
-    MS_EXCEPTION_IF_NULL(workspace_address);
-    (void)workspace_device_tensors_.emplace_back(workspace_address.get());
-    (void)memory_alloc_list_.emplace_back(workspace_address.get());
-    (void)memory_free_list_.emplace_back(workspace_address.get());
-    (void)launch_info_.workspaces_.emplace_back(std::make_shared<Address>());
-  }
+  InitInputInfo();
+  InitOutputInfo();
+  InitWorkspaceInfo();
 
   // Init the output data.
-  output_data_by_output_index_.resize(output_device_tensors_.size());
   InitOutputData();
   if (output_data_.size() != output_data_arrows_.size()) {
     MS_LOG(EXCEPTION) << "The output data size is wrong: " << GetAID().Name();
@@ -95,12 +81,88 @@ void KernelActor::Init() {
       MS_LOG(EXCEPTION) << "The output index is out of range: " << GetAID().Name();
     }
     data->data_ = output_device_tensors_[IntToSize(data_arrow->from_output_index_)];
-    (void)output_data_by_output_index_[IntToSize(data_arrow->from_output_index_)].emplace_back(data);
-
     ++output_data_index;
   }
 }
 
+void KernelActor::InitInputInfo() {
+  for (size_t i = 0; i < real_input_num_; ++i) {
+    const auto &input_device_tensor = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel_, i, false);
+    MS_EXCEPTION_IF_NULL(input_device_tensor);
+    (void)real_input_data_infos_.emplace_back(
+      std::make_shared<InputDataInfo>(input_device_tensor->format(), input_device_tensor->host_shape(),
+                                      input_device_tensor->GetSize(), input_device_tensor->type_id()));
+  }
+
+  copy_input_device_tensors_.resize(real_input_num_);
+  input_device_tensors_.resize(real_input_num_);
+  for (auto &input_address : input_device_tensors_) {
+    (void)memory_free_list_.emplace_back(input_address);
+    (void)launch_info_.inputs_.emplace_back(std::make_shared<Address>());
+  }
+}
+
+void KernelActor::InitOutputInfo() {
+  MS_EXCEPTION_IF_NULL(kernel_info_);
+  const auto &output_addresses = kernel_info_->output_address_list();
+  const auto &somas_outputs = kernel_info_->somas_output_result();
+  bool output_need_somas = false;
+  for (size_t i = 0; i < output_addresses.size(); ++i) {
+    auto &output_address = output_addresses[i];
+    MS_EXCEPTION_IF_NULL(output_address);
+    (void)output_device_tensors_.emplace_back(output_address.get());
+    (void)launch_info_.outputs_.emplace_back(std::make_shared<Address>());
+
+    // The output taken over by soma does not need to allocate memory.
+    if (kernel_info_->IsTensorEnableSomas(somas_outputs, i)) {
+      MS_EXCEPTION_IF_CHECK_FAIL((somas_outputs[i].second >= output_address->GetSize()), "The somas size is wrong.");
+      UpdateRefCount(output_address.get(), true);
+      output_need_somas = true;
+    } else {
+      (void)memory_alloc_list_.emplace_back(output_address.get());
+      (void)memory_free_list_.emplace_back(output_address.get());
+    }
+  }
+
+  if (output_need_somas && (!IsSomasEnable(somas_info_))) {
+    MS_LOG(EXCEPTION) << "The somas is not enable for: " << GetAID().Name();
+  }
+
+  for (auto &external_reference_tensor : external_reference_tensors_) {
+    (void)memory_free_list_.emplace_back(external_reference_tensor);
+  }
+}
+
+void KernelActor::InitWorkspaceInfo() {
+  MS_EXCEPTION_IF_NULL(kernel_info_);
+  // The size of workspace maybe changed in dynamic shape, so put workspace_address in the end of memory_alloc_list_ and
+  // memory_free_list_, for the operation of dynamic_shape condition in FetchWorkspaceDeviceTensor.
+  const auto &workspace_addresses = kernel_info_->workspace_address_list();
+  const auto &somas_workspace = kernel_info_->somas_workspace_result();
+  bool workspace_need_somas = false;
+  for (size_t i = 0; i < workspace_addresses.size(); ++i) {
+    auto &workspace_address = workspace_addresses[i];
+    MS_EXCEPTION_IF_NULL(workspace_address);
+    (void)workspace_device_tensors_.emplace_back(workspace_address.get());
+    (void)launch_info_.workspaces_.emplace_back(std::make_shared<Address>());
+
+    // The workspace taken over by soma does not need to allocate memory.
+    if (kernel_info_->IsTensorEnableSomas(somas_workspace, i)) {
+      MS_EXCEPTION_IF_CHECK_FAIL((somas_workspace[i].second >= workspace_address->GetSize()),
+                                 "The somas size is wrong.");
+      UpdateRefCount(workspace_address.get(), true);
+      workspace_need_somas = true;
+    } else {
+      (void)memory_alloc_list_.emplace_back(workspace_address.get());
+      (void)memory_free_list_.emplace_back(workspace_address.get());
+    }
+  }
+
+  if (workspace_need_somas && (!IsSomasEnable(somas_info_))) {
+    MS_LOG(EXCEPTION) << "The somas is not enable for: " << GetAID().Name();
+  }
+}
+
 void KernelActor::Run(OpContext<DeviceTensor> *const context) {
   MS_EXCEPTION_IF_NULL(context);
   MS_EXCEPTION_IF_NULL(device_contexts_[0]);
@@ -129,9 +191,12 @@ void KernelActor::FetchWorkspaceDeviceTensor() {
   if (launch_info_.workspaces_.size() > workspace_sizes.size()) {
     size_t size = launch_info_.workspaces_.size() - workspace_sizes.size();
     (void)workspace_device_tensors_.erase(workspace_device_tensors_.end() - size, workspace_device_tensors_.end());
+    (void)launch_info_.workspaces_.erase(launch_info_.workspaces_.end() - size, launch_info_.workspaces_.end());
+
+    MS_EXCEPTION_IF_CHECK_FAIL((memory_alloc_list_.size() >= size), "The memory alloc list size is wrong.");
+    MS_EXCEPTION_IF_CHECK_FAIL((memory_free_list_.size() >= size), "The memory free list size is wrong.");
     (void)memory_alloc_list_.erase(memory_alloc_list_.end() - size, memory_alloc_list_.end());
     (void)memory_free_list_.erase(memory_free_list_.end() - size, memory_free_list_.end());
-    (void)launch_info_.workspaces_.erase(launch_info_.workspaces_.end() - size, launch_info_.workspaces_.end());
   } else if (launch_info_.workspaces_.size() < workspace_sizes.size()) {
     for (size_t i = launch_info_.workspaces_.size(); i < workspace_sizes.size(); ++i) {
       auto device_address = device_contexts_[0]->device_res_manager_->CreateDeviceAddress(
@@ -141,9 +206,9 @@ void KernelActor::FetchWorkspaceDeviceTensor() {
       AnfAlgo::SetWorkspaceAddr(device_address, i, kernel_.get());  // set to kernel_info
       MS_EXCEPTION_IF_NULL(device_address);
       (void)workspace_device_tensors_.emplace_back(device_address.get());
+      (void)launch_info_.workspaces_.emplace_back(std::make_shared<Address>());
       (void)memory_alloc_list_.emplace_back(device_address.get());
       (void)memory_free_list_.emplace_back(device_address.get());
-      (void)launch_info_.workspaces_.emplace_back(std::make_shared<Address>());
     }
   }
   // Set workspace address new size
@@ -191,8 +256,47 @@ void FreeMemory(const std::vector<DeviceTensor *> &free_list, const DeviceContex
 }
 }  // namespace
 
+void KernelActor::SetSomasMemory(OpContext<DeviceTensor> *const context) {
+  MS_EXCEPTION_IF_NULL(context);
+  MS_EXCEPTION_IF_NULL(kernel_info_);
+  if (!IsSomasEnable(somas_info_)) {
+    return;
+  }
+  if (somas_info_->base_address_ == nullptr) {
+    std::string error_info = "The somas base address isn't allocated when running " + GetAID().Name();
+    SET_OPCONTEXT_FAIL_RET_WITH_ERROR(*context, error_info);
+  }
+
+  // Set the memory address for the output tensors which use the somas.
+  const auto &somas_outputs = kernel_info_->somas_output_result();
+  MS_EXCEPTION_IF_CHECK_FAIL((output_device_tensors_.size() >= somas_outputs.size()), "The output num is wrong.");
+  for (size_t i = 0; i < somas_outputs.size(); ++i) {
+    if (somas_outputs[i].second > 0) {
+      // In this scenario, the Init function can ensure that the pointer of the relevant operation is not nullptr.
+      // In order to perform performance, the pointer validity is not checked here.
+      output_device_tensors_[i]->set_ptr(AddressOffset(somas_info_->base_address_, somas_outputs[i].first));
+    }
+  }
+
+  // Set the memory address for the workspace tensors which use the somas.
+  const auto &somas_workspace = kernel_info_->somas_workspace_result();
+  MS_EXCEPTION_IF_CHECK_FAIL((workspace_device_tensors_.size() >= somas_workspace.size()), "The output num is wrong.");
+  for (size_t i = 0; i < somas_workspace.size(); ++i) {
+    if (somas_workspace[i].second > 0) {
+      // In this scenario, the Init function can ensure that the pointer of the relevant operation is not nullptr.
+      // In order to perform performance, the pointer validity is not checked here.
+      workspace_device_tensors_[i]->set_ptr(AddressOffset(somas_info_->base_address_, somas_workspace[i].first));
+    }
+  }
+}
+
 void KernelActor::SendMemoryAllocReq(OpContext<DeviceTensor> *const context) {
   running_dependent_msg_num_ = 1;
+
+  // Set the memory address for the tensors which use the somas.
+  SetSomasMemory(context);
+
+  // Allocate the memory address for other tensors which don't use the somas.
   if (strategy_ == GraphExecutionStrategy::kPipeline) {
     if (ActorDispatcher::is_memory_allocation_sync()) {
       ActorDispatcher::SendSync(memory_manager_aid_, &MemoryManagerActor::AllocateMemory, &memory_alloc_list_,
@@ -361,13 +465,14 @@ void KernelActor::FetchInputDeviceTensor(OpContext<DeviceTensor> *const context)
   if (data_iter != input_op_datas_.end()) {
     for (auto &input_data : data_iter->second) {
       MS_EXCEPTION_IF_NULL(input_data);
-      if (IntToSize(input_data->index_) >= input_device_tensors_.size()) {
+      size_t input_index = IntToSize(input_data->index_);
+      if (input_index >= input_device_tensors_.size()) {
         SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(strategy_, (*context), "The input index is out of range.");
       }
 
-      if (input_device_tensors_[IntToSize(input_data->index_)] != input_data->data_) {
-        input_device_tensors_[IntToSize(input_data->index_)] = input_data->data_;
-        memory_free_list_[IntToSize(input_data->index_)] = input_data->data_;
+      if (input_device_tensors_[input_index] != input_data->data_) {
+        input_device_tensors_[input_index] = input_data->data_;
+        memory_free_list_[input_index] = input_data->data_;
       }
       CopyInputDeviceTensor(input_data, context);
     }
@@ -407,30 +512,36 @@ void KernelActor::FetchOutputDeviceTensor(OpContext<DeviceTensor> *const context
     SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(strategy_, (*context), error_info);
   }
 
+  const auto &somas_outputs = kernel_info_->somas_output_result();
+  // Update the size of output device tensor.
   for (size_t i = 0; i < output_addresses.size(); ++i) {
     auto output_address = output_addresses[i].get();
     MS_EXCEPTION_IF_NULL(output_address);
-    if (output_size_list[i] != output_address->GetSize()) {
-      // 1. The size of output address may be changed in dynamic shape scenario.
-      // 2. If the format of the DeviceAddress is different, then the size is originally different.
-      //    Such as NCHW(1,1,1,3) and NC1HWC0(1,1,1,1,16). So we don't need to update the size.
-      // 3. For example, we need to call cudnnGetRNNTrainingReserveSize to get real output size in LstmGpuKernelMod!
-      if (AnfAlgo::GetOutputFormat(kernel_, i) == output_address->format()) {
-        output_address->SetSize(output_size_list[i]);
-      }
+    // The output device tensor can't be changed.
+    if (output_device_tensors_[i] != output_address) {
+      std::string error_info =
+        "The device tensor can't be changed of " + GetAID().Name() + " with output index " + std::to_string(i);
+      SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(strategy_, (*context), error_info);
     }
 
-    // When the tensor is the output of graph or in dynamic shape scenario, the output tensor may be changed.
-    if (output_device_tensors_[i] != output_address) {
-      output_device_tensors_[i] = output_address;
-      memory_alloc_list_[i] = output_address;
-      memory_free_list_[real_input_num_ + i] = output_address;
+    if (output_size_list[i] == output_address->GetSize()) {
+      continue;
+    }
 
-      // Update output data.
-      for (auto &output_data : output_data_by_output_index_[i]) {
-        MS_EXCEPTION_IF_NULL(output_data);
-        output_data->data_ = output_address;
-      }
+    // Somas doesn't support the variable size.
+    if (kernel_info_->IsTensorEnableSomas(somas_outputs, i) && (somas_outputs[i].second < output_size_list[i])) {
+      std::string error_info =
+        "Somas doesn't support variable size of " + GetAID().Name() + " with output index " + std::to_string(i) +
+        ".  Suggest to turn off memory optimization by setting the context memory_optimize_level' to 'O0' ";
+      SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(strategy_, (*context), error_info);
+    }
+
+    // 1. The size of output address may be changed in dynamic shape scenario.
+    // 2. If the format of the DeviceAddress is different, then the size is originally different.
+    //    Such as NCHW(1,1,1,3) and NC1HWC0(1,1,1,1,16). So we don't need to update the size.
+    // 3. For example, we need to call cudnnGetRNNTrainingReserveSize to get real output size in LstmGpuKernelMod!
+    if (AnfAlgo::GetOutputFormat(kernel_, i) == output_address->format()) {
+      output_address->SetSize(output_size_list[i]);
     }
   }
 }
@@ -474,8 +585,11 @@ bool KernelActor::LaunchKernel(OpContext<DeviceTensor> *const) {
   }
 
   MS_EXCEPTION_IF_NULL(device_contexts_[0]);
-  return device_contexts_[0]->kernel_executor_->LaunchKernel(kernel_, launch_info_.inputs_, launch_info_.workspaces_,
-                                                             launch_info_.outputs_, kernel_info_->stream_id());
+  MS_LOG(DEBUG) << "Begin launch kernel of actor: " << GetAID().Name();
+  auto ret = device_contexts_[0]->kernel_executor_->LaunchKernel(
+    kernel_, launch_info_.inputs_, launch_info_.workspaces_, launch_info_.outputs_, kernel_info_->stream_id());
+  MS_LOG(DEBUG) << "End launch kernel of actor: " << GetAID().Name();
+  return ret;
 }
 
 void KernelActor::PostLaunchKernel(OpContext<DeviceTensor> *const context) {
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.h b/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.h
index 9287edac4b0..ca3ceb2141f 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.h
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_actor.h
@@ -136,6 +136,11 @@ class KernelActor : public DebugAwareActor {
   friend class RpcNodeScheduler;
 #endif
 
+  // Init the device tensors and kernel launch info.
+  void InitInputInfo();
+  void InitOutputInfo();
+  void InitWorkspaceInfo();
+
   // Fetch the device tensor for launch.
   void FetchInputDeviceTensor(OpContext<DeviceTensor> *const context);
   void FetchOutputDeviceTensor(OpContext<DeviceTensor> *const context);
@@ -152,6 +157,9 @@ class KernelActor : public DebugAwareActor {
   // Back refresh the dynamic device tensor stores that have been triggered copy.
   void RefreshDeviceTensorCopyStore(OpContext<DeviceTensor> *const context);
 
+  // Set the memory address for the tensors which use the somas.
+  void SetSomasMemory(OpContext<DeviceTensor> *const context);
+
   // The real input number of kernel launch.
   size_t real_input_num_;
 
@@ -164,9 +172,6 @@ class KernelActor : public DebugAwareActor {
   std::set<size_t> modifiable_ref_input_indexes_;
   std::set<size_t> modifiable_ref_output_indexes_;
 
-  // Cache output data by output index to modify the output data effectively.
-  std::vector<std::vector<OpData<DeviceTensor> *>> output_data_by_output_index_;
-
   // Whether skip the kernel launch.
   bool is_launch_skipped_;