diff --git a/mindspore/ccsrc/backend/common/session/kernel_graph.h b/mindspore/ccsrc/backend/common/session/kernel_graph.h
index 0d4c508f81e..abff236c169 100644
--- a/mindspore/ccsrc/backend/common/session/kernel_graph.h
+++ b/mindspore/ccsrc/backend/common/session/kernel_graph.h
@@ -50,6 +50,13 @@ struct KernelWithIndexCmp {
   }
 };
 
+struct SomasInfo {
+  // whole_block_size_ is 0 indicating that somas did not allocate memory for this graph.
+  size_t whole_block_size_{0};
+  // offset -> aligned_size_
+  std::map<size_t, size_t> merged_blocks_map_;
+};
+
 using DeviceType = device::DeviceType;
 using KernelMapTensor = std::map<session::KernelWithIndex, BaseRef, session::KernelWithIndexCmp>;
 
@@ -57,6 +64,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
  public:
   KernelGraph()
       : inputs_(std::make_shared<std::vector<AnfNodePtr>>()),
+        somas_info_(std::make_shared<SomasInfo>()),
         graph_id_(0),
         stream_distinction_label_(kInvalidDistincLabel),
         device_target_(DeviceType::kUnknown),
@@ -69,6 +77,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
 
   KernelGraph(const KernelGraph &graph) : FuncGraph(graph) {
     inputs_ = graph.inputs_;
+    somas_info_ = graph.somas_info_;
     child_graph_result_ = graph.child_graph_result_;
     execution_order_ = graph.execution_order_;
     mem_reuse_exec_order_ = graph.mem_reuse_exec_order_;
@@ -452,6 +461,11 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
   bool IsCommSubGraph(uint32_t id) const { return comm_sub_graph_ids_.find(id) != comm_sub_graph_ids_.end(); }
   void RecordNewCommSubGraphId(uint32_t id) { comm_sub_graph_ids_.insert(id); }
 
+  // somas total memory size
+  SomasInfo *MutableSomasInfo() const { return somas_info_.get(); }
+  size_t somas_whole_block_size() const { return somas_info_->whole_block_size_; }
+  const std::map<size_t, size_t> &somas_merged_blocks_map() const { return somas_info_->merged_blocks_map_; }
+
  private:
   // remove value node form graph
   bool RemoveValueNodeFromGraph(const ValueNodePtr &value_node);
@@ -477,6 +491,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
 
   // members
   std::shared_ptr<std::vector<AnfNodePtr>> inputs_;
+  std::shared_ptr<SomasInfo> somas_info_;
   std::vector<AnfNodePtr> child_graph_result_;
   std::vector<CNodePtr> execution_order_;
   std::vector<CNodePtr> mem_reuse_exec_order_;
diff --git a/mindspore/ccsrc/backend/common/somas/somas.cc b/mindspore/ccsrc/backend/common/somas/somas.cc
index 7cd5055c821..1bb8c78cc67 100644
--- a/mindspore/ccsrc/backend/common/somas/somas.cc
+++ b/mindspore/ccsrc/backend/common/somas/somas.cc
@@ -20,19 +20,16 @@
 #include <iterator>
 #include <memory>
 #include <numeric>
-#include <random>
 #include <set>
+#include <random>
 
 #include "backend/common/somas/somas_node.h"
 #include "backend/common/somas/somas_solver_pre.h"
 #include "backend/common/somas/somas_stream.h"
 #include "backend/common/somas/somas_tensor.h"
-#ifdef ENABLE_D
-#include "plugin/device/ascend/hal/device/ascend_stream_assign.h"
-#endif
 #include "backend/common/optimizer/helper.h"
-#include "utils/ms_context.h"
 #include "include/common/debug/common.h"
+#include "include/common/debug/anf_ir_dump.h"
 #ifdef ENABLE_DUMP_IR
 #include "debug/rdr/string_recorder.h"
 #endif
@@ -46,15 +43,16 @@ using mindspore::profiler::ascend::TensorMemory;
 #endif
 namespace mindspore {
 namespace somas {
-constexpr auto kGapSize = 512;
 constexpr auto kRetryIntervalSeconds = 500;
-constexpr size_t kRefNodeTensorNum = 2;
+constexpr auto kRefNodeTensorNum = 2;
 constexpr auto kOnlyOneDestinationNode = 1;
 constexpr auto kOnlyTwoDestinationNode = 2;
+constexpr auto kNopNodeRealInputIndex = 1;
+constexpr auto kZeroAlignSize = 1;
 
 constexpr auto kGraphId = "graph_id";
 constexpr auto kHashId = "hash_id";
-constexpr auto kMemOffset = "mem_offset";
+constexpr auto kReused_memory_size = "reused_memory_size";
 constexpr auto kNodeSize = "node_size";
 constexpr auto kTensorSize = "tensor_size";
 constexpr auto kContiguousSize = "contiguous_size";
@@ -72,104 +70,203 @@ constexpr auto kLifeEnd = "life_end";
 constexpr auto kOffset = "offset";
 constexpr auto kCachedResultThreshold = 2000;
 
-std::map<TensorType, std::string> tensor_type_name_map = {{kCommon, "Common"},
-                                                          {kOutputOnly, "OutputOnly"},
-                                                          {kWorkspace, "Workspace"},
-                                                          {kGetNextOutput, "GetNextOutput"},
-                                                          {kSummaryInput, "SummaryInput"},
-                                                          {kRefNodeInput, "RefNodeInput"},
-                                                          {kRefNodeOutput, "RefNodeOutput"},
-                                                          {kEventVirtualOutput, "EventVirtualOutput"},
-                                                          {kUnknown, "Unknown"}};
-
-std::map<LifeLongType, std::string> life_long_name_map = {{kLifeLongNone, "LifeLongNone"},
-                                                          {kLifeLongGraphAll, "LifeLongGraphAll"},
-                                                          {kLifeLongGraphStart, "LifeLongGraphStart"},
-                                                          {kLifeLongGraphEnd, "LifeLongGraphEnd"}};
-
-bool Somas::Allocate(const session::KernelGraph *graph) {
-  MS_LOG(DEBUG) << "Somas Allocate start...";
-  auto ret = InitSomasTensors(graph);
-  if (!ret) {
-    MS_LOG(EXCEPTION) << "Somas Initialize Failed.";
+// set somas result
+void SetSomasResult(std::vector<std::pair<size_t, size_t>> &&output_somas_result,
+                    std::vector<std::pair<size_t, size_t>> &&workspace_somas_result, AnfNode *node) {
+  MS_EXCEPTION_IF_NULL(node);
+  auto kernel_info = dynamic_cast<device::KernelInfo *>(node->kernel_info());
+  MS_EXCEPTION_IF_NULL(kernel_info);
+  if (!kernel_info->SetSomasResult(std::move(output_somas_result), std::move(workspace_somas_result))) {
+    MS_LOG(EXCEPTION) << "Node " << node->DebugString() << "set somas result fail. ";
   }
+}
+
+void MergeBlocks(std::vector<Block> *block_list, std::stack<Block> *merged_blocks) {
+  if (block_list->empty()) {
+    MS_LOG(INFO) << "No block to merge.";
+    return;
+  }
+  std::sort(block_list->begin(), block_list->end(), [](const Block &block1, const Block &block2) {
+    return (block1.start_offset_ < block2.start_offset_) ||
+           ((block1.start_offset_ == block2.start_offset_) && (block1.end_offset_ < block2.end_offset_));
+  });
+  merged_blocks->push(Block((*block_list)[0].start_offset_, (*block_list)[0].size_));
+  for (size_t i = 1; i < block_list->size(); i++) {
+    Block &top = merged_blocks->top();
+    auto &block = (*block_list)[i];
+    if (block.start_offset_ >= top.end_offset_) {
+      merged_blocks->push(Block(block.start_offset_, block.size_));
+    } else if (block.end_offset_ > top.end_offset_) {
+      top.end_offset_ = block.end_offset_;
+      top.size_ = top.end_offset_ - top.start_offset_;
+    }
+  }
+}
+
+bool Somas::Assign(const session::KernelGraph &graph) {
+  MS_LOG(INFO) << "Start Somas Assign for graph " << graph.graph_id();
+  if (graph.is_dynamic_shape()) {
+    MS_LOG(WARNING) << "Somas can't allocate graph with dynamic_shape now.";
+    return false;
+  }
+  auto ret = ConfigSomas(graph);
+  if (!ret) {
+    MS_LOG(EXCEPTION) << "Config Somas Failed.";
+  }
+  MS_LOG(INFO) << "Somas Configure success, configuration info: "
+               << "\nDevice Name: " << device_name_ << "\nRun by execution order: " << depend_exec_order_
+               << "\nEnable debug log: " << save_debug_info_ << "\nDebug log path: " << debug_info_path_;
+  MS_LOG(INFO) << "Start Initialize SOMAS Model";
+
+  ret = InitSomasModel(graph);
+  if (!ret) {
+    MS_LOG(EXCEPTION) << "Somas modeling Failed for graph " << graph.graph_id();
+  }
+  MS_LOG(INFO) << "End Initialize SOMAS Model";
 
   if (tensors_list_.empty()) {
-    MS_LOG(INFO) << "No Tensor for Somas";
+    MS_LOG(INFO) << "No Somas Tensor in graph " << graph.graph_id();
     return true;
   }
 
-  ret = LoadSomasCache(graph);
-  if (ret) {
-    GenGraphStatisticInfo();
-    return ret;
+  if (enable_cache_) {
+    ret = LoadSomasCache(graph);
+    if (ret) {
+      GenGraphStatisticInfo();
+      UpdateSomasResultToGraph(graph);
+      DumpSomasModelInfo("somas_tensor_offset", graph.graph_id());
+      MS_LOG(INFO) << "Somas Allocate end.";
+      return ret;
+    }
   }
 
   // Computing Conflict pairs
-  MS_LOG(INFO) << "Start Computing Conflict Pairs";
-  ComputeConflictPairs();
-  MS_LOG(INFO) << "End Computing Conflict Pairs";
+  MS_LOG(INFO) << "Start Computing Conflict Matrix";
+  ComputeConflictMatrix();
+  MS_LOG(INFO) << "End Computing Conflict Matrix";
 
-  ret = Assign(graph);
+  ret = Solve(graph);
   if (!ret) {
     MS_LOG(EXCEPTION) << "Somas Assign Failed.";
   }
-  SaveSomasResult(graph);
+
   GenGraphStatisticInfo();
-  MS_LOG(DEBUG) << "Somas Allocate end.";
+  if (enable_cache_) {
+    SaveSomasResult(graph);
+  }
+
+  UpdateSomasResultToGraph(graph);
+  DumpSomasModelInfo("somas_tensor_offset", graph.graph_id());
+
+  MS_LOG(INFO) << "Somas Allocate end.";
   return ret;
 }
 
-bool Somas::LoadSomasCache(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_LOG(DEBUG) << "Somas LoadSomasCache start...";
-  if (tensors_list_.size() < kCachedResultThreshold) {
-    MS_LOG(DEBUG) << "Tensors size (" << tensors_list_.size() << ") less than " << kCachedResultThreshold
-                  << ", no need to load cached";
+bool Somas::Assign(const KernelGraphPtr &graph_ptr) {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  MS_EXCEPTION_IF_NULL(graph_ptr);
+#ifndef ENABLE_SECURITY
+  auto enable_save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
+  if (enable_save_graphs) {
+    std::string file_name = "somas_input_graph_" + std::to_string(graph_ptr->graph_id()) + ".ir";
+    DumpIR(file_name, graph_ptr, true, kWholeStack);
+  }
+#endif
+  return Assign(*graph_ptr);
+}
+
+size_t Somas::GetCommunicationReservedSize() const { return 0; }
+
+bool Somas::GetEnableCacheFlag(const session::KernelGraph &graph) const {
+  return graph.execution_order().size() >= kCachedResultThreshold;
+}
+
+std::pair<bool, std::string> Somas::GetDebugConfig() const {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  auto enable_save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
+  auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
+  if (save_graphs_path.empty()) {
+    save_graphs_path = ".";
+  }
+  return std::make_pair(enable_save_graphs, save_graphs_path);
+}
+
+std::vector<vector<uint32_t>> Somas::GetStreamGroupInfo(const session::KernelGraph &graph) const {
+  std::vector<vector<uint32_t>> stream_group;
+  return stream_group;
+}
+
+std::map<std::string, UnReuseType> Somas::GetUnReuseNodeType(const session::KernelGraph &graph) const {
+  std::map<std::string, UnReuseType> node_type;
+  return node_type;
+}
+
+std::map<std::string, UnReuseType> Somas::GetUnReuseNodeName(const session::KernelGraph &graph) const {
+  std::map<std::string, UnReuseType> name_type;
+  return name_type;
+}
+
+bool Somas::ConfigSomas(const session::KernelGraph &graph) {
+  auto ret = Initialize();
+  if (!ret) {
+    MS_LOG(ERROR) << "Somas Initialize failed. Please Check!!!";
     return false;
   }
+  device_name_ = GetDeviceName();
+  communication_gap_size_ = GetCommunicationReservedSize();
+  enable_cache_ = GetEnableCacheFlag(graph);
+  depend_exec_order_ = GetDependExecOrderFlag(graph);
+  auto debug_config = GetDebugConfig();
+  save_debug_info_ = debug_config.first;
+  debug_info_path_ = debug_config.second;
+  streams_groups_ = GetStreamGroupInfo(graph);
+  un_reuse_node_type_.clear();
+  auto device_un_reuse_type = GetUnReuseNodeType(graph);
+  un_reuse_node_type_.insert(device_un_reuse_type.begin(), device_un_reuse_type.end());
+  un_reuse_node_name_.clear();
+  auto device_un_reuse_name = GetUnReuseNodeName(graph);
+  un_reuse_node_name_.insert(device_un_reuse_name.begin(), device_un_reuse_name.end());
+  return true;
+}
 
+bool Somas::LoadSomasCache(const session::KernelGraph &graph) {
+  MS_LOG(DEBUG) << "Somas LoadSomasCache start...";
   bool ret = CalcSomasModelHash(graph);
   if (ret) {
     std::string filename = Common::GetCompilerCachePath() + "/somas_meta/somas_graph_" +
-                           std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json";
+                           std::to_string(graph.graph_id()) + "_" + hash_id_ + ".json";
     ret = LoadSomasResult(graph, filename);
     if (ret) {
       MS_LOG(INFO) << "Load Somas Cache file " << filename << " Successfully.";
     }
   } else {
-    MS_LOG(ERROR) << "Calculate somas's model hash id failed.";
+    MS_LOG(ERROR) << "Calculate SOMAS model hash id failed.";
   }
   MS_LOG(DEBUG) << "Somas LoadSomasCache end.";
   return ret;
 }
 
-bool Somas::CalcSomasModelHash(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
+bool Somas::CalcSomasModelHash(const session::KernelGraph &graph) {
   auto model_str = SomasInfo(true);
   hash_id_ = std::to_string(std::hash<std::string>()(model_str));
-  MS_LOG(INFO) << "Graph " << graph->graph_id() << "'s SOMAS Model hash id is " << hash_id_;
+  MS_LOG(INFO) << "Graph " << graph.graph_id() << "'s SOMAS Model hash id is " << hash_id_;
   std::string filename = Common::GetCompilerCachePath() + "/somas_meta/somas_graph_" +
-                         std::to_string(graph->graph_id()) + "_" + hash_id_ + ".info";
+                         std::to_string(graph.graph_id()) + "_" + hash_id_ + ".info";
   return Common::SaveStringToFile(filename, model_str);
 }
 
-bool Somas::SaveSomasResult(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  if (tensors_list_.size() < kCachedResultThreshold) {
-    MS_LOG(DEBUG) << "Tensors size (" << tensors_list_.size() << ") less than " << kCachedResultThreshold
-                  << ", no need to save result";
-    return false;
-  }
+bool Somas::SaveSomasResult(const session::KernelGraph &graph) {
   nlohmann::json somas_json;
-  somas_json[kGraphId] = graph->graph_id();
+  somas_json[kGraphId] = graph.graph_id();
   somas_json[kHashId] = hash_id_;
-  somas_json[kMemOffset] = mem_offset_;
+  somas_json[kReused_memory_size] = reused_memory_size_;
   somas_json[kNodeSize] = nodes_list_.size();
   somas_json[kTensorSize] = tensors_list_.size();
   somas_json[kContiguousSize] = contiguous_tensors_list_.size();
-  somas_json[kRefNodeSize] = ref_node_constraints_.size();
-  somas_json[kStreamSize] = streams_list_.size();
+  somas_json[kRefNodeSize] = union_tensors_list_.size();
+  somas_json[kStreamSize] = streams_map_.size();
   somas_json[kStreamGroupSize] = streams_groups_.size();
   std::vector<nlohmann::json> tensors_json;
   for (auto &tensor : tensors_list_) {
@@ -187,12 +284,48 @@ bool Somas::SaveSomasResult(const session::KernelGraph *graph) {
   somas_json[kTensors] = tensors_json;
 
   std::string filename = Common::GetCompilerCachePath() + "/somas_meta/somas_graph_" +
-                         std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json";
+                         std::to_string(graph.graph_id()) + "_" + hash_id_ + ".json";
   (void)Common::SaveStringToFile(filename, somas_json.dump());
   return true;
 }
 
-bool Somas::LoadSomasResult(const session::KernelGraph *graph, const string &filename) {
+bool Somas::UpdateSomasResultToGraph(const session::KernelGraph &graph) {
+  auto &execution_nodes = graph.execution_order();
+  std::vector<Block> block_list;
+  for (auto &node : execution_nodes) {
+    auto kernel_mod = AnfAlgo::GetKernelMod(node);
+    MS_EXCEPTION_IF_NULL(kernel_mod);
+    auto output_somas_result = GetNodeOutputSomasResult(node);
+    auto workspace_somas_result = GetNodeWorkSpaceSomasResult(node);
+
+    for (const auto &somas_offset_aligned_size : output_somas_result) {
+      if (somas_offset_aligned_size.second > 0) {
+        block_list.emplace_back(somas_offset_aligned_size.first, somas_offset_aligned_size.second);
+      }
+    }
+    for (const auto &somas_offset_aligned_size : workspace_somas_result) {
+      if (somas_offset_aligned_size.second > 0) {
+        block_list.emplace_back(somas_offset_aligned_size.first, somas_offset_aligned_size.second);
+      }
+    }
+
+    SetSomasResult(std::move(output_somas_result), std::move(workspace_somas_result), node.get());
+  }
+
+  std::stack<Block> merged_blocks;
+  MergeBlocks(&block_list, &merged_blocks);
+  session::SomasInfo *somas_info = graph.MutableSomasInfo();
+  somas_info->whole_block_size_ = reused_memory_size_;
+  while (!merged_blocks.empty()) {
+    auto block = merged_blocks.top();
+    merged_blocks.pop();
+    somas_info->merged_blocks_map_[block.start_offset_] = block.size_;
+    dump_merged_blocks_.emplace_back(block.start_offset_, block.size_);
+  }
+  return true;
+}
+
+bool Somas::LoadSomasResult(const session::KernelGraph &graph, const string &filename) {
   std::ifstream somas_json_fs(filename);
   if (!somas_json_fs.is_open()) {
     MS_LOG(INFO) << "Open json file: " << filename << " error, Somas Cache Missed.";
@@ -220,27 +353,19 @@ bool Somas::LoadSomasResult(const session::KernelGraph *graph, const string &fil
     MS_LOG(WARNING) << "Verify Somas Result Failed.";
     return false;
   }
-  auto mem_offset = somas_json[kMemOffset];
-  mem_offset_ = mem_offset;
+  reused_memory_size_ = somas_json[kReused_memory_size];
   ret = UpdateTensorsOffset(somas_json[kTensors]);
   return ret;
 }
 
-bool Somas::VerifySomasResult(const session::KernelGraph *graph, const nlohmann::json &somas_json) const {
-  MS_EXCEPTION_IF_NULL(graph);
-  auto graph_id = somas_json[kGraphId];
-  auto hash_id = somas_json[kHashId];
-  auto node_size = somas_json[kNodeSize];
-  auto tensor_size = somas_json[kTensorSize];
-  auto contiguous_size = somas_json[kContiguousSize];
-  auto ref_node_size = somas_json[kRefNodeSize];
-  auto stream_size = somas_json[kStreamSize];
-  auto stream_group_size = somas_json[kStreamGroupSize];
-
-  if (graph_id != graph->graph_id()) {
-    MS_LOG(WARNING) << "Mismatch graph id " << graph_id << " vs " << graph->graph_id();
-    return false;
-  }
+bool Somas::VerifySomasResult(const session::KernelGraph &graph, const nlohmann::json &somas_json) const {
+  const auto &hash_id = somas_json[kHashId];
+  const auto &node_size = somas_json[kNodeSize];
+  const auto &tensor_size = somas_json[kTensorSize];
+  const auto &contiguous_size = somas_json[kContiguousSize];
+  const auto &ref_node_size = somas_json[kRefNodeSize];
+  const auto &stream_size = somas_json[kStreamSize];
+  const auto &stream_group_size = somas_json[kStreamGroupSize];
 
   if (hash_id != hash_id_) {
     MS_LOG(WARNING) << "Mismatch hash id " << hash_id << " vs " << hash_id_;
@@ -262,13 +387,13 @@ bool Somas::VerifySomasResult(const session::KernelGraph *graph, const nlohmann:
     return false;
   }
 
-  if (ref_node_size != ref_node_constraints_.size()) {
-    MS_LOG(WARNING) << "Mismatch ref node size " << ref_node_size << " vs " << ref_node_constraints_.size();
+  if (ref_node_size != union_tensors_list_.size()) {
+    MS_LOG(WARNING) << "Mismatch ref node size " << ref_node_size << " vs " << union_tensors_list_.size();
     return false;
   }
 
-  if (stream_size != streams_list_.size()) {
-    MS_LOG(WARNING) << "Mismatch stream size " << stream_size << " vs " << streams_list_.size();
+  if (stream_size != streams_map_.size()) {
+    MS_LOG(WARNING) << "Mismatch stream size " << stream_size << " vs " << streams_map_.size();
     return false;
   }
 
@@ -277,136 +402,221 @@ bool Somas::VerifySomasResult(const session::KernelGraph *graph, const nlohmann:
     return false;
   }
 
+  const auto &tensors_json = somas_json[kTensors];
+  for (const auto &tensor_json : tensors_json) {
+    const auto &tensor_id = tensor_json[kTensorId];
+    const auto &size = tensor_json[kSize];
+    const auto &ori_size = tensor_json[kOriSize];
+    const auto &lifelong_value = tensor_json[kLifelongValue];
+    const auto &life_start = tensor_json[kLifeStart];
+    const auto &life_end = tensor_json[kLifeEnd];
+    if (tensor_id < tensors_list_.size()) {
+      auto &tensor = tensors_list_[tensor_id];
+      MS_EXCEPTION_IF_NULL(tensor);
+      if (size != tensor->aligned_size_) {
+        MS_LOG(WARNING) << "Mismatch size of tensor " << tensor_id << " " << size << " vs " << tensor->aligned_size_;
+        return false;
+      }
+
+      if (ori_size != tensor->GetOriginalSize()) {
+        MS_LOG(WARNING) << "Mismatch original size of tensor " << tensor_id << " " << ori_size << " vs "
+                        << tensor->GetOriginalSize();
+        return false;
+      }
+
+      if (lifelong_value != tensor->lifelong_value_) {
+        MS_LOG(WARNING) << "Mismatch lifelong value of tensor " << tensor_id << " " << lifelong_value << " vs "
+                        << tensor->lifelong_value_;
+        return false;
+      }
+
+      if (life_start != tensor->lifetime_.start_) {
+        MS_LOG(WARNING) << "Mismatch life start of tensor " << tensor_id << " " << life_start << " vs "
+                        << tensor->lifetime_.start_;
+        return false;
+      }
+
+      if (life_end != tensor->lifetime_.end_) {
+        MS_LOG(WARNING) << "Mismatch life start of tensor " << tensor_id << " " << life_end << " vs "
+                        << tensor->lifetime_.end_;
+        return false;
+      }
+    } else {
+      MS_LOG(WARNING) << "Can't find tensor " << tensor_id;
+      return false;
+    }
+  }
+
   return true;
 }
 
 bool Somas::UpdateTensorsOffset(const std::vector<nlohmann::json> &tensors_json) {
   bool ret = true;
   for (auto &tensor_json : tensors_json) {
-    auto tensor_id = tensor_json[kTensorId];
-    auto size = tensor_json[kSize];
-    auto ori_size = tensor_json[kOriSize];
-    auto lifelong_value = tensor_json[kLifelongValue];
-    auto life_start = tensor_json[kLifeStart];
-    auto life_end = tensor_json[kLifeEnd];
-    auto offset = tensor_json[kOffset];
-    auto iter = tensors_map_.find(tensor_id);
-    if (iter != tensors_map_.end()) {
-      MS_EXCEPTION_IF_NULL(iter->second);
-      if (size != iter->second->aligned_size_) {
-        MS_LOG(WARNING) << "Mismatch size of tensor " << tensor_id << " " << size << " vs "
-                        << iter->second->aligned_size_;
-        ret = false;
-        break;
-      }
-
-      if (ori_size != iter->second->GetOriginalSize()) {
-        MS_LOG(WARNING) << "Mismatch original size of tensor " << tensor_id << " " << ori_size << " vs "
-                        << iter->second->GetOriginalSize();
-        ret = false;
-        break;
-      }
-
-      if (lifelong_value != iter->second->lifelong_value_) {
-        MS_LOG(WARNING) << "Mismatch lifelong value of tensor " << tensor_id << " " << lifelong_value << " vs "
-                        << iter->second->lifelong_value_;
-        ret = false;
-        break;
-      }
-
-      if (life_start != iter->second->lifetime_.start_) {
-        MS_LOG(WARNING) << "Mismatch life start of tensor " << tensor_id << " " << life_start << " vs "
-                        << iter->second->lifetime_.start_;
-        ret = false;
-        break;
-      }
-
-      if (life_end != iter->second->lifetime_.end_) {
-        MS_LOG(WARNING) << "Mismatch life start of tensor " << tensor_id << " " << life_end << " vs "
-                        << iter->second->lifetime_.end_;
-        ret = false;
-        break;
-      }
-
-      // verify pass, update memory offset
-      iter->second->offset_ = offset;
-    } else {
-      MS_LOG(WARNING) << "Can't find tensor " << tensor_id;
-      ret = false;
-      break;
-    }
+    const auto &tensor_id = tensor_json[kTensorId];
+    const auto &size = tensor_json[kSize];
+    const auto &offset = tensor_json[kOffset];
+    auto &tensor = tensors_list_[tensor_id];
+    MS_EXCEPTION_IF_NULL(tensor);
+    // update memory offset
+    tensor->offset_ = offset;
+    tensor->aligned_size_ = size;
   }
   return ret;
 }
 
-bool Somas::InitSomasTensors(const session::KernelGraph *graph) {
-  MS_LOG(DEBUG) << "Somas InitSomasTensors start...";
-  MS_EXCEPTION_IF_NULL(graph);
-  InitBasicInfo(graph);
-  IndependentNodeOutputProcess(graph);
+bool Somas::InitSomasModel(const session::KernelGraph &graph) {
+  MS_EXCEPTION_IF_CHECK_FAIL(InitBasicInfoFromGraph(graph), "Init SOMAS basic info from graph failed.");
+#if defined(ENABLE_DUMP_IR) && !defined(ENABLE_SECURITY)
+  SubModuleId module = SubModuleId::SM_OPTIMIZER;
+  std::string name = device_name_ + "_somas_initial_info." + std::to_string(graph.graph_id());
+  (void)mindspore::RDR::RecordString(module, name, SomasInfo());
+#endif
+  DumpSomasModelInfo("somas_initial_info", graph.graph_id());
+
+  MS_EXCEPTION_IF_CHECK_FAIL(InitDevSpecControlTensors(graph), "Init device special control tensors failed.");
+  DumpSomasModelInfo("somas_device_control_info", graph.graph_id());
+
+  MS_EXCEPTION_IF_CHECK_FAIL(CommonSpecNodeProcess(graph), "Common special node process failed.");
+  DumpSomasModelInfo("somas_common_spec_node_process", graph.graph_id());
+
+  MS_EXCEPTION_IF_CHECK_FAIL(DevSpecNodeProcess(graph), "Device specify special node process failed.");
+  DumpSomasModelInfo("somas_device_spec_node_process", graph.graph_id());
+
+  UnReuseNodeProcess(graph);
+  UpdateContiguousTensorList();
+  if (tensors_list_.empty()) {
+    MS_LOG(INFO) << "No Tensor from graph " << graph.graph_id();
+    return true;
+  }
+
+  MS_LOG(INFO) << "Created " << streams_map_.size() << " streams (" << streams_groups_.size() << " groups), "
+               << nodes_list_.size() << " nodes, " << tensors_list_.size() << " tensors, " << union_tensors_list_.size()
+               << " union tensors lists, and " << contiguous_tensors_list_.size() << " contiguous tensors lists";
+
+#if defined(ENABLE_DUMP_IR) && !defined(ENABLE_SECURITY)
+  name = device_name_ + "_somas_pre_processed_info." + std::to_string(graph.graph_id());
+  (void)mindspore::RDR::RecordString(module, name, SomasInfo());
+  name = device_name_ + "_somas_offline_log." + std::to_string(graph.graph_id());
+  (void)mindspore::RDR::RecordString(module, name, Offline());
+#endif
+
+  DumpSomasModelInfo("somas_pre_processed_info", graph.graph_id());
+  if (save_debug_info_) {
+    std::string offline_file_path = GetSaveGraphsPathName(
+      "/" + device_name_ + "_somas_offline_log_" + std::to_string(graph.graph_id()) + ".ir", debug_info_path_);
+    DumpOfflineIR(offline_file_path);
+  }
+  return true;
+}
+
+void Somas::AddControlTensor(const SomasNodePtr &from, const SomasNodePtr &to) {
+  size_t control_tensor_index = control_tensors_list_.size();
+  SomasTensorPtr tensor =
+    std::make_shared<SomasTensor>(control_tensor_index, from->GetId(), from->GetStreamId(), 0, 0, kLifeLongNone);
+  tensor->lifetime_.start_ = from->GetId();
+  tensor->lifetime_.end_ = to->GetId();
+  tensor->type_ = kControl;
+  tensor->destination_nodes_.insert(to->GetId());
+  tensor->consumer_list_.emplace_back(to->GetId());
+  from->control_output_tensors_.push_back(tensor);
+  to->control_input_tensors_.push_back(tensor);
+  to->ancestor_nodes_.insert(from);
+  control_tensors_list_.push_back(tensor);
+}
+
+void Somas::AddControlTensorFromExecOrder(const session::KernelGraph &graph) {
+  // Loop to add control edges within each stream (node order within stream)
+  for (const auto &stream_kv : streams_map_) {
+    auto stream = stream_kv.second;
+    MS_EXCEPTION_IF_NULL(stream);
+    auto &nodes = stream->nodes_;
+    std::sort(nodes.begin(), nodes.end(), NodeSort);
+    for (size_t i = 1; i < nodes.size(); i++) {
+      const auto &previous_node = nodes[i - 1];
+      const auto &current_node = nodes[i];
+      MS_EXCEPTION_IF_NULL(current_node);
+      AddControlTensor(previous_node, current_node);
+    }
+  }
+
+  // Loop to add control edges from end to beginning of next group
+  for (const auto &group : streams_groups_) {
+    for (size_t i = 1; i < group.size(); i++) {
+      size_t previous_stream = group[i - 1];
+      size_t current_stream = group[i];
+
+      auto stream = GetSomasStream(previous_stream);
+      if (stream == nullptr) {
+        continue;
+      }
+
+      auto &last_node_in_prev_stream = stream->nodes_.back();
+
+      stream = GetSomasStream(current_stream);
+      if (stream == nullptr) {
+        continue;
+      }
+      auto &first_node_in_cur_stream = stream->nodes_.front();
+      AddControlTensor(last_node_in_prev_stream, first_node_in_cur_stream);
+    }
+  }
+
+  // Loop to compute max destinations in each stream
+  mindspore::HashMap<size_t, size_t> stream_max_destination_node;
+  // Loop to compute max destinations in each stream
+  for (const auto &tensor : tensors_list_) {
+    MS_EXCEPTION_IF_NULL(tensor);
+    stream_max_destination_node.clear();
+    for (const auto &node_id : tensor->destination_nodes_) {
+      auto node = GetSomasNode(node_id);
+      MS_EXCEPTION_IF_NULL(node);
+      if (node_id > stream_max_destination_node[node->GetStreamId()]) {
+        stream_max_destination_node[node->GetStreamId()] = node_id;
+      }
+    }
+
+    tensor->consumer_list_.clear();
+    for (const auto &dst_map : stream_max_destination_node) {
+      tensor->consumer_list_.emplace_back(dst_map.second);
+    }
+  }
+}
+
+void Somas::InitControlTensors(const session::KernelGraph &graph) {
+  if (depend_exec_order_) {
+    AddControlTensorFromExecOrder(graph);
+  }
+}
+
+bool Somas::CommonSpecNodeProcess(const session::KernelGraph &graph) {
 #ifndef ENABLE_SECURITY
   SummaryInputProcess(graph);
 #endif
   RefNodeProcess(graph);
-  NonTaskSplitProcess(graph);
-  UnReuseNodeProcess(graph);
-  GenContiguousList(graph);
-  GetNextOutputProcess(graph);
-
-  if (tensors_list_.empty()) {
-    MS_LOG(INFO) << "No Tensor from graph " << graph->graph_id();
-    return true;
-  }
-
-  MS_LOG(INFO) << "Created " << streams_list_.size() << " streams (" << streams_groups_.size() << " groups), "
-               << nodes_list_.size() << " nodes, " << tensors_list_.size() << " tensors, and "
-               << contiguous_tensors_list_.size() << " contiguous lists";
-
-#ifdef ENABLE_DUMP_IR
-  SubModuleId module = SubModuleId::SM_OPTIMIZER;
-  std::string name = "somas_pre_processed_info." + std::to_string(graph->graph_id());
-  (void)mindspore::RDR::RecordString(module, name, SomasInfo());
-  name = "somas_offline_log." + std::to_string(graph->graph_id());
-  (void)mindspore::RDR::RecordString(module, name, Offline());
-#endif
-
-  if (save_graphs_) {
-    std::string file_path = GetSaveGraphsPathName(
-      "/somas_pre_processed_info_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path_);
-    DumpSomasInfoIR(file_path);
-
-    std::string offline_file_path =
-      GetSaveGraphsPathName("/somas_offline_log_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path_);
-    DumpOfflineIR(offline_file_path);
-  }
-  MS_LOG(DEBUG) << "Somas InitSomasTensors end.";
+  CommunicationNodeProcess(graph);
   return true;
 }
 
-void Somas::InitSomasStreamAndNode(const session::KernelGraph *graph) {
+void Somas::InitSomasStreamAndNode(const session::KernelGraph &graph) {
   MS_LOG(DEBUG) << "Somas InitSomasStreamAndNode start...";
-  MS_EXCEPTION_IF_NULL(graph);
-  std::vector<CNodePtr> kernel_cnodes;
-  streams_list_ = {};
+  streams_map_.clear();
   nodes_list_ = {};
-  size_t node_index = 0;
-  if (graph->subgraph_multi_call()) {
-    kernel_cnodes = graph->mem_reuse_exec_order();
-  } else {
-    kernel_cnodes = graph->execution_order();
-  }
+  auto &kernel_cnodes = (graph.subgraph_multi_call()) ? graph.mem_reuse_exec_order() : graph.execution_order();
   for (size_t i = 0; i < kernel_cnodes.size(); i++) {
     auto kernel = kernel_cnodes[i];
     MS_EXCEPTION_IF_NULL(kernel);
     SomasStreamPtr stream;
-    auto stream_id = AnfAlgo::GetStreamId(kernel);
-    auto it = find_if(streams_list_.begin(), streams_list_.end(),
-                      [stream_id](const SomasStreamPtr &s) { return s->GetId() == stream_id; });
-    if (it == streams_list_.end()) {
+    size_t stream_id = i;
+    if (depend_exec_order_) {
+      stream_id = AnfAlgo::GetStreamId(kernel);
+    }
+    auto it = streams_map_.find(stream_id);
+    if (it == streams_map_.end()) {
       stream = std::make_shared<SomasStream>(stream_id);
-      streams_list_.push_back(stream);
+      streams_map_[stream_id] = stream;
     } else {
-      stream = *it;
+      stream = (*it).second;
     }
 
     // Node
@@ -414,31 +624,22 @@ void Somas::InitSomasStreamAndNode(const session::KernelGraph *graph) {
     if (common::AnfAlgo::IsCommunicationOp(kernel)) {
       type = kCommunicationNode;
     }
-    auto node = std::make_shared<SomasNode>(kernel->fullname_with_scope(), node_index, type, stream->GetId());
+    auto node = std::make_shared<SomasNode>(kernel->fullname_with_scope(), i, type, stream->GetId());
     MS_EXCEPTION_IF_NULL(node);
+    MS_EXCEPTION_IF_CHECK_FAIL(nodes_list_.size() == i, "node_list_ size error!!!");
     nodes_list_.push_back(node);
     stream->nodes_.push_back(node);
     auto key = kernel.get();
     auto &nodes = nodes_map_[key];
     nodes.push_back(node);
-    node_index++;
-  }
-
-  // make nodes_id map
-  for (const auto &node : nodes_list_) {
-    if (nodes_id_map_.find(node->GetId()) != nodes_id_map_.end()) {
-      MS_LOG(EXCEPTION) << "Duplicate node id [" << node->GetId() << "]";
-    }
-    nodes_id_map_[node->GetId()] = node;
   }
 }
 
-void Somas::InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph) {
+void Somas::InitSomasOutputAndWorkspaceTensors(const session::KernelGraph &graph) {
   MS_LOG(DEBUG) << "Somas InitSomasOutputAndWorkspaceTensors start...";
-  MS_EXCEPTION_IF_NULL(graph);
   tensors_list_ = {};
   size_t tensor_index = 0;
-  auto kernel_cnodes = graph->execution_order();
+  auto &kernel_cnodes = graph.execution_order();
   for (const auto &kernel : kernel_cnodes) {
     auto nodes = nodes_map_[kernel.get()];
     auto node = nodes[0];
@@ -449,77 +650,75 @@ void Somas::InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph
     auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
     MS_EXCEPTION_IF_NULL(kernel_mod);
     auto output_sizes = kernel_mod->GetOutputSizeList();
-    auto index = 0;
     for (const auto &size : output_sizes) {
       auto output_tensor_index = tensor_index;
       tensor_index++;
-      // Set all output tensor lifelong to true.
-      auto tensor = std::make_shared<SomasTensor>(output_tensor_index, node->GetId(), stream_id, size, kLifeLongNone);
+      size_t aligned_size = GetAlignSize(size);
+      if (aligned_size == 0) {
+        // Device Address still need to be allocated when output_size is 0
+        aligned_size = GetAlignSize(kZeroAlignSize);
+      }
+      MS_LOG(INFO) << "Node " << kernel->fullname_with_scope() << " output size " << size << " align size "
+                   << aligned_size;
+      auto tensor =
+        std::make_shared<SomasTensor>(output_tensor_index, node->GetId(), stream_id, size, aligned_size, kLifeLongNone);
       MS_EXCEPTION_IF_NULL(tensor);
       tensor->lifetime_.start_ = node->GetId();
       tensor->lifetime_.end_ = (nodes.size() > 1) ? nodes.back()->GetId() : node->GetId();
       tensor->type_ = kOutputOnly;
-      if (AnfAlgo::OutputAddrExist(kernel, IntToSize(index))) {
-        tensor->aligned_size_ = 0;
-      }
 
+      MS_EXCEPTION_IF_CHECK_FAIL(tensors_list_.size() == output_tensor_index, "tensors_list_ size error!!!");
       tensors_list_.push_back(tensor);
-      tensors_map_[output_tensor_index] = tensor;
       std::for_each(nodes.begin(), nodes.end(), [tensor](auto &node) {
         MS_EXCEPTION_IF_NULL(node);
-        node->tensors_.insert(tensor);
         node->output_tensors_.push_back(tensor);
       });
-      index++;
     }
 
     // WorkSpace Tensor
     auto workspace_sizes = kernel_mod->GetWorkspaceSizeList();
-    index = 0;
     for (const auto &size : workspace_sizes) {
       auto workspace_tensor_index = tensor_index;
       tensor_index++;
-      SomasTensorPtr tensor =
-        std::make_shared<SomasTensor>(workspace_tensor_index, node->GetId(), stream_id, size, kLifeLongNone);
+      size_t aligned_size = GetAlignSize(size);
+      if (aligned_size == 0) {
+        // Device Address still need to be allocated when workspace_size is 0
+        aligned_size = GetAlignSize(kZeroAlignSize);
+      }
+      SomasTensorPtr tensor = std::make_shared<SomasTensor>(workspace_tensor_index, node->GetId(), stream_id, size,
+                                                            aligned_size, kLifeLongNone);
       MS_EXCEPTION_IF_NULL(tensor);
       tensor->type_ = kWorkspace;
       tensor->lifetime_.start_ = node->GetId();
       tensor->lifetime_.end_ = (nodes.size() > 1) ? nodes.back()->GetId() : node->GetId();
-      if (AnfAlgo::WorkspaceAddrExist(kernel, IntToSize(index))) {
-        tensor->aligned_size_ = 0;
-      }
+
+      MS_EXCEPTION_IF_CHECK_FAIL(tensors_list_.size() == workspace_tensor_index, "tensors_list_ size error!!!");
       tensors_list_.push_back(tensor);
-      tensors_map_[workspace_tensor_index] = tensor;
       std::for_each(nodes.begin(), nodes.end(), [tensor](auto &node) {
         MS_EXCEPTION_IF_NULL(node);
-        node->tensors_.insert(tensor);
         node->workspace_tensors_.push_back(tensor);
       });
-      index++;
     }
   }
 }
 
-void Somas::InitSomasInputTensors(const session::KernelGraph *graph) {
+void Somas::InitSomasInputTensors(const session::KernelGraph &graph) {
   MS_LOG(DEBUG) << "Somas InitSomasInputTensors start...";
-  MS_EXCEPTION_IF_NULL(graph);
-  bool is_all_nop_node = opt::IsAllNopNode(graph);
   static const auto enable_fusion_clear = (common::GetEnv("ENV_FUSION_CLEAR") == "1");
-  auto kernel_cnodes = graph->execution_order();
+  auto &kernel_cnodes = graph.execution_order();
   for (const auto &kernel : kernel_cnodes) {
     if (common::AnfAlgo::GetCNodeName(kernel) != kAtomicAddrCleanOpName) {
-      InitCommonNodeInputs(is_all_nop_node, kernel);
+      InitCommonNodeInputs(kernel);
     } else {
       InitAtomicCleanInputs(enable_fusion_clear, kernel);
     }
   }
 }
 
-void Somas::InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel) {
+void Somas::InitCommonNodeInputs(const CNodePtr &kernel) {
   auto nodes = nodes_map_[kernel.get()];
   auto node = nodes[0];
   MS_EXCEPTION_IF_NULL(node);
-  auto stream_id = node->GetStreamId();
 
   // Input Tensor
   auto input_tensor_num = common::AnfAlgo::GetInputTensorNum(kernel);
@@ -527,17 +726,12 @@ void Somas::InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel) {
   for (size_t i = 0; i < input_tensor_num; i++) {
     auto input_node = kernel->input(i + 1);
     MS_EXCEPTION_IF_NULL(input_node);
-    session::KernelWithIndex prenode_index;
-    if (is_all_nop_node) {
-      prenode_index = common::AnfAlgo::VisitKernelWithReturnType(input_node, 0, false);
-    } else {
-      prenode_index = common::AnfAlgo::VisitKernelWithReturnType(input_node, 0, true);
-    }
-    if (common::AnfAlgo::CheckPrimitiveType(prenode_index.first, prim::kPrimMakeTuple)) {
-      MS_LOG(EXCEPTION) << "Input node [" << kernel->DebugString() << "]'s input " << i << " ["
-                        << input_node->DebugString() << "] is MakeTuple";
-    }
+    session::KernelWithIndex prenode_index = GetVisitKernelWithReturnType(input_node, 0);
     MS_EXCEPTION_IF_NULL(prenode_index.first);
+    if (common::AnfAlgo::CheckPrimitiveType(prenode_index.first, prim::kPrimMakeTuple)) {
+      MS_LOG(EXCEPTION) << "Node " << node->scope_full_name_ << "'s input node [" << input_node->DebugString()
+                        << "]'s input " << i << " is MakeTuple";
+    }
     if (!AnfUtils::IsRealCNodeKernel(prenode_index.first)) {
       auto op_name = common::AnfAlgo::GetCNodeName(kernel);
       TypeId input_origin_type = common::AnfAlgo::GetPrevNodeOutputInferDataType(kernel, i);
@@ -556,7 +750,7 @@ void Somas::InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel) {
       MS_LOG(EXCEPTION) << "Kernel[" << kernel->fullname_with_scope() << "]'s input " << i << " ["
                         << prenode_index.first->fullname_with_scope() << "] is not init.";
     }
-    auto pre_somas_node = iter->second.at(0);
+    SomasNodePtr pre_somas_node = iter->second.at(0);
     if (prenode_index.second > pre_somas_node->output_tensors_.size()) {
       MS_LOG(EXCEPTION) << "Output index " << prenode_index.second << " exceed input node ["
                         << prenode_index.first->fullname_with_scope() << "]'s outputs size "
@@ -573,6 +767,7 @@ void Somas::InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel) {
 
     for (auto &repeat_node : nodes) {
       input_somas_tensor->destination_nodes_.insert(repeat_node->GetId());
+      input_somas_tensor->consumer_list_.emplace_back(repeat_node->GetId());
       if (input_somas_tensor->lifetime_.end_ < repeat_node->GetId()) {
         input_somas_tensor->lifetime_.end_ = repeat_node->GetId();
       }
@@ -581,10 +776,6 @@ void Somas::InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel) {
     if (node != pre_somas_node) {
       node->ancestor_nodes_.insert(pre_somas_node);
     }
-    auto input_tensor_stream_id = input_somas_tensor->GetSourceStreamId();
-    if (input_tensor_stream_id != stream_id) {
-      input_somas_tensor->between_streams_ = true;
-    }
   }
 }
 
@@ -614,7 +805,7 @@ void Somas::InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kern
         MS_EXCEPTION_IF_NULL(input_somas_tensor);
         node->input_tensors_.push_back(input_somas_tensor);
         if (enable_fusion_clear) {
-          input_somas_tensor->lifelong_value_ = kLifeLongGraphAll;
+          input_somas_tensor->lifelong_value_ = kLifeLongGraphStart;
           MS_LOG(INFO) << "Set " << node->scope_full_name_ << "'s Input node " << pre_somas_node->scope_full_name_
                        << " 's output" << index << " to lifelong";
         }
@@ -633,7 +824,7 @@ void Somas::InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kern
         MS_EXCEPTION_IF_NULL(input_somas_tensor);
         node->input_tensors_.push_back(input_somas_tensor);
         if (enable_fusion_clear) {
-          input_somas_tensor->lifelong_value_ = kLifeLongGraphAll;
+          input_somas_tensor->lifelong_value_ = kLifeLongGraphStart;
           MS_LOG(INFO) << "Set " << node->scope_full_name_ << "'s Input node " << pre_somas_node->scope_full_name_
                        << " 's workspace" << index << " to lifelong";
         }
@@ -642,47 +833,6 @@ void Somas::InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kern
   }
 }
 
-void Somas::InitSomasEventInfos() {
-  MS_LOG(DEBUG) << "Somas InitSomasEventInfos start...";
-  event_map_ = {};
-  std::map<CNodePtr, CNodePtr> send_recv_map;
-#ifdef ENABLE_D
-  send_recv_map = device::ascend::AscendStreamAssign::GetInstance().get_event_map();
-#endif
-  for (const auto &send_recv : send_recv_map) {
-    size_t event_id = common::AnfAlgo::GetNodeAttr<uint32_t>(send_recv.first, kAttrEventId);
-    event_map_[event_id] = std::make_pair(send_recv.first, send_recv.second);
-  }
-
-  auto tensor_index = tensors_list_.size();
-  for (const auto &event : event_map_) {
-    std::pair<CNodePtr, CNodePtr> send_recv_pair = event.second;
-    auto send_iter = nodes_map_.find(send_recv_pair.first.get());
-    auto recv_iter = nodes_map_.find(send_recv_pair.second.get());
-    if (send_iter == nodes_map_.end() || recv_iter == nodes_map_.end()) {
-      continue;
-    }
-
-    auto &somas_send = send_iter->second.at(0);
-    auto &somas_recv = recv_iter->second.at(0);
-    auto output_tensor_index = tensor_index;
-    tensor_index++;
-    SomasTensorPtr tensor = std::make_shared<SomasTensor>(output_tensor_index, somas_send->GetId(),
-                                                          somas_send->GetStreamId(), 0, kLifeLongNone);
-    tensor->lifetime_.start_ = somas_send->GetId();
-    tensor->lifetime_.end_ = somas_recv->GetId();
-    tensor->type_ = kEventVirtualOutput;
-    tensor->destination_nodes_.insert(somas_recv->GetId());
-    somas_send->tensors_.insert(tensor);
-    somas_send->output_tensors_.push_back(tensor);
-    somas_recv->input_tensors_.push_back(tensor);
-    somas_recv->ancestor_nodes_.insert(somas_send);
-    tensors_list_.push_back(tensor);
-    tensors_map_[output_tensor_index] = tensor;
-  }
-  MS_LOG(DEBUG) << "Somas InitSomasEventInfos end.";
-}
-
 SomasParameterPtr Somas::CreateSomasParameter(const AnfNodePtr &node, size_t index) {
   MS_EXCEPTION_IF_NULL(node);
   auto id = parameters_list_.size();
@@ -722,95 +872,23 @@ SomasParameterPtr Somas::GetSomasParameter(const AnfNodePtr &node, size_t index)
   }
 }
 
-void Somas::InitBasicInfo(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-#ifdef ENABLE_D
-  streams_groups_ = device::ascend::AscendStreamAssign::GetInstance().get_stream_group();
-#endif
+bool Somas::InitBasicInfoFromGraph(const session::KernelGraph &graph) {
   InitSomasStreamAndNode(graph);
   InitSomasOutputAndWorkspaceTensors(graph);
   InitSomasInputTensors(graph);
-  InitSomasEventInfos();
-
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
-
-#ifdef ENABLE_DUMP_IR
-  SubModuleId module = SubModuleId::SM_OPTIMIZER;
-  std::string name = "somas_initial_info." + std::to_string(graph->graph_id());
-  (void)mindspore::RDR::RecordString(module, name, SomasInfo());
-#endif
-
-  save_graphs_ = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
-  save_graphs_path_ = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
-  if (save_graphs_path_.empty()) {
-    save_graphs_path_ = ".";
-  }
-  if (save_graphs_) {
-    std::string file_path =
-      GetSaveGraphsPathName("/somas_initial_info_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path_);
-    DumpSomasInfoIR(file_path);
-  }
-}
-
-void Somas::GetNextOutputProcess(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  auto kernel_cnodes = graph->execution_order();
-  size_t total_size = 0;
-  for (const auto &kernel : kernel_cnodes) {
-    if (common::AnfAlgo::GetCNodeName(kernel) != kGetNextOpName) {
-      continue;
-    }
-    auto iter = nodes_map_.find(kernel.get());
-    if (iter != nodes_map_.end()) {
-      auto &node = iter->second.at(0);
-      MS_EXCEPTION_IF_NULL(node);
-      auto getnext_output_tensors = node->output_tensors_;
-      for (auto &tensor : getnext_output_tensors) {
-        MS_EXCEPTION_IF_NULL(tensor);
-        total_size += tensor->GetAlignedSize();
-        tensor->lifelong_value_ = kLifeLongGraphAll;
-        tensor->type_ = kGetNextOutput;
-      }
-    }
-  }
-  MS_LOG(INFO) << "Special Tensor total size: GetNext Output " << total_size;
-}
-
-void Somas::IndependentNodeOutputProcess(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  auto kernel_cnodes = graph->execution_order();
-  size_t total_size = 0;
-  for (const auto &kernel : kernel_cnodes) {
-    bool independent = AnfAlgo::IsIndependentNode(kernel);
-    if (!independent) {
-      continue;
-    }
-    auto iter = nodes_map_.find(kernel.get());
-    if (iter != nodes_map_.end()) {
-      auto &node = iter->second.at(0);
-      MS_EXCEPTION_IF_NULL(node);
-      auto semi_reuse_output_tensors = node->output_tensors_;
-      for (auto &tensor : semi_reuse_output_tensors) {
-        MS_EXCEPTION_IF_NULL(tensor);
-        total_size += tensor->GetAlignedSize();
-        tensor->lifelong_value_ = kLifeLongGraphEnd;
-      }
-    }
-  }
-
-  MS_LOG(INFO) << "Special Tensor total size: Independent Node output " << total_size;
+  InitControlTensors(graph);
+  GraphOutputProcess(graph);
+  return true;
 }
 
 #ifndef ENABLE_SECURITY
-void Somas::SummaryInputProcess(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  bool summary_exist = graph->summary_node_exist();
+void Somas::SummaryInputProcess(const session::KernelGraph &graph) {
+  bool summary_exist = graph.summary_node_exist();
   if (!summary_exist) {
     return;
   }
 
-  auto summary_nodes = graph->summary_nodes();
+  auto summary_nodes = graph.summary_nodes();
   if (summary_nodes.empty()) {
     return;
   }
@@ -819,7 +897,7 @@ void Somas::SummaryInputProcess(const session::KernelGraph *graph) {
   for (const auto &node_item : summary_nodes) {
     auto origin_node = node_item.second.first;
     size_t origin_index = IntToSize(node_item.second.second);
-    auto item_with_index = common::AnfAlgo::VisitKernelWithReturnType(origin_node, origin_index, true);
+    auto item_with_index = GetVisitKernelWithReturnType(origin_node, origin_index);
     auto node = item_with_index.first;
     size_t index = item_with_index.second;
     auto iter = nodes_map_.find(node.get());
@@ -829,7 +907,7 @@ void Somas::SummaryInputProcess(const session::KernelGraph *graph) {
       if (index < input_node->output_tensors_.size()) {
         auto tensor = input_node->output_tensors_[index];
         MS_EXCEPTION_IF_NULL(tensor);
-        tensor->lifelong_value_ = kLifeLongGraphAll;
+        tensor->lifelong_value_ = kLifeLongGraphEnd;
         tensor->type_ = kSummaryInput;
         total_summary_size += tensor->GetAlignedSize();
         MS_LOG(INFO) << "Set summary node input tensor's lifelong, node: " << node->fullname_with_scope()
@@ -847,9 +925,50 @@ void Somas::SummaryInputProcess(const session::KernelGraph *graph) {
 }
 #endif
 
-void Somas::RefNodeProcess(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  auto kernel_cnodes = graph->execution_order();
+void Somas::GraphOutputProcess(const session::KernelGraph &graph) {
+  size_t count = 0;
+  auto outputs = common::AnfAlgo::GetAllOutputWithIndex(graph.output());
+  for (auto output_with_index : outputs) {
+    auto output_kernel = output_with_index.first;
+    MS_EXCEPTION_IF_NULL(output_kernel);
+    if (AnfUtils::IsRealCNodeKernel(output_kernel) && nodes_map_.find(output_kernel.get()) == nodes_map_.end()) {
+      auto cnode = output_kernel->cast<CNodePtr>();
+      if (!common::AnfAlgo::IsNopNode(cnode)) {
+        MS_LOG(EXCEPTION) << "Node[" << cnode->fullname_with_scope()
+                          << "] doesn't exist in nodes_map and is not a nop node!!!";
+      }
+      output_with_index = common::AnfAlgo::VisitKernelWithReturnType(cnode->input(kNopNodeRealInputIndex), 0, false);
+      output_kernel = output_with_index.first;
+    }
+
+    if (!AnfUtils::IsRealCNodeKernel(output_kernel)) {
+      continue;
+    }
+
+    auto output_index = output_with_index.second;
+    auto iter = nodes_map_.find(output_kernel.get());
+    if (iter != nodes_map_.end()) {
+      auto &node = iter->second.at(0);
+      MS_EXCEPTION_IF_NULL(node);
+      if (output_index <= node->output_tensors_.size()) {
+        auto &tensor = node->output_tensors_[output_index];
+        tensor->aligned_size_ = 0;
+        tensor->type_ = kGraphOutput;
+        count++;
+      } else {
+        MS_LOG(EXCEPTION) << "Graph's output node " << output_kernel->fullname_with_scope() << "'s output index"
+                          << output_index << " is larger than its output tensor number "
+                          << node->output_tensors_.size();
+      }
+    } else {
+      MS_LOG(EXCEPTION) << "Can't find somas node for graph output node " << output_kernel->fullname_with_scope();
+    }
+  }
+  MS_LOG(INFO) << "Set " << count << " graph output tensors' aligned size to 0.";
+}
+
+void Somas::RefNodeProcess(const session::KernelGraph &graph) {
+  auto &kernel_cnodes = graph.execution_order();
   size_t total_output_size = 0;
   size_t total_input_size = 0;
   for (const auto &kernel : kernel_cnodes) {
@@ -864,17 +983,30 @@ void Somas::RefNodeProcess(const session::KernelGraph *graph) {
       auto out_index = output_index;
       output_index++;
       session::AnfWithOutIndex out_pair(kernel, out_index);
-      if (graph->IsInRefOutputMap(out_pair)) {
-        auto origin_pair = graph->GetRefCorrespondOutput(out_pair);
+      if (graph.IsInRefOutputMap(out_pair)) {
+        auto origin_pair = graph.GetRefCorrespondOutput(out_pair);
         MS_EXCEPTION_IF_NULL(origin_pair.first);
         auto &node = nodes_map_[kernel.get()].at(0);
         MS_EXCEPTION_IF_NULL(node);
         auto output_tensor = node->output_tensors_[out_index];
         MS_EXCEPTION_IF_NULL(output_tensor);
-        output_tensor->type_ = kRefNodeOutput;
+        output_tensor->type_ = kUnion;
         total_output_size += size;
 
         if (AnfUtils::IsRealCNodeKernel(origin_pair.first)) {
+          if (nodes_map_.find(origin_pair.first.get()) == nodes_map_.end()) {
+            auto cnode = origin_pair.first->cast<CNodePtr>();
+            if (!common::AnfAlgo::IsNopNode(cnode)) {
+              MS_LOG(EXCEPTION) << "Node[" << origin_pair.first->fullname_with_scope() << "] find input node["
+                                << cnode->fullname_with_scope()
+                                << "] doesn't exist in nodes_map and is not a nop node!!!!";
+            }
+            origin_pair = common::AnfAlgo::VisitKernelWithReturnType(cnode->input(kNopNodeRealInputIndex), 0, false);
+          }
+          if (!origin_pair.first->isa<CNode>()) {
+            MS_LOG(EXCEPTION) << "The origin_pair.first is not a cnode. Info origin_pair.first: "
+                              << origin_pair.first->DebugString();
+          }
           auto ori_node = origin_pair.first->cast<CNodePtr>();
           auto ori_index = origin_pair.second;
           if (nodes_map_.find(ori_node.get()) == nodes_map_.end()) {
@@ -886,13 +1018,16 @@ void Somas::RefNodeProcess(const session::KernelGraph *graph) {
           MS_EXCEPTION_IF_NULL(repeat_node);
           auto input_tensor = repeat_node->output_tensors_[ori_index];
           MS_EXCEPTION_IF_NULL(input_tensor);
-          input_tensor->type_ = kRefNodeInput;
+          input_tensor->type_ = kUnion;
           total_input_size += input_tensor->aligned_size_;
           std::vector<size_t> refnode_input_output;
           refnode_input_output.push_back(input_tensor->GetId());
           refnode_input_output.push_back(output_tensor->GetId());
-          ref_node_constraints_.push_back(refnode_input_output);
+          union_tensors_list_.push_back(refnode_input_output);
           MS_LOG(INFO) << "RefNode: input " << input_tensor->GetId() << " output " << output_tensor->GetId();
+        } else {
+          output_tensor->type_ = kGraphInput;
+          output_tensor->aligned_size_ = 0;
         }
       }
     }
@@ -901,66 +1036,60 @@ void Somas::RefNodeProcess(const session::KernelGraph *graph) {
   MS_LOG(INFO) << "Special Tensor total size: RefNode: input " << total_input_size << " output " << total_output_size;
 }
 
-void Somas::NonTaskSplitProcess(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  auto kernel_cnodes = graph->execution_order();
-  for (const auto &kernel : kernel_cnodes) {
-    auto op_name = common::AnfAlgo::GetCNodeName(kernel);
-    if (common::AnfAlgo::IsNonTaskOp(kernel)) {
-      std::vector<size_t> refnode_input_output;
-      auto node = nodes_map_[kernel.get()].at(0);
-      MS_EXCEPTION_IF_NULL(node);
-      if (node->input_tensors_.size() == 0) {
-        MS_LOG(EXCEPTION) << op_name << " has no input tensor, can not do split non_task process.";
-      }
-      auto input_tensor = node->input_tensors_[0];
-      MS_EXCEPTION_IF_NULL(input_tensor);
-      input_tensor->type_ = kRefNodeInput;
-      refnode_input_output.push_back(input_tensor->GetId());
-
-      for (auto &output_tensor : node->output_tensors_) {
-        MS_EXCEPTION_IF_NULL(output_tensor);
-        output_tensor->type_ = kRefNodeOutput;
-        refnode_input_output.push_back(output_tensor->GetId());
-      }
-      ref_node_constraints_.push_back(refnode_input_output);
-    }
+void Somas::UnReuseNodeProcess(const session::KernelGraph &graph) {
+  std::map<std::string, UnReuseType> full_name_type = {};
+  for (const auto &node : un_reuse_node_name_) {
+    full_name_type.insert(node);
   }
-}
 
-void Somas::UnReuseNodeProcess(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
-  vector<string> full_name_list = {};
-  if (full_name_list.size() == 0) {
+  auto &kernel_cnodes = graph.execution_order();
+  for (const auto &kernel : kernel_cnodes) {
+    auto type = common::AnfAlgo::GetCNodeName(kernel);
+    auto iter = un_reuse_node_type_.find(type);
+    if (iter == un_reuse_node_type_.end()) {
+      continue;
+    }
+    auto full_name = kernel->fullname_with_scope();
+    full_name_type[full_name] = iter->second;
+  }
+
+  if (full_name_type.empty()) {
     return;
   }
 
-  auto kernel_cnodes = graph->execution_order();
   for (const auto &kernel : kernel_cnodes) {
     MS_EXCEPTION_IF_NULL(kernel);
     auto full_name = kernel->fullname_with_scope();
-    auto iter = std::find(full_name_list.begin(), full_name_list.end(), full_name);
-    if (iter != full_name_list.end()) {
-      MS_LOG(INFO) << "Set UnReuse Node in somas, Node:" << full_name;
-      auto key = kernel.get();
-      auto somas_node = nodes_map_[key].at(0);
-      MS_EXCEPTION_IF_NULL(somas_node);
-      // input
+    auto iter = full_name_type.find(full_name);
+    if (iter == full_name_type.end()) {
+      continue;
+    }
+    auto un_reuse_type = iter->second;
+    MS_LOG(INFO) << "Set UnReuse Node in somas, Node:" << iter->first << ", UnReuse type:" << un_reuse_type;
+    auto key = kernel.get();
+    auto somas_node = nodes_map_[key].at(0);
+    MS_EXCEPTION_IF_NULL(somas_node);
+    // input
+    if (un_reuse_type == UnReuseType::kUnReuseAll || un_reuse_type == UnReuseType::kUnReuseInput) {
       auto inputs = somas_node->input_tensors_;
       for (auto &input : inputs) {
         MS_EXCEPTION_IF_NULL(input);
         input->lifelong_value_ = kLifeLongGraphAll;
       }
+    }
 
-      // output
+    // output
+    if (un_reuse_type == UnReuseType::kUnReuseAll || un_reuse_type == UnReuseType::kUnReuseOutput) {
       auto outputs = somas_node->output_tensors_;
       MS_LOG(INFO) << "Output size of " << kernel->fullname_with_scope() << " is  " << outputs.size();
       for (auto &output : outputs) {
         MS_EXCEPTION_IF_NULL(output);
         output->lifelong_value_ = kLifeLongGraphAll;
       }
+    }
 
-      // workspace
+    // workspace
+    if (un_reuse_type == UnReuseType::kUnReuseAll || un_reuse_type == UnReuseType::kUnReuseWorkspace) {
       auto workspaces = somas_node->workspace_tensors_;
       for (auto &workspace : workspaces) {
         MS_EXCEPTION_IF_NULL(workspace);
@@ -970,8 +1099,7 @@ void Somas::UnReuseNodeProcess(const session::KernelGraph *graph) {
   }
 }
 
-void Somas::GenContiguousList(const session::KernelGraph *graph) {
-  MS_EXCEPTION_IF_NULL(graph);
+void Somas::CommunicationNodeProcess(const session::KernelGraph &graph) {
   for (const auto &node : nodes_list_) {
     MS_EXCEPTION_IF_NULL(node);
     if (node->GetType() != kCommunicationNode) {
@@ -980,12 +1108,14 @@ void Somas::GenContiguousList(const session::KernelGraph *graph) {
 
     // Contiguous input
     if ((!node->input_tensors_.empty()) && (!node->input_tensors_[0]->contiguous_)) {
+      // add gap for first and last input
       if (node->input_tensors_[0]->aligned_size_ != 0) {
-        node->input_tensors_[0]->aligned_size_ += kGapSize;
+        node->input_tensors_[0]->aligned_size_ += communication_gap_size_;
       }
       if (node->input_tensors_[node->input_tensors_.size() - 1]->aligned_size_ != 0) {
-        node->input_tensors_[node->input_tensors_.size() - 1]->aligned_size_ += kGapSize;
+        node->input_tensors_[node->input_tensors_.size() - 1]->aligned_size_ += communication_gap_size_;
       }
+
       std::vector<size_t> inputs;
       for (const auto &input_tensor : node->input_tensors_) {
         MS_EXCEPTION_IF_NULL(input_tensor);
@@ -1002,12 +1132,14 @@ void Somas::GenContiguousList(const session::KernelGraph *graph) {
 
     // Contiguous output
     if ((!node->output_tensors_.empty()) && (!node->output_tensors_[0]->contiguous_)) {
+      // add gap for first and last output
       if (node->output_tensors_[0]->aligned_size_ != 0) {
-        node->output_tensors_[0]->aligned_size_ += kGapSize;
+        node->output_tensors_[0]->aligned_size_ += communication_gap_size_;
       }
       if (node->output_tensors_[node->output_tensors_.size() - 1]->aligned_size_ != 0) {
-        node->output_tensors_[node->output_tensors_.size() - 1]->aligned_size_ += kGapSize;
+        node->output_tensors_[node->output_tensors_.size() - 1]->aligned_size_ += communication_gap_size_;
       }
+
       std::vector<size_t> outputs;
       for (const auto &output_tensor : node->output_tensors_) {
         MS_EXCEPTION_IF_NULL(output_tensor);
@@ -1021,9 +1153,22 @@ void Somas::GenContiguousList(const session::KernelGraph *graph) {
       }
       contiguous_tensors_list_.push_back(outputs);
     }
+
+    // check the tensors of the list
+    std::set<size_t> all_contiguous_tensors_set;
+    size_t all_contiguous_tensors_num = 0;
+    for (auto &tensors : contiguous_tensors_list_) {
+      all_contiguous_tensors_num += tensors.size();
+      all_contiguous_tensors_set.insert(tensors.begin(), tensors.end());
+    }
+    if (all_contiguous_tensors_num != all_contiguous_tensors_set.size()) {
+      MS_LOG(EXCEPTION) << "Please check the CommunicationNodes, some tensor are in multiple contiguous list";
+    }
   }
 }
 
+bool Somas::NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2) { return node1->GetId() < node2->GetId(); }
+
 void Somas::BuildConflictInfo(const std::shared_ptr<SomasTensor> &tensor, TensorConflictInfo *tensor_conflict_info,
                               std::vector<size_t> *destination_node_list) {
   const auto &consumer_list = tensor->consumer_list_;
@@ -1044,12 +1189,7 @@ void Somas::BuildConflictInfo(const std::shared_ptr<SomasTensor> &tensor, Tensor
   }
 }
 
-void Somas::ComputeConflictPairs() {
-  if (tensors_list_.empty()) {
-    MS_LOG(INFO) << "No Tensor for Conflict computing";
-    return;
-  }
-
+void Somas::ComputeBasicMatrix() {
   MS_LOG(INFO) << "Start Conflict Computing (Bitset Model)";
   auto start_conflict = std::chrono::system_clock::now();
   std::sort(nodes_list_.begin(), nodes_list_.end(), NodeSort);
@@ -1127,11 +1267,8 @@ void Somas::ComputeConflictPairs() {
     common::ThreadPool::GetInstance().SyncRun(tasks);
   }
 
-  ProcessSemiLifeLongTensor();
-
-  MS_LOG(INFO) << "End Tensor Relation Computing";
   auto end_conflict = std::chrono::system_clock::now();
-  MS_LOG(INFO) << "End Conflict Computing (Bitset Model)(time taken "
+  MS_LOG(INFO) << "End Basic Conflict Computing (Bitset Model)(time taken "
                << std::chrono::duration_cast<std::chrono::milliseconds>(end_conflict - start_conflict).count() << "ms)";
 }
 
@@ -1146,8 +1283,13 @@ void Somas::ProcessSemiLifeLongTensor() {
       if (calc_tensor == target_tensor) {
         continue;
       }
-      if ((calc_tensor->IsSemiLifelongStart() && target_tensor->GetId() < calc_tensor->GetId()) ||
-          (calc_tensor->IsSemiLifelongEnd() && target_tensor->GetId() > calc_tensor->GetId())) {
+      if (depend_exec_order_) {
+        if ((calc_tensor->IsSemiLifelongStart() && target_tensor->GetId() < calc_tensor->GetId()) ||
+            (calc_tensor->IsSemiLifelongEnd() && target_tensor->GetId() > calc_tensor->GetId())) {
+          reuse_matrix_[calc_tensor->GetId()].SetBitFalse(target_tensor->GetId());
+          reuse_matrix_[target_tensor->GetId()].SetBitFalse(calc_tensor->GetId());
+        }
+      } else {
         reuse_matrix_[calc_tensor->GetId()].SetBitFalse(target_tensor->GetId());
         reuse_matrix_[target_tensor->GetId()].SetBitFalse(calc_tensor->GetId());
       }
@@ -1155,65 +1297,62 @@ void Somas::ProcessSemiLifeLongTensor() {
   }
 }
 
+void Somas::ComputeConflictMatrix() {
+  if (tensors_list_.empty()) {
+    MS_LOG(INFO) << "No Tensor for Conflict computing";
+    return;
+  }
+  ComputeBasicMatrix();
+  ProcessSemiLifeLongTensor();
+  UpdateUnionTensorsConflict();
+}
+
+void Somas::UpdateContiguousTensorList() {
+  processed_contiguous_tensors_list_.clear();
+  processed_contiguous_tensors_list_.insert(processed_contiguous_tensors_list_.end(), contiguous_tensors_list_.begin(),
+                                            contiguous_tensors_list_.end());
+  std::set<std::vector<size_t>> contiguous_tensors_list_to_remove;
+
+  GetContiguousListContainUnionTensor();
+  for (const auto &ref_list_pair : contiguous_list_with_ref_index_map_) {
+    contiguous_tensors_list_to_remove.insert(contiguous_tensors_list_[ref_list_pair.second]);
+  }
+
+  // remove the contiguous list which all tensors' align size is 0
+  for (const auto &contiguous_list : contiguous_tensors_list_) {
+    bool all_outputs = true;
+    for (auto tensor_id : contiguous_list) {
+      auto tensor = tensors_list_[tensor_id];
+      MS_EXCEPTION_IF_NULL(tensor);
+      if (tensor->aligned_size_ != 0) {
+        all_outputs = false;
+        break;
+      }
+    }
+
+    if (all_outputs) {
+      contiguous_tensors_list_to_remove.insert(contiguous_list);
+    }
+  }
+
+  for (const auto &contiguous_list : contiguous_tensors_list_to_remove) {
+    auto iterator =
+      std::find(processed_contiguous_tensors_list_.begin(), processed_contiguous_tensors_list_.end(), contiguous_list);
+    if (iterator != processed_contiguous_tensors_list_.end()) {
+      processed_contiguous_tensors_list_.erase(iterator);
+    } else {
+      MS_LOG(WARNING) << "Could not find contiguous list to remove for ref";
+    }
+  }
+}
+
 void Somas::UpdateTensorDestinations() {
-  // Loop to add edges within each stream (node order within stream)
-  for (const auto &stream : streams_list_) {
-    MS_EXCEPTION_IF_NULL(stream);
-    auto &nodes = stream->nodes_;
-    std::sort(nodes.begin(), nodes.end(), NodeSort);
-    for (size_t i = 1; i < nodes.size(); i++) {
-      const auto &previous_node = nodes[i - 1];
-      const auto &current_node = nodes[i];
-      MS_EXCEPTION_IF_NULL(current_node);
-      current_node->ancestor_nodes_.insert(previous_node);
-    }
-  }
-
-  // Loop to add edges from end to beginning of next group
-  for (const auto &group : streams_groups_) {
-    for (size_t i = 1; i < group.size(); i++) {
-      size_t previous_stream = group[i - 1];
-      size_t current_stream = group[i];
-
-      auto stream = GetSomasStream(previous_stream);
-      if (stream == nullptr) {
-        continue;
-      }
-
-      auto &last_node_in_prev_stream = stream->nodes_.back();
-
-      stream = GetSomasStream(current_stream);
-      if (stream == nullptr) {
-        continue;
-      }
-      auto &first_node_in_cur_stream = stream->nodes_.front();
-
-      first_node_in_cur_stream->ancestor_nodes_.insert(last_node_in_prev_stream);
-    }
-  }
-
   // Loop to avoid tensors with empty destinations (add itself)
   for (const auto &tensor : tensors_list_) {
     MS_EXCEPTION_IF_NULL(tensor);
-    if (tensor->destination_nodes_.size() == 0) {
+    if (tensor->destination_nodes_.empty()) {
       tensor->destination_nodes_.insert(tensor->GetSourceNodeId());
-    }
-  }
-
-  mindspore::HashMap<size_t, size_t> stream_max_destination_node;
-  // Loop to compute max destinations in each stream
-  for (const auto &tensor : tensors_list_) {
-    MS_EXCEPTION_IF_NULL(tensor);
-    stream_max_destination_node.clear();
-    for (const auto &node_id : tensor->destination_nodes_) {
-      auto node = GetSomasNode(node_id);
-      MS_EXCEPTION_IF_NULL(node);
-      if (node_id > stream_max_destination_node[node->GetStreamId()]) {
-        stream_max_destination_node[node->GetStreamId()] = node_id;
-      }
-    }
-    for (const auto &dst_map : stream_max_destination_node) {
-      tensor->consumer_list_.emplace_back(dst_map.second);
+      tensor->consumer_list_.emplace_back(tensor->GetSourceNodeId());
     }
   }
 }
@@ -1270,7 +1409,7 @@ void Somas::ComputeOneTensorConflicts(const std::shared_ptr<SomasTensor> &target
                                       const std::vector<TensorConflictInfo> &tensor_conflict_info_list,
                                       const std::vector<size_t> &destination_node_list,
                                       const vector<DynamicBitSet> &nodes_dependency,
-                                      std::vector<DynamicBitSet> *tensor_relation) const {
+                                      std::vector<DynamicBitSet> *tensor_relation) {
   MS_EXCEPTION_IF_NULL(target_tensor);
   auto target_tensor_id = target_tensor->GetId();
   auto target_src_node_id = target_tensor->GetSourceNodeId();
@@ -1281,86 +1420,41 @@ void Somas::ComputeOneTensorConflicts(const std::shared_ptr<SomasTensor> &target
 
   //  the conflict info of per calc_tensor
   for (const auto &tensor_conflict_info : tensor_conflict_info_list) {
-    if (tensor_conflict_info.tensor_id_ == target_tensor_id ||
-        tensor_conflict_info.src_node_id_ == target_src_node_id) {
+    if (tensor_conflict_info.tensor_id == target_tensor_id || tensor_conflict_info.src_node_id == target_src_node_id) {
       continue;
     }
 
     if (CheckIsDependency(tensor_conflict_info, target_src_node_id, nodes_dependency, destination_node_list) ||
-        CheckIsDependency(target_info, tensor_conflict_info.src_node_id_, nodes_dependency,
+        CheckIsDependency(target_info, tensor_conflict_info.src_node_id, nodes_dependency,
                           target_destination_node_list)) {
       // calc_tensor and target_tensor have dependencies so they can reuse each other
-      (*tensor_relation)[target_tensor_id].SetBitTrue(tensor_conflict_info.tensor_id_);
+      (*tensor_relation)[target_tensor_id].SetBitTrue(tensor_conflict_info.tensor_id);
     }
   }
 }
 
-bool Somas::NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2) { return node1->GetId() < node2->GetId(); }
-
-bool Somas::Assign(const session::KernelGraph *graph) {
-  MS_LOG(DEBUG) << "Somas Assign start...";
+bool Somas::Solve(const session::KernelGraph &graph) {
+  MS_LOG(INFO) << "Somas Assign start...";
   if (tensors_list_.empty()) {
     MS_LOG(INFO) << "No Tensor for Assigner";
     return true;
   }
 
-  // Ref Node Preprocessing
-  UpdateRefTensorsConflict();
-  std::map<size_t, size_t> contiguous_list_with_ref_index_map = GetContiguousListContainRefTensor();
-  vector<vector<size_t>> contiguous_tensors_list_removed = contiguous_tensors_list_;
-  std::set<vector<size_t>> contiguous_tensors_list_to_remove;
-  for (const auto &ref_list_pair : contiguous_list_with_ref_index_map) {
-    contiguous_tensors_list_to_remove.insert(contiguous_tensors_list_[ref_list_pair.second]);
-  }
-
-  // remove the contiguous list which all tensors' align size is 0
-  for (auto contiguous_list : contiguous_tensors_list_) {
-    bool all_outputs = true;
-    for (auto tensor_id : contiguous_list) {
-      auto tensor = tensors_list_[tensor_id];
-      MS_EXCEPTION_IF_NULL(tensor);
-      if (tensor->aligned_size_ != 0) {
-        all_outputs = false;
-        break;
-      }
-    }
-
-    if (all_outputs) {
-      contiguous_tensors_list_to_remove.insert(contiguous_list);
-    }
-  }
-
-  for (const auto &contiguous_list : contiguous_tensors_list_to_remove) {
-    auto iterator =
-      std::find(contiguous_tensors_list_removed.begin(), contiguous_tensors_list_removed.end(), contiguous_list);
-    if (iterator != contiguous_tensors_list_removed.end()) {
-      contiguous_tensors_list_removed.erase(iterator);
-    } else {
-      MS_LOG(WARNING) << "Could not find contiguous list to remove for ref";
-    }
-  }
-  MS_LOG(INFO) << "End Solving Preprocessing for Ref Node";
-  UpdateRefOverlapTensorsConflicts();
-
-#ifdef SOMAS_DEBUG
-  // Compute number of constraints for each tensor
+  // Compute number of constraints for each tensor which will used in solver
   auto tensors_num = tensors_list_.size();
-  for (auto tensor1 : tensors_list_) {
-    auto ones_num = reuse_matrix_[tensor1->GetId()].CountOnesNum();
-    tensor1->num_constraints_ = tensors_num - ones_num;
+  for (const auto &tensor : tensors_list_) {
+    auto ones_num = reuse_matrix_[tensor->GetId()].CountOnesNum();
+    tensor->num_constraints_ = tensors_num - ones_num;
   }
-#endif
 
   // Prepare solver info
-  MS_LOG(INFO) << "Start Loop to create solver info";
-  for (auto tensor : tensors_list_) {
+  for (const auto &tensor : tensors_list_) {
     MS_EXCEPTION_IF_NULL(tensor);
     if (tensor->GetSolverTensorDesc() != nullptr) {
       SomasSolverTensorDescPtr pSolverTensor = tensor->GetSolverTensorDesc();
       (void)solver_tensor_desc_map_.emplace(pSolverTensor->index_, pSolverTensor);
     }
   }
-  MS_LOG(INFO) << "End Loop to create solver info";
 
   MS_LOG(INFO) << "Start Solving";
   if (solver_tensor_desc_map_.empty()) {
@@ -1370,7 +1464,7 @@ bool Somas::Assign(const session::KernelGraph *graph) {
 
   somas_solver_ = std::make_shared<SomasSolverPre>();
   auto status =
-    somas_solver_->Solving(graph, &solver_tensor_desc_map_, &reuse_matrix_, contiguous_tensors_list_removed, false);
+    somas_solver_->Solving(graph, &solver_tensor_desc_map_, &reuse_matrix_, processed_contiguous_tensors_list_, false);
   MS_LOG(INFO) << "End Solving";
   if (status != SUCCESS) {
     GenGraphStatisticInfo();
@@ -1383,18 +1477,18 @@ bool Somas::Assign(const session::KernelGraph *graph) {
     tensor->SetOffset();
   }
 
-  UpdateRefTensorsOffset();
-  UpdateContiguousTensorsOffset(contiguous_list_with_ref_index_map);
+  UpdateUnionTensorsOffset();
+  UpdateContiguousTensorsOffset(contiguous_list_with_ref_index_map_);
 
-  // Set mem_offset_ value by solver result
-  mem_offset_ = static_cast<size_t>(somas_solver_->GetMaxOffset());
-  MS_LOG(DEBUG) << "Somas Assign end.";
+  reused_memory_size_ = static_cast<size_t>(somas_solver_->GetMaxOffset());
+
+  MS_LOG(INFO) << "Somas Assign end.";
   return true;
 }
 
-std::map<size_t, size_t> Somas::GetContiguousListContainRefTensor() {
-  // key: contiguous list index with ref node input; value: contiguous list index with ref node output
-  std::map<size_t, size_t> contiguous_list_with_ref_index_map;
+void Somas::GetContiguousListContainUnionTensor() {
+  // key: contiguous list index with first union tensor; value: contiguous list index with other union tensor
+  contiguous_list_with_ref_index_map_.clear();
   std::map<size_t, size_t> ref_tensors_in_contiguous_map = GetRefTensorsInContiguousList();
   std::map<size_t, std::map<size_t, std::set<size_t>>> contiguous_ref_list_error_check_map;
   for (const auto &ref_pair : ref_tensors_in_contiguous_map) {
@@ -1433,9 +1527,9 @@ std::map<size_t, size_t> Somas::GetContiguousListContainRefTensor() {
     if (!found_second) {
       MS_LOG(WARNING) << "Contiguous ref tensor " << ref_second << " not found in any contiguous list";
     }
-    if (contiguous_list_with_ref_index_map.find(index_first) == contiguous_list_with_ref_index_map.end() ||
-        contiguous_list_with_ref_index_map[index_first] == index_second) {
-      contiguous_list_with_ref_index_map[index_first] = index_second;
+    if (contiguous_list_with_ref_index_map_.find(index_first) == contiguous_list_with_ref_index_map_.end() ||
+        contiguous_list_with_ref_index_map_[index_first] == index_second) {
+      contiguous_list_with_ref_index_map_[index_first] = index_second;
       // Checking for error cases
       if (index_in_list_first != index_in_list_second) {
         MS_LOG(WARNING) << "Inconsistency in contiguous ref: tensor " << ref_first << " in position "
@@ -1445,7 +1539,7 @@ std::map<size_t, size_t> Somas::GetContiguousListContainRefTensor() {
       contiguous_ref_list_error_check_map[index_first][index_second].insert(index_in_list_first);
     } else {
       MS_LOG(WARNING) << "Contiguous list " << index_first << " associated (ref node) with two other contiguous lists: "
-                      << contiguous_list_with_ref_index_map[index_first] << " and " << index_second;
+                      << contiguous_list_with_ref_index_map_[index_first] << " and " << index_second;
     }
   }
 
@@ -1466,24 +1560,23 @@ std::map<size_t, size_t> Somas::GetContiguousListContainRefTensor() {
       }
     }
   }
-  return contiguous_list_with_ref_index_map;
 }
 
 std::map<size_t, size_t> Somas::GetRefTensorsInContiguousList() {
   // key: refnode input value: refnode output
   std::map<size_t, size_t> ref_tensors_in_contiguous_map;
-  for (auto ref_node_list : ref_node_constraints_) {
+  for (auto ref_node_list : union_tensors_list_) {
     // Count contiguous tensors in ref list
     auto contiguous_in_ref_list = std::count_if(ref_node_list.begin(), ref_node_list.end(),
-                                                [this](size_t tid) { return tensors_map_[tid]->contiguous_; });
+                                                [this](size_t tid) { return tensors_list_[tid]->contiguous_; });
     // Keep info about contiguous and check for errors
     if (ref_node_list.size() > kRefNodeTensorNum && contiguous_in_ref_list > 0) {
       MS_LOG(WARNING) << "Ref node of size greater than two with at least one contiguous tensor in";
     }
     if (ref_node_list.size() == kRefNodeTensorNum && contiguous_in_ref_list == 1) {
       MS_LOG(WARNING) << "Ref node of size two with only one contiguous tensor" << ref_node_list[0] << ":"
-                      << tensors_map_[ref_node_list[0]]->contiguous_ << ", " << ref_node_list[1] << ":"
-                      << tensors_map_[ref_node_list[1]]->contiguous_;
+                      << tensors_list_[ref_node_list[0]]->contiguous_ << ", " << ref_node_list[1] << ":"
+                      << tensors_list_[ref_node_list[1]]->contiguous_;
     }
     if (ref_node_list.size() == kRefNodeTensorNum && LongToSize(contiguous_in_ref_list) == kRefNodeTensorNum) {
       ref_tensors_in_contiguous_map[ref_node_list[0]] = ref_node_list[1];
@@ -1498,54 +1591,36 @@ void Somas::UpdateContiguousTensorsOffset(const std::map<size_t, size_t> &contig
     size_t index_first = ref_list_pair.first;
     size_t index_second = ref_list_pair.second;
     for (size_t x = 0; x < contiguous_tensors_list_[index_second].size(); x++) {
-      tensors_map_[contiguous_tensors_list_[index_second][x]]->offset_ =
-        tensors_map_[contiguous_tensors_list_[index_first][x]]->offset_;
+      tensors_list_[contiguous_tensors_list_[index_second][x]]->offset_ =
+        tensors_list_[contiguous_tensors_list_[index_first][x]]->offset_;
+      tensors_list_[contiguous_tensors_list_[index_second][x]]->aligned_size_ =
+        tensors_list_[contiguous_tensors_list_[index_first][x]]->aligned_size_;
     }
   }
 
   // Contiguous gaps postprocessing
   for (auto list : contiguous_tensors_list_) {
-    tensors_map_[list[0]]->offset_ += kGapSize;
+    tensors_list_[list[0]]->offset_ += communication_gap_size_;
   }
 }
 
-void Somas::UpdateRefTensorsOffset() {
-  // Ref Node Postprocessing
-  MS_LOG(INFO) << "\nStart Solving Postprocessing for Ref Node";
+void Somas::UpdateUnionTensorsOffset() {
   // Set offset for rest of ref node list (ignored by solver due to ref node preprocessing)
-  for (auto ref_node_list : ref_node_constraints_) {
+  for (auto ref_node_list : union_tensors_list_) {
     for (size_t i = 1; i < ref_node_list.size(); ++i) {
-      tensors_map_[ref_node_list[i]]->offset_ = tensors_map_[ref_node_list[0]]->offset_;
+      tensors_list_[ref_node_list[i]]->offset_ = tensors_list_[ref_node_list[0]]->offset_;
+      tensors_list_[ref_node_list[i]]->aligned_size_ = tensors_list_[ref_node_list[0]]->aligned_size_;
     }
   }
 }
 
-void Somas::UpdateRefOverlapTensorsConflicts() {
-  // Ref Overlap Preprocessing
-  MS_LOG(INFO) << "Start Solving Preprocessing for Ref Overlap";
-  // In ConflictComputing(), by use of ref_overlap_ flag, each tensor in a ref_overlap_list has all entries 1 in
-  // cannot_reuse_ array Here, we allow reuse only among tensors in same list
-  for (auto ref_overlap_list : ref_overlap_constraints_) {
-    for (size_t tid_1 : ref_overlap_list) {
-      for (size_t tid_2 : ref_overlap_list) {
-        reuse_matrix_[tid_1].SetBitTrue(tid_2);
-        reuse_matrix_[tid_2].SetBitTrue(tid_1);
-      }
-    }
-  }
-  MS_LOG(INFO) << "End Solving Preprocessing for Ref Overlap";
-}
-
-void Somas::UpdateRefTensorsConflict() {
+void Somas::UpdateUnionTensorsConflict() {
   // Keep all constraints for first tensor in list
-  for (auto ref_node_list : ref_node_constraints_) {
-    size_t tid_0 = ref_node_list[0];
-    for (SomasTensorPtr tensor : tensors_list_) {
-      if (reuse_matrix_[tid_0].IsBitTrue(tensor->GetId()) == false) {
-        continue;
-      }
-      for (size_t tid : ref_node_list) {
-        if (reuse_matrix_[tid].IsBitTrue(tensor->GetId()) == false) {
+  for (auto union_node_list : union_tensors_list_) {
+    size_t tid_0 = union_node_list[0];
+    for (const SomasTensorPtr &tensor : tensors_list_) {
+      for (size_t tid : union_node_list) {
+        if (!reuse_matrix_[tid].IsBitTrue(tensor->GetId())) {
           reuse_matrix_[tid_0].SetBitFalse(tensor->GetId());
           reuse_matrix_[tensor->GetId()].SetBitFalse(tid_0);
           break;
@@ -1553,15 +1628,20 @@ void Somas::UpdateRefTensorsConflict() {
       }
     }
     // Set rest to size 0, so that solver ignores them (if not contiguous)
-    for (size_t i = 1; i < ref_node_list.size(); ++i) {
-      if (!tensors_map_[ref_node_list[i]]->contiguous_) {
-        tensors_map_[ref_node_list[i]]->aligned_size_ = 0;
+    for (size_t i = 1; i < union_node_list.size(); ++i) {
+      if (!tensors_list_[union_node_list[i]]->contiguous_) {
+        if (tensors_list_[union_node_list[i]]->aligned_size_ > tensors_list_[union_node_list[0]]->aligned_size_) {
+          MS_LOG(WARNING) << "The aligned_size of union tensor " << tensors_list_[union_node_list[i]]->GetId()
+                          << " is bigger than the aligned_size of union tensor "
+                          << tensors_list_[union_node_list[0]]->GetId();
+        }
+        tensors_list_[union_node_list[i]]->aligned_size_ = 0;
       }
     }
   }
 }
 
-std::string Somas::GetSplitName(const std::string &scope_name) const {
+std::string Somas::GetSplitName(const std::string &scope_name) {
   auto index = scope_name.rfind('/');
   if (index == std::string::npos) {
     return scope_name;
@@ -1582,6 +1662,33 @@ std::string Somas::SomasInfo(bool calc_hash) const {
   DumpTensors(oss);
   DumpNodes(oss);
 
+  oss << "\n\nAll Union Tensors Info:\n\n";
+  for (const auto &ref_in_out : union_tensors_list_) {
+    oss << "union tensors: [";
+    for (const auto &item : ref_in_out) {
+      oss << "%" << item << "T ";
+    }
+    oss << "]\n";
+  }
+
+  oss << "\n\nAll Original Contiguous Tensors Info:\n\n";
+  for (const auto &contiguous : contiguous_tensors_list_) {
+    oss << "contiguous tensors: [";
+    for (const auto &item : contiguous) {
+      oss << "%" << item << "T ";
+    }
+    oss << "]\n";
+  }
+
+  oss << "\n\nAll Processed Contiguous Tensors Info:\n\n";
+  for (const auto &contiguous : processed_contiguous_tensors_list_) {
+    oss << "contiguous tensors: [";
+    for (const auto &item : contiguous) {
+      oss << "%" << item << "T ";
+    }
+    oss << "]\n";
+  }
+
   oss << "\n\nAll Stream Groups:\n\n";
   for (const auto &stream_group : streams_groups_) {
     for (const auto &stream : stream_group) {
@@ -1590,25 +1697,13 @@ std::string Somas::SomasInfo(bool calc_hash) const {
     oss << "\n";
   }
 
-  if (!ref_node_constraints_.empty()) {
-    oss << "\n\nAll Ref Node Info:\n\n";
-    for (const auto &ref_in_out : ref_node_constraints_) {
-      oss << "refnode input-output:";
-      for (const auto &item : ref_in_out) {
-        oss << "%" << item << "T ";
-      }
-      oss << "\n";
-    }
+  oss << "\n\nAll Merged Blocks:\n\n";
+  oss << "start_offset:"
+      << "\tsize:\n";
+  for (const auto &merged_block : dump_merged_blocks_) {
+    oss << merged_block.first << "\t" << merged_block.second << "\n";
   }
-
-  for (const auto &event : event_map_) {
-    std::pair<CNodePtr, CNodePtr> send_recv_pair = event.second;
-    std::string send_split_name = GetSplitName(send_recv_pair.first->fullname_with_scope());
-    std::string recv_split_name = GetSplitName(send_recv_pair.second->fullname_with_scope());
-    oss << "event_id:" << event.first << " send:" << send_split_name << " recv:" << recv_split_name;
-    oss << "\n";
-  }
-
+  oss << "\nTotal Memory Size after reused:" << reused_memory_size_;
   return oss.str();
 }
 
@@ -1633,8 +1728,8 @@ void Somas::DumpNodes(std::ostringstream &oss) const {
         tensor_index++;
       }
     }
-
     oss << "]";
+
     oss << "\toutputs[";
     for (const auto &out : node->output_tensors_) {
       MS_EXCEPTION_IF_NULL(out);
@@ -1642,6 +1737,7 @@ void Somas::DumpNodes(std::ostringstream &oss) const {
           << ", ";
     }
     oss << "]";
+
     oss << "\tworkspace[";
     for (const auto &wk : node->workspace_tensors_) {
       MS_EXCEPTION_IF_NULL(wk);
@@ -1649,6 +1745,23 @@ void Somas::DumpNodes(std::ostringstream &oss) const {
           << ", ";
     }
     oss << "]";
+
+    oss << "\tctrl_inputs[";
+    for (const auto &ctrl_in : node->control_input_tensors_) {
+      MS_EXCEPTION_IF_NULL(ctrl_in);
+      oss << "%" << ctrl_in->GetId() << "CT"
+          << ", ";
+    }
+    oss << "]";
+
+    oss << "\tctrl_outputs[";
+    for (const auto &ctrl_out : node->control_output_tensors_) {
+      MS_EXCEPTION_IF_NULL(ctrl_out);
+      oss << "%" << ctrl_out->GetId() << "CT"
+          << ", ";
+    }
+    oss << "]";
+
     oss << "\tstreamID["
         << "@" << node->GetStreamId() << "]\n";
   }
@@ -1657,10 +1770,9 @@ void Somas::DumpNodes(std::ostringstream &oss) const {
 void Somas::DumpTensors(std::ostringstream &oss) const {
   oss << "\n\nAll Tensors:\n\n";
   oss << "index:"
-      << "\tsize:"
-      << "\treal_size:"
+      << "\taligned_size:"
+      << "\toriginal_size:"
       << "\toffset:"
-      << "\taddr:"
       << "\ttype:"
       << "\tlifelong:"
       << "\tlife_start:"
@@ -1680,9 +1792,23 @@ void Somas::DumpTensors(std::ostringstream &oss) const {
         << "#" << tensor->GetOriginalSize() << "S"
         << "\t"
         << "&" << tensor->GetOffset() << ""
+        << "\t" << tensor->GetTypeString() << "\t" << tensor->GetLifelongString() << "\t" << tensor->lifetime_.start_
+        << "\t" << tensor->lifetime_.end_ << "\t" << split_name << "\n";
+  }
+  for (const auto &tensor : control_tensors_list_) {
+    MS_EXCEPTION_IF_NULL(tensor);
+    auto node = GetSomasNode(tensor->GetSourceNodeId());
+    MS_EXCEPTION_IF_NULL(node);
+    auto scope_name = node->scope_full_name_;
+    std::string split_name = GetSplitName(scope_name);
+    oss << "%" << tensor->GetId() << "T"
         << "\t"
-        << "&" << static_cast<void *>(tensor->GetOffset() + mem_base_addr_) << "\t"
-        << tensor_type_name_map[tensor->type_] << "\t" << tensor->IsLifelong() << "\t" << tensor->lifetime_.start_
+        << "#" << tensor->GetAlignedSize() << "S"
+        << "\t"
+        << "#" << tensor->GetOriginalSize() << "S"
+        << "\t"
+        << "&" << tensor->GetOffset() << ""
+        << "\t" << tensor->GetTypeString() << "\t" << tensor->GetLifelongString() << "\t" << tensor->lifetime_.start_
         << "\t" << tensor->lifetime_.end_ << "\t" << split_name << "\n";
   }
 }
@@ -1691,7 +1817,6 @@ void Somas::DumpParameters(std::ostringstream &oss) const {
   oss << "All Parameters:\n\n";
   oss << "index:"
       << "\tsize:"
-      << "\tstart_addr:"
       << "\tsource node name:"
       << "\tnode out index:\n";
 
@@ -1700,19 +1825,26 @@ void Somas::DumpParameters(std::ostringstream &oss) const {
     oss << "%" << param->id_ << "P"
         << "\t"
         << "#" << param->size_ << "S"
-        << "\t"
-        << "&" << param->addr_ << "\t" << param->source_node_name_ << "\t" << param->output_index_ << "\n";
+        << "\t" << param->source_node_name_ << "\t" << param->output_index_ << "\n";
   }
 }
 
-void Somas::DumpSomasInfoIR(const string filename) const { (void)Common::SaveStringToFile(filename, SomasInfo()); }
+void Somas::DumpSomasModelInfo(const string &tag, uint32_t graph_id) const {
+#ifndef ENABLE_SECURITY
+  if (save_debug_info_) {
+    std::string file_path =
+      GetSaveGraphsPathName("/" + device_name_ + "_" + tag + "_" + std::to_string(graph_id) + ".ir", debug_info_path_);
+    (void)Common::SaveStringToFile(file_path, SomasInfo());
+  }
+#endif
+}
 
 std::string Somas::Offline() const {
   std::ostringstream oss;
 
-  for (auto tensor : tensors_list_) {
+  for (const auto &tensor : tensors_list_) {
     MS_EXCEPTION_IF_NULL(tensor);
-    if (tensor->IsOutputOnly() || tensor->type_ == TensorType::kRefNodeOutput) {
+    if (tensor->IsOutputOnly() || tensor->type_ == TensorType::kUnion) {
       oss << "Somas EDGE ERROR src=n" << tensor->GetSourceNodeId() << ", srcstm=" << tensor->GetSourceStreamId()
           << ", dst=nc"
           << ", dststm=nc"
@@ -1736,7 +1868,7 @@ std::string Somas::Offline() const {
       }
     }
   }
-  for (vector<size_t> tList : contiguous_tensors_list_) {
+  for (const vector<size_t> &tList : contiguous_tensors_list_) {
     oss << "Somas CONTIGUOUS";
     for (size_t tid : tList) {
       oss << " " << tid;
@@ -1753,84 +1885,11 @@ std::string Somas::Offline() const {
   return oss.str();
 }
 
-void Somas::DumpOfflineIR(const string filename) const {
+void Somas::DumpOfflineIR(const string &filename) const {
   MS_LOG(INFO) << "Printing somas-log-from-graph log: " << filename;
   (void)Common::SaveStringToFile(filename, Offline());
 }
 
-std::string Somas::SomasMemory() const {
-  std::ostringstream oss;
-
-  std::map<size_t, size_t> mem_map;
-  for (auto tensor : tensors_list_) {
-    MS_EXCEPTION_IF_NULL(tensor);
-    mem_map[tensor->GetOffset()] = 0;
-  }
-
-  size_t num = 0;
-  for (auto iter = mem_map.begin(); iter != mem_map.end(); ++iter, ++num) {
-    iter->second = num;
-  }
-
-  std::map<size_t, std::map<size_t, SomasTensorPtr>> mem_list;
-
-  for (const auto &output_tensor : tensors_list_) {
-    MS_EXCEPTION_IF_NULL(output_tensor);
-    size_t key = output_tensor->offset_;
-    auto iter = mem_list.find(key);
-    if (iter == mem_list.end()) {
-      std::map<size_t, SomasTensorPtr> id_tensor_map;
-      id_tensor_map[output_tensor->GetId()] = output_tensor;
-      mem_list[key] = id_tensor_map;
-    } else {
-      iter->second[output_tensor->GetId()] = output_tensor;
-    }
-  }
-
-  oss << "mem_id:"
-      << "\tstart_offset:"
-      << "\tend_offset:"
-      << "\ttensor_id:"
-      << "\torigin_size:"
-      << "\talign_size:"
-      << "\tstart_addr:"
-      << "\tend_addr:"
-      << "\ttype:"
-      << "\tsrc_node:"
-      << "\tsrc_stm_id:"
-      << "lifetime_start\t"
-      << "lifetime_end\n";
-
-  for (const auto &mem : mem_list) {
-    auto id_tensor_map = mem.second;
-    for (const auto &id_tensor : id_tensor_map) {
-      auto place_tensor = id_tensor.second;
-      MS_EXCEPTION_IF_NULL(place_tensor);
-      std::string scope_name;
-      int64_t src_stm_id = 0xffff;
-      auto node = GetSomasNode(place_tensor->GetSourceNodeId());
-      if (node != nullptr) {
-        scope_name = node->scope_full_name_;
-        src_stm_id = SizeToLong(node->GetStreamId());
-      } else {
-        scope_name = "Somas Tensor";
-      }
-
-      std::string split_name = GetSplitName(scope_name);
-      oss << "#" << mem_map[place_tensor->GetOffset()] << "\t" << place_tensor->GetOffset() << "\t"
-          << place_tensor->GetOffset() + place_tensor->GetAlignedSize() << "\t%" << place_tensor->GetId() << "T\t"
-          << place_tensor->GetOriginalSize() << "\t" << place_tensor->GetAlignedSize() << "\t&"
-          << static_cast<void *>(place_tensor->GetOffset() + mem_base_addr_) << "\t&"
-          << static_cast<void *>(place_tensor->GetOffset() + mem_base_addr_ + place_tensor->GetAlignedSize()) << "\t"
-          << tensor_type_name_map[place_tensor->type_] << "\t" << split_name << "\tstm" << src_stm_id << "\t"
-          << place_tensor->lifetime_.start_ << "\t" << place_tensor->lifetime_.end_ << "\n";
-    }
-  }
-  return oss.str();
-}
-
-void Somas::DumpSomasMemoryIR(const string &filename) const { (void)Common::SaveStringToFile(filename, SomasMemory()); }
-
 size_t Somas::CalcLowerBound() const {
   size_t max_node_id = std::accumulate(tensors_list_.begin(), tensors_list_.end(), 0, [](size_t max_id, auto tensor) {
     return std::max(max_id, tensor->lifetime_.end_);
@@ -1884,8 +1943,8 @@ void Somas::GenGraphStatisticInfo() {
   }
 
   const double giga = 1024. * 1024. * 1024.;
-  MS_LOG(INFO) << "Lower Bound: " << lower_bound_ << " (" << lower_bound_ / giga
-               << " GB), Upper Bound: " << upper_bound_ << " (" << upper_bound_ / giga << " GB)";
+  MS_LOG(INFO) << "Lower Bound: " << lower_bound_ << " (" << static_cast<double>(lower_bound_) / giga
+               << " GB), Upper Bound: " << upper_bound_ << " (" << static_cast<double>(upper_bound_) / giga << " GB)";
 
   MS_LOG(INFO) << "\nTotal Dynamic Size (Upper Bound):\t" << upper_bound_ << "\n"
                << "Theoretical Optimal Size (Lower Bound):\t" << lower_bound_ << "\n"
@@ -1895,104 +1954,73 @@ void Somas::GenGraphStatisticInfo() {
                << "Total LifeLong All Tensor Size:\t" << lifelong_all_total_size_ << "\n"
                << "Total LifeLong Start Tensor Size:\t" << lifelong_start_total_size_ << "\n"
                << "Total LifeLong End Tensor Size:\t" << lifelong_end_total_size_ << "\n"
-               << "Reused Size(Allocate Size):\t" << GetTotalMemSize() << "\n\n\n";
+               << "Reused Size(Allocate Size):\t" << reused_memory_size_ << "\n\n\n";
 }
 
-uint8_t *Somas::GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const {
+std::vector<std::pair<size_t, size_t>> Somas::GetNodeOutputSomasResult(const AnfNodePtr &node) const {
   MS_EXCEPTION_IF_NULL(node);
   auto key = node.get();
   auto iter = nodes_map_.find(key);
-  uint8_t *ptr = nullptr;
+  std::vector<std::pair<size_t, size_t>> output_somas_result;
   if (iter != nodes_map_.end()) {
     auto &somas_node = iter->second.at(0);
     MS_EXCEPTION_IF_NULL(somas_node);
-    if (index >= somas_node->output_tensors_.size()) {
-      MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:["
-                        << somas_node->output_tensors_.size() << "]";
-    }
-    auto output_tensor = somas_node->output_tensors_[index];
-    ptr = mem_base_addr_ + output_tensor->offset_;
+    std::transform(somas_node->output_tensors_.cbegin(), somas_node->output_tensors_.cend(),
+                   std::back_inserter(output_somas_result),
+                   [](const SomasTensorPtr &tensor) { return std::make_pair(tensor->offset_, tensor->aligned_size_); });
   } else {
     MS_LOG(EXCEPTION) << "node [" << common::AnfAlgo::GetCNodeName(node) << "] don't exist in nodes_map";
   }
-  return ptr;
+  return output_somas_result;
 }
 
-uint8_t *Somas::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const {
+std::vector<std::pair<size_t, size_t>> Somas::GetNodeWorkSpaceSomasResult(const AnfNodePtr &node) const {
   MS_EXCEPTION_IF_NULL(node);
   auto key = node.get();
   auto iter = nodes_map_.find(key);
-  uint8_t *ptr = nullptr;
+  std::vector<std::pair<size_t, size_t>> workspace_somas_result;
   if (iter != nodes_map_.end()) {
     auto &somas_node = iter->second.at(0);
     MS_EXCEPTION_IF_NULL(somas_node);
-    if (index >= somas_node->workspace_tensors_.size()) {
-      MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's workspace size:["
-                        << somas_node->workspace_tensors_.size() << "]";
-    }
-    auto workspace_tensor = somas_node->workspace_tensors_[index];
-    ptr = mem_base_addr_ + workspace_tensor->offset_;
+    std::transform(somas_node->workspace_tensors_.cbegin(), somas_node->workspace_tensors_.cend(),
+                   std::back_inserter(workspace_somas_result),
+                   [](const SomasTensorPtr &tensor) { return std::make_pair(tensor->offset_, tensor->aligned_size_); });
+  } else {
+    MS_LOG(EXCEPTION) << "node [" << common::AnfAlgo::GetCNodeName(node) << "] don't exist in nodes_map";
   }
-  return ptr;
-}
-#ifndef ENABLE_SECURITY
-void Somas::ConvertToProfilingNode(uint32_t graph_id) const {
-#ifdef ENABLE_D
-  auto graph_node = MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id);
-  if (graph_node == nullptr) {
-    graph_node = MemoryProfiling::GetInstance().AddGraphMemoryNode(graph_id);
-    MS_LOG(INFO) << "Add graph memory node for dynamic memory profiling, graph id is " << graph_id;
-  }
-
-  for (const auto &tensor : tensors_list_) {
-    TensorMemory tensor_memory;
-    tensor_memory.SetTensorId(tensor->GetId());
-    tensor_memory.SetAlignedSize(tensor->GetAlignedSize());
-    tensor_memory.SetType(tensor_type_name_map[tensor->type_]);
-    tensor_memory.SetLifeStart(tensor->lifetime_.start_);
-    tensor_memory.SetLifeEnd(tensor->lifetime_.end_);
-    tensor_memory.SetLifeLong(life_long_name_map[tensor->lifelong_value_]);
-    graph_node->AddTensorMemory(tensor_memory);
-  }
-
-  for (const auto &node : nodes_list_) {
-    NodeMemory node_memory;
-    std::string name = GetSplitName(node->scope_full_name_);
-    node_memory.SetNodeName(name);
-    node_memory.SetNodeId(node->GetId());
-    for (const auto &input_tensor : node->input_tensors_) {
-      node_memory.AddInputTensorId(input_tensor->GetId());
-    }
-    for (const auto &output_tensor : node->output_tensors_) {
-      node_memory.AddOutputTensorId(output_tensor->GetId());
-    }
-    for (const auto &workspace_tensor : node->workspace_tensors_) {
-      node_memory.AddWorkSpaceTensorId(workspace_tensor->GetId());
-    }
-    graph_node->AddNodeMemory(node_memory);
-  }
-#endif
+  return workspace_somas_result;
 }
 
 SomasStreamPtr Somas::GetSomasStream(size_t stream_id) const {
-  auto it = std::find_if(streams_list_.begin(), streams_list_.end(),
-                         [stream_id](const SomasStreamPtr &stream) { return stream->GetId() == stream_id; });
-  if (it != streams_list_.end()) {
-    return *(it);
+  auto it = streams_map_.find(stream_id);
+  if (it != streams_map_.end()) {
+    return (*it).second;
   } else {
+    MS_LOG(ERROR) << "Can't find somas stream for stream " << stream_id;
     return nullptr;
   }
 }
 
 SomasNodePtr Somas::GetSomasNode(size_t node_id) const {
-  auto it = nodes_id_map_.find(node_id);
-  if (it == nodes_id_map_.end()) {
+  if (node_id >= nodes_list_.size()) {
     return nullptr;
   } else {
-    return it->second;
+    return nodes_list_[node_id];
   }
 }
 
-#endif
+common::KernelWithIndex Somas::GetVisitKernelWithReturnType(const AnfNodePtr &ori_node, size_t ori_index) {
+  auto prenode = common::AnfAlgo::VisitKernelWithReturnType(ori_node, ori_index, false);
+  while (prenode.first->isa<CNode>() && nodes_map_.find(prenode.first.get()) == nodes_map_.end()) {
+    auto anf_node = prenode.first;
+    auto cnode = anf_node->cast<CNodePtr>();
+    if (!common::AnfAlgo::IsNopNode(cnode)) {
+      MS_LOG(EXCEPTION) << "Node[" << ori_node->fullname_with_scope() << "] find input node["
+                        << cnode->fullname_with_scope() << "] doesn't exist in nodes_map and is not a nop node!!!!";
+    }
+    prenode = common::AnfAlgo::VisitKernelWithReturnType(cnode->input(kNopNodeRealInputIndex), 0, false);
+  }
+  return prenode;
+}
 }  // namespace somas
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/common/somas/somas.h b/mindspore/ccsrc/backend/common/somas/somas.h
index f91f4dfe9a9..abaaeacac21 100644
--- a/mindspore/ccsrc/backend/common/somas/somas.h
+++ b/mindspore/ccsrc/backend/common/somas/somas.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
 
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <stack>
 
 #include "utils/hash_map.h"
 #include "utils/hash_set.h"
@@ -33,9 +34,15 @@
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "include/common/utils/anfalgo.h"
 #include "backend/common/session/kernel_graph.h"
+#include "runtime/hardware/device_type.h"
 
 namespace mindspore {
 namespace somas {
+struct EventPair {
+  CNodePtr send_;
+  CNodePtr recv_;
+};
+
 union DestinationUnion {
   size_t id;
   size_t index;
@@ -43,81 +50,86 @@ union DestinationUnion {
 };
 
 struct TensorConflictInfo {
-  size_t tensor_id_;
-  size_t src_node_id_;
+  size_t tensor_id;
+  size_t src_node_id;
   size_t destination_num;
   DestinationUnion l;
   DestinationUnion r;
   TensorConflictInfo(size_t tensor_id, size_t src_node_id)
-      : tensor_id_(tensor_id), src_node_id_(src_node_id), destination_num(0) {}
+      : tensor_id(tensor_id), src_node_id(src_node_id), destination_num(0) {}
 };
+
+struct Block {
+  size_t start_offset_;
+  size_t size_;
+  size_t end_offset_;
+
+  Block(size_t start, size_t size) : start_offset_(start), size_(size) { end_offset_ = start_offset_ + size_; }
+};
+
+void MergeBlocks(std::vector<Block> *block_list, std::stack<Block> *merged_blocks);
+
+enum class UnReuseType { kUnReuseAll, kUnReuseInput, kUnReuseOutput, kUnReuseWorkspace };
 class Somas {
  public:
   // Constructors/Destructors
   Somas() = default;
   Somas(const Somas &) = delete;
   Somas &operator=(const Somas &) = delete;
-  ~Somas() { mem_base_addr_ = nullptr; }
-
-  bool Allocate(const session::KernelGraph *graph);
-  const size_t GetTotalMemSize() const { return mem_offset_; }
-  void set_mem_base_addr(uint8_t *mem_base_addr) { mem_base_addr_ = mem_base_addr; }
-  uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
-  uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
+  virtual ~Somas() = default;
 
+  bool Assign(const session::KernelGraph &graph);
+  bool Assign(const KernelGraphPtr &graph_ptr);
   std::string SomasInfo(bool calc_hash = false) const;
-  std::string SomasMemory() const;
-  void DumpSomasInfoIR(const string filename) const;
-  void DumpSomasMemoryIR(const string &filename) const;
-
-  static bool NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2);
 #ifndef ENABLE_SECURITY
-  void ConvertToProfilingNode(uint32_t graph_id) const;
+  virtual void ConvertToProfilingNode(uint32_t graph_id) const {}
 #endif
 
  private:
+  // device implementation interface
+  virtual bool Initialize() = 0;
+  virtual string GetDeviceName() const = 0;
+  virtual size_t GetAlignSize(size_t original_size) const = 0;
+  virtual size_t GetCommunicationReservedSize() const;
+
+  virtual bool GetEnableCacheFlag(const session::KernelGraph &graph) const;
+  virtual std::vector<vector<uint32_t>> GetStreamGroupInfo(const session::KernelGraph &graph) const;
+  virtual bool GetDependExecOrderFlag(const session::KernelGraph &graph) const = 0;
+  virtual std::pair<bool, std::string> GetDebugConfig() const;
+
+  virtual std::map<std::string, UnReuseType> GetUnReuseNodeType(const session::KernelGraph &graph) const;
+  virtual std::map<std::string, UnReuseType> GetUnReuseNodeName(const session::KernelGraph &graph) const;
+
+  virtual bool InitDevSpecControlTensors(const session::KernelGraph &graph) = 0;
+  virtual bool DevSpecNodeProcess(const session::KernelGraph &graph) = 0;
+  // end
+
+  // SOMAS Configuration
+  std::string device_name_{"SOMAS"};
+  size_t communication_gap_size_{0};
+
+  size_t depend_exec_order_{false};
+  bool enable_cache_{false};
+  bool save_debug_info_{false};
+  std::string debug_info_path_;
+
+  std::map<std::string, UnReuseType> un_reuse_node_type_;
+  std::map<std::string, UnReuseType> un_reuse_node_name_;
+  // end
+
   std::vector<DynamicBitSet> reuse_matrix_;
   // hash id
   std::string hash_id_;
-  // Maps
-  mindspore::HashMap<size_t, SomasTensorPtr> tensors_map_;
-  mindspore::HashMap<void *, std::vector<SomasNodePtr>> nodes_map_;
-  mindspore::HashMap<void *, vector<SomasParameterPtr>> parameters_map_;
-  mindspore::HashMap<size_t, SomasNodePtr> nodes_id_map_;
-
-  // Vectors
-  std::vector<SomasNodePtr> nodes_list_;
-  std::vector<SomasStreamPtr> streams_list_;
-  std::vector<SomasTensorPtr> tensors_list_;
-  std::vector<SomasParameterPtr> parameters_list_;
 
   // Stream groups
   std::vector<vector<uint32_t>> streams_groups_;
 
-  // event info map
-  std::map<size_t, std::pair<CNodePtr, CNodePtr>> event_map_;
-
   // Solver
   TensorsDescMap solver_tensor_desc_map_;
   SomasSolverPrePtr somas_solver_;
 
-  // Contiguous list
-  std::vector<vector<size_t>> contiguous_tensors_list_;
-
-  // Ref lists
-  std::vector<vector<size_t>> ref_node_constraints_;
   std::vector<vector<size_t>> ref_overlap_constraints_;
 
-  // total Offset
-  size_t mem_offset_{0};
-
-  // Memory base addr
-  uint8_t *mem_base_addr_{nullptr};
-
-  // Save debug info
-  bool save_graphs_{false};
-  std::string save_graphs_path_;
-
   // statistic info
   size_t upper_bound_{0};
   size_t lower_bound_{0};
@@ -128,74 +140,147 @@ class Somas {
   size_t lifelong_start_total_size_{0};
   size_t lifelong_end_total_size_{0};
 
-  bool InitSomasTensors(const session::KernelGraph *graph);
-  void InitBasicInfo(const session::KernelGraph *graph);
-  void InitSomasStreamAndNode(const session::KernelGraph *graph);
-  void InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph);
-  void InitSomasInputTensors(const session::KernelGraph *graph);
-  void InitSomasEventInfos();
-  void GetNextOutputProcess(const session::KernelGraph *graph);
-  void IndependentNodeOutputProcess(const session::KernelGraph *graph);
-#ifndef ENABLE_SECURITY
-  void SummaryInputProcess(const session::KernelGraph *graph);
-#endif
-  void RefNodeProcess(const session::KernelGraph *graph);
-  void NonTaskSplitProcess(const session::KernelGraph *graph);
-  void UnReuseNodeProcess(const session::KernelGraph *graph);
-  SomasTensorPtr CreateGapTensor(size_t gap_tensor_id);
-  void GenContiguousList(const session::KernelGraph *graph);
+  std::vector<vector<size_t>> processed_contiguous_tensors_list_;
+  // key: contiguous list index with first union tensor; value: contiguous list index with other union tensor
+  std::map<size_t, size_t> contiguous_list_with_ref_index_map_;
 
-  void ComputeConflictPairs();
+  bool ConfigSomas(const session::KernelGraph &graph);
 
-  bool Assign(const session::KernelGraph *graph);
-
-  std::string Offline() const;
-  void DumpOfflineIR(const string filename) const;
-  std::string GetSplitName(const string &scope_name) const;
-  size_t CalcLowerBound() const;
-  void GenGraphStatisticInfo();
+  // somas model
+  bool InitSomasModel(const session::KernelGraph &graph);
+  bool InitBasicInfoFromGraph(const session::KernelGraph &graph);
+  void InitSomasStreamAndNode(const session::KernelGraph &graph);
+  void InitSomasOutputAndWorkspaceTensors(const session::KernelGraph &graph);
+  void InitSomasInputTensors(const session::KernelGraph &graph);
+  void InitCommonNodeInputs(const CNodePtr &kernel);
+  void InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kernel);
   SomasParameterPtr GetSomasParameter(const AnfNodePtr &node, size_t index);
   SomasParameterPtr CreateSomasParameter(const AnfNodePtr &node, size_t index);
-  void InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel);
-  void InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kernel);
-  void ComputeOneTensorConflicts(const std::shared_ptr<SomasTensor> &target_tensor,
-                                 const std::vector<TensorConflictInfo> &tensor_conflict_info_list,
-                                 const std::vector<size_t> &destination_node_list,
-                                 const vector<DynamicBitSet> &nodes_dependency,
-                                 std::vector<DynamicBitSet> *tensor_relation) const;
+  void InitControlTensors(const session::KernelGraph &graph);
+  bool CommonSpecNodeProcess(const session::KernelGraph &graph);
+  SomasStreamPtr GetSomasStream(size_t stream_id) const;
+#ifndef ENABLE_SECURITY
+  void SummaryInputProcess(const session::KernelGraph &graph);
+#endif
+  void RefNodeProcess(const session::KernelGraph &graph);
+  void UnReuseNodeProcess(const session::KernelGraph &graph);
+  void CommunicationNodeProcess(const session::KernelGraph &graph);
+  void GetContiguousListContainUnionTensor();
+  std::map<size_t, size_t> GetRefTensorsInContiguousList();
+  common::KernelWithIndex GetVisitKernelWithReturnType(const AnfNodePtr &ori_node, size_t ori_index);
+
+  // conflict matrix
+  static bool NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2);
+  void ComputeConflictMatrix();
+  void ComputeBasicMatrix();
+  static void ComputeOneTensorConflicts(const std::shared_ptr<SomasTensor> &target_tensor,
+                                        const std::vector<TensorConflictInfo> &tensor_conflict_info,
+                                        const std::vector<size_t> &destination_node_list,
+                                        const vector<DynamicBitSet> &nodes_dependency,
+                                        std::vector<DynamicBitSet> *tensor_relation);
   void ComputeMultiTensorConflicts(const std::vector<SomasTensorPtr> &target_tensors_list,
-                                   const std::vector<TensorConflictInfo> &tensor_conflict_info_list,
+                                   const std::vector<TensorConflictInfo> &tensor_conflict_info,
                                    const std::vector<size_t> &destination_node_list,
                                    const vector<DynamicBitSet> &nodes_dependency,
                                    std::vector<DynamicBitSet> *tensor_relation) const;
   void UpdateTensorDestinations();
-  void UpdateRefTensorsConflict();
-  void UpdateRefOverlapTensorsConflicts();
-  void UpdateRefTensorsOffset();
-  void UpdateContiguousTensorsOffset(const std::map<size_t, size_t> &contiguous_ref_list_map);
-  void DumpParameters(std::ostringstream &oss) const;
-  void DumpTensors(std::ostringstream &oss) const;
-  void DumpNodes(std::ostringstream &oss) const;
-  std::map<size_t, size_t> GetContiguousListContainRefTensor();
-  std::map<size_t, size_t> GetRefTensorsInContiguousList();
-  bool SaveSomasResult(const session::KernelGraph *graph);
-  bool VerifySomasResult(const session::KernelGraph *graph, const nlohmann::json &somas_json) const;
-  bool LoadSomasResult(const session::KernelGraph *graph, const string &filename);
-  bool UpdateTensorsOffset(const std::vector<nlohmann::json> &tensors_json);
-  bool CalcSomasModelHash(const session::KernelGraph *graph);
-  void UpdateInputTensor(SomasNodePtr node, SomasNodePtr pre_somas_node, SomasTensorPtr input_somas_tensor) const;
-  bool LoadSomasCache(const session::KernelGraph *graph);
-  SomasStreamPtr GetSomasStream(size_t stream_id) const;
-  SomasNodePtr GetSomasNode(size_t node_id) const;
+  void UpdateUnionTensorsConflict();
   static void BuildConflictInfo(const std::shared_ptr<SomasTensor> &tensor, TensorConflictInfo *tensor_conflict_info,
                                 std::vector<size_t> *destination_node_list);
   static bool CheckIsDependency(const TensorConflictInfo &tensor_conflict_info, const size_t &src_node_id,
                                 const vector<DynamicBitSet> &nodes_dependency,
                                 const std::vector<size_t> &destination_node_list);
   void ProcessSemiLifeLongTensor();
+
+  // solver
+  bool Solve(const session::KernelGraph &graph);
+  void UpdateUnionTensorsOffset();
+  void UpdateContiguousTensorsOffset(const std::map<size_t, size_t> &contiguous_ref_list_map);
+
+  // cache
+  bool SaveSomasResult(const session::KernelGraph &graph);
+  bool VerifySomasResult(const session::KernelGraph &graph, const nlohmann::json &somas_json) const;
+  bool LoadSomasResult(const session::KernelGraph &graph, const string &filename);
+  bool UpdateTensorsOffset(const std::vector<nlohmann::json> &tensors_json);
+  bool CalcSomasModelHash(const session::KernelGraph &graph);
+  bool LoadSomasCache(const session::KernelGraph &graph);
+
+  // log
+  std::string Offline() const;
+  void DumpOfflineIR(const string &filename) const;
+  size_t CalcLowerBound() const;
+  void GenGraphStatisticInfo();
+  void DumpParameters(std::ostringstream &oss) const;
+  void DumpTensors(std::ostringstream &oss) const;
+  void DumpNodes(std::ostringstream &oss) const;
+  void DumpSomasModelInfo(const string &tag, uint32_t graph_id) const;
+
+  // update graph
+  std::vector<std::pair<size_t, size_t>> GetNodeOutputSomasResult(const AnfNodePtr &node) const;
+  std::vector<std::pair<size_t, size_t>> GetNodeWorkSpaceSomasResult(const AnfNodePtr &node) const;
+  bool UpdateSomasResultToGraph(const session::KernelGraph &graph);
+
+ protected:
+  std::vector<SomasParameterPtr> parameters_list_;
+  std::vector<SomasTensorPtr> control_tensors_list_;
+  std::vector<SomasTensorPtr> tensors_list_;
+  std::vector<SomasNodePtr> nodes_list_;
+
+  mindspore::HashMap<size_t, SomasStreamPtr> streams_map_;
+  mindspore::HashMap<void *, vector<SomasParameterPtr>> parameters_map_;
+  mindspore::HashMap<void *, std::vector<SomasNodePtr>> nodes_map_;
+
+  std::vector<vector<size_t>> union_tensors_list_;
+  std::vector<vector<size_t>> contiguous_tensors_list_;
+
+  void AddControlTensor(const SomasNodePtr &from, const SomasNodePtr &to);
+  void AddControlTensorFromExecOrder(const session::KernelGraph &graph);
+  void GraphOutputProcess(const session::KernelGraph &graph);
+  void UpdateContiguousTensorList();
+  SomasNodePtr GetSomasNode(size_t node_id) const;
+  static std::string GetSplitName(const string &scope_name);
+
+  size_t reused_memory_size_{0};
+  std::vector<std::pair<size_t, size_t>> dump_merged_blocks_;
 };
 
 using SomasPtr = std::shared_ptr<Somas>;
+using SomasCreator = std::function<std::shared_ptr<Somas>()>;
+
+// @todo will delete when old runtime remove
+class SomasManager {
+ public:
+  static SomasManager &Instance() {
+    static SomasManager instance{};
+    return instance;
+  }
+  void Register(device::DeviceType device_type, SomasCreator &&creator) {
+    if (base_map_.find(device_type) == base_map_.end()) {
+      (void)base_map_.emplace(device_type, creator);
+    }
+  }
+  SomasPtr GetSomas(device::DeviceType device_type) {
+    auto iter = base_map_.find(device_type);
+    if (base_map_.end() != iter) {
+      MS_EXCEPTION_IF_NULL(iter->second);
+      return (iter->second)();
+    }
+    return nullptr;
+  }
+
+ private:
+  std::map<device::DeviceType, SomasCreator> base_map_;
+};
+
+class SomasRegister {
+ public:
+  SomasRegister(device::DeviceType device_type, SomasCreator &&creator) {
+    SomasManager::Instance().Register(device_type, std::move(creator));
+  }
+  ~SomasRegister() = default;
+};
+
+#define REG_SOMAS(S, T, C) static const somas::SomasRegister g_##S##_reg(T, []() { return std::make_shared<C>(); });
 }  // namespace somas
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_COMMON_SOMAS_SOMAS_H_
diff --git a/mindspore/ccsrc/backend/common/somas/somas_node.h b/mindspore/ccsrc/backend/common/somas/somas_node.h
index f7dda7b7d4f..777cb29537f 100644
--- a/mindspore/ccsrc/backend/common/somas/somas_node.h
+++ b/mindspore/ccsrc/backend/common/somas/somas_node.h
@@ -39,14 +39,14 @@ class SomasNode {
 
   // node's dependency including data dependency and time dependency
   std::set<std::shared_ptr<SomasNode>> ancestor_nodes_;
-  std::set<SomasTensorPtr> tensors_;
-
+  // data tensor
   std::vector<SomasTensorPtr> input_tensors_;
   std::vector<SomasTensorPtr> output_tensors_;
   std::vector<SomasTensorPtr> workspace_tensors_;
   std::map<size_t, SomasParameterPtr> input_parameters_map_;
-
-  mindspore::HashMap<int64_t, size_t> anc_stream_max_order_;
+  // control tensor
+  std::vector<SomasTensorPtr> control_input_tensors_;
+  std::vector<SomasTensorPtr> control_output_tensors_;
 
   // Constructors/Destructors
   SomasNode(std::string scope_full_name, size_t id, NodeType type, const size_t &stream_id)
@@ -57,7 +57,7 @@ class SomasNode {
 
   // Accessors
   const size_t &GetId() const { return id_; }
-  const size_t GetStreamId() const { return stream_id_; }
+  const size_t &GetStreamId() const { return stream_id_; }
   const NodeType &GetType() const { return type_; }
 
  private:
diff --git a/mindspore/ccsrc/backend/common/somas/somas_solver_pre.cc b/mindspore/ccsrc/backend/common/somas/somas_solver_pre.cc
index e548912f1fd..8caeda86255 100644
--- a/mindspore/ccsrc/backend/common/somas/somas_solver_pre.cc
+++ b/mindspore/ccsrc/backend/common/somas/somas_solver_pre.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
 
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,7 +98,7 @@ vector<TensorsDescMap> SomasSolverPre::CreateTensorsMaps(const TensorsDescMap &t
   }
   return vecTensorsMap;
 }
-Status SomasSolverPre::Solving(const session::KernelGraph *graph, TensorsDescMap *ptensors,
+Status SomasSolverPre::Solving(const session::KernelGraph &graph, TensorsDescMap *ptensors,
                                const std::vector<DynamicBitSet> *pConstraints,
                                const vector<vector<size_t>> &continuous_v, bool bVerifySolution, bool ball,
                                SortingType sorting, FittingType fitting, AlgorithmType algorithm) {
@@ -198,7 +198,7 @@ Status SomasSolverPre::Solving(const session::KernelGraph *graph, TensorsDescMap
   return ret;
 }
 
-void SomasSolverPre::Log(const session::KernelGraph *graph, const TensorsDescMap &tensors,
+void SomasSolverPre::Log(const session::KernelGraph &graph, const TensorsDescMap &tensors,
                          const std::vector<DynamicBitSet> *pConstraints,
                          const vector<vector<size_t>> &continuous_v) const {
   auto context_ptr = MsContext::GetInstance();
@@ -213,13 +213,13 @@ void SomasSolverPre::Log(const session::KernelGraph *graph, const TensorsDescMap
 }
 
 void SomasSolverPre::TensorRelationLog(const std::vector<DynamicBitSet> *pConstraints,
-                                       const session::KernelGraph *graph) const {
+                                       const session::KernelGraph &graph) const {
   MS_LOG(INFO) << "SomasSolver::Log Writing somas_tensor_relation.ir..";
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
   auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
   std::string filename =
-    GetSaveGraphsPathName("somas_tensor_relation_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
+    GetSaveGraphsPathName("somas_tensor_relation_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path);
   std::ostringstream oss;
   for (size_t tid1 = 0; tid1 < pConstraints->size(); tid1++) {
     oss << 't' << tid1 << ' ';
@@ -232,14 +232,14 @@ void SomasSolverPre::TensorRelationLog(const std::vector<DynamicBitSet> *pConstr
   MS_LOG(INFO) << "SomasSolver somas_tensor_relation Log done";
 }
 
-void SomasSolverPre::SolverInputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors,
+void SomasSolverPre::SolverInputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors,
                                     const vector<vector<size_t>> &continuous_v) const {
   MS_LOG(INFO) << "SomasSolver::Log Writing somas_solver_input..";
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
   auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
   std::string filename =
-    GetSaveGraphsPathName("somas_solver_input_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
+    GetSaveGraphsPathName("somas_solver_input_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path);
   std::ostringstream oss;
   for (auto &t : tensors) {
     oss << "T " << t.second->index_ << " " << t.second->size_ << " " << t.second->lifelong_ << std::endl;
@@ -256,13 +256,13 @@ void SomasSolverPre::SolverInputLog(const session::KernelGraph *graph, const Ten
   MS_LOG(INFO) << "SomasSolver input Log done";
 }
 
-void SomasSolverPre::SolverOutputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors) const {
+void SomasSolverPre::SolverOutputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors) const {
   MS_LOG(INFO) << "SomasSolver::Log Writing somas_solver_output_..";
   auto context_ptr = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(context_ptr);
   auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
   std::string out_filename =
-    GetSaveGraphsPathName("somas_solver_output_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
+    GetSaveGraphsPathName("somas_solver_output_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path);
   std::ostringstream oss;
   constexpr size_t contiguous_left = 1;
   constexpr size_t contiguous_mid = 2;
diff --git a/mindspore/ccsrc/backend/common/somas/somas_solver_pre.h b/mindspore/ccsrc/backend/common/somas/somas_solver_pre.h
index a6613974cf3..094d9148e6f 100644
--- a/mindspore/ccsrc/backend/common/somas/somas_solver_pre.h
+++ b/mindspore/ccsrc/backend/common/somas/somas_solver_pre.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
 
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -184,14 +184,14 @@ class SomasSolverPre {
 
   size_t GetMaxOffset() const { return max_offset_; }
 
-  Status Solving(const session::KernelGraph *graph, TensorsDescMap *ptensors,
+  Status Solving(const session::KernelGraph &graph, TensorsDescMap *ptensors,
                  const std::vector<DynamicBitSet> *pConstraints, const vector<vector<size_t>> &continuous_v,
                  bool bVerifySolution,  // true -> Check continuous and non overlapping constraints solution
                  bool ball = true,      // true -> run full set of heuristics, false -> run single heuristic specified
                  SortingType sorting = kGreaterSizeSmallerIndex, FittingType fitting = kBest,
                  AlgorithmType algorithm = kManyObjects);
 
-  void Log(const session::KernelGraph *graph, const TensorsDescMap &tensors,
+  void Log(const session::KernelGraph &graph, const TensorsDescMap &tensors,
            const std::vector<DynamicBitSet> *pConstraints, const vector<vector<size_t>> &continuous_v) const;
 
   Status CheckTensors(const TensorsDescMap *pTensors, uint32_t index1, uint32_t index2) const;
@@ -201,11 +201,11 @@ class SomasSolverPre {
 
  private:
   size_t max_offset_;
-  void SolverInputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors,
+  void SolverInputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors,
                       const vector<vector<size_t>> &continuous_v) const;
-  void SolverOutputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors) const;
+  void SolverOutputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors) const;
   vector<TensorsDescMap> CreateTensorsMaps(const TensorsDescMap &tensors, size_t total_sol) const;
-  void TensorRelationLog(const std::vector<DynamicBitSet> *pConstraints, const session::KernelGraph *graph) const;
+  void TensorRelationLog(const std::vector<DynamicBitSet> *pConstraints, const session::KernelGraph &graph) const;
 };
 using SomasSolverPrePtr = std::shared_ptr<SomasSolverPre>;
 }  // namespace somas
diff --git a/mindspore/ccsrc/backend/common/somas/somas_stream.h b/mindspore/ccsrc/backend/common/somas/somas_stream.h
index 2108b8345e0..3766c73a48d 100644
--- a/mindspore/ccsrc/backend/common/somas/somas_stream.h
+++ b/mindspore/ccsrc/backend/common/somas/somas_stream.h
@@ -31,7 +31,7 @@ class SomasStream {
   std::vector<SomasNodePtr> nodes_;
 
   // Constructors/Destructors
-  explicit SomasStream(int64_t id) : id_(id) {}
+  explicit SomasStream(size_t id) : id_(id) {}
   SomasStream(const SomasStream &) = delete;
   SomasStream &operator=(const SomasStream &) = delete;
   ~SomasStream() = default;
diff --git a/mindspore/ccsrc/backend/common/somas/somas_tensor.cc b/mindspore/ccsrc/backend/common/somas/somas_tensor.cc
index 960aa94f912..41bd88479c2 100644
--- a/mindspore/ccsrc/backend/common/somas/somas_tensor.cc
+++ b/mindspore/ccsrc/backend/common/somas/somas_tensor.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
 
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,25 +15,35 @@
 */
 
 #include "backend/common/somas/somas_tensor.h"
+#include <map>
+#include <string>
 
 namespace mindspore {
 namespace somas {
-SomasTensor::SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t real_size,
-                         LifeLongType lifelong_value)
-    : lifelong_value_(lifelong_value),
-      between_streams_(false),
+std::map<somas::TensorType, std::string> tensor_type_name_map = {
+  {kCommon, "Common"},         {kWorkspace, "Workspace"},
+  {kOutputOnly, "OutputOnly"}, {kGraphOutput, "GraphOutput"},
+  {kGraphInput, "GraphInput"}, {kSummaryInput, "SummaryInput"},
+  {kUnion, "Union"},           {kControl, "Control"},
+  {kUnknown, "Unknown"}};
+
+std::map<LifeLongType, std::string> life_long_name_map = {{kLifeLongNone, "LifeLongNone"},
+                                                          {kLifeLongGraphAll, "LifeLongGraphAll"},
+                                                          {kLifeLongGraphStart, "LifeLongGraphStart"},
+                                                          {kLifeLongGraphEnd, "LifeLongGraphEnd"}};
+
+SomasTensor::SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t ori_size,
+                         size_t aligned_size, LifeLongType lifelong_value)
+    : aligned_size_(aligned_size),
+      lifelong_value_(lifelong_value),
       contiguous_(false),
       type_(kUnknown),
       offset_(0),
       num_constraints_(0),
-      ref_overlap_(false),
       id_(id),
       source_node_id_(source_node_id),
       source_stream_id_(source_stream_id),
-      original_size_(real_size) {
-  const size_t alignment = 512;
-  const size_t alignment_complement = 31;
-  aligned_size_ = (real_size > 0) ? ((real_size + alignment + alignment_complement) / alignment) * alignment : 0;
+      original_size_(ori_size) {
   solver_tensor_desc_ = std::make_shared<SomasSolverTensorDesc>(id_, aligned_size_, offset_, false);
 }
 
@@ -49,5 +59,9 @@ SomasSolverTensorDescPtr SomasTensor::GetSolverTensorDesc() {
     return solver_tensor_desc_;
   }
 }
+
+std::string SomasTensor::GetTypeString() { return tensor_type_name_map[type_]; }
+
+std::string SomasTensor::GetLifelongString() { return life_long_name_map[lifelong_value_]; }
 }  // namespace somas
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/common/somas/somas_tensor.h b/mindspore/ccsrc/backend/common/somas/somas_tensor.h
index 14a7ebe1003..6967c2a4de0 100644
--- a/mindspore/ccsrc/backend/common/somas/somas_tensor.h
+++ b/mindspore/ccsrc/backend/common/somas/somas_tensor.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
 
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include <memory>
 #include <set>
 #include <vector>
-
+#include <string>
 #include "utils/hash_map.h"
 #include "backend/common/somas/somas_solver_pre.h"
 
@@ -38,21 +38,21 @@ using lifetime_t = struct Lifetime;
 // Tensor type
 enum TensorType {
   kCommon,
-  kOutputOnly,
   kWorkspace,
-  kGetNextOutput,
+  kOutputOnly,
+  kGraphOutput,
+  kGraphInput,
   kSummaryInput,
-  kRefNodeInput,
-  kRefNodeOutput,
-  kEventVirtualOutput,
+  kUnion,
+  kControl,
   kUnknown
 };
 
 enum LifeLongType {
   kLifeLongNone,        // life time is from tensor start to tensor end
-  kLifeLongGraphAll,    // life time is  from graph start to graph end
-  kLifeLongGraphStart,  // life time is  from graph start to tensor end
-  kLifeLongGraphEnd     // life time is  from tensor start to graph end
+  kLifeLongGraphAll,    // life time is from graph start to graph end
+  kLifeLongGraphStart,  // life time is from graph start to tensor end
+  kLifeLongGraphEnd     // life time is from tensor start to graph end
 };
 
 class SomasTensor {
@@ -60,7 +60,6 @@ class SomasTensor {
   size_t aligned_size_{0};
   LifeLongType lifelong_value_;
 
-  bool between_streams_;
   bool contiguous_;
 
   lifetime_t lifetime_;
@@ -72,7 +71,7 @@ class SomasTensor {
   vector<size_t> consumer_list_;
 
   // Constructors/Destructors
-  explicit SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t real_size,
+  explicit SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t ori_size, size_t aligned_size,
                        LifeLongType lifelong_value = kLifeLongNone);
   SomasTensor(const SomasTensor &) = delete;
   SomasTensor &operator=(const SomasTensor &) = delete;
@@ -86,14 +85,12 @@ class SomasTensor {
   const size_t &GetAlignedSize() const { return aligned_size_; }
   const size_t &GetNumConstraints() const { return num_constraints_; }
   bool IsLifelong() const { return lifelong_value_ == kLifeLongGraphAll; }
-  bool IsWorkspace() const { return type_ == kWorkspace; }
   bool IsOutputOnly() const { return type_ == kOutputOnly; }
   size_t GetOffset() const { return offset_; }
-  bool IsBetweenStreams() const { return between_streams_; }
   bool IsSemiLifelongStart() const { return lifelong_value_ == kLifeLongGraphStart; }
   bool IsSemiLifelongEnd() const { return lifelong_value_ == kLifeLongGraphEnd; }
-  bool IsRefOverlap() const { return ref_overlap_; }
-
+  string GetTypeString();
+  string GetLifelongString();
   // Computing functions
   void SetOffset() {
     if (aligned_size_ != 0) {
@@ -104,7 +101,6 @@ class SomasTensor {
   size_t num_constraints_{0};
 
  private:
-  bool ref_overlap_;
   const size_t id_{0};
   const size_t source_node_id_;
   const size_t source_stream_id_;
diff --git a/mindspore/ccsrc/backend/graph_compiler/backend.cc b/mindspore/ccsrc/backend/graph_compiler/backend.cc
index 1e35cbfebd3..7307c9f878e 100644
--- a/mindspore/ccsrc/backend/graph_compiler/backend.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/backend.cc
@@ -607,8 +607,8 @@ const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
     device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_});
   MS_EXCEPTION_IF_NULL(device_context);
   bool all_support = device_context->PartitionGraph(func_graph);
+  auto run_mode = device_context->GetRunMode(func_graph);
   if (all_support) {
-    auto run_mode = device_context->GetRunMode(func_graph);
     if (run_mode == device::RunMode::kGraphMode) {
       auto graph_id = graph_compiler_->CompileWholeGraphForGraphRunMode(func_graph, device_context);
       graph_id_to_device_context_[graph_id] = device_context;
@@ -1384,9 +1384,15 @@ std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(con
 
   std::vector<std::vector<int64_t> *> tensors_mask;
   std::vector<std::vector<tensor::TensorPtr> *> input_tensors;
+  auto strategy = runtime::GraphExecutionStrategy::kPipeline;
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (context_ptr->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) {
+    strategy = runtime::GraphExecutionStrategy::kPipelineWithExecutionOrder;
+  }
   return std::make_unique<GraphCompilerInfo>(graphs, device_contexts, tensors_mask, input_tensors, control_nodes_,
                                              root_graph->parameters(), parser, outputs_order, outputs_num, name, false,
-                                             runtime::GraphExecutionStrategy::kPipeline);
+                                             strategy);
 }
 
 std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.cc
index 2f9c3832d9d..fa803cbf8e5 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.cc
@@ -104,16 +104,6 @@ uint8_t *AscendMemoryManager::MallocDynamicMem(size_t size, bool communication_m
   return communication_mem ? alloc_address + kMemAlignSize : alloc_address;
 }
 
-void AscendMemoryManager::MallocSomasDynamicMem(const session::KernelGraph &graph) {
-  MemoryManager::MallocSomasDynamicMem(graph);
-#ifndef ENABLE_SECURITY
-  if (MemoryProfiling::GetInstance().IsMemoryProfilingInitialized()) {
-    MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
-    somas_reuse_util_ptr_->ConvertToProfilingNode(graph.graph_id());
-  }
-#endif
-}
-
 // communication memory: [512align_size + data + 512align_size]
 // return the pointer to the start of data address.
 uint8_t *AscendMemoryManager::MallocCommunicationMemFromMemPool(size_t size) {
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.h b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.h
index aba272e3348..59173feaf75 100644
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.h
@@ -36,7 +36,6 @@ class AscendMemoryManager : public MemoryManager {
   void *MallocMemFromMemPool(size_t size, bool from_persistent_mem) override;
   void FreeMemFromMemPool(void *device_ptr) override;
   uint64_t GetMsMaxMemSize() const;
-  void MallocSomasDynamicMem(const session::KernelGraph &graph) override;
   uint8_t *MallocCommunicationMemFromMemPool(size_t size) override;
   bool MallocContinuousMemFromMemPool(const DeviceAddressPtrList &addr_list, size_t total_size,
                                       std::vector<size_t> size_list) override;
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.cc b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.cc
new file mode 100644
index 00000000000..402c63f8456
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.cc
@@ -0,0 +1,229 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/ascend/hal/hardware/ascend_somas.h"
+#include <string>
+#include <map>
+#include <utility>
+#include <vector>
+#include "backend/common/optimizer/helper.h"
+#include "utils/ms_context.h"
+#include "plugin/device/ascend/hal/device/ascend_stream_assign.h"
+#include "plugin/device/ascend/hal/profiler/memory_profiling.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+using KernelGraph = session::KernelGraph;
+using UnReuseType = somas::UnReuseType;
+using TensorType = somas::TensorType;
+using LifeLongType = somas::LifeLongType;
+using mindspore::profiler::ascend::MemoryProfiling;
+
+#ifndef ENABLE_SECURITY
+void AscendSomas::ConvertToProfilingNode(uint32_t graph_id) const {
+  if (!MemoryProfiling::GetInstance().IsMemoryProfilingInitialized()) {
+    return;
+  }
+  auto graph_node = profiler::ascend::MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id);
+  if (graph_node == nullptr) {
+    graph_node = profiler::ascend::MemoryProfiling::GetInstance().AddGraphMemoryNode(graph_id);
+    MS_LOG(INFO) << "Add graph memory node for dynamic memory profiling, graph id is " << graph_id;
+  }
+
+  for (const auto &tensor : tensors_list_) {
+    profiler::ascend::TensorMemory tensor_memory;
+    tensor_memory.SetTensorId(tensor->GetId());
+    tensor_memory.SetAlignedSize(tensor->GetAlignedSize());
+    tensor_memory.SetType(tensor->GetTypeString());
+    tensor_memory.SetLifeStart(tensor->lifetime_.start_);
+    tensor_memory.SetLifeEnd(tensor->lifetime_.end_);
+    tensor_memory.SetLifeLong(tensor->GetLifelongString());
+    graph_node->AddTensorMemory(tensor_memory);
+  }
+
+  for (const auto &node : nodes_list_) {
+    profiler::ascend::NodeMemory node_memory;
+    std::string name = GetSplitName(node->scope_full_name_);
+    node_memory.SetNodeName(name);
+    node_memory.SetNodeId(node->GetId());
+    for (const auto &input_tensor : node->input_tensors_) {
+      node_memory.AddInputTensorId(input_tensor->GetId());
+    }
+    for (const auto &output_tensor : node->output_tensors_) {
+      node_memory.AddOutputTensorId(output_tensor->GetId());
+    }
+    for (const auto &workspace_tensor : node->workspace_tensors_) {
+      node_memory.AddWorkSpaceTensorId(workspace_tensor->GetId());
+    }
+    graph_node->AddNodeMemory(node_memory);
+  }
+}
+#endif
+
+bool AscendSomas::Initialize() { return true; }
+
+std::string AscendSomas::GetDeviceName() const { return "Ascend"; }
+
+size_t AscendSomas::GetCommunicationReservedSize() const {
+  constexpr size_t gap_size = 512;
+  return gap_size;
+}
+
+size_t AscendSomas::GetAlignSize(size_t original_size) const {
+  constexpr size_t alignment = 512;
+  constexpr size_t alignment_complement = 31;
+  size_t aligned_size =
+    (original_size > 0) ? ((original_size + alignment + alignment_complement) / alignment) * alignment : 0;
+  return aligned_size;
+}
+
+bool AscendSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto task_sink = ms_context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
+  auto opt_level = ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL);
+  if (task_sink || (opt_level == kOptimizeO1)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+std::vector<vector<uint32_t>> AscendSomas::GetStreamGroupInfo(const session::KernelGraph &graph) const {
+  std::vector<vector<uint32_t>> stream_group;
+  stream_group = device::ascend::AscendStreamAssign::GetInstance().get_stream_group();
+  return stream_group;
+}
+
+std::map<std::string, UnReuseType> AscendSomas::GetUnReuseNodeType(const session::KernelGraph &graph) const {
+  std::map<std::string, UnReuseType> node_type;
+  node_type[kGetNextOpName] = UnReuseType::kUnReuseOutput;
+  return node_type;
+}
+
+bool AscendSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) {
+  InitEventInfo(graph);
+  return true;
+}
+
+void AscendSomas::InitEventInfo(const session::KernelGraph &graph) {
+  event_map_ = {};
+  auto &kernels = graph.execution_order();
+  for (const auto &kernel : kernels) {
+    auto type = common::AnfAlgo::GetCNodeName(kernel);
+    if (type == kSendOpName) {
+      auto event = common::AnfAlgo::GetNodeAttr<uint32_t>(kernel, kAttrEventId);
+      auto iter = event_map_.find(event);
+      if (iter == event_map_.end()) {
+        auto pair = somas::EventPair();
+        pair.send_ = kernel;
+        event_map_[event] = pair;
+      } else {
+        iter->second.send_ = kernel;
+      }
+    } else if (type == kRecvOpName) {
+      auto event = common::AnfAlgo::GetNodeAttr<uint32_t>(kernel, kAttrEventId);
+      auto iter = event_map_.find(event);
+      if (iter == event_map_.end()) {
+        auto pair = somas::EventPair();
+        pair.recv_ = kernel;
+        event_map_[event] = pair;
+      } else {
+        iter->second.recv_ = kernel;
+      }
+    }
+  }
+
+  for (auto &event : event_map_) {
+    auto pair = event.second;
+    auto send_iter = nodes_map_.find(pair.send_.get());
+    if (send_iter == nodes_map_.end()) {
+      MS_LOG(WARNING) << "Can't find somas node for " << pair.send_->fullname_with_scope();
+      continue;
+    }
+
+    auto recv_iter = nodes_map_.find(pair.recv_.get());
+    if (recv_iter == nodes_map_.end()) {
+      MS_LOG(WARNING) << "Can't find somas node for " << pair.recv_->fullname_with_scope();
+      continue;
+    }
+
+    auto &somas_send = send_iter->second.at(0);
+    auto &somas_recv = recv_iter->second.at(0);
+    AddControlTensor(somas_send, somas_recv);
+  }
+  MS_LOG(DEBUG) << "Somas InitEventInfo end.";
+}
+
+bool AscendSomas::DevSpecNodeProcess(const session::KernelGraph &graph) {
+  IndependentNodeOutputProcess(graph);
+  NonTaskSplitProcess(graph);
+  return true;
+}
+
+void AscendSomas::IndependentNodeOutputProcess(const session::KernelGraph &graph) {
+  auto &kernel_cnodes = graph.execution_order();
+  size_t total_size = 0;
+  for (const auto &kernel : kernel_cnodes) {
+    bool independent = AnfAlgo::IsIndependentNode(kernel);
+    if (!independent) {
+      continue;
+    }
+    auto iter = nodes_map_.find(kernel.get());
+    if (iter != nodes_map_.end()) {
+      auto &node = iter->second.at(0);
+      MS_EXCEPTION_IF_NULL(node);
+      auto semi_reuse_output_tensors = node->output_tensors_;
+      for (auto &tensor : semi_reuse_output_tensors) {
+        MS_EXCEPTION_IF_NULL(tensor);
+        total_size += tensor->GetAlignedSize();
+        tensor->lifelong_value_ = LifeLongType::kLifeLongGraphEnd;
+      }
+    }
+  }
+
+  MS_LOG(INFO) << "Special Tensor total size: Independent Node output " << total_size;
+}
+
+void AscendSomas::NonTaskSplitProcess(const session::KernelGraph &graph) {
+  auto &kernel_cnodes = graph.execution_order();
+  for (const auto &kernel : kernel_cnodes) {
+    auto op_name = common::AnfAlgo::GetCNodeName(kernel);
+    if (common::AnfAlgo::IsNonTaskOp(kernel)) {
+      std::vector<size_t> refnode_input_output;
+      auto node = nodes_map_[kernel.get()].at(0);
+      MS_EXCEPTION_IF_NULL(node);
+      if (node->input_tensors_.empty()) {
+        MS_LOG(EXCEPTION) << op_name << " has no input tensor, can not do split non_task process.";
+      }
+      auto input_tensor = node->input_tensors_[0];
+      MS_EXCEPTION_IF_NULL(input_tensor);
+      input_tensor->type_ = TensorType::kUnion;
+      refnode_input_output.push_back(input_tensor->GetId());
+
+      for (auto &output_tensor : node->output_tensors_) {
+        MS_EXCEPTION_IF_NULL(output_tensor);
+        output_tensor->type_ = TensorType::kUnion;
+        refnode_input_output.push_back(output_tensor->GetId());
+      }
+      union_tensors_list_.push_back(refnode_input_output);
+    }
+  }
+}
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.h b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.h
new file mode 100644
index 00000000000..d741f2d613e
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_
+
+#include <vector>
+#include <string>
+#include <map>
+#include <utility>
+#include <memory>
+#include "backend/common/somas/somas.h"
+#include "runtime/hardware/device_type.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+using KernelGraph = session::KernelGraph;
+using UnReuseType = somas::UnReuseType;
+class AscendSomas : public somas::Somas {
+ public:
+#ifndef ENABLE_SECURITY
+  void ConvertToProfilingNode(uint32_t graph_id) const override;
+#endif
+ private:
+  bool Initialize() override;
+  string GetDeviceName() const override;
+  size_t GetCommunicationReservedSize() const override;
+  size_t GetAlignSize(size_t original_size) const override;
+
+  bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
+  std::vector<vector<uint32_t>> GetStreamGroupInfo(const session::KernelGraph &graph) const override;
+  std::map<std::string, UnReuseType> GetUnReuseNodeType(const session::KernelGraph &graph) const override;
+
+  bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
+  bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
+
+  void InitEventInfo(const session::KernelGraph &graph);
+  void IndependentNodeOutputProcess(const session::KernelGraph &graph);
+  void NonTaskSplitProcess(const session::KernelGraph &graph);
+  std::map<uint32_t, somas::EventPair> event_map_;
+};
+REG_SOMAS(Ascend, DeviceType::kAscend, AscendSomas)
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.cc b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.cc
new file mode 100644
index 00000000000..9c108a2dec1
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.cc
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/cpu/hal/hardware/cpu_somas.h"
+#include <string>
+#include "utils/ms_context.h"
+
+namespace mindspore {
+namespace device {
+namespace cpu {
+bool CPUSomas::Initialize() { return true; }
+
+std::string CPUSomas::GetDeviceName() const { return "CPU"; }
+
+size_t CPUSomas::GetAlignSize(size_t original_size) const {
+  constexpr size_t alignment = 512;
+  size_t aligned_size = (original_size > 0) ? ((original_size + alignment - 1) / alignment) * alignment : 0;
+  return aligned_size;
+}
+
+bool CPUSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const { return false; }
+
+bool CPUSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) { return true; }
+
+bool CPUSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { return true; }
+}  // namespace cpu
+}  // namespace device
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.h b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.h
new file mode 100644
index 00000000000..3df3b0369d7
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__
+#define MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__
+
+#include <string>
+#include "backend/common/somas/somas.h"
+#include "runtime/hardware/device_type.h"
+
+namespace mindspore {
+namespace device {
+namespace cpu {
+using KernelGraph = session::KernelGraph;
+class CPUSomas : public somas::Somas {
+ private:
+  bool Initialize() override;
+  string GetDeviceName() const override;
+  size_t GetAlignSize(size_t original_size) const override;
+
+  bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
+  bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
+  bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
+};
+REG_SOMAS(CPU, DeviceType::kCPU, CPUSomas)
+}  // namespace cpu
+}  // namespace device
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
index 47af2b2173a..d2f91b0d4e8 100644
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
@@ -25,6 +25,7 @@
 #include "plugin/device/gpu/hal/device/gpu_stream_assign.h"
 #include "plugin/device/gpu/hal/device/distribution/collective_init.h"
 #include "plugin/device/gpu/hal/device/gpu_device_manager.h"
+#include "plugin/device/gpu/hal/hardware/gpu_somas.h"
 #include "runtime/data_queue/data_queue_mgr.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/hal/device/gpu_common.h"
@@ -40,6 +41,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "backend/common/optimizer/common_backend_optimization.h"
 #include "backend/common/optimizer/dynamic_shape/dynamic_shape_helper.h"
+#include "include/common/debug/anf_ir_dump.h"
 #ifdef ENABLE_DUMP_IR
 #include "include/common/debug/rdr/recorder_manager.h"
 #include "debug/rdr/mem_address_recorder.h"
@@ -258,6 +260,25 @@ DeviceAddressPtr GPUDeviceResManager::CreateDeviceAddress(void *const device_ptr
   return device_address;
 }
 
+void GPUKernelExecutor::PreprocessBeforeRun(const FuncGraphPtr &graph) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  auto kernel_graph = graph->cast<KernelGraphPtr>();
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) {
+    auto somas = std::make_shared<GPUSomas>();
+    bool ret = somas->Assign(kernel_graph);
+    if (ret) {
+      MS_LOG(INFO) << "Somas allocate success for graph " << kernel_graph->graph_id()
+                   << " somas size: " << kernel_graph->somas_whole_block_size();
+    } else {
+      MS_LOG(WARNING) << "Somas allocate failed for graph " << kernel_graph->graph_id();
+    }
+  }
+  MS_LOG(INFO) << "Status record: end preprocess before run graph. graph id: " << kernel_graph->graph_id();
+}
+
 void GPUKernelExecutor::OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph) const {
   MS_EXCEPTION_IF_NULL(graph);
   // Operator fusion optimization.
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.h b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.h
index a0b670a4a62..0c1edc10db6 100644
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.h
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.h
@@ -82,6 +82,8 @@ class GPUKernelExecutor : public DeprecatedKernelExecutor {
 
   void CreateKernel(const std::vector<CNodePtr> &nodes) const override;
 
+  void PreprocessBeforeRun(const FuncGraphPtr &graph) const override;
+
   bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
                     const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs) const override;
 
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.cc b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.cc
new file mode 100644
index 00000000000..2a477dc78bd
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.cc
@@ -0,0 +1,141 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/hal/hardware/gpu_somas.h"
+#include <string>
+#include <vector>
+#include "backend/common/optimizer/helper.h"
+#include "utils/ms_context.h"
+
+namespace mindspore {
+namespace device {
+namespace gpu {
+bool GPUSomas::Initialize() { return true; }
+
+std::string GPUSomas::GetDeviceName() const { return "GPU"; }
+
+size_t GPUSomas::GetAlignSize(size_t original_size) const {
+  constexpr size_t alignment = 512;
+  size_t aligned_size = (original_size > 0) ? ((original_size + alignment - 1) / alignment) * alignment : 0;
+  return aligned_size;
+}
+
+bool GPUSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (context_ptr->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool GPUSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) {
+  InitEventInfo(graph);
+  return true;
+}
+
+void GPUSomas::InitEventInfo(const session::KernelGraph &graph) {
+  event_map_ = {};
+  auto &kernels = graph.execution_order();
+  for (const auto &kernel : kernels) {
+    auto type = common::AnfAlgo::GetCNodeName(kernel);
+    if (type == kSendOpName) {
+      auto event = common::AnfAlgo::GetNodeAttr<uintptr_t>(kernel, kAttrRecordEvent);
+      auto iter = event_map_.find(event);
+      if (iter == event_map_.end()) {
+        auto pair = somas::EventPair();
+        pair.send_ = kernel;
+        event_map_[event] = pair;
+      } else {
+        iter->second.send_ = kernel;
+      }
+    } else if (type == kRecvOpName) {
+      auto event = common::AnfAlgo::GetNodeAttr<uintptr_t>(kernel, kAttrWaitEvent);
+      auto iter = event_map_.find(event);
+      if (iter == event_map_.end()) {
+        auto pair = somas::EventPair();
+        pair.recv_ = kernel;
+        event_map_[event] = pair;
+      } else {
+        iter->second.recv_ = kernel;
+      }
+    }
+  }
+
+  for (auto &event : event_map_) {
+    auto pair = event.second;
+    auto send_iter = nodes_map_.find(pair.send_.get());
+    if (send_iter == nodes_map_.end()) {
+      MS_LOG(WARNING) << "Can't find somas node for " << pair.send_->fullname_with_scope();
+      continue;
+    }
+
+    auto recv_iter = nodes_map_.find(pair.recv_.get());
+    if (recv_iter == nodes_map_.end()) {
+      MS_LOG(WARNING) << "Can't find somas node for " << pair.recv_->fullname_with_scope();
+      continue;
+    }
+
+    auto &somas_send = send_iter->second.at(0);
+    auto &somas_recv = recv_iter->second.at(0);
+    AddControlTensor(somas_send, somas_recv);
+  }
+  MS_LOG(DEBUG) << "Somas InitEventInfo end.";
+}
+
+bool GPUSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { return InplaceNodeProcess(graph); }
+
+bool GPUSomas::InplaceNodeProcess(const session::KernelGraph &graph) {
+  auto &kernels = graph.execution_order();
+  for (auto &kernel : kernels) {
+    if (!common::AnfAlgo::IsInplaceNode(kernel, "skip")) {
+      continue;
+    }
+    auto iter = nodes_map_.find(kernel.get());
+    if (iter != nodes_map_.end()) {
+      auto &node = iter->second.at(0);
+      MS_EXCEPTION_IF_NULL(node);
+      auto input_tensors = node->input_tensors_;
+      auto output_tensors = node->output_tensors_;
+      std::vector<somas::SomasTensorPtr> union_tensors;
+      union_tensors.insert(union_tensors.end(), input_tensors.begin(), input_tensors.end());
+      union_tensors.insert(union_tensors.end(), output_tensors.begin(), output_tensors.end());
+      // check whether the union tensor already in other union tensors
+      for (auto &tensor : union_tensors) {
+        auto tensor_id = tensor->GetId();
+        for (auto &union_list : union_tensors_list_) {
+          if (std::count(union_list.begin(), union_list.end(), tensor_id)) {
+            MS_LOG(EXCEPTION) << "Inplace node union Tensor " << tensor_id << " already in other union tensor list.";
+          }
+        }
+      }
+      std::vector<size_t> inplace_union_tensor_list;
+      for (auto &tensor : union_tensors) {
+        tensor->type_ = somas::kUnion;
+        inplace_union_tensor_list.push_back(tensor->GetId());
+      }
+
+      union_tensors_list_.push_back(inplace_union_tensor_list);
+    } else {
+      MS_LOG(EXCEPTION) << "Can't find somas node for inplace node " << kernel->fullname_with_scope();
+    }
+  }
+  return true;
+}
+}  // namespace gpu
+}  // namespace device
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.h b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.h
new file mode 100644
index 00000000000..8f64a1c5621
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__
+#define MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__
+
+#include <map>
+#include <string>
+#include "backend/common/somas/somas.h"
+#include "runtime/hardware/device_type.h"
+
+namespace mindspore {
+namespace device {
+namespace gpu {
+using KernelGraph = session::KernelGraph;
+
+class GPUSomas : public somas::Somas {
+ private:
+  bool Initialize() override;
+  string GetDeviceName() const override;
+  size_t GetAlignSize(size_t original_size) const override;
+
+  bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
+  bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
+  bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
+  bool InplaceNodeProcess(const session::KernelGraph &graph);
+  void InitEventInfo(const session::KernelGraph &graph);
+  std::map<uintptr_t, somas::EventPair> event_map_;
+};
+REG_SOMAS(GPU, DeviceType::kGPU, GPUSomas)
+}  // namespace gpu
+}  // namespace device
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__
diff --git a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
index 53d6b252849..908bf9ab3f0 100644
--- a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
+++ b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
@@ -101,7 +101,8 @@ REGISTER_PYBIND_DEFINE(MsContextPy, ([](const py::module *m) {
                            .value("graph_kernel_flags", MsCtxParam::MS_CTX_GRAPH_KERNEL_FLAGS)
                            .value("grad_for_scalar", MsCtxParam::MS_CTX_GRAD_FOR_SCALAR)
                            .value("pynative_synchronize", MsCtxParam::MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE)
-                           .value("disable_format_transform", MsCtxParam::MS_CTX_DISABLE_FORMAT_TRANSFORM);
+                           .value("disable_format_transform", MsCtxParam::MS_CTX_DISABLE_FORMAT_TRANSFORM)
+                           .value("memory_optimize_level", MsCtxParam::MS_CTX_MEMORY_OPTIMIZE_LEVEL);
                          (void)py::class_<mindspore::MsContext, std::shared_ptr<mindspore::MsContext>>(*m, "MSContext")
                            .def_static("get_instance", &mindspore::MsContext::GetInstance, "Get ms context instance.")
                            .def("get_param", &mindspore::MsCtxGetParameter, "Get value of specified parameter.")
diff --git a/mindspore/ccsrc/runtime/device/CMakeLists.txt b/mindspore/ccsrc/runtime/device/CMakeLists.txt
index eacb106bdd7..d73d8f167b9 100644
--- a/mindspore/ccsrc/runtime/device/CMakeLists.txt
+++ b/mindspore/ccsrc/runtime/device/CMakeLists.txt
@@ -3,6 +3,7 @@ file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*
     "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc" "memory_scheduler.cc"
     "memory_offload_strategy.cc" "bucket.cc" "launch_kernel.cc" "launch_mul.cc" "tensor_array.cc"
     "ms_device_shape_transfer.cc" "context_extends.cc" "stream_synchronizer.cc" "tensors_queue.cc" "auto_mem_offload.cc"
+    "common_somas_allocator.cc"
 )
 
 if("${ENABLE_HIDDEN}" STREQUAL "OFF")
diff --git a/mindspore/ccsrc/runtime/device/common_somas_allocator.cc b/mindspore/ccsrc/runtime/device/common_somas_allocator.cc
new file mode 100644
index 00000000000..0f8b62da569
--- /dev/null
+++ b/mindspore/ccsrc/runtime/device/common_somas_allocator.cc
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "runtime/device/common_somas_allocator.h"
+#include <utility>
+#include <string>
+#include "backend/common/optimizer/helper.h"
+#include "utils/ms_context.h"
+#ifdef ENABLE_DUMP_IR
+#include "debug/rdr/string_recorder.h"
+#endif
+
+namespace mindspore {
+namespace device {
+bool CommonSomasAllocator::Assign(const session::KernelGraph &graph) {
+  somas::SomasPtr somas_ptr{nullptr};
+  if (GetTargetFromContext() == kAscendDevice) {
+    somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kAscend);
+  } else if (GetTargetFromContext() == kGPUDevice) {
+    somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kGPU);
+  } else {
+    somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kCPU);
+  }
+  MS_EXCEPTION_IF_NULL(somas_ptr);
+  bool ret = somas_ptr->Assign(graph);
+  if (ret) {
+#ifdef ENABLE_DUMP_IR
+    SubModuleId module = SubModuleId::SM_OPTIMIZER;
+    std::string name = "somas_allocate_info." + std::to_string(graph.graph_id());
+    (void)mindspore::RDR::RecordString(module, name, somas_ptr->SomasInfo());
+#endif
+#ifndef ENABLE_SECURITY
+    somas_ptr->ConvertToProfilingNode(graph.graph_id());
+#endif
+  }
+  return ret;
+}
+
+uint8_t *CommonSomasAllocator::GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const {
+  MS_EXCEPTION_IF_NULL(node);
+  auto kernel_info = dynamic_cast<KernelInfo *>(node->kernel_info());
+  MS_EXCEPTION_IF_NULL(kernel_info);
+  if (index >= kernel_info->somas_output_offset_aligned_size_list().size()) {
+    MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:["
+                      << kernel_info->somas_output_offset_aligned_size_list().size() << "]";
+  }
+  auto somas_offset_aligned_size = kernel_info->somas_output_offset_aligned_size_list()[index];
+  if (somas_offset_aligned_size.second == 0) {
+    return nullptr;
+  }
+  auto somas_offset = somas_offset_aligned_size.first;
+  uint8_t *ptr = mem_base_addr_ + somas_offset;
+  return ptr;
+}
+
+uint8_t *CommonSomasAllocator::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const {
+  MS_EXCEPTION_IF_NULL(node);
+  auto kernel_info = dynamic_cast<KernelInfo *>(node->kernel_info());
+  MS_EXCEPTION_IF_NULL(kernel_info);
+  if (index >= kernel_info->somas_workspace_offset_aligned_size_list().size()) {
+    MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:["
+                      << kernel_info->somas_workspace_offset_aligned_size_list().size() << "]";
+  }
+  auto somas_offset_aligned_size = kernel_info->somas_workspace_offset_aligned_size_list()[index];
+  if (somas_offset_aligned_size.second == 0) {
+    return nullptr;
+  }
+  auto somas_offset = somas_offset_aligned_size.first;
+  uint8_t *ptr = mem_base_addr_ + somas_offset;
+  return ptr;
+}
+}  // namespace device
+}  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/device/common_somas_allocator.h b/mindspore/ccsrc/runtime/device/common_somas_allocator.h
new file mode 100644
index 00000000000..e3c796c741a
--- /dev/null
+++ b/mindspore/ccsrc/runtime/device/common_somas_allocator.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H
+#define MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H
+
+#include <vector>
+#include <string>
+#include <map>
+#include <utility>
+#include <memory>
+#include "backend/common/somas/somas.h"
+#include "runtime/hardware/device_type.h"
+#include "utils/ms_context.h"
+
+namespace mindspore {
+namespace device {
+class CommonSomasAllocator {
+ public:
+  void set_mem_base_addr(uint8_t *mem_base_addr) { mem_base_addr_ = mem_base_addr; }
+  static bool Assign(const session::KernelGraph &graph);
+  uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
+  uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
+
+ private:
+  // Memory base addr
+  uint8_t *mem_base_addr_{nullptr};
+  static std::string GetTargetFromContext() {
+    auto context_ptr = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(context_ptr);
+    return context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
+  }
+};
+using CommonSomasAllocatorPtr = std::shared_ptr<CommonSomasAllocator>;
+}  // namespace device
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H
diff --git a/mindspore/ccsrc/runtime/device/kernel_info.cc b/mindspore/ccsrc/runtime/device/kernel_info.cc
index 111dac5c5c6..0e9058984c7 100644
--- a/mindspore/ccsrc/runtime/device/kernel_info.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_info.cc
@@ -15,6 +15,7 @@
  */
 
 #include "runtime/device/kernel_info.h"
+#include <utility>
 
 namespace mindspore {
 namespace device {
@@ -108,6 +109,13 @@ bool KernelInfo::SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t
   return true;
 }
 
+bool KernelInfo::SetSomasResult(std::vector<std::pair<size_t, size_t>> &&output_somas_result,
+                                std::vector<std::pair<size_t, size_t>> &&workspace_somas_result) {
+  somas_output_result_ = std::move(output_somas_result);
+  somas_workspace_result_ = std::move(workspace_somas_result);
+  return true;
+}
+
 void KernelInfo::set_kernel_mod(const kernel::KernelModPtr &kernel_mod) { kernel_mod_ = kernel_mod; }
 
 kernel::KernelMod *KernelInfo::MutableKernelMod() const { return kernel_mod_.get(); }
diff --git a/mindspore/ccsrc/runtime/device/kernel_info.h b/mindspore/ccsrc/runtime/device/kernel_info.h
index 7e2ef6802e7..9c8dbf5dc12 100644
--- a/mindspore/ccsrc/runtime/device/kernel_info.h
+++ b/mindspore/ccsrc/runtime/device/kernel_info.h
@@ -19,6 +19,7 @@
 
 #include <vector>
 #include <memory>
+#include <utility>
 #include "ir/kernel_info_dev.h"
 #include "kernel/kernel_build_info.h"
 #include "kernel/kernel.h"
@@ -57,6 +58,8 @@ class KernelInfo : public KernelInfoDevice {
   DeviceAddressPtr GetMutableWorkspaceAddr(size_t index) const;
   bool WorkspaceAddrExist(size_t index) const;
   bool SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t index);
+  bool SetSomasResult(std::vector<std::pair<size_t, size_t>> &&output_somas_result,
+                      std::vector<std::pair<size_t, size_t>> &&workspace_somas_result);
   void set_kernel_mod(const kernel::KernelModPtr &kernel_mod);
   kernel::KernelMod *MutableKernelMod() const;
   const kernel::KernelMod *kernel_mod() const;
@@ -70,6 +73,12 @@ class KernelInfo : public KernelInfoDevice {
   uint32_t graph_id() const { return graph_id_; }
   bool operator==(const KernelInfo &other) const;
   bool is_feature_map() const { return is_feature_map_; }
+  const std::vector<std::pair<size_t, size_t>> &somas_output_offset_aligned_size_list() const {
+    return somas_output_result_;
+  }
+  const std::vector<std::pair<size_t, size_t>> &somas_workspace_offset_aligned_size_list() const {
+    return somas_workspace_result_;
+  }
   const std::vector<std::shared_ptr<DeviceAddress>> &output_address_list() const { return output_address_list_; }
   const std::vector<std::shared_ptr<DeviceAddress>> &workspace_address_list() const { return workspace_address_list_; }
 
@@ -83,6 +92,12 @@ class KernelInfo : public KernelInfoDevice {
   kernel::KernelBuildInfoPtr select_kernel_build_info_;
   std::vector<std::shared_ptr<DeviceAddress>> output_address_list_;
   std::vector<std::shared_ptr<DeviceAddress>> workspace_address_list_;
+  // pair<size_t, size_t> : (offset, aligned_size)
+  // aligned_size of 0 means no memory allocation
+  std::vector<std::pair<size_t, size_t>> somas_output_result_;
+  // pair<size_t, size_t> : (offset, aligned_size)
+  // aligned_size of 0 means no memory allocation
+  std::vector<std::pair<size_t, size_t>> somas_workspace_result_;
   kernel::KernelModPtr kernel_mod_;
   // stream_id_ is the index of stream object vector
   uint32_t stream_id_;
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.cc b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
index ff3d51d548c..93105178a81 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@@ -985,7 +985,12 @@ void KernelRuntime::AssignNodeOutputMem(MemType type, const AnfNodePtr &node, in
     auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type, {node, i});
     MS_EXCEPTION_IF_NULL(device_address);
     uint8_t *ptr = mem_manager_->MallocOutputMem(node, i, type, output_sizes[i], device_address, false);
-    MS_EXCEPTION_IF_NULL(ptr);
+    if (ptr == nullptr && type == kSomasReuseDynamicMem) {
+      MS_LOG(INFO) << "node: " << node->fullname_with_scope() << " could be a RefNode, please check it"
+                   << " output index: " << i << " memory type: " << type;
+    } else {
+      MS_EXCEPTION_IF_NULL(ptr);
+    }
     device_address->set_host_shape(trans::GetRuntimePaddingShape(node, i));
     AnfAlgo::SetOutputAddr(device_address, i, node.get());
   }
diff --git a/mindspore/ccsrc/runtime/device/memory_manager.cc b/mindspore/ccsrc/runtime/device/memory_manager.cc
index 77aceb99341..85b47b96f6e 100644
--- a/mindspore/ccsrc/runtime/device/memory_manager.cc
+++ b/mindspore/ccsrc/runtime/device/memory_manager.cc
@@ -18,10 +18,6 @@
 #include <string>
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "include/common/utils/anfalgo.h"
-#include "include/common/debug/common.h"
-#ifdef ENABLE_DUMP_IR
-#include "debug/rdr/string_recorder.h"
-#endif
 #include "utils/ms_context.h"
 
 namespace mindspore {
@@ -37,41 +33,21 @@ size_t MemoryManager::GetCommunicationAlignSize(size_t input_size) {
 }
 
 void MemoryManager::MallocSomasDynamicMem(const session::KernelGraph &graph) {
-  SomasPtr somas_reuse_util_ptr = std::make_shared<somas::Somas>();
-  MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr);
-  somas_reuse_util_ptr_ = somas_reuse_util_ptr;
+  SomasAllocatorPtr somas_allocator_ptr = std::make_shared<device::CommonSomasAllocator>();
+  MS_EXCEPTION_IF_NULL(somas_allocator_ptr);
+  somas_allocator_ptr_ = somas_allocator_ptr;
 
-  if (!(somas_reuse_util_ptr->Allocate(&graph))) {
+  if (!(somas_allocator_ptr->Assign(graph))) {
     MS_LOG(EXCEPTION) << "Somas Allocate Failed.";
   }
 
-  size_t total_allocated_size = somas_reuse_util_ptr->GetTotalMemSize();
+  size_t total_allocated_size = graph.somas_whole_block_size();
   MS_LOG(INFO) << "Graph " << graph.graph_id() << ": TotalSomasReuseDynamicSize [" << total_allocated_size << "]";
   if (total_allocated_size > 0) {
     auto base_ptr = MallocDynamicMem(total_allocated_size, false);
     MS_LOG(INFO) << "Somas Reuse Memory Base Address [" << static_cast<void *>(base_ptr) << "], End Address ["
                  << static_cast<void *>(base_ptr + total_allocated_size) << "]";
-    somas_reuse_util_ptr->set_mem_base_addr(base_ptr);
-  }
-
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
-#ifdef ENABLE_DUMP_IR
-  SubModuleId module = SubModuleId::SM_OPTIMIZER;
-
-  std::string name = "somas_allocate_info." + std::to_string(graph.graph_id());
-  (void)mindspore::RDR::RecordString(module, name, somas_reuse_util_ptr_->SomasInfo());
-
-  name = "somas_mem_info." + std::to_string(graph.graph_id());
-  (void)mindspore::RDR::RecordString(module, name, somas_reuse_util_ptr_->SomasMemory());
-#endif
-  bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
-  if (save_graphs) {
-    std::string file_path = GetSaveGraphsPathName("somas_allocate_info_" + std::to_string(graph.graph_id()) + ".ir");
-    somas_reuse_util_ptr_->DumpSomasInfoIR(file_path);
-
-    std::string mem_file_path = GetSaveGraphsPathName("somas_mem_info_" + std::to_string(graph.graph_id()) + ".ir");
-    somas_reuse_util_ptr_->DumpSomasMemoryIR(mem_file_path);
+    somas_allocator_ptr->set_mem_base_addr(base_ptr);
   }
 }
 
@@ -94,8 +70,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
         address->communication_ptr_ = ptr - kMemAlignSize;
       }
     } else if (type == kSomasReuseDynamicMem) {
-      MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
-      ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index);
+      MS_EXCEPTION_IF_NULL(somas_allocator_ptr_);
+      ptr = somas_allocator_ptr_->GetNodeOutputPtr(node, index);
     } else {
       ptr = MallocDynamicMem(size, communication_mem);
     }
@@ -109,8 +85,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
   } else if (type == kDynamicMem) {
     ptr = MallocDynamicMem(size, false);
   } else if (type == kSomasReuseDynamicMem) {
-    MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
-    ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index);
+    MS_EXCEPTION_IF_NULL(somas_allocator_ptr_);
+    ptr = somas_allocator_ptr_->GetNodeOutputPtr(node, index);
   }
   address->ptr_ = ptr;
   return ptr;
@@ -118,8 +94,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
 
 uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, MemType type, size_t size) {
   if (type == kSomasReuseDynamicMem) {
-    MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
-    return somas_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
+    MS_EXCEPTION_IF_NULL(somas_allocator_ptr_);
+    return somas_allocator_ptr_->GetNodeWorkSpacePtr(node, index);
   }
   return MallocDynamicMem(size, false);
 }
diff --git a/mindspore/ccsrc/runtime/device/memory_manager.h b/mindspore/ccsrc/runtime/device/memory_manager.h
index d97bbdfca4a..c327ff35e2d 100644
--- a/mindspore/ccsrc/runtime/device/memory_manager.h
+++ b/mindspore/ccsrc/runtime/device/memory_manager.h
@@ -22,14 +22,15 @@
 #include <map>
 #include <queue>
 #include "common/mem_reuse/mem_reuse.h"
-#include "backend/common/somas/somas.h"
+#include "runtime/device/common_somas_allocator.h"
+
 namespace mindspore {
 namespace device {
 enum MemType { kStaticMem, kDynamicMem, kSomasReuseDynamicMem };
 constexpr int kGetAllOuts = -1;
 constexpr uint64_t kMemAlignSize = 512;
 constexpr uint64_t kTwiceMemAlignSize = kMemAlignSize << 1;
-using SomasPtr = mindspore::somas::SomasPtr;
+using SomasAllocatorPtr = mindspore::device::CommonSomasAllocatorPtr;
 
 class MemoryManager {
  public:
@@ -80,7 +81,7 @@ class MemoryManager {
     return MallocStaticMem(size, communication_mem, kInvalidGraphId);
   }
   virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem);
-  SomasPtr somas_reuse_util_ptr_{nullptr};
+  SomasAllocatorPtr somas_allocator_ptr_{nullptr};
 };
 }  // namespace device
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/memory_manager_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/memory_manager_actor.cc
index 0f64d3a08e9..565a18a1137 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/memory_manager_actor.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/memory_manager_actor.cc
@@ -81,6 +81,10 @@ void MemoryManagerActor::AllocateContinuousMemory(const std::vector<std::vector<
     auto &size_list = (*size_list_list)[i];
     auto &device_context = (*device_contexts)[i];
     MS_EXCEPTION_IF_NULL(device_context);
+    // if the address of continuous tensor has already been allocated, skip the tensor
+    if (alloc_list[0]->GetPtr() != nullptr) {
+      continue;
+    }
     // Allocate memory through the device context.
     device::DynamicMemAllocatorDebugInfo::SetDebugInfo(from_aid.Name(), device::AllocatorType::kKernelOutput);
     auto dev_ptr_list = device_context->device_res_manager_->AllocateContinuousMemory(size_list);
diff --git a/mindspore/core/utils/ms_context.cc b/mindspore/core/utils/ms_context.cc
index 2c1d9e39484..104a6cc006c 100644
--- a/mindspore/core/utils/ms_context.cc
+++ b/mindspore/core/utils/ms_context.cc
@@ -102,6 +102,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
   set_param<bool>(MS_CTX_ENABLE_RECOVERY, false);
   set_param<bool>(MS_CTX_ENABLE_GE_HETEROGENOUS, false);
   set_param<bool>(MS_CTX_DISABLE_FORMAT_TRANSFORM, false);
+  set_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL, kOptimizeO0);
 
   uint32_t kDefaultRuntimeNumThreads = 30;
   uint32_t cpu_core_num = std::thread::hardware_concurrency() - 1;
diff --git a/mindspore/core/utils/ms_context.h b/mindspore/core/utils/ms_context.h
index 4cd7afef3bc..3a9e68b2de3 100644
--- a/mindspore/core/utils/ms_context.h
+++ b/mindspore/core/utils/ms_context.h
@@ -55,6 +55,8 @@ const char kGpuInferenceDevice[] = "GpuInference";
 const char kDavinciDevice[] = "Davinci";
 const char KNpuLog[] = "_npu_log";
 const unsigned int MAX_CALL_DEPTH_DEFAULT = 1000;
+const int kOptimizeO0 = 0;
+const int kOptimizeO1 = 1;
 
 const std::set<std::string> kTargetSet = {kCPUDevice, kGPUDevice, kAscendDevice, kDavinciDevice};
 // The default max available device memory is 1024GB.
@@ -98,6 +100,7 @@ enum MsCtxParam : unsigned {
   // parameter of type int
   MS_CTX_TYPE_INT_BEGIN = MS_CTX_TYPE_BOOL_END,
   MS_CTX_EXECUTION_MODE = MS_CTX_TYPE_INT_BEGIN,
+  MS_CTX_MEMORY_OPTIMIZE_LEVEL,
   MS_CTX_TYPE_INT_END,
 
   // parameter of type uint32
diff --git a/mindspore/lite/src/extendrt/CMakeLists.txt b/mindspore/lite/src/extendrt/CMakeLists.txt
index 4e53a2c97d3..4da401546df 100644
--- a/mindspore/lite/src/extendrt/CMakeLists.txt
+++ b/mindspore/lite/src/extendrt/CMakeLists.txt
@@ -98,7 +98,6 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
         ${CCSRC_DIR}/backend/common/somas/somas_solver_alg.cc
         ${CCSRC_DIR}/backend/graph_compiler/graph_partition.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/mock/segment_runner.cc
-        ${CCSRC_DIR}/runtime/device/auto_mem_offload.cc
         ${CCSRC_DIR}/runtime/device/ms_device_shape_transfer.cc
         ${CCSRC_DIR}/runtime/device/kernel_info.cc
         ${CCSRC_DIR}/runtime/device/convert_tensor_utils.cc
@@ -109,6 +108,7 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
         ${CCSRC_DIR}/runtime/device/memory_offload_strategy.cc
         ${CCSRC_DIR}/runtime/device/memory_manager.cc
         ${CCSRC_DIR}/runtime/device/auto_mem_offload.cc
+        ${CCSRC_DIR}/runtime/device/common_somas_allocator.cc
         ${CCSRC_DIR}/runtime/pynative/op_executor.cc
         ${CCSRC_DIR}/runtime/pynative/op_runtime_info.cc
         ${CCSRC_DIR}/runtime/hardware/device_type.cc
@@ -117,6 +117,8 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
         ${CCSRC_DIR}/kernel/kernel.cc
         ${CCSRC_DIR}/kernel/kash/kernel_pack.cc
         ${CCSRC_DIR}/kernel/oplib/oplib.cc
+        ${CCSRC_DIR}/common/debug/anf_dump_utils.cc
+        ${CCSRC_DIR}/common/debug/anf_ir_dump.cc
         ${CCSRC_DIR}/common/debug/common.cc
         ${CCSRC_DIR}/common/debug/env_config_parser.cc
         ${CCSRC_DIR}/common/thread_pool.cc
diff --git a/mindspore/python/mindspore/context.py b/mindspore/python/mindspore/context.py
index 9282db75b36..5aaf93975bf 100644
--- a/mindspore/python/mindspore/context.py
+++ b/mindspore/python/mindspore/context.py
@@ -197,6 +197,22 @@ class _Context:
                              f"or context.PYNATIVE_MODE (1), but got {mode}.")
         self.set_param(ms_ctx_param.mode, mode)
 
+    def set_memory_optimize_level(self, memory_optimize_level):
+        """
+        The memory optimize level, support "O0", "O1".
+
+        Args:
+            target (str): "O0", "O1"
+        """
+        memory_optimize_levels = ["O0", "O1"]
+        if memory_optimize_level not in memory_optimize_levels:
+            raise ValueError(f"For 'context.set_context', the argument 'memory_optimize_level' must be one of "
+                             f"{memory_optimize_levels}, but got {memory_optimize_level}.")
+        if memory_optimize_level == "O0":
+            self.set_param(ms_ctx_param.memory_optimize_level, 0)
+        else:
+            self.set_param(ms_ctx_param.memory_optimize_level, 1)
+
     def set_backend_policy(self, policy):
         success = self._context_handle.set_backend_policy(policy)
         if not success:
@@ -353,7 +369,8 @@ class _Context:
         'mempool_block_size': set_mempool_block_size,
         'print_file_path': set_print_file_path,
         'env_config_path': set_env_config_path,
-        'runtime_num_threads': set_runtime_num_threads
+        'runtime_num_threads': set_runtime_num_threads,
+        'memory_optimize_level': set_memory_optimize_level
     }
 
     @property
diff --git a/tests/st/networks/test_gpu_alexnet.py b/tests/st/networks/test_gpu_alexnet.py
index 13561e7b7e2..a2cfd462ac9 100644
--- a/tests/st/networks/test_gpu_alexnet.py
+++ b/tests/st/networks/test_gpu_alexnet.py
@@ -87,3 +87,30 @@ def test_trainTensor(num_classes=10, epoch=15, batch_size=32):
         loss = train_network(data, label).asnumpy()
         losses.append(loss)
     assert losses[-1] < 0.01
+
+
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_train_tensor_memory_opt(num_classes=10, epoch=15, batch_size=32):
+    """
+    Feature: Somas GPU kernel by kernel.
+    Description: AlexNet with Somas GPU kernel by kernel.
+    Expectation: No exception.
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
+    net = AlexNet(num_classes)
+    lr = 0.1
+    momentum = 0.9
+    optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, momentum, weight_decay=0.0001)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(net_with_criterion, optimizer)
+    train_network.set_train()
+    losses = []
+    for i in range(0, epoch):
+        data = Tensor(np.ones([batch_size, 3, 227, 227]).astype(np.float32) * 0.01)
+        label = Tensor(np.ones([batch_size]).astype(np.int32))
+        loss = train_network(data, label).asnumpy()
+        losses.append(loss)
+    assert losses[-1] < 0.01
diff --git a/tests/st/networks/test_gpu_lenet.py b/tests/st/networks/test_gpu_lenet.py
index ca4d21d3601..5e1daa37fdc 100644
--- a/tests/st/networks/test_gpu_lenet.py
+++ b/tests/st/networks/test_gpu_lenet.py
@@ -150,6 +150,35 @@ def test_train_lenet():
     assert losses[-1] < 0.01
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_train_lenet_memory_opt():
+    """
+    Feature: Somas GPU kernel by kernel.
+    Description: LeNet with Somas GPU kernel by kernel.
+    Expectation: No exception.
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
+    epoch = 100
+    net = LeNet()
+    momentum = 0.9
+    learning_rate = multisteplr(epoch, 30)
+
+    optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(net_with_criterion, optimizer)  # optimizer
+    train_network.set_train()
+    losses = []
+    for i in range(epoch):
+        data = Tensor(np.ones([net.batch_size, 3, 32, 32]).astype(np.float32) * 0.01)
+        label = Tensor(np.ones([net.batch_size]).astype(np.int32))
+        loss = train_network(data, label).asnumpy()
+        losses.append(loss)
+    assert losses[-1] < 0.01
+
+
 def create_dataset(data_path, batch_size=32, repeat_size=1,
                    num_parallel_workers=1):
     """
diff --git a/tests/st/networks/test_gpu_lstm.py b/tests/st/networks/test_gpu_lstm.py
index 4ec063278da..8cf29be06e5 100644
--- a/tests/st/networks/test_gpu_lstm.py
+++ b/tests/st/networks/test_gpu_lstm.py
@@ -142,3 +142,48 @@ def test_LSTM():
         losses.append(loss)
         print("loss:", loss.asnumpy())
     assert (losses[-1].asnumpy() < 0.01)
+
+
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_lstm_memory_opt():
+    """
+    Feature: Somas GPU kernel by kernel.
+    Description: LSTM with Somas GPU kernel by kernel.
+    Expectation: No exception.
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
+    num_epochs = 5
+    embed_size = 100
+    num_hiddens = 100
+    num_layers = 2
+    bidirectional = True
+    labels = 2
+    vocab_size = 252193
+    max_len = 500
+
+    weight = np.ones((vocab_size + 1, embed_size)).astype(np.float32)
+
+    net = SentimentNet(vocab_size=(vocab_size + 1), embed_size=embed_size,
+                       num_hiddens=num_hiddens, num_layers=num_layers,
+                       bidirectional=bidirectional, weight=weight,
+                       labels=labels, batch_size=batch_size)
+
+    learning_rate = 0.1
+    momentum = 0.9
+
+    optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(net_with_criterion, optimizer)  # optimizer
+    train_network.set_train()
+
+    train_features = Tensor(np.ones([64, max_len]).astype(np.int32))
+    train_labels = Tensor(np.ones([64,]).astype(np.int32)[0:64])
+    losses = []
+    for epoch in range(num_epochs):
+        loss = train_network(train_features, train_labels)
+        losses.append(loss)
+        print("loss:", loss.asnumpy())
+    assert (losses[-1].asnumpy() < 0.01)
diff --git a/tests/st/networks/test_gpu_resnet.py b/tests/st/networks/test_gpu_resnet.py
index de67d16318d..521d0b3d23c 100644
--- a/tests/st/networks/test_gpu_resnet.py
+++ b/tests/st/networks/test_gpu_resnet.py
@@ -352,6 +352,36 @@ def test_trainTensor(num_classes=10, epoch=8, batch_size=1):
     assert (losses[-1].asnumpy() < 1)
 
 
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_train_tensor_memory_opt(num_classes=10, epoch=8, batch_size=1):
+    """
+    Feature: Somas GPU kernel by kernel.
+    Description: ResNet with Somas GPU kernel by kernel.
+    Expectation: No exception.
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
+    net = resnet50(num_classes)
+    lr = 0.1
+    momentum = 0.9
+    optimizer = Momentum(filter(lambda x: x.requires_grad,
+                                net.get_parameters()), lr, momentum)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(
+        net_with_criterion, optimizer)  # optimizer
+    train_network.set_train()
+    losses = []
+    for i in range(0, epoch):
+        data = Tensor(np.ones([batch_size, 3, 224, 224]
+                              ).astype(np.float32) * 0.01)
+        label = Tensor(np.ones([batch_size]).astype(np.int32))
+        loss = train_network(data, label)
+        losses.append(loss)
+    assert (losses[-1].asnumpy() < 1)
+
+
 @pytest.mark.level2
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard