add gpu somas

2022-08-08 16:57:20 +08:00 · 2022-08-08 16:57:20 +08:00 · 52534d1751
parent cd63f6283c
commit 52534d1751
38 changed files with 1989 additions and 969 deletions
--- a/mindspore/ccsrc/backend/common/session/kernel_graph.h
+++ b/mindspore/ccsrc/backend/common/session/kernel_graph.h
@ -50,6 +50,13 @@ struct KernelWithIndexCmp {
  }
 };

+struct SomasInfo {
+  // whole_block_size_ is 0 indicating that somas did not allocate memory for this graph.
+  size_t whole_block_size_{0};
+  // offset -> aligned_size_
+  std::map<size_t, size_t> merged_blocks_map_;
+};
+
 using DeviceType = device::DeviceType;
 using KernelMapTensor = std::map<session::KernelWithIndex, BaseRef, session::KernelWithIndexCmp>;

@ -57,6 +64,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
 public:
  KernelGraph()
      : inputs_(std::make_shared<std::vector<AnfNodePtr>>()),
+        somas_info_(std::make_shared<SomasInfo>()),
        graph_id_(0),
        stream_distinction_label_(kInvalidDistincLabel),
        device_target_(DeviceType::kUnknown),
@ -69,6 +77,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {

  KernelGraph(const KernelGraph &graph) : FuncGraph(graph) {
    inputs_ = graph.inputs_;
+    somas_info_ = graph.somas_info_;
    child_graph_result_ = graph.child_graph_result_;
    execution_order_ = graph.execution_order_;
    mem_reuse_exec_order_ = graph.mem_reuse_exec_order_;
@ -452,6 +461,11 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
  bool IsCommSubGraph(uint32_t id) const { return comm_sub_graph_ids_.find(id) != comm_sub_graph_ids_.end(); }
  void RecordNewCommSubGraphId(uint32_t id) { comm_sub_graph_ids_.insert(id); }

+  // somas total memory size
+  SomasInfo *MutableSomasInfo() const { return somas_info_.get(); }
+  size_t somas_whole_block_size() const { return somas_info_->whole_block_size_; }
+  const std::map<size_t, size_t> &somas_merged_blocks_map() const { return somas_info_->merged_blocks_map_; }
+
 private:
  // remove value node form graph
  bool RemoveValueNodeFromGraph(const ValueNodePtr &value_node);
@ -477,6 +491,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {

  // members
  std::shared_ptr<std::vector<AnfNodePtr>> inputs_;
+  std::shared_ptr<SomasInfo> somas_info_;
  std::vector<AnfNodePtr> child_graph_result_;
  std::vector<CNodePtr> execution_order_;
  std::vector<CNodePtr> mem_reuse_exec_order_;
--- a/mindspore/ccsrc/backend/common/somas/somas.cc
+++ b/mindspore/ccsrc/backend/common/somas/somas.cc
--- a/mindspore/ccsrc/backend/common/somas/somas.h
+++ b/mindspore/ccsrc/backend/common/somas/somas.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd

 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -22,6 +22,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <stack>

 #include "utils/hash_map.h"
 #include "utils/hash_set.h"
@ -33,9 +34,15 @@
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "include/common/utils/anfalgo.h"
 #include "backend/common/session/kernel_graph.h"
+#include "runtime/hardware/device_type.h"

 namespace mindspore {
 namespace somas {
+struct EventPair {
+  CNodePtr send_;
+  CNodePtr recv_;
+};
+
 union DestinationUnion {
  size_t id;
  size_t index;
@ -43,81 +50,86 @@ union DestinationUnion {
 };

 struct TensorConflictInfo {
-  size_t tensor_id_;
-  size_t src_node_id_;
+  size_t tensor_id;
+  size_t src_node_id;
  size_t destination_num;
  DestinationUnion l;
  DestinationUnion r;
  TensorConflictInfo(size_t tensor_id, size_t src_node_id)
-      : tensor_id_(tensor_id), src_node_id_(src_node_id), destination_num(0) {}
+      : tensor_id(tensor_id), src_node_id(src_node_id), destination_num(0) {}
 };
+
+struct Block {
+  size_t start_offset_;
+  size_t size_;
+  size_t end_offset_;
+
+  Block(size_t start, size_t size) : start_offset_(start), size_(size) { end_offset_ = start_offset_ + size_; }
+};
+
+void MergeBlocks(std::vector<Block> *block_list, std::stack<Block> *merged_blocks);
+
+enum class UnReuseType { kUnReuseAll, kUnReuseInput, kUnReuseOutput, kUnReuseWorkspace };
 class Somas {
 public:
  // Constructors/Destructors
  Somas() = default;
  Somas(const Somas &) = delete;
  Somas &operator=(const Somas &) = delete;
-  ~Somas() { mem_base_addr_ = nullptr; }
-
-  bool Allocate(const session::KernelGraph *graph);
-  const size_t GetTotalMemSize() const { return mem_offset_; }
-  void set_mem_base_addr(uint8_t *mem_base_addr) { mem_base_addr_ = mem_base_addr; }
-  uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
-  uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
+  virtual ~Somas() = default;

+  bool Assign(const session::KernelGraph &graph);
+  bool Assign(const KernelGraphPtr &graph_ptr);
  std::string SomasInfo(bool calc_hash = false) const;
-  std::string SomasMemory() const;
-  void DumpSomasInfoIR(const string filename) const;
-  void DumpSomasMemoryIR(const string &filename) const;
-
-  static bool NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2);
 #ifndef ENABLE_SECURITY
-  void ConvertToProfilingNode(uint32_t graph_id) const;
+  virtual void ConvertToProfilingNode(uint32_t graph_id) const {}
 #endif

 private:
+  // device implementation interface
+  virtual bool Initialize() = 0;
+  virtual string GetDeviceName() const = 0;
+  virtual size_t GetAlignSize(size_t original_size) const = 0;
+  virtual size_t GetCommunicationReservedSize() const;
+
+  virtual bool GetEnableCacheFlag(const session::KernelGraph &graph) const;
+  virtual std::vector<vector<uint32_t>> GetStreamGroupInfo(const session::KernelGraph &graph) const;
+  virtual bool GetDependExecOrderFlag(const session::KernelGraph &graph) const = 0;
+  virtual std::pair<bool, std::string> GetDebugConfig() const;
+
+  virtual std::map<std::string, UnReuseType> GetUnReuseNodeType(const session::KernelGraph &graph) const;
+  virtual std::map<std::string, UnReuseType> GetUnReuseNodeName(const session::KernelGraph &graph) const;
+
+  virtual bool InitDevSpecControlTensors(const session::KernelGraph &graph) = 0;
+  virtual bool DevSpecNodeProcess(const session::KernelGraph &graph) = 0;
+  // end
+
+  // SOMAS Configuration
+  std::string device_name_{"SOMAS"};
+  size_t communication_gap_size_{0};
+
+  size_t depend_exec_order_{false};
+  bool enable_cache_{false};
+  bool save_debug_info_{false};
+  std::string debug_info_path_;
+
+  std::map<std::string, UnReuseType> un_reuse_node_type_;
+  std::map<std::string, UnReuseType> un_reuse_node_name_;
+  // end
+
  std::vector<DynamicBitSet> reuse_matrix_;
  // hash id
  std::string hash_id_;
-  // Maps
-  mindspore::HashMap<size_t, SomasTensorPtr> tensors_map_;
-  mindspore::HashMap<void *, std::vector<SomasNodePtr>> nodes_map_;
-  mindspore::HashMap<void *, vector<SomasParameterPtr>> parameters_map_;
-  mindspore::HashMap<size_t, SomasNodePtr> nodes_id_map_;
-
-  // Vectors
-  std::vector<SomasNodePtr> nodes_list_;
-  std::vector<SomasStreamPtr> streams_list_;
-  std::vector<SomasTensorPtr> tensors_list_;
-  std::vector<SomasParameterPtr> parameters_list_;

  // Stream groups
  std::vector<vector<uint32_t>> streams_groups_;

-  // event info map
-  std::map<size_t, std::pair<CNodePtr, CNodePtr>> event_map_;
-
  // Solver
  TensorsDescMap solver_tensor_desc_map_;
  SomasSolverPrePtr somas_solver_;

-  // Contiguous list
-  std::vector<vector<size_t>> contiguous_tensors_list_;
-
-  // Ref lists
-  std::vector<vector<size_t>> ref_node_constraints_;
  std::vector<vector<size_t>> ref_overlap_constraints_;

-  // total Offset
-  size_t mem_offset_{0};
-
-  // Memory base addr
-  uint8_t *mem_base_addr_{nullptr};
-
-  // Save debug info
-  bool save_graphs_{false};
-  std::string save_graphs_path_;
-
  // statistic info
  size_t upper_bound_{0};
  size_t lower_bound_{0};
@ -128,74 +140,147 @@ class Somas {
  size_t lifelong_start_total_size_{0};
  size_t lifelong_end_total_size_{0};

-  bool InitSomasTensors(const session::KernelGraph *graph);
-  void InitBasicInfo(const session::KernelGraph *graph);
-  void InitSomasStreamAndNode(const session::KernelGraph *graph);
-  void InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph);
-  void InitSomasInputTensors(const session::KernelGraph *graph);
-  void InitSomasEventInfos();
-  void GetNextOutputProcess(const session::KernelGraph *graph);
-  void IndependentNodeOutputProcess(const session::KernelGraph *graph);
-#ifndef ENABLE_SECURITY
-  void SummaryInputProcess(const session::KernelGraph *graph);
-#endif
-  void RefNodeProcess(const session::KernelGraph *graph);
-  void NonTaskSplitProcess(const session::KernelGraph *graph);
-  void UnReuseNodeProcess(const session::KernelGraph *graph);
-  SomasTensorPtr CreateGapTensor(size_t gap_tensor_id);
-  void GenContiguousList(const session::KernelGraph *graph);
+  std::vector<vector<size_t>> processed_contiguous_tensors_list_;
+  // key: contiguous list index with first union tensor; value: contiguous list index with other union tensor
+  std::map<size_t, size_t> contiguous_list_with_ref_index_map_;

-  void ComputeConflictPairs();
+  bool ConfigSomas(const session::KernelGraph &graph);

-  bool Assign(const session::KernelGraph *graph);
-
-  std::string Offline() const;
-  void DumpOfflineIR(const string filename) const;
-  std::string GetSplitName(const string &scope_name) const;
-  size_t CalcLowerBound() const;
-  void GenGraphStatisticInfo();
+  // somas model
+  bool InitSomasModel(const session::KernelGraph &graph);
+  bool InitBasicInfoFromGraph(const session::KernelGraph &graph);
+  void InitSomasStreamAndNode(const session::KernelGraph &graph);
+  void InitSomasOutputAndWorkspaceTensors(const session::KernelGraph &graph);
+  void InitSomasInputTensors(const session::KernelGraph &graph);
+  void InitCommonNodeInputs(const CNodePtr &kernel);
+  void InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kernel);
  SomasParameterPtr GetSomasParameter(const AnfNodePtr &node, size_t index);
  SomasParameterPtr CreateSomasParameter(const AnfNodePtr &node, size_t index);
-  void InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel);
-  void InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kernel);
-  void ComputeOneTensorConflicts(const std::shared_ptr<SomasTensor> &target_tensor,
-                                 const std::vector<TensorConflictInfo> &tensor_conflict_info_list,
-                                 const std::vector<size_t> &destination_node_list,
-                                 const vector<DynamicBitSet> &nodes_dependency,
-                                 std::vector<DynamicBitSet> *tensor_relation) const;
+  void InitControlTensors(const session::KernelGraph &graph);
+  bool CommonSpecNodeProcess(const session::KernelGraph &graph);
+  SomasStreamPtr GetSomasStream(size_t stream_id) const;
+#ifndef ENABLE_SECURITY
+  void SummaryInputProcess(const session::KernelGraph &graph);
+#endif
+  void RefNodeProcess(const session::KernelGraph &graph);
+  void UnReuseNodeProcess(const session::KernelGraph &graph);
+  void CommunicationNodeProcess(const session::KernelGraph &graph);
+  void GetContiguousListContainUnionTensor();
+  std::map<size_t, size_t> GetRefTensorsInContiguousList();
+  common::KernelWithIndex GetVisitKernelWithReturnType(const AnfNodePtr &ori_node, size_t ori_index);
+
+  // conflict matrix
+  static bool NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2);
+  void ComputeConflictMatrix();
+  void ComputeBasicMatrix();
+  static void ComputeOneTensorConflicts(const std::shared_ptr<SomasTensor> &target_tensor,
+                                        const std::vector<TensorConflictInfo> &tensor_conflict_info,
+                                        const std::vector<size_t> &destination_node_list,
+                                        const vector<DynamicBitSet> &nodes_dependency,
+                                        std::vector<DynamicBitSet> *tensor_relation);
  void ComputeMultiTensorConflicts(const std::vector<SomasTensorPtr> &target_tensors_list,
-                                   const std::vector<TensorConflictInfo> &tensor_conflict_info_list,
+                                   const std::vector<TensorConflictInfo> &tensor_conflict_info,
                                   const std::vector<size_t> &destination_node_list,
                                   const vector<DynamicBitSet> &nodes_dependency,
                                   std::vector<DynamicBitSet> *tensor_relation) const;
  void UpdateTensorDestinations();
-  void UpdateRefTensorsConflict();
-  void UpdateRefOverlapTensorsConflicts();
-  void UpdateRefTensorsOffset();
-  void UpdateContiguousTensorsOffset(const std::map<size_t, size_t> &contiguous_ref_list_map);
-  void DumpParameters(std::ostringstream &oss) const;
-  void DumpTensors(std::ostringstream &oss) const;
-  void DumpNodes(std::ostringstream &oss) const;
-  std::map<size_t, size_t> GetContiguousListContainRefTensor();
-  std::map<size_t, size_t> GetRefTensorsInContiguousList();
-  bool SaveSomasResult(const session::KernelGraph *graph);
-  bool VerifySomasResult(const session::KernelGraph *graph, const nlohmann::json &somas_json) const;
-  bool LoadSomasResult(const session::KernelGraph *graph, const string &filename);
-  bool UpdateTensorsOffset(const std::vector<nlohmann::json> &tensors_json);
-  bool CalcSomasModelHash(const session::KernelGraph *graph);
-  void UpdateInputTensor(SomasNodePtr node, SomasNodePtr pre_somas_node, SomasTensorPtr input_somas_tensor) const;
-  bool LoadSomasCache(const session::KernelGraph *graph);
-  SomasStreamPtr GetSomasStream(size_t stream_id) const;
-  SomasNodePtr GetSomasNode(size_t node_id) const;
+  void UpdateUnionTensorsConflict();
  static void BuildConflictInfo(const std::shared_ptr<SomasTensor> &tensor, TensorConflictInfo *tensor_conflict_info,
                                std::vector<size_t> *destination_node_list);
  static bool CheckIsDependency(const TensorConflictInfo &tensor_conflict_info, const size_t &src_node_id,
                                const vector<DynamicBitSet> &nodes_dependency,
                                const std::vector<size_t> &destination_node_list);
  void ProcessSemiLifeLongTensor();
+
+  // solver
+  bool Solve(const session::KernelGraph &graph);
+  void UpdateUnionTensorsOffset();
+  void UpdateContiguousTensorsOffset(const std::map<size_t, size_t> &contiguous_ref_list_map);
+
+  // cache
+  bool SaveSomasResult(const session::KernelGraph &graph);
+  bool VerifySomasResult(const session::KernelGraph &graph, const nlohmann::json &somas_json) const;
+  bool LoadSomasResult(const session::KernelGraph &graph, const string &filename);
+  bool UpdateTensorsOffset(const std::vector<nlohmann::json> &tensors_json);
+  bool CalcSomasModelHash(const session::KernelGraph &graph);
+  bool LoadSomasCache(const session::KernelGraph &graph);
+
+  // log
+  std::string Offline() const;
+  void DumpOfflineIR(const string &filename) const;
+  size_t CalcLowerBound() const;
+  void GenGraphStatisticInfo();
+  void DumpParameters(std::ostringstream &oss) const;
+  void DumpTensors(std::ostringstream &oss) const;
+  void DumpNodes(std::ostringstream &oss) const;
+  void DumpSomasModelInfo(const string &tag, uint32_t graph_id) const;
+
+  // update graph
+  std::vector<std::pair<size_t, size_t>> GetNodeOutputSomasResult(const AnfNodePtr &node) const;
+  std::vector<std::pair<size_t, size_t>> GetNodeWorkSpaceSomasResult(const AnfNodePtr &node) const;
+  bool UpdateSomasResultToGraph(const session::KernelGraph &graph);
+
+ protected:
+  std::vector<SomasParameterPtr> parameters_list_;
+  std::vector<SomasTensorPtr> control_tensors_list_;
+  std::vector<SomasTensorPtr> tensors_list_;
+  std::vector<SomasNodePtr> nodes_list_;
+
+  mindspore::HashMap<size_t, SomasStreamPtr> streams_map_;
+  mindspore::HashMap<void *, vector<SomasParameterPtr>> parameters_map_;
+  mindspore::HashMap<void *, std::vector<SomasNodePtr>> nodes_map_;
+
+  std::vector<vector<size_t>> union_tensors_list_;
+  std::vector<vector<size_t>> contiguous_tensors_list_;
+
+  void AddControlTensor(const SomasNodePtr &from, const SomasNodePtr &to);
+  void AddControlTensorFromExecOrder(const session::KernelGraph &graph);
+  void GraphOutputProcess(const session::KernelGraph &graph);
+  void UpdateContiguousTensorList();
+  SomasNodePtr GetSomasNode(size_t node_id) const;
+  static std::string GetSplitName(const string &scope_name);
+
+  size_t reused_memory_size_{0};
+  std::vector<std::pair<size_t, size_t>> dump_merged_blocks_;
 };

 using SomasPtr = std::shared_ptr<Somas>;
+using SomasCreator = std::function<std::shared_ptr<Somas>()>;
+
+// @todo will delete when old runtime remove
+class SomasManager {
+ public:
+  static SomasManager &Instance() {
+    static SomasManager instance{};
+    return instance;
+  }
+  void Register(device::DeviceType device_type, SomasCreator &&creator) {
+    if (base_map_.find(device_type) == base_map_.end()) {
+      (void)base_map_.emplace(device_type, creator);
+    }
+  }
+  SomasPtr GetSomas(device::DeviceType device_type) {
+    auto iter = base_map_.find(device_type);
+    if (base_map_.end() != iter) {
+      MS_EXCEPTION_IF_NULL(iter->second);
+      return (iter->second)();
+    }
+    return nullptr;
+  }
+
+ private:
+  std::map<device::DeviceType, SomasCreator> base_map_;
+};
+
+class SomasRegister {
+ public:
+  SomasRegister(device::DeviceType device_type, SomasCreator &&creator) {
+    SomasManager::Instance().Register(device_type, std::move(creator));
+  }
+  ~SomasRegister() = default;
+};
+
+#define REG_SOMAS(S, T, C) static const somas::SomasRegister g_##S##_reg(T, []() { return std::make_shared<C>(); });
 }  // namespace somas
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_COMMON_SOMAS_SOMAS_H_
--- a/mindspore/ccsrc/backend/common/somas/somas_node.h
+++ b/mindspore/ccsrc/backend/common/somas/somas_node.h
@ -39,14 +39,14 @@ class SomasNode {

  // node's dependency including data dependency and time dependency
  std::set<std::shared_ptr<SomasNode>> ancestor_nodes_;
-  std::set<SomasTensorPtr> tensors_;
-
+  // data tensor
  std::vector<SomasTensorPtr> input_tensors_;
  std::vector<SomasTensorPtr> output_tensors_;
  std::vector<SomasTensorPtr> workspace_tensors_;
  std::map<size_t, SomasParameterPtr> input_parameters_map_;
-
-  mindspore::HashMap<int64_t, size_t> anc_stream_max_order_;
+  // control tensor
+  std::vector<SomasTensorPtr> control_input_tensors_;
+  std::vector<SomasTensorPtr> control_output_tensors_;

  // Constructors/Destructors
  SomasNode(std::string scope_full_name, size_t id, NodeType type, const size_t &stream_id)
@ -57,7 +57,7 @@ class SomasNode {

  // Accessors
  const size_t &GetId() const { return id_; }
-  const size_t GetStreamId() const { return stream_id_; }
+  const size_t &GetStreamId() const { return stream_id_; }
  const NodeType &GetType() const { return type_; }

 private:
--- a/mindspore/ccsrc/backend/common/somas/somas_solver_pre.cc
+++ b/mindspore/ccsrc/backend/common/somas/somas_solver_pre.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd

 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -98,7 +98,7 @@ vector<TensorsDescMap> SomasSolverPre::CreateTensorsMaps(const TensorsDescMap &t
  }
  return vecTensorsMap;
 }
-Status SomasSolverPre::Solving(const session::KernelGraph *graph, TensorsDescMap *ptensors,
+Status SomasSolverPre::Solving(const session::KernelGraph &graph, TensorsDescMap *ptensors,
                               const std::vector<DynamicBitSet> *pConstraints,
                               const vector<vector<size_t>> &continuous_v, bool bVerifySolution, bool ball,
                               SortingType sorting, FittingType fitting, AlgorithmType algorithm) {
@ -198,7 +198,7 @@ Status SomasSolverPre::Solving(const session::KernelGraph *graph, TensorsDescMap
  return ret;
 }

-void SomasSolverPre::Log(const session::KernelGraph *graph, const TensorsDescMap &tensors,
+void SomasSolverPre::Log(const session::KernelGraph &graph, const TensorsDescMap &tensors,
                         const std::vector<DynamicBitSet> *pConstraints,
                         const vector<vector<size_t>> &continuous_v) const {
  auto context_ptr = MsContext::GetInstance();
@ -213,13 +213,13 @@ void SomasSolverPre::Log(const session::KernelGraph *graph, const TensorsDescMap
 }

 void SomasSolverPre::TensorRelationLog(const std::vector<DynamicBitSet> *pConstraints,
-                                       const session::KernelGraph *graph) const {
+                                       const session::KernelGraph &graph) const {
  MS_LOG(INFO) << "SomasSolver::Log Writing somas_tensor_relation.ir..";
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
  std::string filename =
-    GetSaveGraphsPathName("somas_tensor_relation_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
+    GetSaveGraphsPathName("somas_tensor_relation_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path);
  std::ostringstream oss;
  for (size_t tid1 = 0; tid1 < pConstraints->size(); tid1++) {
    oss << 't' << tid1 << ' ';
@ -232,14 +232,14 @@ void SomasSolverPre::TensorRelationLog(const std::vector<DynamicBitSet> *pConstr
  MS_LOG(INFO) << "SomasSolver somas_tensor_relation Log done";
 }

-void SomasSolverPre::SolverInputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors,
+void SomasSolverPre::SolverInputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors,
                                    const vector<vector<size_t>> &continuous_v) const {
  MS_LOG(INFO) << "SomasSolver::Log Writing somas_solver_input..";
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
  std::string filename =
-    GetSaveGraphsPathName("somas_solver_input_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
+    GetSaveGraphsPathName("somas_solver_input_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path);
  std::ostringstream oss;
  for (auto &t : tensors) {
    oss << "T " << t.second->index_ << " " << t.second->size_ << " " << t.second->lifelong_ << std::endl;
@ -256,13 +256,13 @@ void SomasSolverPre::SolverInputLog(const session::KernelGraph *graph, const Ten
  MS_LOG(INFO) << "SomasSolver input Log done";
 }

-void SomasSolverPre::SolverOutputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors) const {
+void SomasSolverPre::SolverOutputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors) const {
  MS_LOG(INFO) << "SomasSolver::Log Writing somas_solver_output_..";
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
  std::string out_filename =
-    GetSaveGraphsPathName("somas_solver_output_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
+    GetSaveGraphsPathName("somas_solver_output_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path);
  std::ostringstream oss;
  constexpr size_t contiguous_left = 1;
  constexpr size_t contiguous_mid = 2;
--- a/mindspore/ccsrc/backend/common/somas/somas_solver_pre.h
+++ b/mindspore/ccsrc/backend/common/somas/somas_solver_pre.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd

 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -184,14 +184,14 @@ class SomasSolverPre {

  size_t GetMaxOffset() const { return max_offset_; }

-  Status Solving(const session::KernelGraph *graph, TensorsDescMap *ptensors,
+  Status Solving(const session::KernelGraph &graph, TensorsDescMap *ptensors,
                 const std::vector<DynamicBitSet> *pConstraints, const vector<vector<size_t>> &continuous_v,
                 bool bVerifySolution,  // true -> Check continuous and non overlapping constraints solution
                 bool ball = true,      // true -> run full set of heuristics, false -> run single heuristic specified
                 SortingType sorting = kGreaterSizeSmallerIndex, FittingType fitting = kBest,
                 AlgorithmType algorithm = kManyObjects);

-  void Log(const session::KernelGraph *graph, const TensorsDescMap &tensors,
+  void Log(const session::KernelGraph &graph, const TensorsDescMap &tensors,
           const std::vector<DynamicBitSet> *pConstraints, const vector<vector<size_t>> &continuous_v) const;

  Status CheckTensors(const TensorsDescMap *pTensors, uint32_t index1, uint32_t index2) const;
@ -201,11 +201,11 @@ class SomasSolverPre {

 private:
  size_t max_offset_;
-  void SolverInputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors,
+  void SolverInputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors,
                      const vector<vector<size_t>> &continuous_v) const;
-  void SolverOutputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors) const;
+  void SolverOutputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors) const;
  vector<TensorsDescMap> CreateTensorsMaps(const TensorsDescMap &tensors, size_t total_sol) const;
-  void TensorRelationLog(const std::vector<DynamicBitSet> *pConstraints, const session::KernelGraph *graph) const;
+  void TensorRelationLog(const std::vector<DynamicBitSet> *pConstraints, const session::KernelGraph &graph) const;
 };
 using SomasSolverPrePtr = std::shared_ptr<SomasSolverPre>;
 }  // namespace somas
--- a/mindspore/ccsrc/backend/common/somas/somas_stream.h
+++ b/mindspore/ccsrc/backend/common/somas/somas_stream.h
@ -31,7 +31,7 @@ class SomasStream {
  std::vector<SomasNodePtr> nodes_;

  // Constructors/Destructors
-  explicit SomasStream(int64_t id) : id_(id) {}
+  explicit SomasStream(size_t id) : id_(id) {}
  SomasStream(const SomasStream &) = delete;
  SomasStream &operator=(const SomasStream &) = delete;
  ~SomasStream() = default;
--- a/mindspore/ccsrc/backend/common/somas/somas_tensor.cc
+++ b/mindspore/ccsrc/backend/common/somas/somas_tensor.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd

 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -15,25 +15,35 @@
 */

 #include "backend/common/somas/somas_tensor.h"
+#include <map>
+#include <string>

 namespace mindspore {
 namespace somas {
-SomasTensor::SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t real_size,
-                         LifeLongType lifelong_value)
-    : lifelong_value_(lifelong_value),
-      between_streams_(false),
+std::map<somas::TensorType, std::string> tensor_type_name_map = {
+  {kCommon, "Common"},         {kWorkspace, "Workspace"},
+  {kOutputOnly, "OutputOnly"}, {kGraphOutput, "GraphOutput"},
+  {kGraphInput, "GraphInput"}, {kSummaryInput, "SummaryInput"},
+  {kUnion, "Union"},           {kControl, "Control"},
+  {kUnknown, "Unknown"}};
+
+std::map<LifeLongType, std::string> life_long_name_map = {{kLifeLongNone, "LifeLongNone"},
+                                                          {kLifeLongGraphAll, "LifeLongGraphAll"},
+                                                          {kLifeLongGraphStart, "LifeLongGraphStart"},
+                                                          {kLifeLongGraphEnd, "LifeLongGraphEnd"}};
+
+SomasTensor::SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t ori_size,
+                         size_t aligned_size, LifeLongType lifelong_value)
+    : aligned_size_(aligned_size),
+      lifelong_value_(lifelong_value),
      contiguous_(false),
      type_(kUnknown),
      offset_(0),
      num_constraints_(0),
-      ref_overlap_(false),
      id_(id),
      source_node_id_(source_node_id),
      source_stream_id_(source_stream_id),
-      original_size_(real_size) {
-  const size_t alignment = 512;
-  const size_t alignment_complement = 31;
-  aligned_size_ = (real_size > 0) ? ((real_size + alignment + alignment_complement) / alignment) * alignment : 0;
+      original_size_(ori_size) {
  solver_tensor_desc_ = std::make_shared<SomasSolverTensorDesc>(id_, aligned_size_, offset_, false);
 }

@ -49,5 +59,9 @@ SomasSolverTensorDescPtr SomasTensor::GetSolverTensorDesc() {
    return solver_tensor_desc_;
  }
 }
+
+std::string SomasTensor::GetTypeString() { return tensor_type_name_map[type_]; }
+
+std::string SomasTensor::GetLifelongString() { return life_long_name_map[lifelong_value_]; }
 }  // namespace somas
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/common/somas/somas_tensor.h
+++ b/mindspore/ccsrc/backend/common/somas/somas_tensor.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd

 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -20,7 +20,7 @@
 #include <memory>
 #include <set>
 #include <vector>
-
+#include <string>
 #include "utils/hash_map.h"
 #include "backend/common/somas/somas_solver_pre.h"

@ -38,21 +38,21 @@ using lifetime_t = struct Lifetime;
 // Tensor type
 enum TensorType {
  kCommon,
-  kOutputOnly,
  kWorkspace,
-  kGetNextOutput,
+  kOutputOnly,
+  kGraphOutput,
+  kGraphInput,
  kSummaryInput,
-  kRefNodeInput,
-  kRefNodeOutput,
-  kEventVirtualOutput,
+  kUnion,
+  kControl,
  kUnknown
 };

 enum LifeLongType {
  kLifeLongNone,        // life time is from tensor start to tensor end
-  kLifeLongGraphAll,    // life time is  from graph start to graph end
-  kLifeLongGraphStart,  // life time is  from graph start to tensor end
-  kLifeLongGraphEnd     // life time is  from tensor start to graph end
+  kLifeLongGraphAll,    // life time is from graph start to graph end
+  kLifeLongGraphStart,  // life time is from graph start to tensor end
+  kLifeLongGraphEnd     // life time is from tensor start to graph end
 };

 class SomasTensor {
@ -60,7 +60,6 @@ class SomasTensor {
  size_t aligned_size_{0};
  LifeLongType lifelong_value_;

-  bool between_streams_;
  bool contiguous_;

  lifetime_t lifetime_;
@ -72,7 +71,7 @@ class SomasTensor {
  vector<size_t> consumer_list_;

  // Constructors/Destructors
-  explicit SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t real_size,
+  explicit SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t ori_size, size_t aligned_size,
                       LifeLongType lifelong_value = kLifeLongNone);
  SomasTensor(const SomasTensor &) = delete;
  SomasTensor &operator=(const SomasTensor &) = delete;
@ -86,14 +85,12 @@ class SomasTensor {
  const size_t &GetAlignedSize() const { return aligned_size_; }
  const size_t &GetNumConstraints() const { return num_constraints_; }
  bool IsLifelong() const { return lifelong_value_ == kLifeLongGraphAll; }
-  bool IsWorkspace() const { return type_ == kWorkspace; }
  bool IsOutputOnly() const { return type_ == kOutputOnly; }
  size_t GetOffset() const { return offset_; }
-  bool IsBetweenStreams() const { return between_streams_; }
  bool IsSemiLifelongStart() const { return lifelong_value_ == kLifeLongGraphStart; }
  bool IsSemiLifelongEnd() const { return lifelong_value_ == kLifeLongGraphEnd; }
-  bool IsRefOverlap() const { return ref_overlap_; }
-
+  string GetTypeString();
+  string GetLifelongString();
  // Computing functions
  void SetOffset() {
    if (aligned_size_ != 0) {
@ -104,7 +101,6 @@ class SomasTensor {
  size_t num_constraints_{0};

 private:
-  bool ref_overlap_;
  const size_t id_{0};
  const size_t source_node_id_;
  const size_t source_stream_id_;
--- a/mindspore/ccsrc/backend/graph_compiler/backend.cc
+++ b/mindspore/ccsrc/backend/graph_compiler/backend.cc
@ -607,8 +607,8 @@ const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
    device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_});
  MS_EXCEPTION_IF_NULL(device_context);
  bool all_support = device_context->PartitionGraph(func_graph);
+  auto run_mode = device_context->GetRunMode(func_graph);
  if (all_support) {
-    auto run_mode = device_context->GetRunMode(func_graph);
    if (run_mode == device::RunMode::kGraphMode) {
      auto graph_id = graph_compiler_->CompileWholeGraphForGraphRunMode(func_graph, device_context);
      graph_id_to_device_context_[graph_id] = device_context;
@ -1384,9 +1384,15 @@ std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(con

  std::vector<std::vector<int64_t> *> tensors_mask;
  std::vector<std::vector<tensor::TensorPtr> *> input_tensors;
+  auto strategy = runtime::GraphExecutionStrategy::kPipeline;
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (context_ptr->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) {
+    strategy = runtime::GraphExecutionStrategy::kPipelineWithExecutionOrder;
+  }
  return std::make_unique<GraphCompilerInfo>(graphs, device_contexts, tensors_mask, input_tensors, control_nodes_,
                                             root_graph->parameters(), parser, outputs_order, outputs_num, name, false,
-                                             runtime::GraphExecutionStrategy::kPipeline);
+                                             strategy);
 }

 std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.cc
@ -104,16 +104,6 @@ uint8_t *AscendMemoryManager::MallocDynamicMem(size_t size, bool communication_m
  return communication_mem ? alloc_address + kMemAlignSize : alloc_address;
 }

-void AscendMemoryManager::MallocSomasDynamicMem(const session::KernelGraph &graph) {
-  MemoryManager::MallocSomasDynamicMem(graph);
-#ifndef ENABLE_SECURITY
-  if (MemoryProfiling::GetInstance().IsMemoryProfilingInitialized()) {
-    MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
-    somas_reuse_util_ptr_->ConvertToProfilingNode(graph.graph_id());
-  }
-#endif
-}
-
 // communication memory: [512align_size + data + 512align_size]
 // return the pointer to the start of data address.
 uint8_t *AscendMemoryManager::MallocCommunicationMemFromMemPool(size_t size) {
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_manager.h
@ -36,7 +36,6 @@ class AscendMemoryManager : public MemoryManager {
  void *MallocMemFromMemPool(size_t size, bool from_persistent_mem) override;
  void FreeMemFromMemPool(void *device_ptr) override;
  uint64_t GetMsMaxMemSize() const;
-  void MallocSomasDynamicMem(const session::KernelGraph &graph) override;
  uint8_t *MallocCommunicationMemFromMemPool(size_t size) override;
  bool MallocContinuousMemFromMemPool(const DeviceAddressPtrList &addr_list, size_t total_size,
                                      std::vector<size_t> size_list) override;
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.cc
@ -0,0 +1,229 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/ascend/hal/hardware/ascend_somas.h"
+#include <string>
+#include <map>
+#include <utility>
+#include <vector>
+#include "backend/common/optimizer/helper.h"
+#include "utils/ms_context.h"
+#include "plugin/device/ascend/hal/device/ascend_stream_assign.h"
+#include "plugin/device/ascend/hal/profiler/memory_profiling.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+using KernelGraph = session::KernelGraph;
+using UnReuseType = somas::UnReuseType;
+using TensorType = somas::TensorType;
+using LifeLongType = somas::LifeLongType;
+using mindspore::profiler::ascend::MemoryProfiling;
+
+#ifndef ENABLE_SECURITY
+void AscendSomas::ConvertToProfilingNode(uint32_t graph_id) const {
+  if (!MemoryProfiling::GetInstance().IsMemoryProfilingInitialized()) {
+    return;
+  }
+  auto graph_node = profiler::ascend::MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id);
+  if (graph_node == nullptr) {
+    graph_node = profiler::ascend::MemoryProfiling::GetInstance().AddGraphMemoryNode(graph_id);
+    MS_LOG(INFO) << "Add graph memory node for dynamic memory profiling, graph id is " << graph_id;
+  }
+
+  for (const auto &tensor : tensors_list_) {
+    profiler::ascend::TensorMemory tensor_memory;
+    tensor_memory.SetTensorId(tensor->GetId());
+    tensor_memory.SetAlignedSize(tensor->GetAlignedSize());
+    tensor_memory.SetType(tensor->GetTypeString());
+    tensor_memory.SetLifeStart(tensor->lifetime_.start_);
+    tensor_memory.SetLifeEnd(tensor->lifetime_.end_);
+    tensor_memory.SetLifeLong(tensor->GetLifelongString());
+    graph_node->AddTensorMemory(tensor_memory);
+  }
+
+  for (const auto &node : nodes_list_) {
+    profiler::ascend::NodeMemory node_memory;
+    std::string name = GetSplitName(node->scope_full_name_);
+    node_memory.SetNodeName(name);
+    node_memory.SetNodeId(node->GetId());
+    for (const auto &input_tensor : node->input_tensors_) {
+      node_memory.AddInputTensorId(input_tensor->GetId());
+    }
+    for (const auto &output_tensor : node->output_tensors_) {
+      node_memory.AddOutputTensorId(output_tensor->GetId());
+    }
+    for (const auto &workspace_tensor : node->workspace_tensors_) {
+      node_memory.AddWorkSpaceTensorId(workspace_tensor->GetId());
+    }
+    graph_node->AddNodeMemory(node_memory);
+  }
+}
+#endif
+
+bool AscendSomas::Initialize() { return true; }
+
+std::string AscendSomas::GetDeviceName() const { return "Ascend"; }
+
+size_t AscendSomas::GetCommunicationReservedSize() const {
+  constexpr size_t gap_size = 512;
+  return gap_size;
+}
+
+size_t AscendSomas::GetAlignSize(size_t original_size) const {
+  constexpr size_t alignment = 512;
+  constexpr size_t alignment_complement = 31;
+  size_t aligned_size =
+    (original_size > 0) ? ((original_size + alignment + alignment_complement) / alignment) * alignment : 0;
+  return aligned_size;
+}
+
+bool AscendSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const {
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  auto task_sink = ms_context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
+  auto opt_level = ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL);
+  if (task_sink || (opt_level == kOptimizeO1)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+std::vector<vector<uint32_t>> AscendSomas::GetStreamGroupInfo(const session::KernelGraph &graph) const {
+  std::vector<vector<uint32_t>> stream_group;
+  stream_group = device::ascend::AscendStreamAssign::GetInstance().get_stream_group();
+  return stream_group;
+}
+
+std::map<std::string, UnReuseType> AscendSomas::GetUnReuseNodeType(const session::KernelGraph &graph) const {
+  std::map<std::string, UnReuseType> node_type;
+  node_type[kGetNextOpName] = UnReuseType::kUnReuseOutput;
+  return node_type;
+}
+
+bool AscendSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) {
+  InitEventInfo(graph);
+  return true;
+}
+
+void AscendSomas::InitEventInfo(const session::KernelGraph &graph) {
+  event_map_ = {};
+  auto &kernels = graph.execution_order();
+  for (const auto &kernel : kernels) {
+    auto type = common::AnfAlgo::GetCNodeName(kernel);
+    if (type == kSendOpName) {
+      auto event = common::AnfAlgo::GetNodeAttr<uint32_t>(kernel, kAttrEventId);
+      auto iter = event_map_.find(event);
+      if (iter == event_map_.end()) {
+        auto pair = somas::EventPair();
+        pair.send_ = kernel;
+        event_map_[event] = pair;
+      } else {
+        iter->second.send_ = kernel;
+      }
+    } else if (type == kRecvOpName) {
+      auto event = common::AnfAlgo::GetNodeAttr<uint32_t>(kernel, kAttrEventId);
+      auto iter = event_map_.find(event);
+      if (iter == event_map_.end()) {
+        auto pair = somas::EventPair();
+        pair.recv_ = kernel;
+        event_map_[event] = pair;
+      } else {
+        iter->second.recv_ = kernel;
+      }
+    }
+  }
+
+  for (auto &event : event_map_) {
+    auto pair = event.second;
+    auto send_iter = nodes_map_.find(pair.send_.get());
+    if (send_iter == nodes_map_.end()) {
+      MS_LOG(WARNING) << "Can't find somas node for " << pair.send_->fullname_with_scope();
+      continue;
+    }
+
+    auto recv_iter = nodes_map_.find(pair.recv_.get());
+    if (recv_iter == nodes_map_.end()) {
+      MS_LOG(WARNING) << "Can't find somas node for " << pair.recv_->fullname_with_scope();
+      continue;
+    }
+
+    auto &somas_send = send_iter->second.at(0);
+    auto &somas_recv = recv_iter->second.at(0);
+    AddControlTensor(somas_send, somas_recv);
+  }
+  MS_LOG(DEBUG) << "Somas InitEventInfo end.";
+}
+
+bool AscendSomas::DevSpecNodeProcess(const session::KernelGraph &graph) {
+  IndependentNodeOutputProcess(graph);
+  NonTaskSplitProcess(graph);
+  return true;
+}
+
+void AscendSomas::IndependentNodeOutputProcess(const session::KernelGraph &graph) {
+  auto &kernel_cnodes = graph.execution_order();
+  size_t total_size = 0;
+  for (const auto &kernel : kernel_cnodes) {
+    bool independent = AnfAlgo::IsIndependentNode(kernel);
+    if (!independent) {
+      continue;
+    }
+    auto iter = nodes_map_.find(kernel.get());
+    if (iter != nodes_map_.end()) {
+      auto &node = iter->second.at(0);
+      MS_EXCEPTION_IF_NULL(node);
+      auto semi_reuse_output_tensors = node->output_tensors_;
+      for (auto &tensor : semi_reuse_output_tensors) {
+        MS_EXCEPTION_IF_NULL(tensor);
+        total_size += tensor->GetAlignedSize();
+        tensor->lifelong_value_ = LifeLongType::kLifeLongGraphEnd;
+      }
+    }
+  }
+
+  MS_LOG(INFO) << "Special Tensor total size: Independent Node output " << total_size;
+}
+
+void AscendSomas::NonTaskSplitProcess(const session::KernelGraph &graph) {
+  auto &kernel_cnodes = graph.execution_order();
+  for (const auto &kernel : kernel_cnodes) {
+    auto op_name = common::AnfAlgo::GetCNodeName(kernel);
+    if (common::AnfAlgo::IsNonTaskOp(kernel)) {
+      std::vector<size_t> refnode_input_output;
+      auto node = nodes_map_[kernel.get()].at(0);
+      MS_EXCEPTION_IF_NULL(node);
+      if (node->input_tensors_.empty()) {
+        MS_LOG(EXCEPTION) << op_name << " has no input tensor, can not do split non_task process.";
+      }
+      auto input_tensor = node->input_tensors_[0];
+      MS_EXCEPTION_IF_NULL(input_tensor);
+      input_tensor->type_ = TensorType::kUnion;
+      refnode_input_output.push_back(input_tensor->GetId());
+
+      for (auto &output_tensor : node->output_tensors_) {
+        MS_EXCEPTION_IF_NULL(output_tensor);
+        output_tensor->type_ = TensorType::kUnion;
+        refnode_input_output.push_back(output_tensor->GetId());
+      }
+      union_tensors_list_.push_back(refnode_input_output);
+    }
+  }
+}
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.h
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_somas.h
@ -0,0 +1,61 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_
+
+#include <vector>
+#include <string>
+#include <map>
+#include <utility>
+#include <memory>
+#include "backend/common/somas/somas.h"
+#include "runtime/hardware/device_type.h"
+
+namespace mindspore {
+namespace device {
+namespace ascend {
+using KernelGraph = session::KernelGraph;
+using UnReuseType = somas::UnReuseType;
+class AscendSomas : public somas::Somas {
+ public:
+#ifndef ENABLE_SECURITY
+  void ConvertToProfilingNode(uint32_t graph_id) const override;
+#endif
+ private:
+  bool Initialize() override;
+  string GetDeviceName() const override;
+  size_t GetCommunicationReservedSize() const override;
+  size_t GetAlignSize(size_t original_size) const override;
+
+  bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
+  std::vector<vector<uint32_t>> GetStreamGroupInfo(const session::KernelGraph &graph) const override;
+  std::map<std::string, UnReuseType> GetUnReuseNodeType(const session::KernelGraph &graph) const override;
+
+  bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
+  bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
+
+  void InitEventInfo(const session::KernelGraph &graph);
+  void IndependentNodeOutputProcess(const session::KernelGraph &graph);
+  void NonTaskSplitProcess(const session::KernelGraph &graph);
+  std::map<uint32_t, somas::EventPair> event_map_;
+};
+REG_SOMAS(Ascend, DeviceType::kAscend, AscendSomas)
+}  // namespace ascend
+}  // namespace device
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_
--- a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.cc
@ -0,0 +1,41 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/cpu/hal/hardware/cpu_somas.h"
+#include <string>
+#include "utils/ms_context.h"
+
+namespace mindspore {
+namespace device {
+namespace cpu {
+bool CPUSomas::Initialize() { return true; }
+
+std::string CPUSomas::GetDeviceName() const { return "CPU"; }
+
+size_t CPUSomas::GetAlignSize(size_t original_size) const {
+  constexpr size_t alignment = 512;
+  size_t aligned_size = (original_size > 0) ? ((original_size + alignment - 1) / alignment) * alignment : 0;
+  return aligned_size;
+}
+
+bool CPUSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const { return false; }
+
+bool CPUSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) { return true; }
+
+bool CPUSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { return true; }
+}  // namespace cpu
+}  // namespace device
+}  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.h
+++ b/mindspore/ccsrc/plugin/device/cpu/hal/hardware/cpu_somas.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__
+#define MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__
+
+#include <string>
+#include "backend/common/somas/somas.h"
+#include "runtime/hardware/device_type.h"
+
+namespace mindspore {
+namespace device {
+namespace cpu {
+using KernelGraph = session::KernelGraph;
+class CPUSomas : public somas::Somas {
+ private:
+  bool Initialize() override;
+  string GetDeviceName() const override;
+  size_t GetAlignSize(size_t original_size) const override;
+
+  bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
+  bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
+  bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
+};
+REG_SOMAS(CPU, DeviceType::kCPU, CPUSomas)
+}  // namespace cpu
+}  // namespace device
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.cc
@ -25,6 +25,7 @@
 #include "plugin/device/gpu/hal/device/gpu_stream_assign.h"
 #include "plugin/device/gpu/hal/device/distribution/collective_init.h"
 #include "plugin/device/gpu/hal/device/gpu_device_manager.h"
+#include "plugin/device/gpu/hal/hardware/gpu_somas.h"
 #include "runtime/data_queue/data_queue_mgr.h"
 #include "kernel/common_utils.h"
 #include "plugin/device/gpu/hal/device/gpu_common.h"
@ -40,6 +41,7 @@
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "backend/common/optimizer/common_backend_optimization.h"
 #include "backend/common/optimizer/dynamic_shape/dynamic_shape_helper.h"
+#include "include/common/debug/anf_ir_dump.h"
 #ifdef ENABLE_DUMP_IR
 #include "include/common/debug/rdr/recorder_manager.h"
 #include "debug/rdr/mem_address_recorder.h"
@ -258,6 +260,25 @@ DeviceAddressPtr GPUDeviceResManager::CreateDeviceAddress(void *const device_ptr
  return device_address;
 }

+void GPUKernelExecutor::PreprocessBeforeRun(const FuncGraphPtr &graph) const {
+  MS_EXCEPTION_IF_NULL(graph);
+  auto kernel_graph = graph->cast<KernelGraphPtr>();
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  if (ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) {
+    auto somas = std::make_shared<GPUSomas>();
+    bool ret = somas->Assign(kernel_graph);
+    if (ret) {
+      MS_LOG(INFO) << "Somas allocate success for graph " << kernel_graph->graph_id()
+                   << " somas size: " << kernel_graph->somas_whole_block_size();
+    } else {
+      MS_LOG(WARNING) << "Somas allocate failed for graph " << kernel_graph->graph_id();
+    }
+  }
+  MS_LOG(INFO) << "Status record: end preprocess before run graph. graph id: " << kernel_graph->graph_id();
+}
+
 void GPUKernelExecutor::OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph) const {
  MS_EXCEPTION_IF_NULL(graph);
  // Operator fusion optimization.
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.h
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_device_context.h
@ -82,6 +82,8 @@ class GPUKernelExecutor : public DeprecatedKernelExecutor {

  void CreateKernel(const std::vector<CNodePtr> &nodes) const override;

+  void PreprocessBeforeRun(const FuncGraphPtr &graph) const override;
+
  bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
                    const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs) const override;

--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.cc
@ -0,0 +1,141 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin/device/gpu/hal/hardware/gpu_somas.h"
+#include <string>
+#include <vector>
+#include "backend/common/optimizer/helper.h"
+#include "utils/ms_context.h"
+
+namespace mindspore {
+namespace device {
+namespace gpu {
+bool GPUSomas::Initialize() { return true; }
+
+std::string GPUSomas::GetDeviceName() const { return "GPU"; }
+
+size_t GPUSomas::GetAlignSize(size_t original_size) const {
+  constexpr size_t alignment = 512;
+  size_t aligned_size = (original_size > 0) ? ((original_size + alignment - 1) / alignment) * alignment : 0;
+  return aligned_size;
+}
+
+bool GPUSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const {
+  auto context_ptr = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(context_ptr);
+  if (context_ptr->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool GPUSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) {
+  InitEventInfo(graph);
+  return true;
+}
+
+void GPUSomas::InitEventInfo(const session::KernelGraph &graph) {
+  event_map_ = {};
+  auto &kernels = graph.execution_order();
+  for (const auto &kernel : kernels) {
+    auto type = common::AnfAlgo::GetCNodeName(kernel);
+    if (type == kSendOpName) {
+      auto event = common::AnfAlgo::GetNodeAttr<uintptr_t>(kernel, kAttrRecordEvent);
+      auto iter = event_map_.find(event);
+      if (iter == event_map_.end()) {
+        auto pair = somas::EventPair();
+        pair.send_ = kernel;
+        event_map_[event] = pair;
+      } else {
+        iter->second.send_ = kernel;
+      }
+    } else if (type == kRecvOpName) {
+      auto event = common::AnfAlgo::GetNodeAttr<uintptr_t>(kernel, kAttrWaitEvent);
+      auto iter = event_map_.find(event);
+      if (iter == event_map_.end()) {
+        auto pair = somas::EventPair();
+        pair.recv_ = kernel;
+        event_map_[event] = pair;
+      } else {
+        iter->second.recv_ = kernel;
+      }
+    }
+  }
+
+  for (auto &event : event_map_) {
+    auto pair = event.second;
+    auto send_iter = nodes_map_.find(pair.send_.get());
+    if (send_iter == nodes_map_.end()) {
+      MS_LOG(WARNING) << "Can't find somas node for " << pair.send_->fullname_with_scope();
+      continue;
+    }
+
+    auto recv_iter = nodes_map_.find(pair.recv_.get());
+    if (recv_iter == nodes_map_.end()) {
+      MS_LOG(WARNING) << "Can't find somas node for " << pair.recv_->fullname_with_scope();
+      continue;
+    }
+
+    auto &somas_send = send_iter->second.at(0);
+    auto &somas_recv = recv_iter->second.at(0);
+    AddControlTensor(somas_send, somas_recv);
+  }
+  MS_LOG(DEBUG) << "Somas InitEventInfo end.";
+}
+
+bool GPUSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { return InplaceNodeProcess(graph); }
+
+bool GPUSomas::InplaceNodeProcess(const session::KernelGraph &graph) {
+  auto &kernels = graph.execution_order();
+  for (auto &kernel : kernels) {
+    if (!common::AnfAlgo::IsInplaceNode(kernel, "skip")) {
+      continue;
+    }
+    auto iter = nodes_map_.find(kernel.get());
+    if (iter != nodes_map_.end()) {
+      auto &node = iter->second.at(0);
+      MS_EXCEPTION_IF_NULL(node);
+      auto input_tensors = node->input_tensors_;
+      auto output_tensors = node->output_tensors_;
+      std::vector<somas::SomasTensorPtr> union_tensors;
+      union_tensors.insert(union_tensors.end(), input_tensors.begin(), input_tensors.end());
+      union_tensors.insert(union_tensors.end(), output_tensors.begin(), output_tensors.end());
+      // check whether the union tensor already in other union tensors
+      for (auto &tensor : union_tensors) {
+        auto tensor_id = tensor->GetId();
+        for (auto &union_list : union_tensors_list_) {
+          if (std::count(union_list.begin(), union_list.end(), tensor_id)) {
+            MS_LOG(EXCEPTION) << "Inplace node union Tensor " << tensor_id << " already in other union tensor list.";
+          }
+        }
+      }
+      std::vector<size_t> inplace_union_tensor_list;
+      for (auto &tensor : union_tensors) {
+        tensor->type_ = somas::kUnion;
+        inplace_union_tensor_list.push_back(tensor->GetId());
+      }
+
+      union_tensors_list_.push_back(inplace_union_tensor_list);
+    } else {
+      MS_LOG(EXCEPTION) << "Can't find somas node for inplace node " << kernel->fullname_with_scope();
+    }
+  }
+  return true;
+}
+}  // namespace gpu
+}  // namespace device
+}  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.h
+++ b/mindspore/ccsrc/plugin/device/gpu/hal/hardware/gpu_somas.h
@ -0,0 +1,48 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__
+#define MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__
+
+#include <map>
+#include <string>
+#include "backend/common/somas/somas.h"
+#include "runtime/hardware/device_type.h"
+
+namespace mindspore {
+namespace device {
+namespace gpu {
+using KernelGraph = session::KernelGraph;
+
+class GPUSomas : public somas::Somas {
+ private:
+  bool Initialize() override;
+  string GetDeviceName() const override;
+  size_t GetAlignSize(size_t original_size) const override;
+
+  bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
+  bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
+  bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
+  bool InplaceNodeProcess(const session::KernelGraph &graph);
+  void InitEventInfo(const session::KernelGraph &graph);
+  std::map<uintptr_t, somas::EventPair> event_map_;
+};
+REG_SOMAS(GPU, DeviceType::kGPU, GPUSomas)
+}  // namespace gpu
+}  // namespace device
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__
--- a/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
+++ b/mindspore/ccsrc/pybind_api/utils/ms_context_py.cc
@ -101,7 +101,8 @@ REGISTER_PYBIND_DEFINE(MsContextPy, ([](const py::module *m) {
                           .value("graph_kernel_flags", MsCtxParam::MS_CTX_GRAPH_KERNEL_FLAGS)
                           .value("grad_for_scalar", MsCtxParam::MS_CTX_GRAD_FOR_SCALAR)
                           .value("pynative_synchronize", MsCtxParam::MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE)
-                           .value("disable_format_transform", MsCtxParam::MS_CTX_DISABLE_FORMAT_TRANSFORM);
+                           .value("disable_format_transform", MsCtxParam::MS_CTX_DISABLE_FORMAT_TRANSFORM)
+                           .value("memory_optimize_level", MsCtxParam::MS_CTX_MEMORY_OPTIMIZE_LEVEL);
                         (void)py::class_<mindspore::MsContext, std::shared_ptr<mindspore::MsContext>>(*m, "MSContext")
                           .def_static("get_instance", &mindspore::MsContext::GetInstance, "Get ms context instance.")
                           .def("get_param", &mindspore::MsCtxGetParameter, "Get value of specified parameter.")
--- a/mindspore/ccsrc/runtime/device/CMakeLists.txt
+++ b/mindspore/ccsrc/runtime/device/CMakeLists.txt
@ -3,6 +3,7 @@ file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*
    "memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc" "memory_scheduler.cc"
    "memory_offload_strategy.cc" "bucket.cc" "launch_kernel.cc" "launch_mul.cc" "tensor_array.cc"
    "ms_device_shape_transfer.cc" "context_extends.cc" "stream_synchronizer.cc" "tensors_queue.cc" "auto_mem_offload.cc"
+    "common_somas_allocator.cc"
 )

 if("${ENABLE_HIDDEN}" STREQUAL "OFF")
--- a/mindspore/ccsrc/runtime/device/common_somas_allocator.cc
+++ b/mindspore/ccsrc/runtime/device/common_somas_allocator.cc
@ -0,0 +1,86 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "runtime/device/common_somas_allocator.h"
+#include <utility>
+#include <string>
+#include "backend/common/optimizer/helper.h"
+#include "utils/ms_context.h"
+#ifdef ENABLE_DUMP_IR
+#include "debug/rdr/string_recorder.h"
+#endif
+
+namespace mindspore {
+namespace device {
+bool CommonSomasAllocator::Assign(const session::KernelGraph &graph) {
+  somas::SomasPtr somas_ptr{nullptr};
+  if (GetTargetFromContext() == kAscendDevice) {
+    somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kAscend);
+  } else if (GetTargetFromContext() == kGPUDevice) {
+    somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kGPU);
+  } else {
+    somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kCPU);
+  }
+  MS_EXCEPTION_IF_NULL(somas_ptr);
+  bool ret = somas_ptr->Assign(graph);
+  if (ret) {
+#ifdef ENABLE_DUMP_IR
+    SubModuleId module = SubModuleId::SM_OPTIMIZER;
+    std::string name = "somas_allocate_info." + std::to_string(graph.graph_id());
+    (void)mindspore::RDR::RecordString(module, name, somas_ptr->SomasInfo());
+#endif
+#ifndef ENABLE_SECURITY
+    somas_ptr->ConvertToProfilingNode(graph.graph_id());
+#endif
+  }
+  return ret;
+}
+
+uint8_t *CommonSomasAllocator::GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const {
+  MS_EXCEPTION_IF_NULL(node);
+  auto kernel_info = dynamic_cast<KernelInfo *>(node->kernel_info());
+  MS_EXCEPTION_IF_NULL(kernel_info);
+  if (index >= kernel_info->somas_output_offset_aligned_size_list().size()) {
+    MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:["
+                      << kernel_info->somas_output_offset_aligned_size_list().size() << "]";
+  }
+  auto somas_offset_aligned_size = kernel_info->somas_output_offset_aligned_size_list()[index];
+  if (somas_offset_aligned_size.second == 0) {
+    return nullptr;
+  }
+  auto somas_offset = somas_offset_aligned_size.first;
+  uint8_t *ptr = mem_base_addr_ + somas_offset;
+  return ptr;
+}
+
+uint8_t *CommonSomasAllocator::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const {
+  MS_EXCEPTION_IF_NULL(node);
+  auto kernel_info = dynamic_cast<KernelInfo *>(node->kernel_info());
+  MS_EXCEPTION_IF_NULL(kernel_info);
+  if (index >= kernel_info->somas_workspace_offset_aligned_size_list().size()) {
+    MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:["
+                      << kernel_info->somas_workspace_offset_aligned_size_list().size() << "]";
+  }
+  auto somas_offset_aligned_size = kernel_info->somas_workspace_offset_aligned_size_list()[index];
+  if (somas_offset_aligned_size.second == 0) {
+    return nullptr;
+  }
+  auto somas_offset = somas_offset_aligned_size.first;
+  uint8_t *ptr = mem_base_addr_ + somas_offset;
+  return ptr;
+}
+}  // namespace device
+}  // namespace mindspore
--- a/mindspore/ccsrc/runtime/device/common_somas_allocator.h
+++ b/mindspore/ccsrc/runtime/device/common_somas_allocator.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H
+#define MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H
+
+#include <vector>
+#include <string>
+#include <map>
+#include <utility>
+#include <memory>
+#include "backend/common/somas/somas.h"
+#include "runtime/hardware/device_type.h"
+#include "utils/ms_context.h"
+
+namespace mindspore {
+namespace device {
+class CommonSomasAllocator {
+ public:
+  void set_mem_base_addr(uint8_t *mem_base_addr) { mem_base_addr_ = mem_base_addr; }
+  static bool Assign(const session::KernelGraph &graph);
+  uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
+  uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
+
+ private:
+  // Memory base addr
+  uint8_t *mem_base_addr_{nullptr};
+  static std::string GetTargetFromContext() {
+    auto context_ptr = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(context_ptr);
+    return context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
+  }
+};
+using CommonSomasAllocatorPtr = std::shared_ptr<CommonSomasAllocator>;
+}  // namespace device
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H
--- a/mindspore/ccsrc/runtime/device/kernel_info.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_info.cc
@ -15,6 +15,7 @@
 */

 #include "runtime/device/kernel_info.h"
+#include <utility>

 namespace mindspore {
 namespace device {
@ -108,6 +109,13 @@ bool KernelInfo::SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t
  return true;
 }

+bool KernelInfo::SetSomasResult(std::vector<std::pair<size_t, size_t>> &&output_somas_result,
+                                std::vector<std::pair<size_t, size_t>> &&workspace_somas_result) {
+  somas_output_result_ = std::move(output_somas_result);
+  somas_workspace_result_ = std::move(workspace_somas_result);
+  return true;
+}
+
 void KernelInfo::set_kernel_mod(const kernel::KernelModPtr &kernel_mod) { kernel_mod_ = kernel_mod; }

 kernel::KernelMod *KernelInfo::MutableKernelMod() const { return kernel_mod_.get(); }
--- a/mindspore/ccsrc/runtime/device/kernel_info.h
+++ b/mindspore/ccsrc/runtime/device/kernel_info.h
@ -19,6 +19,7 @@

 #include <vector>
 #include <memory>
+#include <utility>
 #include "ir/kernel_info_dev.h"
 #include "kernel/kernel_build_info.h"
 #include "kernel/kernel.h"
@ -57,6 +58,8 @@ class KernelInfo : public KernelInfoDevice {
  DeviceAddressPtr GetMutableWorkspaceAddr(size_t index) const;
  bool WorkspaceAddrExist(size_t index) const;
  bool SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t index);
+  bool SetSomasResult(std::vector<std::pair<size_t, size_t>> &&output_somas_result,
+                      std::vector<std::pair<size_t, size_t>> &&workspace_somas_result);
  void set_kernel_mod(const kernel::KernelModPtr &kernel_mod);
  kernel::KernelMod *MutableKernelMod() const;
  const kernel::KernelMod *kernel_mod() const;
@ -70,6 +73,12 @@ class KernelInfo : public KernelInfoDevice {
  uint32_t graph_id() const { return graph_id_; }
  bool operator==(const KernelInfo &other) const;
  bool is_feature_map() const { return is_feature_map_; }
+  const std::vector<std::pair<size_t, size_t>> &somas_output_offset_aligned_size_list() const {
+    return somas_output_result_;
+  }
+  const std::vector<std::pair<size_t, size_t>> &somas_workspace_offset_aligned_size_list() const {
+    return somas_workspace_result_;
+  }
  const std::vector<std::shared_ptr<DeviceAddress>> &output_address_list() const { return output_address_list_; }
  const std::vector<std::shared_ptr<DeviceAddress>> &workspace_address_list() const { return workspace_address_list_; }

@ -83,6 +92,12 @@ class KernelInfo : public KernelInfoDevice {
  kernel::KernelBuildInfoPtr select_kernel_build_info_;
  std::vector<std::shared_ptr<DeviceAddress>> output_address_list_;
  std::vector<std::shared_ptr<DeviceAddress>> workspace_address_list_;
+  // pair<size_t, size_t> : (offset, aligned_size)
+  // aligned_size of 0 means no memory allocation
+  std::vector<std::pair<size_t, size_t>> somas_output_result_;
+  // pair<size_t, size_t> : (offset, aligned_size)
+  // aligned_size of 0 means no memory allocation
+  std::vector<std::pair<size_t, size_t>> somas_workspace_result_;
  kernel::KernelModPtr kernel_mod_;
  // stream_id_ is the index of stream object vector
  uint32_t stream_id_;
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.cc
@ -985,7 +985,12 @@ void KernelRuntime::AssignNodeOutputMem(MemType type, const AnfNodePtr &node, in
    auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type, {node, i});
    MS_EXCEPTION_IF_NULL(device_address);
    uint8_t *ptr = mem_manager_->MallocOutputMem(node, i, type, output_sizes[i], device_address, false);
-    MS_EXCEPTION_IF_NULL(ptr);
+    if (ptr == nullptr && type == kSomasReuseDynamicMem) {
+      MS_LOG(INFO) << "node: " << node->fullname_with_scope() << " could be a RefNode, please check it"
+                   << " output index: " << i << " memory type: " << type;
+    } else {
+      MS_EXCEPTION_IF_NULL(ptr);
+    }
    device_address->set_host_shape(trans::GetRuntimePaddingShape(node, i));
    AnfAlgo::SetOutputAddr(device_address, i, node.get());
  }
--- a/mindspore/ccsrc/runtime/device/memory_manager.cc
+++ b/mindspore/ccsrc/runtime/device/memory_manager.cc
@ -18,10 +18,6 @@
 #include <string>
 #include "backend/common/session/anf_runtime_algorithm.h"
 #include "include/common/utils/anfalgo.h"
-#include "include/common/debug/common.h"
-#ifdef ENABLE_DUMP_IR
-#include "debug/rdr/string_recorder.h"
-#endif
 #include "utils/ms_context.h"

 namespace mindspore {
@ -37,41 +33,21 @@ size_t MemoryManager::GetCommunicationAlignSize(size_t input_size) {
 }

 void MemoryManager::MallocSomasDynamicMem(const session::KernelGraph &graph) {
-  SomasPtr somas_reuse_util_ptr = std::make_shared<somas::Somas>();
-  MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr);
-  somas_reuse_util_ptr_ = somas_reuse_util_ptr;
+  SomasAllocatorPtr somas_allocator_ptr = std::make_shared<device::CommonSomasAllocator>();
+  MS_EXCEPTION_IF_NULL(somas_allocator_ptr);
+  somas_allocator_ptr_ = somas_allocator_ptr;

-  if (!(somas_reuse_util_ptr->Allocate(&graph))) {
+  if (!(somas_allocator_ptr->Assign(graph))) {
    MS_LOG(EXCEPTION) << "Somas Allocate Failed.";
  }

-  size_t total_allocated_size = somas_reuse_util_ptr->GetTotalMemSize();
+  size_t total_allocated_size = graph.somas_whole_block_size();
  MS_LOG(INFO) << "Graph " << graph.graph_id() << ": TotalSomasReuseDynamicSize [" << total_allocated_size << "]";
  if (total_allocated_size > 0) {
    auto base_ptr = MallocDynamicMem(total_allocated_size, false);
    MS_LOG(INFO) << "Somas Reuse Memory Base Address [" << static_cast<void *>(base_ptr) << "], End Address ["
                 << static_cast<void *>(base_ptr + total_allocated_size) << "]";
-    somas_reuse_util_ptr->set_mem_base_addr(base_ptr);
-  }
-
-  auto context_ptr = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(context_ptr);
-#ifdef ENABLE_DUMP_IR
-  SubModuleId module = SubModuleId::SM_OPTIMIZER;
-
-  std::string name = "somas_allocate_info." + std::to_string(graph.graph_id());
-  (void)mindspore::RDR::RecordString(module, name, somas_reuse_util_ptr_->SomasInfo());
-
-  name = "somas_mem_info." + std::to_string(graph.graph_id());
-  (void)mindspore::RDR::RecordString(module, name, somas_reuse_util_ptr_->SomasMemory());
-#endif
-  bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
-  if (save_graphs) {
-    std::string file_path = GetSaveGraphsPathName("somas_allocate_info_" + std::to_string(graph.graph_id()) + ".ir");
-    somas_reuse_util_ptr_->DumpSomasInfoIR(file_path);
-
-    std::string mem_file_path = GetSaveGraphsPathName("somas_mem_info_" + std::to_string(graph.graph_id()) + ".ir");
-    somas_reuse_util_ptr_->DumpSomasMemoryIR(mem_file_path);
+    somas_allocator_ptr->set_mem_base_addr(base_ptr);
  }
 }

@ -94,8 +70,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
        address->communication_ptr_ = ptr - kMemAlignSize;
      }
    } else if (type == kSomasReuseDynamicMem) {
-      MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
-      ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index);
+      MS_EXCEPTION_IF_NULL(somas_allocator_ptr_);
+      ptr = somas_allocator_ptr_->GetNodeOutputPtr(node, index);
    } else {
      ptr = MallocDynamicMem(size, communication_mem);
    }
@ -109,8 +85,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
  } else if (type == kDynamicMem) {
    ptr = MallocDynamicMem(size, false);
  } else if (type == kSomasReuseDynamicMem) {
-    MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
-    ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index);
+    MS_EXCEPTION_IF_NULL(somas_allocator_ptr_);
+    ptr = somas_allocator_ptr_->GetNodeOutputPtr(node, index);
  }
  address->ptr_ = ptr;
  return ptr;
@ -118,8 +94,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me

 uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, MemType type, size_t size) {
  if (type == kSomasReuseDynamicMem) {
-    MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
-    return somas_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
+    MS_EXCEPTION_IF_NULL(somas_allocator_ptr_);
+    return somas_allocator_ptr_->GetNodeWorkSpacePtr(node, index);
  }
  return MallocDynamicMem(size, false);
 }
--- a/mindspore/ccsrc/runtime/device/memory_manager.h
+++ b/mindspore/ccsrc/runtime/device/memory_manager.h
@ -22,14 +22,15 @@
 #include <map>
 #include <queue>
 #include "common/mem_reuse/mem_reuse.h"
-#include "backend/common/somas/somas.h"
+#include "runtime/device/common_somas_allocator.h"
+
 namespace mindspore {
 namespace device {
 enum MemType { kStaticMem, kDynamicMem, kSomasReuseDynamicMem };
 constexpr int kGetAllOuts = -1;
 constexpr uint64_t kMemAlignSize = 512;
 constexpr uint64_t kTwiceMemAlignSize = kMemAlignSize << 1;
-using SomasPtr = mindspore::somas::SomasPtr;
+using SomasAllocatorPtr = mindspore::device::CommonSomasAllocatorPtr;

 class MemoryManager {
 public:
@ -80,7 +81,7 @@ class MemoryManager {
    return MallocStaticMem(size, communication_mem, kInvalidGraphId);
  }
  virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem);
-  SomasPtr somas_reuse_util_ptr_{nullptr};
+  SomasAllocatorPtr somas_allocator_ptr_{nullptr};
 };
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/memory_manager_actor.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/memory_manager_actor.cc
@ -81,6 +81,10 @@ void MemoryManagerActor::AllocateContinuousMemory(const std::vector<std::vector<
    auto &size_list = (*size_list_list)[i];
    auto &device_context = (*device_contexts)[i];
    MS_EXCEPTION_IF_NULL(device_context);
+    // if the address of continuous tensor has already been allocated, skip the tensor
+    if (alloc_list[0]->GetPtr() != nullptr) {
+      continue;
+    }
    // Allocate memory through the device context.
    device::DynamicMemAllocatorDebugInfo::SetDebugInfo(from_aid.Name(), device::AllocatorType::kKernelOutput);
    auto dev_ptr_list = device_context->device_res_manager_->AllocateContinuousMemory(size_list);
--- a/mindspore/core/utils/ms_context.cc
+++ b/mindspore/core/utils/ms_context.cc
@ -102,6 +102,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
  set_param<bool>(MS_CTX_ENABLE_RECOVERY, false);
  set_param<bool>(MS_CTX_ENABLE_GE_HETEROGENOUS, false);
  set_param<bool>(MS_CTX_DISABLE_FORMAT_TRANSFORM, false);
+  set_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL, kOptimizeO0);

  uint32_t kDefaultRuntimeNumThreads = 30;
  uint32_t cpu_core_num = std::thread::hardware_concurrency() - 1;
--- a/mindspore/core/utils/ms_context.h
+++ b/mindspore/core/utils/ms_context.h
@ -55,6 +55,8 @@ const char kGpuInferenceDevice[] = "GpuInference";
 const char kDavinciDevice[] = "Davinci";
 const char KNpuLog[] = "_npu_log";
 const unsigned int MAX_CALL_DEPTH_DEFAULT = 1000;
+const int kOptimizeO0 = 0;
+const int kOptimizeO1 = 1;

 const std::set<std::string> kTargetSet = {kCPUDevice, kGPUDevice, kAscendDevice, kDavinciDevice};
 // The default max available device memory is 1024GB.
@ -98,6 +100,7 @@ enum MsCtxParam : unsigned {
  // parameter of type int
  MS_CTX_TYPE_INT_BEGIN = MS_CTX_TYPE_BOOL_END,
  MS_CTX_EXECUTION_MODE = MS_CTX_TYPE_INT_BEGIN,
+  MS_CTX_MEMORY_OPTIMIZE_LEVEL,
  MS_CTX_TYPE_INT_END,

  // parameter of type uint32
--- a/mindspore/lite/src/extendrt/CMakeLists.txt
+++ b/mindspore/lite/src/extendrt/CMakeLists.txt
@ -98,7 +98,6 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
        ${CCSRC_DIR}/backend/common/somas/somas_solver_alg.cc
        ${CCSRC_DIR}/backend/graph_compiler/graph_partition.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/mock/segment_runner.cc
-        ${CCSRC_DIR}/runtime/device/auto_mem_offload.cc
        ${CCSRC_DIR}/runtime/device/ms_device_shape_transfer.cc
        ${CCSRC_DIR}/runtime/device/kernel_info.cc
        ${CCSRC_DIR}/runtime/device/convert_tensor_utils.cc
@ -109,6 +108,7 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
        ${CCSRC_DIR}/runtime/device/memory_offload_strategy.cc
        ${CCSRC_DIR}/runtime/device/memory_manager.cc
        ${CCSRC_DIR}/runtime/device/auto_mem_offload.cc
+        ${CCSRC_DIR}/runtime/device/common_somas_allocator.cc
        ${CCSRC_DIR}/runtime/pynative/op_executor.cc
        ${CCSRC_DIR}/runtime/pynative/op_runtime_info.cc
        ${CCSRC_DIR}/runtime/hardware/device_type.cc
@ -117,6 +117,8 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
        ${CCSRC_DIR}/kernel/kernel.cc
        ${CCSRC_DIR}/kernel/kash/kernel_pack.cc
        ${CCSRC_DIR}/kernel/oplib/oplib.cc
+        ${CCSRC_DIR}/common/debug/anf_dump_utils.cc
+        ${CCSRC_DIR}/common/debug/anf_ir_dump.cc
        ${CCSRC_DIR}/common/debug/common.cc
        ${CCSRC_DIR}/common/debug/env_config_parser.cc
        ${CCSRC_DIR}/common/thread_pool.cc
--- a/mindspore/python/mindspore/context.py
+++ b/mindspore/python/mindspore/context.py
@ -197,6 +197,22 @@ class _Context:
                             f"or context.PYNATIVE_MODE (1), but got {mode}.")
        self.set_param(ms_ctx_param.mode, mode)

+    def set_memory_optimize_level(self, memory_optimize_level):
+        """
+        The memory optimize level, support "O0", "O1".
+
+        Args:
+            target (str): "O0", "O1"
+        """
+        memory_optimize_levels = ["O0", "O1"]
+        if memory_optimize_level not in memory_optimize_levels:
+            raise ValueError(f"For 'context.set_context', the argument 'memory_optimize_level' must be one of "
+                             f"{memory_optimize_levels}, but got {memory_optimize_level}.")
+        if memory_optimize_level == "O0":
+            self.set_param(ms_ctx_param.memory_optimize_level, 0)
+        else:
+            self.set_param(ms_ctx_param.memory_optimize_level, 1)
+
    def set_backend_policy(self, policy):
        success = self._context_handle.set_backend_policy(policy)
        if not success:
@ -353,7 +369,8 @@ class _Context:
        'mempool_block_size': set_mempool_block_size,
        'print_file_path': set_print_file_path,
        'env_config_path': set_env_config_path,
-        'runtime_num_threads': set_runtime_num_threads
+        'runtime_num_threads': set_runtime_num_threads,
+        'memory_optimize_level': set_memory_optimize_level
    }

    @property
--- a/tests/st/networks/test_gpu_alexnet.py
+++ b/tests/st/networks/test_gpu_alexnet.py
@ -87,3 +87,30 @@ def test_trainTensor(num_classes=10, epoch=15, batch_size=32):
        loss = train_network(data, label).asnumpy()
        losses.append(loss)
    assert losses[-1] < 0.01
+
+
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_train_tensor_memory_opt(num_classes=10, epoch=15, batch_size=32):
+    """
+    Feature: Somas GPU kernel by kernel.
+    Description: AlexNet with Somas GPU kernel by kernel.
+    Expectation: No exception.
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
+    net = AlexNet(num_classes)
+    lr = 0.1
+    momentum = 0.9
+    optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, momentum, weight_decay=0.0001)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(net_with_criterion, optimizer)
+    train_network.set_train()
+    losses = []
+    for i in range(0, epoch):
+        data = Tensor(np.ones([batch_size, 3, 227, 227]).astype(np.float32) * 0.01)
+        label = Tensor(np.ones([batch_size]).astype(np.int32))
+        loss = train_network(data, label).asnumpy()
+        losses.append(loss)
+    assert losses[-1] < 0.01
--- a/tests/st/networks/test_gpu_lenet.py
+++ b/tests/st/networks/test_gpu_lenet.py
@ -150,6 +150,35 @@ def test_train_lenet():
    assert losses[-1] < 0.01


+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_train_lenet_memory_opt():
+    """
+    Feature: Somas GPU kernel by kernel.
+    Description: LeNet with Somas GPU kernel by kernel.
+    Expectation: No exception.
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
+    epoch = 100
+    net = LeNet()
+    momentum = 0.9
+    learning_rate = multisteplr(epoch, 30)
+
+    optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(net_with_criterion, optimizer)  # optimizer
+    train_network.set_train()
+    losses = []
+    for i in range(epoch):
+        data = Tensor(np.ones([net.batch_size, 3, 32, 32]).astype(np.float32) * 0.01)
+        label = Tensor(np.ones([net.batch_size]).astype(np.int32))
+        loss = train_network(data, label).asnumpy()
+        losses.append(loss)
+    assert losses[-1] < 0.01
+
+
 def create_dataset(data_path, batch_size=32, repeat_size=1,
                   num_parallel_workers=1):
    """
--- a/tests/st/networks/test_gpu_lstm.py
+++ b/tests/st/networks/test_gpu_lstm.py
@ -142,3 +142,48 @@ def test_LSTM():
        losses.append(loss)
        print("loss:", loss.asnumpy())
    assert (losses[-1].asnumpy() < 0.01)
+
+
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_lstm_memory_opt():
+    """
+    Feature: Somas GPU kernel by kernel.
+    Description: LSTM with Somas GPU kernel by kernel.
+    Expectation: No exception.
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
+    num_epochs = 5
+    embed_size = 100
+    num_hiddens = 100
+    num_layers = 2
+    bidirectional = True
+    labels = 2
+    vocab_size = 252193
+    max_len = 500
+
+    weight = np.ones((vocab_size + 1, embed_size)).astype(np.float32)
+
+    net = SentimentNet(vocab_size=(vocab_size + 1), embed_size=embed_size,
+                       num_hiddens=num_hiddens, num_layers=num_layers,
+                       bidirectional=bidirectional, weight=weight,
+                       labels=labels, batch_size=batch_size)
+
+    learning_rate = 0.1
+    momentum = 0.9
+
+    optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(net_with_criterion, optimizer)  # optimizer
+    train_network.set_train()
+
+    train_features = Tensor(np.ones([64, max_len]).astype(np.int32))
+    train_labels = Tensor(np.ones([64,]).astype(np.int32)[0:64])
+    losses = []
+    for epoch in range(num_epochs):
+        loss = train_network(train_features, train_labels)
+        losses.append(loss)
+        print("loss:", loss.asnumpy())
+    assert (losses[-1].asnumpy() < 0.01)
--- a/tests/st/networks/test_gpu_resnet.py
+++ b/tests/st/networks/test_gpu_resnet.py
@ -352,6 +352,36 @@ def test_trainTensor(num_classes=10, epoch=8, batch_size=1):
    assert (losses[-1].asnumpy() < 1)


+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_train_tensor_memory_opt(num_classes=10, epoch=8, batch_size=1):
+    """
+    Feature: Somas GPU kernel by kernel.
+    Description: ResNet with Somas GPU kernel by kernel.
+    Expectation: No exception.
+    """
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
+    net = resnet50(num_classes)
+    lr = 0.1
+    momentum = 0.9
+    optimizer = Momentum(filter(lambda x: x.requires_grad,
+                                net.get_parameters()), lr, momentum)
+    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
+    net_with_criterion = WithLossCell(net, criterion)
+    train_network = TrainOneStepCell(
+        net_with_criterion, optimizer)  # optimizer
+    train_network.set_train()
+    losses = []
+    for i in range(0, epoch):
+        data = Tensor(np.ones([batch_size, 3, 224, 224]
+                              ).astype(np.float32) * 0.01)
+        label = Tensor(np.ones([batch_size]).astype(np.int32))
+        loss = train_network(data, label)
+        losses.append(loss)
+    assert (losses[-1].asnumpy() < 1)
+
+
@pytest.mark.level2
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard