add gpu somas

This commit is contained in:
reku1997 2022-08-08 16:57:20 +08:00
parent cd63f6283c
commit 52534d1751
38 changed files with 1989 additions and 969 deletions

View File

@ -50,6 +50,13 @@ struct KernelWithIndexCmp {
}
};
struct SomasInfo {
// whole_block_size_ is 0 indicating that somas did not allocate memory for this graph.
size_t whole_block_size_{0};
// offset -> aligned_size_
std::map<size_t, size_t> merged_blocks_map_;
};
using DeviceType = device::DeviceType;
using KernelMapTensor = std::map<session::KernelWithIndex, BaseRef, session::KernelWithIndexCmp>;
@ -57,6 +64,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
public:
KernelGraph()
: inputs_(std::make_shared<std::vector<AnfNodePtr>>()),
somas_info_(std::make_shared<SomasInfo>()),
graph_id_(0),
stream_distinction_label_(kInvalidDistincLabel),
device_target_(DeviceType::kUnknown),
@ -69,6 +77,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
KernelGraph(const KernelGraph &graph) : FuncGraph(graph) {
inputs_ = graph.inputs_;
somas_info_ = graph.somas_info_;
child_graph_result_ = graph.child_graph_result_;
execution_order_ = graph.execution_order_;
mem_reuse_exec_order_ = graph.mem_reuse_exec_order_;
@ -452,6 +461,11 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
bool IsCommSubGraph(uint32_t id) const { return comm_sub_graph_ids_.find(id) != comm_sub_graph_ids_.end(); }
void RecordNewCommSubGraphId(uint32_t id) { comm_sub_graph_ids_.insert(id); }
// somas total memory size
SomasInfo *MutableSomasInfo() const { return somas_info_.get(); }
size_t somas_whole_block_size() const { return somas_info_->whole_block_size_; }
const std::map<size_t, size_t> &somas_merged_blocks_map() const { return somas_info_->merged_blocks_map_; }
private:
// remove value node form graph
bool RemoveValueNodeFromGraph(const ValueNodePtr &value_node);
@ -477,6 +491,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
// members
std::shared_ptr<std::vector<AnfNodePtr>> inputs_;
std::shared_ptr<SomasInfo> somas_info_;
std::vector<AnfNodePtr> child_graph_result_;
std::vector<CNodePtr> execution_order_;
std::vector<CNodePtr> mem_reuse_exec_order_;

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -22,6 +22,7 @@
#include <string>
#include <utility>
#include <vector>
#include <stack>
#include "utils/hash_map.h"
#include "utils/hash_set.h"
@ -33,9 +34,15 @@
#include "backend/common/session/anf_runtime_algorithm.h"
#include "include/common/utils/anfalgo.h"
#include "backend/common/session/kernel_graph.h"
#include "runtime/hardware/device_type.h"
namespace mindspore {
namespace somas {
struct EventPair {
CNodePtr send_;
CNodePtr recv_;
};
union DestinationUnion {
size_t id;
size_t index;
@ -43,81 +50,86 @@ union DestinationUnion {
};
struct TensorConflictInfo {
size_t tensor_id_;
size_t src_node_id_;
size_t tensor_id;
size_t src_node_id;
size_t destination_num;
DestinationUnion l;
DestinationUnion r;
TensorConflictInfo(size_t tensor_id, size_t src_node_id)
: tensor_id_(tensor_id), src_node_id_(src_node_id), destination_num(0) {}
: tensor_id(tensor_id), src_node_id(src_node_id), destination_num(0) {}
};
struct Block {
size_t start_offset_;
size_t size_;
size_t end_offset_;
Block(size_t start, size_t size) : start_offset_(start), size_(size) { end_offset_ = start_offset_ + size_; }
};
void MergeBlocks(std::vector<Block> *block_list, std::stack<Block> *merged_blocks);
enum class UnReuseType { kUnReuseAll, kUnReuseInput, kUnReuseOutput, kUnReuseWorkspace };
class Somas {
public:
// Constructors/Destructors
Somas() = default;
Somas(const Somas &) = delete;
Somas &operator=(const Somas &) = delete;
~Somas() { mem_base_addr_ = nullptr; }
bool Allocate(const session::KernelGraph *graph);
const size_t GetTotalMemSize() const { return mem_offset_; }
void set_mem_base_addr(uint8_t *mem_base_addr) { mem_base_addr_ = mem_base_addr; }
uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
virtual ~Somas() = default;
bool Assign(const session::KernelGraph &graph);
bool Assign(const KernelGraphPtr &graph_ptr);
std::string SomasInfo(bool calc_hash = false) const;
std::string SomasMemory() const;
void DumpSomasInfoIR(const string filename) const;
void DumpSomasMemoryIR(const string &filename) const;
static bool NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2);
#ifndef ENABLE_SECURITY
void ConvertToProfilingNode(uint32_t graph_id) const;
virtual void ConvertToProfilingNode(uint32_t graph_id) const {}
#endif
private:
// device implementation interface
virtual bool Initialize() = 0;
virtual string GetDeviceName() const = 0;
virtual size_t GetAlignSize(size_t original_size) const = 0;
virtual size_t GetCommunicationReservedSize() const;
virtual bool GetEnableCacheFlag(const session::KernelGraph &graph) const;
virtual std::vector<vector<uint32_t>> GetStreamGroupInfo(const session::KernelGraph &graph) const;
virtual bool GetDependExecOrderFlag(const session::KernelGraph &graph) const = 0;
virtual std::pair<bool, std::string> GetDebugConfig() const;
virtual std::map<std::string, UnReuseType> GetUnReuseNodeType(const session::KernelGraph &graph) const;
virtual std::map<std::string, UnReuseType> GetUnReuseNodeName(const session::KernelGraph &graph) const;
virtual bool InitDevSpecControlTensors(const session::KernelGraph &graph) = 0;
virtual bool DevSpecNodeProcess(const session::KernelGraph &graph) = 0;
// end
// SOMAS Configuration
std::string device_name_{"SOMAS"};
size_t communication_gap_size_{0};
size_t depend_exec_order_{false};
bool enable_cache_{false};
bool save_debug_info_{false};
std::string debug_info_path_;
std::map<std::string, UnReuseType> un_reuse_node_type_;
std::map<std::string, UnReuseType> un_reuse_node_name_;
// end
std::vector<DynamicBitSet> reuse_matrix_;
// hash id
std::string hash_id_;
// Maps
mindspore::HashMap<size_t, SomasTensorPtr> tensors_map_;
mindspore::HashMap<void *, std::vector<SomasNodePtr>> nodes_map_;
mindspore::HashMap<void *, vector<SomasParameterPtr>> parameters_map_;
mindspore::HashMap<size_t, SomasNodePtr> nodes_id_map_;
// Vectors
std::vector<SomasNodePtr> nodes_list_;
std::vector<SomasStreamPtr> streams_list_;
std::vector<SomasTensorPtr> tensors_list_;
std::vector<SomasParameterPtr> parameters_list_;
// Stream groups
std::vector<vector<uint32_t>> streams_groups_;
// event info map
std::map<size_t, std::pair<CNodePtr, CNodePtr>> event_map_;
// Solver
TensorsDescMap solver_tensor_desc_map_;
SomasSolverPrePtr somas_solver_;
// Contiguous list
std::vector<vector<size_t>> contiguous_tensors_list_;
// Ref lists
std::vector<vector<size_t>> ref_node_constraints_;
std::vector<vector<size_t>> ref_overlap_constraints_;
// total Offset
size_t mem_offset_{0};
// Memory base addr
uint8_t *mem_base_addr_{nullptr};
// Save debug info
bool save_graphs_{false};
std::string save_graphs_path_;
// statistic info
size_t upper_bound_{0};
size_t lower_bound_{0};
@ -128,74 +140,147 @@ class Somas {
size_t lifelong_start_total_size_{0};
size_t lifelong_end_total_size_{0};
bool InitSomasTensors(const session::KernelGraph *graph);
void InitBasicInfo(const session::KernelGraph *graph);
void InitSomasStreamAndNode(const session::KernelGraph *graph);
void InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph);
void InitSomasInputTensors(const session::KernelGraph *graph);
void InitSomasEventInfos();
void GetNextOutputProcess(const session::KernelGraph *graph);
void IndependentNodeOutputProcess(const session::KernelGraph *graph);
#ifndef ENABLE_SECURITY
void SummaryInputProcess(const session::KernelGraph *graph);
#endif
void RefNodeProcess(const session::KernelGraph *graph);
void NonTaskSplitProcess(const session::KernelGraph *graph);
void UnReuseNodeProcess(const session::KernelGraph *graph);
SomasTensorPtr CreateGapTensor(size_t gap_tensor_id);
void GenContiguousList(const session::KernelGraph *graph);
std::vector<vector<size_t>> processed_contiguous_tensors_list_;
// key: contiguous list index with first union tensor; value: contiguous list index with other union tensor
std::map<size_t, size_t> contiguous_list_with_ref_index_map_;
void ComputeConflictPairs();
bool ConfigSomas(const session::KernelGraph &graph);
bool Assign(const session::KernelGraph *graph);
std::string Offline() const;
void DumpOfflineIR(const string filename) const;
std::string GetSplitName(const string &scope_name) const;
size_t CalcLowerBound() const;
void GenGraphStatisticInfo();
// somas model
bool InitSomasModel(const session::KernelGraph &graph);
bool InitBasicInfoFromGraph(const session::KernelGraph &graph);
void InitSomasStreamAndNode(const session::KernelGraph &graph);
void InitSomasOutputAndWorkspaceTensors(const session::KernelGraph &graph);
void InitSomasInputTensors(const session::KernelGraph &graph);
void InitCommonNodeInputs(const CNodePtr &kernel);
void InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kernel);
SomasParameterPtr GetSomasParameter(const AnfNodePtr &node, size_t index);
SomasParameterPtr CreateSomasParameter(const AnfNodePtr &node, size_t index);
void InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel);
void InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kernel);
void ComputeOneTensorConflicts(const std::shared_ptr<SomasTensor> &target_tensor,
const std::vector<TensorConflictInfo> &tensor_conflict_info_list,
void InitControlTensors(const session::KernelGraph &graph);
bool CommonSpecNodeProcess(const session::KernelGraph &graph);
SomasStreamPtr GetSomasStream(size_t stream_id) const;
#ifndef ENABLE_SECURITY
void SummaryInputProcess(const session::KernelGraph &graph);
#endif
void RefNodeProcess(const session::KernelGraph &graph);
void UnReuseNodeProcess(const session::KernelGraph &graph);
void CommunicationNodeProcess(const session::KernelGraph &graph);
void GetContiguousListContainUnionTensor();
std::map<size_t, size_t> GetRefTensorsInContiguousList();
common::KernelWithIndex GetVisitKernelWithReturnType(const AnfNodePtr &ori_node, size_t ori_index);
// conflict matrix
static bool NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2);
void ComputeConflictMatrix();
void ComputeBasicMatrix();
static void ComputeOneTensorConflicts(const std::shared_ptr<SomasTensor> &target_tensor,
const std::vector<TensorConflictInfo> &tensor_conflict_info,
const std::vector<size_t> &destination_node_list,
const vector<DynamicBitSet> &nodes_dependency,
std::vector<DynamicBitSet> *tensor_relation) const;
std::vector<DynamicBitSet> *tensor_relation);
void ComputeMultiTensorConflicts(const std::vector<SomasTensorPtr> &target_tensors_list,
const std::vector<TensorConflictInfo> &tensor_conflict_info_list,
const std::vector<TensorConflictInfo> &tensor_conflict_info,
const std::vector<size_t> &destination_node_list,
const vector<DynamicBitSet> &nodes_dependency,
std::vector<DynamicBitSet> *tensor_relation) const;
void UpdateTensorDestinations();
void UpdateRefTensorsConflict();
void UpdateRefOverlapTensorsConflicts();
void UpdateRefTensorsOffset();
void UpdateContiguousTensorsOffset(const std::map<size_t, size_t> &contiguous_ref_list_map);
void DumpParameters(std::ostringstream &oss) const;
void DumpTensors(std::ostringstream &oss) const;
void DumpNodes(std::ostringstream &oss) const;
std::map<size_t, size_t> GetContiguousListContainRefTensor();
std::map<size_t, size_t> GetRefTensorsInContiguousList();
bool SaveSomasResult(const session::KernelGraph *graph);
bool VerifySomasResult(const session::KernelGraph *graph, const nlohmann::json &somas_json) const;
bool LoadSomasResult(const session::KernelGraph *graph, const string &filename);
bool UpdateTensorsOffset(const std::vector<nlohmann::json> &tensors_json);
bool CalcSomasModelHash(const session::KernelGraph *graph);
void UpdateInputTensor(SomasNodePtr node, SomasNodePtr pre_somas_node, SomasTensorPtr input_somas_tensor) const;
bool LoadSomasCache(const session::KernelGraph *graph);
SomasStreamPtr GetSomasStream(size_t stream_id) const;
SomasNodePtr GetSomasNode(size_t node_id) const;
void UpdateUnionTensorsConflict();
static void BuildConflictInfo(const std::shared_ptr<SomasTensor> &tensor, TensorConflictInfo *tensor_conflict_info,
std::vector<size_t> *destination_node_list);
static bool CheckIsDependency(const TensorConflictInfo &tensor_conflict_info, const size_t &src_node_id,
const vector<DynamicBitSet> &nodes_dependency,
const std::vector<size_t> &destination_node_list);
void ProcessSemiLifeLongTensor();
// solver
bool Solve(const session::KernelGraph &graph);
void UpdateUnionTensorsOffset();
void UpdateContiguousTensorsOffset(const std::map<size_t, size_t> &contiguous_ref_list_map);
// cache
bool SaveSomasResult(const session::KernelGraph &graph);
bool VerifySomasResult(const session::KernelGraph &graph, const nlohmann::json &somas_json) const;
bool LoadSomasResult(const session::KernelGraph &graph, const string &filename);
bool UpdateTensorsOffset(const std::vector<nlohmann::json> &tensors_json);
bool CalcSomasModelHash(const session::KernelGraph &graph);
bool LoadSomasCache(const session::KernelGraph &graph);
// log
std::string Offline() const;
void DumpOfflineIR(const string &filename) const;
size_t CalcLowerBound() const;
void GenGraphStatisticInfo();
void DumpParameters(std::ostringstream &oss) const;
void DumpTensors(std::ostringstream &oss) const;
void DumpNodes(std::ostringstream &oss) const;
void DumpSomasModelInfo(const string &tag, uint32_t graph_id) const;
// update graph
std::vector<std::pair<size_t, size_t>> GetNodeOutputSomasResult(const AnfNodePtr &node) const;
std::vector<std::pair<size_t, size_t>> GetNodeWorkSpaceSomasResult(const AnfNodePtr &node) const;
bool UpdateSomasResultToGraph(const session::KernelGraph &graph);
protected:
std::vector<SomasParameterPtr> parameters_list_;
std::vector<SomasTensorPtr> control_tensors_list_;
std::vector<SomasTensorPtr> tensors_list_;
std::vector<SomasNodePtr> nodes_list_;
mindspore::HashMap<size_t, SomasStreamPtr> streams_map_;
mindspore::HashMap<void *, vector<SomasParameterPtr>> parameters_map_;
mindspore::HashMap<void *, std::vector<SomasNodePtr>> nodes_map_;
std::vector<vector<size_t>> union_tensors_list_;
std::vector<vector<size_t>> contiguous_tensors_list_;
void AddControlTensor(const SomasNodePtr &from, const SomasNodePtr &to);
void AddControlTensorFromExecOrder(const session::KernelGraph &graph);
void GraphOutputProcess(const session::KernelGraph &graph);
void UpdateContiguousTensorList();
SomasNodePtr GetSomasNode(size_t node_id) const;
static std::string GetSplitName(const string &scope_name);
size_t reused_memory_size_{0};
std::vector<std::pair<size_t, size_t>> dump_merged_blocks_;
};
using SomasPtr = std::shared_ptr<Somas>;
using SomasCreator = std::function<std::shared_ptr<Somas>()>;
// @todo will delete when old runtime remove
class SomasManager {
public:
static SomasManager &Instance() {
static SomasManager instance{};
return instance;
}
void Register(device::DeviceType device_type, SomasCreator &&creator) {
if (base_map_.find(device_type) == base_map_.end()) {
(void)base_map_.emplace(device_type, creator);
}
}
SomasPtr GetSomas(device::DeviceType device_type) {
auto iter = base_map_.find(device_type);
if (base_map_.end() != iter) {
MS_EXCEPTION_IF_NULL(iter->second);
return (iter->second)();
}
return nullptr;
}
private:
std::map<device::DeviceType, SomasCreator> base_map_;
};
class SomasRegister {
public:
SomasRegister(device::DeviceType device_type, SomasCreator &&creator) {
SomasManager::Instance().Register(device_type, std::move(creator));
}
~SomasRegister() = default;
};
#define REG_SOMAS(S, T, C) static const somas::SomasRegister g_##S##_reg(T, []() { return std::make_shared<C>(); });
} // namespace somas
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_COMMON_SOMAS_SOMAS_H_

View File

@ -39,14 +39,14 @@ class SomasNode {
// node's dependency including data dependency and time dependency
std::set<std::shared_ptr<SomasNode>> ancestor_nodes_;
std::set<SomasTensorPtr> tensors_;
// data tensor
std::vector<SomasTensorPtr> input_tensors_;
std::vector<SomasTensorPtr> output_tensors_;
std::vector<SomasTensorPtr> workspace_tensors_;
std::map<size_t, SomasParameterPtr> input_parameters_map_;
mindspore::HashMap<int64_t, size_t> anc_stream_max_order_;
// control tensor
std::vector<SomasTensorPtr> control_input_tensors_;
std::vector<SomasTensorPtr> control_output_tensors_;
// Constructors/Destructors
SomasNode(std::string scope_full_name, size_t id, NodeType type, const size_t &stream_id)
@ -57,7 +57,7 @@ class SomasNode {
// Accessors
const size_t &GetId() const { return id_; }
const size_t GetStreamId() const { return stream_id_; }
const size_t &GetStreamId() const { return stream_id_; }
const NodeType &GetType() const { return type_; }
private:

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -98,7 +98,7 @@ vector<TensorsDescMap> SomasSolverPre::CreateTensorsMaps(const TensorsDescMap &t
}
return vecTensorsMap;
}
Status SomasSolverPre::Solving(const session::KernelGraph *graph, TensorsDescMap *ptensors,
Status SomasSolverPre::Solving(const session::KernelGraph &graph, TensorsDescMap *ptensors,
const std::vector<DynamicBitSet> *pConstraints,
const vector<vector<size_t>> &continuous_v, bool bVerifySolution, bool ball,
SortingType sorting, FittingType fitting, AlgorithmType algorithm) {
@ -198,7 +198,7 @@ Status SomasSolverPre::Solving(const session::KernelGraph *graph, TensorsDescMap
return ret;
}
void SomasSolverPre::Log(const session::KernelGraph *graph, const TensorsDescMap &tensors,
void SomasSolverPre::Log(const session::KernelGraph &graph, const TensorsDescMap &tensors,
const std::vector<DynamicBitSet> *pConstraints,
const vector<vector<size_t>> &continuous_v) const {
auto context_ptr = MsContext::GetInstance();
@ -213,13 +213,13 @@ void SomasSolverPre::Log(const session::KernelGraph *graph, const TensorsDescMap
}
void SomasSolverPre::TensorRelationLog(const std::vector<DynamicBitSet> *pConstraints,
const session::KernelGraph *graph) const {
const session::KernelGraph &graph) const {
MS_LOG(INFO) << "SomasSolver::Log Writing somas_tensor_relation.ir..";
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
std::string filename =
GetSaveGraphsPathName("somas_tensor_relation_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
GetSaveGraphsPathName("somas_tensor_relation_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path);
std::ostringstream oss;
for (size_t tid1 = 0; tid1 < pConstraints->size(); tid1++) {
oss << 't' << tid1 << ' ';
@ -232,14 +232,14 @@ void SomasSolverPre::TensorRelationLog(const std::vector<DynamicBitSet> *pConstr
MS_LOG(INFO) << "SomasSolver somas_tensor_relation Log done";
}
void SomasSolverPre::SolverInputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors,
void SomasSolverPre::SolverInputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors,
const vector<vector<size_t>> &continuous_v) const {
MS_LOG(INFO) << "SomasSolver::Log Writing somas_solver_input..";
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
std::string filename =
GetSaveGraphsPathName("somas_solver_input_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
GetSaveGraphsPathName("somas_solver_input_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path);
std::ostringstream oss;
for (auto &t : tensors) {
oss << "T " << t.second->index_ << " " << t.second->size_ << " " << t.second->lifelong_ << std::endl;
@ -256,13 +256,13 @@ void SomasSolverPre::SolverInputLog(const session::KernelGraph *graph, const Ten
MS_LOG(INFO) << "SomasSolver input Log done";
}
void SomasSolverPre::SolverOutputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors) const {
void SomasSolverPre::SolverOutputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors) const {
MS_LOG(INFO) << "SomasSolver::Log Writing somas_solver_output_..";
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
std::string out_filename =
GetSaveGraphsPathName("somas_solver_output_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
GetSaveGraphsPathName("somas_solver_output_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path);
std::ostringstream oss;
constexpr size_t contiguous_left = 1;
constexpr size_t contiguous_mid = 2;

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -184,14 +184,14 @@ class SomasSolverPre {
size_t GetMaxOffset() const { return max_offset_; }
Status Solving(const session::KernelGraph *graph, TensorsDescMap *ptensors,
Status Solving(const session::KernelGraph &graph, TensorsDescMap *ptensors,
const std::vector<DynamicBitSet> *pConstraints, const vector<vector<size_t>> &continuous_v,
bool bVerifySolution, // true -> Check continuous and non overlapping constraints solution
bool ball = true, // true -> run full set of heuristics, false -> run single heuristic specified
SortingType sorting = kGreaterSizeSmallerIndex, FittingType fitting = kBest,
AlgorithmType algorithm = kManyObjects);
void Log(const session::KernelGraph *graph, const TensorsDescMap &tensors,
void Log(const session::KernelGraph &graph, const TensorsDescMap &tensors,
const std::vector<DynamicBitSet> *pConstraints, const vector<vector<size_t>> &continuous_v) const;
Status CheckTensors(const TensorsDescMap *pTensors, uint32_t index1, uint32_t index2) const;
@ -201,11 +201,11 @@ class SomasSolverPre {
private:
size_t max_offset_;
void SolverInputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors,
void SolverInputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors,
const vector<vector<size_t>> &continuous_v) const;
void SolverOutputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors) const;
void SolverOutputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors) const;
vector<TensorsDescMap> CreateTensorsMaps(const TensorsDescMap &tensors, size_t total_sol) const;
void TensorRelationLog(const std::vector<DynamicBitSet> *pConstraints, const session::KernelGraph *graph) const;
void TensorRelationLog(const std::vector<DynamicBitSet> *pConstraints, const session::KernelGraph &graph) const;
};
using SomasSolverPrePtr = std::shared_ptr<SomasSolverPre>;
} // namespace somas

View File

@ -31,7 +31,7 @@ class SomasStream {
std::vector<SomasNodePtr> nodes_;
// Constructors/Destructors
explicit SomasStream(int64_t id) : id_(id) {}
explicit SomasStream(size_t id) : id_(id) {}
SomasStream(const SomasStream &) = delete;
SomasStream &operator=(const SomasStream &) = delete;
~SomasStream() = default;

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -15,25 +15,35 @@
*/
#include "backend/common/somas/somas_tensor.h"
#include <map>
#include <string>
namespace mindspore {
namespace somas {
SomasTensor::SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t real_size,
LifeLongType lifelong_value)
: lifelong_value_(lifelong_value),
between_streams_(false),
std::map<somas::TensorType, std::string> tensor_type_name_map = {
{kCommon, "Common"}, {kWorkspace, "Workspace"},
{kOutputOnly, "OutputOnly"}, {kGraphOutput, "GraphOutput"},
{kGraphInput, "GraphInput"}, {kSummaryInput, "SummaryInput"},
{kUnion, "Union"}, {kControl, "Control"},
{kUnknown, "Unknown"}};
std::map<LifeLongType, std::string> life_long_name_map = {{kLifeLongNone, "LifeLongNone"},
{kLifeLongGraphAll, "LifeLongGraphAll"},
{kLifeLongGraphStart, "LifeLongGraphStart"},
{kLifeLongGraphEnd, "LifeLongGraphEnd"}};
SomasTensor::SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t ori_size,
size_t aligned_size, LifeLongType lifelong_value)
: aligned_size_(aligned_size),
lifelong_value_(lifelong_value),
contiguous_(false),
type_(kUnknown),
offset_(0),
num_constraints_(0),
ref_overlap_(false),
id_(id),
source_node_id_(source_node_id),
source_stream_id_(source_stream_id),
original_size_(real_size) {
const size_t alignment = 512;
const size_t alignment_complement = 31;
aligned_size_ = (real_size > 0) ? ((real_size + alignment + alignment_complement) / alignment) * alignment : 0;
original_size_(ori_size) {
solver_tensor_desc_ = std::make_shared<SomasSolverTensorDesc>(id_, aligned_size_, offset_, false);
}
@ -49,5 +59,9 @@ SomasSolverTensorDescPtr SomasTensor::GetSolverTensorDesc() {
return solver_tensor_desc_;
}
}
std::string SomasTensor::GetTypeString() { return tensor_type_name_map[type_]; }
std::string SomasTensor::GetLifelongString() { return life_long_name_map[lifelong_value_]; }
} // namespace somas
} // namespace mindspore

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -20,7 +20,7 @@
#include <memory>
#include <set>
#include <vector>
#include <string>
#include "utils/hash_map.h"
#include "backend/common/somas/somas_solver_pre.h"
@ -38,13 +38,13 @@ using lifetime_t = struct Lifetime;
// Tensor type
enum TensorType {
kCommon,
kOutputOnly,
kWorkspace,
kGetNextOutput,
kOutputOnly,
kGraphOutput,
kGraphInput,
kSummaryInput,
kRefNodeInput,
kRefNodeOutput,
kEventVirtualOutput,
kUnion,
kControl,
kUnknown
};
@ -60,7 +60,6 @@ class SomasTensor {
size_t aligned_size_{0};
LifeLongType lifelong_value_;
bool between_streams_;
bool contiguous_;
lifetime_t lifetime_;
@ -72,7 +71,7 @@ class SomasTensor {
vector<size_t> consumer_list_;
// Constructors/Destructors
explicit SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t real_size,
explicit SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t ori_size, size_t aligned_size,
LifeLongType lifelong_value = kLifeLongNone);
SomasTensor(const SomasTensor &) = delete;
SomasTensor &operator=(const SomasTensor &) = delete;
@ -86,14 +85,12 @@ class SomasTensor {
const size_t &GetAlignedSize() const { return aligned_size_; }
const size_t &GetNumConstraints() const { return num_constraints_; }
bool IsLifelong() const { return lifelong_value_ == kLifeLongGraphAll; }
bool IsWorkspace() const { return type_ == kWorkspace; }
bool IsOutputOnly() const { return type_ == kOutputOnly; }
size_t GetOffset() const { return offset_; }
bool IsBetweenStreams() const { return between_streams_; }
bool IsSemiLifelongStart() const { return lifelong_value_ == kLifeLongGraphStart; }
bool IsSemiLifelongEnd() const { return lifelong_value_ == kLifeLongGraphEnd; }
bool IsRefOverlap() const { return ref_overlap_; }
string GetTypeString();
string GetLifelongString();
// Computing functions
void SetOffset() {
if (aligned_size_ != 0) {
@ -104,7 +101,6 @@ class SomasTensor {
size_t num_constraints_{0};
private:
bool ref_overlap_;
const size_t id_{0};
const size_t source_node_id_;
const size_t source_stream_id_;

View File

@ -607,8 +607,8 @@ const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_});
MS_EXCEPTION_IF_NULL(device_context);
bool all_support = device_context->PartitionGraph(func_graph);
if (all_support) {
auto run_mode = device_context->GetRunMode(func_graph);
if (all_support) {
if (run_mode == device::RunMode::kGraphMode) {
auto graph_id = graph_compiler_->CompileWholeGraphForGraphRunMode(func_graph, device_context);
graph_id_to_device_context_[graph_id] = device_context;
@ -1384,9 +1384,15 @@ std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(con
std::vector<std::vector<int64_t> *> tensors_mask;
std::vector<std::vector<tensor::TensorPtr> *> input_tensors;
auto strategy = runtime::GraphExecutionStrategy::kPipeline;
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
if (context_ptr->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) {
strategy = runtime::GraphExecutionStrategy::kPipelineWithExecutionOrder;
}
return std::make_unique<GraphCompilerInfo>(graphs, device_contexts, tensors_mask, input_tensors, control_nodes_,
root_graph->parameters(), parser, outputs_order, outputs_num, name, false,
runtime::GraphExecutionStrategy::kPipeline);
strategy);
}
std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(

View File

@ -104,16 +104,6 @@ uint8_t *AscendMemoryManager::MallocDynamicMem(size_t size, bool communication_m
return communication_mem ? alloc_address + kMemAlignSize : alloc_address;
}
void AscendMemoryManager::MallocSomasDynamicMem(const session::KernelGraph &graph) {
MemoryManager::MallocSomasDynamicMem(graph);
#ifndef ENABLE_SECURITY
if (MemoryProfiling::GetInstance().IsMemoryProfilingInitialized()) {
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
somas_reuse_util_ptr_->ConvertToProfilingNode(graph.graph_id());
}
#endif
}
// communication memory: [512align_size + data + 512align_size]
// return the pointer to the start of data address.
uint8_t *AscendMemoryManager::MallocCommunicationMemFromMemPool(size_t size) {

View File

@ -36,7 +36,6 @@ class AscendMemoryManager : public MemoryManager {
void *MallocMemFromMemPool(size_t size, bool from_persistent_mem) override;
void FreeMemFromMemPool(void *device_ptr) override;
uint64_t GetMsMaxMemSize() const;
void MallocSomasDynamicMem(const session::KernelGraph &graph) override;
uint8_t *MallocCommunicationMemFromMemPool(size_t size) override;
bool MallocContinuousMemFromMemPool(const DeviceAddressPtrList &addr_list, size_t total_size,
std::vector<size_t> size_list) override;

View File

@ -0,0 +1,229 @@
/**
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/ascend/hal/hardware/ascend_somas.h"
#include <string>
#include <map>
#include <utility>
#include <vector>
#include "backend/common/optimizer/helper.h"
#include "utils/ms_context.h"
#include "plugin/device/ascend/hal/device/ascend_stream_assign.h"
#include "plugin/device/ascend/hal/profiler/memory_profiling.h"
namespace mindspore {
namespace device {
namespace ascend {
using KernelGraph = session::KernelGraph;
using UnReuseType = somas::UnReuseType;
using TensorType = somas::TensorType;
using LifeLongType = somas::LifeLongType;
using mindspore::profiler::ascend::MemoryProfiling;
#ifndef ENABLE_SECURITY
void AscendSomas::ConvertToProfilingNode(uint32_t graph_id) const {
if (!MemoryProfiling::GetInstance().IsMemoryProfilingInitialized()) {
return;
}
auto graph_node = profiler::ascend::MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id);
if (graph_node == nullptr) {
graph_node = profiler::ascend::MemoryProfiling::GetInstance().AddGraphMemoryNode(graph_id);
MS_LOG(INFO) << "Add graph memory node for dynamic memory profiling, graph id is " << graph_id;
}
for (const auto &tensor : tensors_list_) {
profiler::ascend::TensorMemory tensor_memory;
tensor_memory.SetTensorId(tensor->GetId());
tensor_memory.SetAlignedSize(tensor->GetAlignedSize());
tensor_memory.SetType(tensor->GetTypeString());
tensor_memory.SetLifeStart(tensor->lifetime_.start_);
tensor_memory.SetLifeEnd(tensor->lifetime_.end_);
tensor_memory.SetLifeLong(tensor->GetLifelongString());
graph_node->AddTensorMemory(tensor_memory);
}
for (const auto &node : nodes_list_) {
profiler::ascend::NodeMemory node_memory;
std::string name = GetSplitName(node->scope_full_name_);
node_memory.SetNodeName(name);
node_memory.SetNodeId(node->GetId());
for (const auto &input_tensor : node->input_tensors_) {
node_memory.AddInputTensorId(input_tensor->GetId());
}
for (const auto &output_tensor : node->output_tensors_) {
node_memory.AddOutputTensorId(output_tensor->GetId());
}
for (const auto &workspace_tensor : node->workspace_tensors_) {
node_memory.AddWorkSpaceTensorId(workspace_tensor->GetId());
}
graph_node->AddNodeMemory(node_memory);
}
}
#endif
bool AscendSomas::Initialize() { return true; }
std::string AscendSomas::GetDeviceName() const { return "Ascend"; }
size_t AscendSomas::GetCommunicationReservedSize() const {
constexpr size_t gap_size = 512;
return gap_size;
}
size_t AscendSomas::GetAlignSize(size_t original_size) const {
constexpr size_t alignment = 512;
constexpr size_t alignment_complement = 31;
size_t aligned_size =
(original_size > 0) ? ((original_size + alignment + alignment_complement) / alignment) * alignment : 0;
return aligned_size;
}
bool AscendSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto task_sink = ms_context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
auto opt_level = ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL);
if (task_sink || (opt_level == kOptimizeO1)) {
return true;
} else {
return false;
}
}
std::vector<vector<uint32_t>> AscendSomas::GetStreamGroupInfo(const session::KernelGraph &graph) const {
std::vector<vector<uint32_t>> stream_group;
stream_group = device::ascend::AscendStreamAssign::GetInstance().get_stream_group();
return stream_group;
}
std::map<std::string, UnReuseType> AscendSomas::GetUnReuseNodeType(const session::KernelGraph &graph) const {
std::map<std::string, UnReuseType> node_type;
node_type[kGetNextOpName] = UnReuseType::kUnReuseOutput;
return node_type;
}
bool AscendSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) {
InitEventInfo(graph);
return true;
}
void AscendSomas::InitEventInfo(const session::KernelGraph &graph) {
event_map_ = {};
auto &kernels = graph.execution_order();
for (const auto &kernel : kernels) {
auto type = common::AnfAlgo::GetCNodeName(kernel);
if (type == kSendOpName) {
auto event = common::AnfAlgo::GetNodeAttr<uint32_t>(kernel, kAttrEventId);
auto iter = event_map_.find(event);
if (iter == event_map_.end()) {
auto pair = somas::EventPair();
pair.send_ = kernel;
event_map_[event] = pair;
} else {
iter->second.send_ = kernel;
}
} else if (type == kRecvOpName) {
auto event = common::AnfAlgo::GetNodeAttr<uint32_t>(kernel, kAttrEventId);
auto iter = event_map_.find(event);
if (iter == event_map_.end()) {
auto pair = somas::EventPair();
pair.recv_ = kernel;
event_map_[event] = pair;
} else {
iter->second.recv_ = kernel;
}
}
}
for (auto &event : event_map_) {
auto pair = event.second;
auto send_iter = nodes_map_.find(pair.send_.get());
if (send_iter == nodes_map_.end()) {
MS_LOG(WARNING) << "Can't find somas node for " << pair.send_->fullname_with_scope();
continue;
}
auto recv_iter = nodes_map_.find(pair.recv_.get());
if (recv_iter == nodes_map_.end()) {
MS_LOG(WARNING) << "Can't find somas node for " << pair.recv_->fullname_with_scope();
continue;
}
auto &somas_send = send_iter->second.at(0);
auto &somas_recv = recv_iter->second.at(0);
AddControlTensor(somas_send, somas_recv);
}
MS_LOG(DEBUG) << "Somas InitEventInfo end.";
}
bool AscendSomas::DevSpecNodeProcess(const session::KernelGraph &graph) {
IndependentNodeOutputProcess(graph);
NonTaskSplitProcess(graph);
return true;
}
void AscendSomas::IndependentNodeOutputProcess(const session::KernelGraph &graph) {
auto &kernel_cnodes = graph.execution_order();
size_t total_size = 0;
for (const auto &kernel : kernel_cnodes) {
bool independent = AnfAlgo::IsIndependentNode(kernel);
if (!independent) {
continue;
}
auto iter = nodes_map_.find(kernel.get());
if (iter != nodes_map_.end()) {
auto &node = iter->second.at(0);
MS_EXCEPTION_IF_NULL(node);
auto semi_reuse_output_tensors = node->output_tensors_;
for (auto &tensor : semi_reuse_output_tensors) {
MS_EXCEPTION_IF_NULL(tensor);
total_size += tensor->GetAlignedSize();
tensor->lifelong_value_ = LifeLongType::kLifeLongGraphEnd;
}
}
}
MS_LOG(INFO) << "Special Tensor total size: Independent Node output " << total_size;
}
void AscendSomas::NonTaskSplitProcess(const session::KernelGraph &graph) {
auto &kernel_cnodes = graph.execution_order();
for (const auto &kernel : kernel_cnodes) {
auto op_name = common::AnfAlgo::GetCNodeName(kernel);
if (common::AnfAlgo::IsNonTaskOp(kernel)) {
std::vector<size_t> refnode_input_output;
auto node = nodes_map_[kernel.get()].at(0);
MS_EXCEPTION_IF_NULL(node);
if (node->input_tensors_.empty()) {
MS_LOG(EXCEPTION) << op_name << " has no input tensor, can not do split non_task process.";
}
auto input_tensor = node->input_tensors_[0];
MS_EXCEPTION_IF_NULL(input_tensor);
input_tensor->type_ = TensorType::kUnion;
refnode_input_output.push_back(input_tensor->GetId());
for (auto &output_tensor : node->output_tensors_) {
MS_EXCEPTION_IF_NULL(output_tensor);
output_tensor->type_ = TensorType::kUnion;
refnode_input_output.push_back(output_tensor->GetId());
}
union_tensors_list_.push_back(refnode_input_output);
}
}
}
} // namespace ascend
} // namespace device
} // namespace mindspore

View File

@ -0,0 +1,61 @@
/**
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_
#include <vector>
#include <string>
#include <map>
#include <utility>
#include <memory>
#include "backend/common/somas/somas.h"
#include "runtime/hardware/device_type.h"
namespace mindspore {
namespace device {
namespace ascend {
using KernelGraph = session::KernelGraph;
using UnReuseType = somas::UnReuseType;
class AscendSomas : public somas::Somas {
public:
#ifndef ENABLE_SECURITY
void ConvertToProfilingNode(uint32_t graph_id) const override;
#endif
private:
bool Initialize() override;
string GetDeviceName() const override;
size_t GetCommunicationReservedSize() const override;
size_t GetAlignSize(size_t original_size) const override;
bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
std::vector<vector<uint32_t>> GetStreamGroupInfo(const session::KernelGraph &graph) const override;
std::map<std::string, UnReuseType> GetUnReuseNodeType(const session::KernelGraph &graph) const override;
bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
void InitEventInfo(const session::KernelGraph &graph);
void IndependentNodeOutputProcess(const session::KernelGraph &graph);
void NonTaskSplitProcess(const session::KernelGraph &graph);
std::map<uint32_t, somas::EventPair> event_map_;
};
REG_SOMAS(Ascend, DeviceType::kAscend, AscendSomas)
} // namespace ascend
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_

View File

@ -0,0 +1,41 @@
/**
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/cpu/hal/hardware/cpu_somas.h"
#include <string>
#include "utils/ms_context.h"
namespace mindspore {
namespace device {
namespace cpu {
bool CPUSomas::Initialize() { return true; }
std::string CPUSomas::GetDeviceName() const { return "CPU"; }
size_t CPUSomas::GetAlignSize(size_t original_size) const {
constexpr size_t alignment = 512;
size_t aligned_size = (original_size > 0) ? ((original_size + alignment - 1) / alignment) * alignment : 0;
return aligned_size;
}
bool CPUSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const { return false; }
bool CPUSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) { return true; }
bool CPUSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { return true; }
} // namespace cpu
} // namespace device
} // namespace mindspore

View File

@ -0,0 +1,43 @@
/**
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__
#define MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__
#include <string>
#include "backend/common/somas/somas.h"
#include "runtime/hardware/device_type.h"
namespace mindspore {
namespace device {
namespace cpu {
using KernelGraph = session::KernelGraph;
class CPUSomas : public somas::Somas {
private:
bool Initialize() override;
string GetDeviceName() const override;
size_t GetAlignSize(size_t original_size) const override;
bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
};
REG_SOMAS(CPU, DeviceType::kCPU, CPUSomas)
} // namespace cpu
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__

View File

@ -25,6 +25,7 @@
#include "plugin/device/gpu/hal/device/gpu_stream_assign.h"
#include "plugin/device/gpu/hal/device/distribution/collective_init.h"
#include "plugin/device/gpu/hal/device/gpu_device_manager.h"
#include "plugin/device/gpu/hal/hardware/gpu_somas.h"
#include "runtime/data_queue/data_queue_mgr.h"
#include "kernel/common_utils.h"
#include "plugin/device/gpu/hal/device/gpu_common.h"
@ -40,6 +41,7 @@
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
#include "backend/common/optimizer/common_backend_optimization.h"
#include "backend/common/optimizer/dynamic_shape/dynamic_shape_helper.h"
#include "include/common/debug/anf_ir_dump.h"
#ifdef ENABLE_DUMP_IR
#include "include/common/debug/rdr/recorder_manager.h"
#include "debug/rdr/mem_address_recorder.h"
@ -258,6 +260,25 @@ DeviceAddressPtr GPUDeviceResManager::CreateDeviceAddress(void *const device_ptr
return device_address;
}
void GPUKernelExecutor::PreprocessBeforeRun(const FuncGraphPtr &graph) const {
MS_EXCEPTION_IF_NULL(graph);
auto kernel_graph = graph->cast<KernelGraphPtr>();
MS_EXCEPTION_IF_NULL(kernel_graph);
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) {
auto somas = std::make_shared<GPUSomas>();
bool ret = somas->Assign(kernel_graph);
if (ret) {
MS_LOG(INFO) << "Somas allocate success for graph " << kernel_graph->graph_id()
<< " somas size: " << kernel_graph->somas_whole_block_size();
} else {
MS_LOG(WARNING) << "Somas allocate failed for graph " << kernel_graph->graph_id();
}
}
MS_LOG(INFO) << "Status record: end preprocess before run graph. graph id: " << kernel_graph->graph_id();
}
void GPUKernelExecutor::OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph) const {
MS_EXCEPTION_IF_NULL(graph);
// Operator fusion optimization.

View File

@ -82,6 +82,8 @@ class GPUKernelExecutor : public DeprecatedKernelExecutor {
void CreateKernel(const std::vector<CNodePtr> &nodes) const override;
void PreprocessBeforeRun(const FuncGraphPtr &graph) const override;
bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs) const override;

View File

@ -0,0 +1,141 @@
/**
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/gpu/hal/hardware/gpu_somas.h"
#include <string>
#include <vector>
#include "backend/common/optimizer/helper.h"
#include "utils/ms_context.h"
namespace mindspore {
namespace device {
namespace gpu {
bool GPUSomas::Initialize() { return true; }
std::string GPUSomas::GetDeviceName() const { return "GPU"; }
size_t GPUSomas::GetAlignSize(size_t original_size) const {
constexpr size_t alignment = 512;
size_t aligned_size = (original_size > 0) ? ((original_size + alignment - 1) / alignment) * alignment : 0;
return aligned_size;
}
bool GPUSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const {
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
if (context_ptr->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) {
return true;
} else {
return false;
}
}
bool GPUSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) {
InitEventInfo(graph);
return true;
}
void GPUSomas::InitEventInfo(const session::KernelGraph &graph) {
event_map_ = {};
auto &kernels = graph.execution_order();
for (const auto &kernel : kernels) {
auto type = common::AnfAlgo::GetCNodeName(kernel);
if (type == kSendOpName) {
auto event = common::AnfAlgo::GetNodeAttr<uintptr_t>(kernel, kAttrRecordEvent);
auto iter = event_map_.find(event);
if (iter == event_map_.end()) {
auto pair = somas::EventPair();
pair.send_ = kernel;
event_map_[event] = pair;
} else {
iter->second.send_ = kernel;
}
} else if (type == kRecvOpName) {
auto event = common::AnfAlgo::GetNodeAttr<uintptr_t>(kernel, kAttrWaitEvent);
auto iter = event_map_.find(event);
if (iter == event_map_.end()) {
auto pair = somas::EventPair();
pair.recv_ = kernel;
event_map_[event] = pair;
} else {
iter->second.recv_ = kernel;
}
}
}
for (auto &event : event_map_) {
auto pair = event.second;
auto send_iter = nodes_map_.find(pair.send_.get());
if (send_iter == nodes_map_.end()) {
MS_LOG(WARNING) << "Can't find somas node for " << pair.send_->fullname_with_scope();
continue;
}
auto recv_iter = nodes_map_.find(pair.recv_.get());
if (recv_iter == nodes_map_.end()) {
MS_LOG(WARNING) << "Can't find somas node for " << pair.recv_->fullname_with_scope();
continue;
}
auto &somas_send = send_iter->second.at(0);
auto &somas_recv = recv_iter->second.at(0);
AddControlTensor(somas_send, somas_recv);
}
MS_LOG(DEBUG) << "Somas InitEventInfo end.";
}
bool GPUSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { return InplaceNodeProcess(graph); }
bool GPUSomas::InplaceNodeProcess(const session::KernelGraph &graph) {
auto &kernels = graph.execution_order();
for (auto &kernel : kernels) {
if (!common::AnfAlgo::IsInplaceNode(kernel, "skip")) {
continue;
}
auto iter = nodes_map_.find(kernel.get());
if (iter != nodes_map_.end()) {
auto &node = iter->second.at(0);
MS_EXCEPTION_IF_NULL(node);
auto input_tensors = node->input_tensors_;
auto output_tensors = node->output_tensors_;
std::vector<somas::SomasTensorPtr> union_tensors;
union_tensors.insert(union_tensors.end(), input_tensors.begin(), input_tensors.end());
union_tensors.insert(union_tensors.end(), output_tensors.begin(), output_tensors.end());
// check whether the union tensor already in other union tensors
for (auto &tensor : union_tensors) {
auto tensor_id = tensor->GetId();
for (auto &union_list : union_tensors_list_) {
if (std::count(union_list.begin(), union_list.end(), tensor_id)) {
MS_LOG(EXCEPTION) << "Inplace node union Tensor " << tensor_id << " already in other union tensor list.";
}
}
}
std::vector<size_t> inplace_union_tensor_list;
for (auto &tensor : union_tensors) {
tensor->type_ = somas::kUnion;
inplace_union_tensor_list.push_back(tensor->GetId());
}
union_tensors_list_.push_back(inplace_union_tensor_list);
} else {
MS_LOG(EXCEPTION) << "Can't find somas node for inplace node " << kernel->fullname_with_scope();
}
}
return true;
}
} // namespace gpu
} // namespace device
} // namespace mindspore

View File

@ -0,0 +1,48 @@
/**
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__
#define MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__
#include <map>
#include <string>
#include "backend/common/somas/somas.h"
#include "runtime/hardware/device_type.h"
namespace mindspore {
namespace device {
namespace gpu {
using KernelGraph = session::KernelGraph;
class GPUSomas : public somas::Somas {
private:
bool Initialize() override;
string GetDeviceName() const override;
size_t GetAlignSize(size_t original_size) const override;
bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
bool InplaceNodeProcess(const session::KernelGraph &graph);
void InitEventInfo(const session::KernelGraph &graph);
std::map<uintptr_t, somas::EventPair> event_map_;
};
REG_SOMAS(GPU, DeviceType::kGPU, GPUSomas)
} // namespace gpu
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__

View File

@ -101,7 +101,8 @@ REGISTER_PYBIND_DEFINE(MsContextPy, ([](const py::module *m) {
.value("graph_kernel_flags", MsCtxParam::MS_CTX_GRAPH_KERNEL_FLAGS)
.value("grad_for_scalar", MsCtxParam::MS_CTX_GRAD_FOR_SCALAR)
.value("pynative_synchronize", MsCtxParam::MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE)
.value("disable_format_transform", MsCtxParam::MS_CTX_DISABLE_FORMAT_TRANSFORM);
.value("disable_format_transform", MsCtxParam::MS_CTX_DISABLE_FORMAT_TRANSFORM)
.value("memory_optimize_level", MsCtxParam::MS_CTX_MEMORY_OPTIMIZE_LEVEL);
(void)py::class_<mindspore::MsContext, std::shared_ptr<mindspore::MsContext>>(*m, "MSContext")
.def_static("get_instance", &mindspore::MsContext::GetInstance, "Get ms context instance.")
.def("get_param", &mindspore::MsCtxGetParameter, "Get value of specified parameter.")

View File

@ -3,6 +3,7 @@ file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*
"memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc" "memory_scheduler.cc"
"memory_offload_strategy.cc" "bucket.cc" "launch_kernel.cc" "launch_mul.cc" "tensor_array.cc"
"ms_device_shape_transfer.cc" "context_extends.cc" "stream_synchronizer.cc" "tensors_queue.cc" "auto_mem_offload.cc"
"common_somas_allocator.cc"
)
if("${ENABLE_HIDDEN}" STREQUAL "OFF")

View File

@ -0,0 +1,86 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "runtime/device/common_somas_allocator.h"
#include <utility>
#include <string>
#include "backend/common/optimizer/helper.h"
#include "utils/ms_context.h"
#ifdef ENABLE_DUMP_IR
#include "debug/rdr/string_recorder.h"
#endif
namespace mindspore {
namespace device {
bool CommonSomasAllocator::Assign(const session::KernelGraph &graph) {
somas::SomasPtr somas_ptr{nullptr};
if (GetTargetFromContext() == kAscendDevice) {
somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kAscend);
} else if (GetTargetFromContext() == kGPUDevice) {
somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kGPU);
} else {
somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kCPU);
}
MS_EXCEPTION_IF_NULL(somas_ptr);
bool ret = somas_ptr->Assign(graph);
if (ret) {
#ifdef ENABLE_DUMP_IR
SubModuleId module = SubModuleId::SM_OPTIMIZER;
std::string name = "somas_allocate_info." + std::to_string(graph.graph_id());
(void)mindspore::RDR::RecordString(module, name, somas_ptr->SomasInfo());
#endif
#ifndef ENABLE_SECURITY
somas_ptr->ConvertToProfilingNode(graph.graph_id());
#endif
}
return ret;
}
uint8_t *CommonSomasAllocator::GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const {
MS_EXCEPTION_IF_NULL(node);
auto kernel_info = dynamic_cast<KernelInfo *>(node->kernel_info());
MS_EXCEPTION_IF_NULL(kernel_info);
if (index >= kernel_info->somas_output_offset_aligned_size_list().size()) {
MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:["
<< kernel_info->somas_output_offset_aligned_size_list().size() << "]";
}
auto somas_offset_aligned_size = kernel_info->somas_output_offset_aligned_size_list()[index];
if (somas_offset_aligned_size.second == 0) {
return nullptr;
}
auto somas_offset = somas_offset_aligned_size.first;
uint8_t *ptr = mem_base_addr_ + somas_offset;
return ptr;
}
uint8_t *CommonSomasAllocator::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const {
MS_EXCEPTION_IF_NULL(node);
auto kernel_info = dynamic_cast<KernelInfo *>(node->kernel_info());
MS_EXCEPTION_IF_NULL(kernel_info);
if (index >= kernel_info->somas_workspace_offset_aligned_size_list().size()) {
MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:["
<< kernel_info->somas_workspace_offset_aligned_size_list().size() << "]";
}
auto somas_offset_aligned_size = kernel_info->somas_workspace_offset_aligned_size_list()[index];
if (somas_offset_aligned_size.second == 0) {
return nullptr;
}
auto somas_offset = somas_offset_aligned_size.first;
uint8_t *ptr = mem_base_addr_ + somas_offset;
return ptr;
}
} // namespace device
} // namespace mindspore

View File

@ -0,0 +1,50 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H
#include <vector>
#include <string>
#include <map>
#include <utility>
#include <memory>
#include "backend/common/somas/somas.h"
#include "runtime/hardware/device_type.h"
#include "utils/ms_context.h"
namespace mindspore {
namespace device {
class CommonSomasAllocator {
public:
void set_mem_base_addr(uint8_t *mem_base_addr) { mem_base_addr_ = mem_base_addr; }
static bool Assign(const session::KernelGraph &graph);
uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
private:
// Memory base addr
uint8_t *mem_base_addr_{nullptr};
static std::string GetTargetFromContext() {
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
return context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
}
};
using CommonSomasAllocatorPtr = std::shared_ptr<CommonSomasAllocator>;
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H

View File

@ -15,6 +15,7 @@
*/
#include "runtime/device/kernel_info.h"
#include <utility>
namespace mindspore {
namespace device {
@ -108,6 +109,13 @@ bool KernelInfo::SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t
return true;
}
bool KernelInfo::SetSomasResult(std::vector<std::pair<size_t, size_t>> &&output_somas_result,
std::vector<std::pair<size_t, size_t>> &&workspace_somas_result) {
somas_output_result_ = std::move(output_somas_result);
somas_workspace_result_ = std::move(workspace_somas_result);
return true;
}
void KernelInfo::set_kernel_mod(const kernel::KernelModPtr &kernel_mod) { kernel_mod_ = kernel_mod; }
kernel::KernelMod *KernelInfo::MutableKernelMod() const { return kernel_mod_.get(); }

View File

@ -19,6 +19,7 @@
#include <vector>
#include <memory>
#include <utility>
#include "ir/kernel_info_dev.h"
#include "kernel/kernel_build_info.h"
#include "kernel/kernel.h"
@ -57,6 +58,8 @@ class KernelInfo : public KernelInfoDevice {
DeviceAddressPtr GetMutableWorkspaceAddr(size_t index) const;
bool WorkspaceAddrExist(size_t index) const;
bool SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t index);
bool SetSomasResult(std::vector<std::pair<size_t, size_t>> &&output_somas_result,
std::vector<std::pair<size_t, size_t>> &&workspace_somas_result);
void set_kernel_mod(const kernel::KernelModPtr &kernel_mod);
kernel::KernelMod *MutableKernelMod() const;
const kernel::KernelMod *kernel_mod() const;
@ -70,6 +73,12 @@ class KernelInfo : public KernelInfoDevice {
uint32_t graph_id() const { return graph_id_; }
bool operator==(const KernelInfo &other) const;
bool is_feature_map() const { return is_feature_map_; }
const std::vector<std::pair<size_t, size_t>> &somas_output_offset_aligned_size_list() const {
return somas_output_result_;
}
const std::vector<std::pair<size_t, size_t>> &somas_workspace_offset_aligned_size_list() const {
return somas_workspace_result_;
}
const std::vector<std::shared_ptr<DeviceAddress>> &output_address_list() const { return output_address_list_; }
const std::vector<std::shared_ptr<DeviceAddress>> &workspace_address_list() const { return workspace_address_list_; }
@ -83,6 +92,12 @@ class KernelInfo : public KernelInfoDevice {
kernel::KernelBuildInfoPtr select_kernel_build_info_;
std::vector<std::shared_ptr<DeviceAddress>> output_address_list_;
std::vector<std::shared_ptr<DeviceAddress>> workspace_address_list_;
// pair<size_t, size_t> : (offset, aligned_size)
// aligned_size of 0 means no memory allocation
std::vector<std::pair<size_t, size_t>> somas_output_result_;
// pair<size_t, size_t> : (offset, aligned_size)
// aligned_size of 0 means no memory allocation
std::vector<std::pair<size_t, size_t>> somas_workspace_result_;
kernel::KernelModPtr kernel_mod_;
// stream_id_ is the index of stream object vector
uint32_t stream_id_;

View File

@ -985,7 +985,12 @@ void KernelRuntime::AssignNodeOutputMem(MemType type, const AnfNodePtr &node, in
auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type, {node, i});
MS_EXCEPTION_IF_NULL(device_address);
uint8_t *ptr = mem_manager_->MallocOutputMem(node, i, type, output_sizes[i], device_address, false);
if (ptr == nullptr && type == kSomasReuseDynamicMem) {
MS_LOG(INFO) << "node: " << node->fullname_with_scope() << " could be a RefNode, please check it"
<< " output index: " << i << " memory type: " << type;
} else {
MS_EXCEPTION_IF_NULL(ptr);
}
device_address->set_host_shape(trans::GetRuntimePaddingShape(node, i));
AnfAlgo::SetOutputAddr(device_address, i, node.get());
}

View File

@ -18,10 +18,6 @@
#include <string>
#include "backend/common/session/anf_runtime_algorithm.h"
#include "include/common/utils/anfalgo.h"
#include "include/common/debug/common.h"
#ifdef ENABLE_DUMP_IR
#include "debug/rdr/string_recorder.h"
#endif
#include "utils/ms_context.h"
namespace mindspore {
@ -37,41 +33,21 @@ size_t MemoryManager::GetCommunicationAlignSize(size_t input_size) {
}
void MemoryManager::MallocSomasDynamicMem(const session::KernelGraph &graph) {
SomasPtr somas_reuse_util_ptr = std::make_shared<somas::Somas>();
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr);
somas_reuse_util_ptr_ = somas_reuse_util_ptr;
SomasAllocatorPtr somas_allocator_ptr = std::make_shared<device::CommonSomasAllocator>();
MS_EXCEPTION_IF_NULL(somas_allocator_ptr);
somas_allocator_ptr_ = somas_allocator_ptr;
if (!(somas_reuse_util_ptr->Allocate(&graph))) {
if (!(somas_allocator_ptr->Assign(graph))) {
MS_LOG(EXCEPTION) << "Somas Allocate Failed.";
}
size_t total_allocated_size = somas_reuse_util_ptr->GetTotalMemSize();
size_t total_allocated_size = graph.somas_whole_block_size();
MS_LOG(INFO) << "Graph " << graph.graph_id() << ": TotalSomasReuseDynamicSize [" << total_allocated_size << "]";
if (total_allocated_size > 0) {
auto base_ptr = MallocDynamicMem(total_allocated_size, false);
MS_LOG(INFO) << "Somas Reuse Memory Base Address [" << static_cast<void *>(base_ptr) << "], End Address ["
<< static_cast<void *>(base_ptr + total_allocated_size) << "]";
somas_reuse_util_ptr->set_mem_base_addr(base_ptr);
}
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
#ifdef ENABLE_DUMP_IR
SubModuleId module = SubModuleId::SM_OPTIMIZER;
std::string name = "somas_allocate_info." + std::to_string(graph.graph_id());
(void)mindspore::RDR::RecordString(module, name, somas_reuse_util_ptr_->SomasInfo());
name = "somas_mem_info." + std::to_string(graph.graph_id());
(void)mindspore::RDR::RecordString(module, name, somas_reuse_util_ptr_->SomasMemory());
#endif
bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
if (save_graphs) {
std::string file_path = GetSaveGraphsPathName("somas_allocate_info_" + std::to_string(graph.graph_id()) + ".ir");
somas_reuse_util_ptr_->DumpSomasInfoIR(file_path);
std::string mem_file_path = GetSaveGraphsPathName("somas_mem_info_" + std::to_string(graph.graph_id()) + ".ir");
somas_reuse_util_ptr_->DumpSomasMemoryIR(mem_file_path);
somas_allocator_ptr->set_mem_base_addr(base_ptr);
}
}
@ -94,8 +70,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
address->communication_ptr_ = ptr - kMemAlignSize;
}
} else if (type == kSomasReuseDynamicMem) {
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index);
MS_EXCEPTION_IF_NULL(somas_allocator_ptr_);
ptr = somas_allocator_ptr_->GetNodeOutputPtr(node, index);
} else {
ptr = MallocDynamicMem(size, communication_mem);
}
@ -109,8 +85,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
} else if (type == kDynamicMem) {
ptr = MallocDynamicMem(size, false);
} else if (type == kSomasReuseDynamicMem) {
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index);
MS_EXCEPTION_IF_NULL(somas_allocator_ptr_);
ptr = somas_allocator_ptr_->GetNodeOutputPtr(node, index);
}
address->ptr_ = ptr;
return ptr;
@ -118,8 +94,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, MemType type, size_t size) {
if (type == kSomasReuseDynamicMem) {
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
return somas_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
MS_EXCEPTION_IF_NULL(somas_allocator_ptr_);
return somas_allocator_ptr_->GetNodeWorkSpacePtr(node, index);
}
return MallocDynamicMem(size, false);
}

View File

@ -22,14 +22,15 @@
#include <map>
#include <queue>
#include "common/mem_reuse/mem_reuse.h"
#include "backend/common/somas/somas.h"
#include "runtime/device/common_somas_allocator.h"
namespace mindspore {
namespace device {
enum MemType { kStaticMem, kDynamicMem, kSomasReuseDynamicMem };
constexpr int kGetAllOuts = -1;
constexpr uint64_t kMemAlignSize = 512;
constexpr uint64_t kTwiceMemAlignSize = kMemAlignSize << 1;
using SomasPtr = mindspore::somas::SomasPtr;
using SomasAllocatorPtr = mindspore::device::CommonSomasAllocatorPtr;
class MemoryManager {
public:
@ -80,7 +81,7 @@ class MemoryManager {
return MallocStaticMem(size, communication_mem, kInvalidGraphId);
}
virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem);
SomasPtr somas_reuse_util_ptr_{nullptr};
SomasAllocatorPtr somas_allocator_ptr_{nullptr};
};
} // namespace device
} // namespace mindspore

View File

@ -81,6 +81,10 @@ void MemoryManagerActor::AllocateContinuousMemory(const std::vector<std::vector<
auto &size_list = (*size_list_list)[i];
auto &device_context = (*device_contexts)[i];
MS_EXCEPTION_IF_NULL(device_context);
// if the address of continuous tensor has already been allocated, skip the tensor
if (alloc_list[0]->GetPtr() != nullptr) {
continue;
}
// Allocate memory through the device context.
device::DynamicMemAllocatorDebugInfo::SetDebugInfo(from_aid.Name(), device::AllocatorType::kKernelOutput);
auto dev_ptr_list = device_context->device_res_manager_->AllocateContinuousMemory(size_list);

View File

@ -102,6 +102,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
set_param<bool>(MS_CTX_ENABLE_RECOVERY, false);
set_param<bool>(MS_CTX_ENABLE_GE_HETEROGENOUS, false);
set_param<bool>(MS_CTX_DISABLE_FORMAT_TRANSFORM, false);
set_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL, kOptimizeO0);
uint32_t kDefaultRuntimeNumThreads = 30;
uint32_t cpu_core_num = std::thread::hardware_concurrency() - 1;

View File

@ -55,6 +55,8 @@ const char kGpuInferenceDevice[] = "GpuInference";
const char kDavinciDevice[] = "Davinci";
const char KNpuLog[] = "_npu_log";
const unsigned int MAX_CALL_DEPTH_DEFAULT = 1000;
const int kOptimizeO0 = 0;
const int kOptimizeO1 = 1;
const std::set<std::string> kTargetSet = {kCPUDevice, kGPUDevice, kAscendDevice, kDavinciDevice};
// The default max available device memory is 1024GB.
@ -98,6 +100,7 @@ enum MsCtxParam : unsigned {
// parameter of type int
MS_CTX_TYPE_INT_BEGIN = MS_CTX_TYPE_BOOL_END,
MS_CTX_EXECUTION_MODE = MS_CTX_TYPE_INT_BEGIN,
MS_CTX_MEMORY_OPTIMIZE_LEVEL,
MS_CTX_TYPE_INT_END,
// parameter of type uint32

View File

@ -98,7 +98,6 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
${CCSRC_DIR}/backend/common/somas/somas_solver_alg.cc
${CCSRC_DIR}/backend/graph_compiler/graph_partition.cc
${CMAKE_CURRENT_SOURCE_DIR}/mock/segment_runner.cc
${CCSRC_DIR}/runtime/device/auto_mem_offload.cc
${CCSRC_DIR}/runtime/device/ms_device_shape_transfer.cc
${CCSRC_DIR}/runtime/device/kernel_info.cc
${CCSRC_DIR}/runtime/device/convert_tensor_utils.cc
@ -109,6 +108,7 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
${CCSRC_DIR}/runtime/device/memory_offload_strategy.cc
${CCSRC_DIR}/runtime/device/memory_manager.cc
${CCSRC_DIR}/runtime/device/auto_mem_offload.cc
${CCSRC_DIR}/runtime/device/common_somas_allocator.cc
${CCSRC_DIR}/runtime/pynative/op_executor.cc
${CCSRC_DIR}/runtime/pynative/op_runtime_info.cc
${CCSRC_DIR}/runtime/hardware/device_type.cc
@ -117,6 +117,8 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
${CCSRC_DIR}/kernel/kernel.cc
${CCSRC_DIR}/kernel/kash/kernel_pack.cc
${CCSRC_DIR}/kernel/oplib/oplib.cc
${CCSRC_DIR}/common/debug/anf_dump_utils.cc
${CCSRC_DIR}/common/debug/anf_ir_dump.cc
${CCSRC_DIR}/common/debug/common.cc
${CCSRC_DIR}/common/debug/env_config_parser.cc
${CCSRC_DIR}/common/thread_pool.cc

View File

@ -197,6 +197,22 @@ class _Context:
f"or context.PYNATIVE_MODE (1), but got {mode}.")
self.set_param(ms_ctx_param.mode, mode)
def set_memory_optimize_level(self, memory_optimize_level):
"""
The memory optimize level, support "O0", "O1".
Args:
target (str): "O0", "O1"
"""
memory_optimize_levels = ["O0", "O1"]
if memory_optimize_level not in memory_optimize_levels:
raise ValueError(f"For 'context.set_context', the argument 'memory_optimize_level' must be one of "
f"{memory_optimize_levels}, but got {memory_optimize_level}.")
if memory_optimize_level == "O0":
self.set_param(ms_ctx_param.memory_optimize_level, 0)
else:
self.set_param(ms_ctx_param.memory_optimize_level, 1)
def set_backend_policy(self, policy):
success = self._context_handle.set_backend_policy(policy)
if not success:
@ -353,7 +369,8 @@ class _Context:
'mempool_block_size': set_mempool_block_size,
'print_file_path': set_print_file_path,
'env_config_path': set_env_config_path,
'runtime_num_threads': set_runtime_num_threads
'runtime_num_threads': set_runtime_num_threads,
'memory_optimize_level': set_memory_optimize_level
}
@property

View File

@ -87,3 +87,30 @@ def test_trainTensor(num_classes=10, epoch=15, batch_size=32):
loss = train_network(data, label).asnumpy()
losses.append(loss)
assert losses[-1] < 0.01
@pytest.mark.level1
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_train_tensor_memory_opt(num_classes=10, epoch=15, batch_size=32):
"""
Feature: Somas GPU kernel by kernel.
Description: AlexNet with Somas GPU kernel by kernel.
Expectation: No exception.
"""
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
net = AlexNet(num_classes)
lr = 0.1
momentum = 0.9
optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, momentum, weight_decay=0.0001)
criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
net_with_criterion = WithLossCell(net, criterion)
train_network = TrainOneStepCell(net_with_criterion, optimizer)
train_network.set_train()
losses = []
for i in range(0, epoch):
data = Tensor(np.ones([batch_size, 3, 227, 227]).astype(np.float32) * 0.01)
label = Tensor(np.ones([batch_size]).astype(np.int32))
loss = train_network(data, label).asnumpy()
losses.append(loss)
assert losses[-1] < 0.01

View File

@ -150,6 +150,35 @@ def test_train_lenet():
assert losses[-1] < 0.01
@pytest.mark.level1
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_train_lenet_memory_opt():
"""
Feature: Somas GPU kernel by kernel.
Description: LeNet with Somas GPU kernel by kernel.
Expectation: No exception.
"""
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
epoch = 100
net = LeNet()
momentum = 0.9
learning_rate = multisteplr(epoch, 30)
optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
net_with_criterion = WithLossCell(net, criterion)
train_network = TrainOneStepCell(net_with_criterion, optimizer) # optimizer
train_network.set_train()
losses = []
for i in range(epoch):
data = Tensor(np.ones([net.batch_size, 3, 32, 32]).astype(np.float32) * 0.01)
label = Tensor(np.ones([net.batch_size]).astype(np.int32))
loss = train_network(data, label).asnumpy()
losses.append(loss)
assert losses[-1] < 0.01
def create_dataset(data_path, batch_size=32, repeat_size=1,
num_parallel_workers=1):
"""

View File

@ -142,3 +142,48 @@ def test_LSTM():
losses.append(loss)
print("loss:", loss.asnumpy())
assert (losses[-1].asnumpy() < 0.01)
@pytest.mark.level1
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_lstm_memory_opt():
"""
Feature: Somas GPU kernel by kernel.
Description: LSTM with Somas GPU kernel by kernel.
Expectation: No exception.
"""
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
num_epochs = 5
embed_size = 100
num_hiddens = 100
num_layers = 2
bidirectional = True
labels = 2
vocab_size = 252193
max_len = 500
weight = np.ones((vocab_size + 1, embed_size)).astype(np.float32)
net = SentimentNet(vocab_size=(vocab_size + 1), embed_size=embed_size,
num_hiddens=num_hiddens, num_layers=num_layers,
bidirectional=bidirectional, weight=weight,
labels=labels, batch_size=batch_size)
learning_rate = 0.1
momentum = 0.9
optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
net_with_criterion = WithLossCell(net, criterion)
train_network = TrainOneStepCell(net_with_criterion, optimizer) # optimizer
train_network.set_train()
train_features = Tensor(np.ones([64, max_len]).astype(np.int32))
train_labels = Tensor(np.ones([64,]).astype(np.int32)[0:64])
losses = []
for epoch in range(num_epochs):
loss = train_network(train_features, train_labels)
losses.append(loss)
print("loss:", loss.asnumpy())
assert (losses[-1].asnumpy() < 0.01)

View File

@ -352,6 +352,36 @@ def test_trainTensor(num_classes=10, epoch=8, batch_size=1):
assert (losses[-1].asnumpy() < 1)
@pytest.mark.level1
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_train_tensor_memory_opt(num_classes=10, epoch=8, batch_size=1):
"""
Feature: Somas GPU kernel by kernel.
Description: ResNet with Somas GPU kernel by kernel.
Expectation: No exception.
"""
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
net = resnet50(num_classes)
lr = 0.1
momentum = 0.9
optimizer = Momentum(filter(lambda x: x.requires_grad,
net.get_parameters()), lr, momentum)
criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
net_with_criterion = WithLossCell(net, criterion)
train_network = TrainOneStepCell(
net_with_criterion, optimizer) # optimizer
train_network.set_train()
losses = []
for i in range(0, epoch):
data = Tensor(np.ones([batch_size, 3, 224, 224]
).astype(np.float32) * 0.01)
label = Tensor(np.ones([batch_size]).astype(np.int32))
loss = train_network(data, label)
losses.append(loss)
assert (losses[-1].asnumpy() < 1)
@pytest.mark.level2
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard