add gpu somas
This commit is contained in:
parent
cd63f6283c
commit
52534d1751
|
@ -50,6 +50,13 @@ struct KernelWithIndexCmp {
|
|||
}
|
||||
};
|
||||
|
||||
struct SomasInfo {
|
||||
// whole_block_size_ is 0 indicating that somas did not allocate memory for this graph.
|
||||
size_t whole_block_size_{0};
|
||||
// offset -> aligned_size_
|
||||
std::map<size_t, size_t> merged_blocks_map_;
|
||||
};
|
||||
|
||||
using DeviceType = device::DeviceType;
|
||||
using KernelMapTensor = std::map<session::KernelWithIndex, BaseRef, session::KernelWithIndexCmp>;
|
||||
|
||||
|
@ -57,6 +64,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
|
|||
public:
|
||||
KernelGraph()
|
||||
: inputs_(std::make_shared<std::vector<AnfNodePtr>>()),
|
||||
somas_info_(std::make_shared<SomasInfo>()),
|
||||
graph_id_(0),
|
||||
stream_distinction_label_(kInvalidDistincLabel),
|
||||
device_target_(DeviceType::kUnknown),
|
||||
|
@ -69,6 +77,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
|
|||
|
||||
KernelGraph(const KernelGraph &graph) : FuncGraph(graph) {
|
||||
inputs_ = graph.inputs_;
|
||||
somas_info_ = graph.somas_info_;
|
||||
child_graph_result_ = graph.child_graph_result_;
|
||||
execution_order_ = graph.execution_order_;
|
||||
mem_reuse_exec_order_ = graph.mem_reuse_exec_order_;
|
||||
|
@ -452,6 +461,11 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
|
|||
bool IsCommSubGraph(uint32_t id) const { return comm_sub_graph_ids_.find(id) != comm_sub_graph_ids_.end(); }
|
||||
void RecordNewCommSubGraphId(uint32_t id) { comm_sub_graph_ids_.insert(id); }
|
||||
|
||||
// somas total memory size
|
||||
SomasInfo *MutableSomasInfo() const { return somas_info_.get(); }
|
||||
size_t somas_whole_block_size() const { return somas_info_->whole_block_size_; }
|
||||
const std::map<size_t, size_t> &somas_merged_blocks_map() const { return somas_info_->merged_blocks_map_; }
|
||||
|
||||
private:
|
||||
// remove value node form graph
|
||||
bool RemoveValueNodeFromGraph(const ValueNodePtr &value_node);
|
||||
|
@ -477,6 +491,7 @@ class BACKEND_EXPORT KernelGraph : public FuncGraph {
|
|||
|
||||
// members
|
||||
std::shared_ptr<std::vector<AnfNodePtr>> inputs_;
|
||||
std::shared_ptr<SomasInfo> somas_info_;
|
||||
std::vector<AnfNodePtr> child_graph_result_;
|
||||
std::vector<CNodePtr> execution_order_;
|
||||
std::vector<CNodePtr> mem_reuse_exec_order_;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2022 Huawei Technologies Co., Ltd
|
||||
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -22,6 +22,7 @@
|
|||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <stack>
|
||||
|
||||
#include "utils/hash_map.h"
|
||||
#include "utils/hash_set.h"
|
||||
|
@ -33,9 +34,15 @@
|
|||
#include "backend/common/session/anf_runtime_algorithm.h"
|
||||
#include "include/common/utils/anfalgo.h"
|
||||
#include "backend/common/session/kernel_graph.h"
|
||||
#include "runtime/hardware/device_type.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace somas {
|
||||
struct EventPair {
|
||||
CNodePtr send_;
|
||||
CNodePtr recv_;
|
||||
};
|
||||
|
||||
union DestinationUnion {
|
||||
size_t id;
|
||||
size_t index;
|
||||
|
@ -43,81 +50,86 @@ union DestinationUnion {
|
|||
};
|
||||
|
||||
struct TensorConflictInfo {
|
||||
size_t tensor_id_;
|
||||
size_t src_node_id_;
|
||||
size_t tensor_id;
|
||||
size_t src_node_id;
|
||||
size_t destination_num;
|
||||
DestinationUnion l;
|
||||
DestinationUnion r;
|
||||
TensorConflictInfo(size_t tensor_id, size_t src_node_id)
|
||||
: tensor_id_(tensor_id), src_node_id_(src_node_id), destination_num(0) {}
|
||||
: tensor_id(tensor_id), src_node_id(src_node_id), destination_num(0) {}
|
||||
};
|
||||
|
||||
struct Block {
|
||||
size_t start_offset_;
|
||||
size_t size_;
|
||||
size_t end_offset_;
|
||||
|
||||
Block(size_t start, size_t size) : start_offset_(start), size_(size) { end_offset_ = start_offset_ + size_; }
|
||||
};
|
||||
|
||||
void MergeBlocks(std::vector<Block> *block_list, std::stack<Block> *merged_blocks);
|
||||
|
||||
enum class UnReuseType { kUnReuseAll, kUnReuseInput, kUnReuseOutput, kUnReuseWorkspace };
|
||||
class Somas {
|
||||
public:
|
||||
// Constructors/Destructors
|
||||
Somas() = default;
|
||||
Somas(const Somas &) = delete;
|
||||
Somas &operator=(const Somas &) = delete;
|
||||
~Somas() { mem_base_addr_ = nullptr; }
|
||||
|
||||
bool Allocate(const session::KernelGraph *graph);
|
||||
const size_t GetTotalMemSize() const { return mem_offset_; }
|
||||
void set_mem_base_addr(uint8_t *mem_base_addr) { mem_base_addr_ = mem_base_addr; }
|
||||
uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
|
||||
uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
|
||||
virtual ~Somas() = default;
|
||||
|
||||
bool Assign(const session::KernelGraph &graph);
|
||||
bool Assign(const KernelGraphPtr &graph_ptr);
|
||||
std::string SomasInfo(bool calc_hash = false) const;
|
||||
std::string SomasMemory() const;
|
||||
void DumpSomasInfoIR(const string filename) const;
|
||||
void DumpSomasMemoryIR(const string &filename) const;
|
||||
|
||||
static bool NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2);
|
||||
#ifndef ENABLE_SECURITY
|
||||
void ConvertToProfilingNode(uint32_t graph_id) const;
|
||||
virtual void ConvertToProfilingNode(uint32_t graph_id) const {}
|
||||
#endif
|
||||
|
||||
private:
|
||||
// device implementation interface
|
||||
virtual bool Initialize() = 0;
|
||||
virtual string GetDeviceName() const = 0;
|
||||
virtual size_t GetAlignSize(size_t original_size) const = 0;
|
||||
virtual size_t GetCommunicationReservedSize() const;
|
||||
|
||||
virtual bool GetEnableCacheFlag(const session::KernelGraph &graph) const;
|
||||
virtual std::vector<vector<uint32_t>> GetStreamGroupInfo(const session::KernelGraph &graph) const;
|
||||
virtual bool GetDependExecOrderFlag(const session::KernelGraph &graph) const = 0;
|
||||
virtual std::pair<bool, std::string> GetDebugConfig() const;
|
||||
|
||||
virtual std::map<std::string, UnReuseType> GetUnReuseNodeType(const session::KernelGraph &graph) const;
|
||||
virtual std::map<std::string, UnReuseType> GetUnReuseNodeName(const session::KernelGraph &graph) const;
|
||||
|
||||
virtual bool InitDevSpecControlTensors(const session::KernelGraph &graph) = 0;
|
||||
virtual bool DevSpecNodeProcess(const session::KernelGraph &graph) = 0;
|
||||
// end
|
||||
|
||||
// SOMAS Configuration
|
||||
std::string device_name_{"SOMAS"};
|
||||
size_t communication_gap_size_{0};
|
||||
|
||||
size_t depend_exec_order_{false};
|
||||
bool enable_cache_{false};
|
||||
bool save_debug_info_{false};
|
||||
std::string debug_info_path_;
|
||||
|
||||
std::map<std::string, UnReuseType> un_reuse_node_type_;
|
||||
std::map<std::string, UnReuseType> un_reuse_node_name_;
|
||||
// end
|
||||
|
||||
std::vector<DynamicBitSet> reuse_matrix_;
|
||||
// hash id
|
||||
std::string hash_id_;
|
||||
// Maps
|
||||
mindspore::HashMap<size_t, SomasTensorPtr> tensors_map_;
|
||||
mindspore::HashMap<void *, std::vector<SomasNodePtr>> nodes_map_;
|
||||
mindspore::HashMap<void *, vector<SomasParameterPtr>> parameters_map_;
|
||||
mindspore::HashMap<size_t, SomasNodePtr> nodes_id_map_;
|
||||
|
||||
// Vectors
|
||||
std::vector<SomasNodePtr> nodes_list_;
|
||||
std::vector<SomasStreamPtr> streams_list_;
|
||||
std::vector<SomasTensorPtr> tensors_list_;
|
||||
std::vector<SomasParameterPtr> parameters_list_;
|
||||
|
||||
// Stream groups
|
||||
std::vector<vector<uint32_t>> streams_groups_;
|
||||
|
||||
// event info map
|
||||
std::map<size_t, std::pair<CNodePtr, CNodePtr>> event_map_;
|
||||
|
||||
// Solver
|
||||
TensorsDescMap solver_tensor_desc_map_;
|
||||
SomasSolverPrePtr somas_solver_;
|
||||
|
||||
// Contiguous list
|
||||
std::vector<vector<size_t>> contiguous_tensors_list_;
|
||||
|
||||
// Ref lists
|
||||
std::vector<vector<size_t>> ref_node_constraints_;
|
||||
std::vector<vector<size_t>> ref_overlap_constraints_;
|
||||
|
||||
// total Offset
|
||||
size_t mem_offset_{0};
|
||||
|
||||
// Memory base addr
|
||||
uint8_t *mem_base_addr_{nullptr};
|
||||
|
||||
// Save debug info
|
||||
bool save_graphs_{false};
|
||||
std::string save_graphs_path_;
|
||||
|
||||
// statistic info
|
||||
size_t upper_bound_{0};
|
||||
size_t lower_bound_{0};
|
||||
|
@ -128,74 +140,147 @@ class Somas {
|
|||
size_t lifelong_start_total_size_{0};
|
||||
size_t lifelong_end_total_size_{0};
|
||||
|
||||
bool InitSomasTensors(const session::KernelGraph *graph);
|
||||
void InitBasicInfo(const session::KernelGraph *graph);
|
||||
void InitSomasStreamAndNode(const session::KernelGraph *graph);
|
||||
void InitSomasOutputAndWorkspaceTensors(const session::KernelGraph *graph);
|
||||
void InitSomasInputTensors(const session::KernelGraph *graph);
|
||||
void InitSomasEventInfos();
|
||||
void GetNextOutputProcess(const session::KernelGraph *graph);
|
||||
void IndependentNodeOutputProcess(const session::KernelGraph *graph);
|
||||
#ifndef ENABLE_SECURITY
|
||||
void SummaryInputProcess(const session::KernelGraph *graph);
|
||||
#endif
|
||||
void RefNodeProcess(const session::KernelGraph *graph);
|
||||
void NonTaskSplitProcess(const session::KernelGraph *graph);
|
||||
void UnReuseNodeProcess(const session::KernelGraph *graph);
|
||||
SomasTensorPtr CreateGapTensor(size_t gap_tensor_id);
|
||||
void GenContiguousList(const session::KernelGraph *graph);
|
||||
std::vector<vector<size_t>> processed_contiguous_tensors_list_;
|
||||
// key: contiguous list index with first union tensor; value: contiguous list index with other union tensor
|
||||
std::map<size_t, size_t> contiguous_list_with_ref_index_map_;
|
||||
|
||||
void ComputeConflictPairs();
|
||||
bool ConfigSomas(const session::KernelGraph &graph);
|
||||
|
||||
bool Assign(const session::KernelGraph *graph);
|
||||
|
||||
std::string Offline() const;
|
||||
void DumpOfflineIR(const string filename) const;
|
||||
std::string GetSplitName(const string &scope_name) const;
|
||||
size_t CalcLowerBound() const;
|
||||
void GenGraphStatisticInfo();
|
||||
// somas model
|
||||
bool InitSomasModel(const session::KernelGraph &graph);
|
||||
bool InitBasicInfoFromGraph(const session::KernelGraph &graph);
|
||||
void InitSomasStreamAndNode(const session::KernelGraph &graph);
|
||||
void InitSomasOutputAndWorkspaceTensors(const session::KernelGraph &graph);
|
||||
void InitSomasInputTensors(const session::KernelGraph &graph);
|
||||
void InitCommonNodeInputs(const CNodePtr &kernel);
|
||||
void InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kernel);
|
||||
SomasParameterPtr GetSomasParameter(const AnfNodePtr &node, size_t index);
|
||||
SomasParameterPtr CreateSomasParameter(const AnfNodePtr &node, size_t index);
|
||||
void InitCommonNodeInputs(bool is_all_nop_node, const CNodePtr &kernel);
|
||||
void InitAtomicCleanInputs(bool enable_fusion_clear, const CNodePtr &kernel);
|
||||
void ComputeOneTensorConflicts(const std::shared_ptr<SomasTensor> &target_tensor,
|
||||
const std::vector<TensorConflictInfo> &tensor_conflict_info_list,
|
||||
const std::vector<size_t> &destination_node_list,
|
||||
const vector<DynamicBitSet> &nodes_dependency,
|
||||
std::vector<DynamicBitSet> *tensor_relation) const;
|
||||
void InitControlTensors(const session::KernelGraph &graph);
|
||||
bool CommonSpecNodeProcess(const session::KernelGraph &graph);
|
||||
SomasStreamPtr GetSomasStream(size_t stream_id) const;
|
||||
#ifndef ENABLE_SECURITY
|
||||
void SummaryInputProcess(const session::KernelGraph &graph);
|
||||
#endif
|
||||
void RefNodeProcess(const session::KernelGraph &graph);
|
||||
void UnReuseNodeProcess(const session::KernelGraph &graph);
|
||||
void CommunicationNodeProcess(const session::KernelGraph &graph);
|
||||
void GetContiguousListContainUnionTensor();
|
||||
std::map<size_t, size_t> GetRefTensorsInContiguousList();
|
||||
common::KernelWithIndex GetVisitKernelWithReturnType(const AnfNodePtr &ori_node, size_t ori_index);
|
||||
|
||||
// conflict matrix
|
||||
static bool NodeSort(const SomasNodePtr &node1, const SomasNodePtr &node2);
|
||||
void ComputeConflictMatrix();
|
||||
void ComputeBasicMatrix();
|
||||
static void ComputeOneTensorConflicts(const std::shared_ptr<SomasTensor> &target_tensor,
|
||||
const std::vector<TensorConflictInfo> &tensor_conflict_info,
|
||||
const std::vector<size_t> &destination_node_list,
|
||||
const vector<DynamicBitSet> &nodes_dependency,
|
||||
std::vector<DynamicBitSet> *tensor_relation);
|
||||
void ComputeMultiTensorConflicts(const std::vector<SomasTensorPtr> &target_tensors_list,
|
||||
const std::vector<TensorConflictInfo> &tensor_conflict_info_list,
|
||||
const std::vector<TensorConflictInfo> &tensor_conflict_info,
|
||||
const std::vector<size_t> &destination_node_list,
|
||||
const vector<DynamicBitSet> &nodes_dependency,
|
||||
std::vector<DynamicBitSet> *tensor_relation) const;
|
||||
void UpdateTensorDestinations();
|
||||
void UpdateRefTensorsConflict();
|
||||
void UpdateRefOverlapTensorsConflicts();
|
||||
void UpdateRefTensorsOffset();
|
||||
void UpdateContiguousTensorsOffset(const std::map<size_t, size_t> &contiguous_ref_list_map);
|
||||
void DumpParameters(std::ostringstream &oss) const;
|
||||
void DumpTensors(std::ostringstream &oss) const;
|
||||
void DumpNodes(std::ostringstream &oss) const;
|
||||
std::map<size_t, size_t> GetContiguousListContainRefTensor();
|
||||
std::map<size_t, size_t> GetRefTensorsInContiguousList();
|
||||
bool SaveSomasResult(const session::KernelGraph *graph);
|
||||
bool VerifySomasResult(const session::KernelGraph *graph, const nlohmann::json &somas_json) const;
|
||||
bool LoadSomasResult(const session::KernelGraph *graph, const string &filename);
|
||||
bool UpdateTensorsOffset(const std::vector<nlohmann::json> &tensors_json);
|
||||
bool CalcSomasModelHash(const session::KernelGraph *graph);
|
||||
void UpdateInputTensor(SomasNodePtr node, SomasNodePtr pre_somas_node, SomasTensorPtr input_somas_tensor) const;
|
||||
bool LoadSomasCache(const session::KernelGraph *graph);
|
||||
SomasStreamPtr GetSomasStream(size_t stream_id) const;
|
||||
SomasNodePtr GetSomasNode(size_t node_id) const;
|
||||
void UpdateUnionTensorsConflict();
|
||||
static void BuildConflictInfo(const std::shared_ptr<SomasTensor> &tensor, TensorConflictInfo *tensor_conflict_info,
|
||||
std::vector<size_t> *destination_node_list);
|
||||
static bool CheckIsDependency(const TensorConflictInfo &tensor_conflict_info, const size_t &src_node_id,
|
||||
const vector<DynamicBitSet> &nodes_dependency,
|
||||
const std::vector<size_t> &destination_node_list);
|
||||
void ProcessSemiLifeLongTensor();
|
||||
|
||||
// solver
|
||||
bool Solve(const session::KernelGraph &graph);
|
||||
void UpdateUnionTensorsOffset();
|
||||
void UpdateContiguousTensorsOffset(const std::map<size_t, size_t> &contiguous_ref_list_map);
|
||||
|
||||
// cache
|
||||
bool SaveSomasResult(const session::KernelGraph &graph);
|
||||
bool VerifySomasResult(const session::KernelGraph &graph, const nlohmann::json &somas_json) const;
|
||||
bool LoadSomasResult(const session::KernelGraph &graph, const string &filename);
|
||||
bool UpdateTensorsOffset(const std::vector<nlohmann::json> &tensors_json);
|
||||
bool CalcSomasModelHash(const session::KernelGraph &graph);
|
||||
bool LoadSomasCache(const session::KernelGraph &graph);
|
||||
|
||||
// log
|
||||
std::string Offline() const;
|
||||
void DumpOfflineIR(const string &filename) const;
|
||||
size_t CalcLowerBound() const;
|
||||
void GenGraphStatisticInfo();
|
||||
void DumpParameters(std::ostringstream &oss) const;
|
||||
void DumpTensors(std::ostringstream &oss) const;
|
||||
void DumpNodes(std::ostringstream &oss) const;
|
||||
void DumpSomasModelInfo(const string &tag, uint32_t graph_id) const;
|
||||
|
||||
// update graph
|
||||
std::vector<std::pair<size_t, size_t>> GetNodeOutputSomasResult(const AnfNodePtr &node) const;
|
||||
std::vector<std::pair<size_t, size_t>> GetNodeWorkSpaceSomasResult(const AnfNodePtr &node) const;
|
||||
bool UpdateSomasResultToGraph(const session::KernelGraph &graph);
|
||||
|
||||
protected:
|
||||
std::vector<SomasParameterPtr> parameters_list_;
|
||||
std::vector<SomasTensorPtr> control_tensors_list_;
|
||||
std::vector<SomasTensorPtr> tensors_list_;
|
||||
std::vector<SomasNodePtr> nodes_list_;
|
||||
|
||||
mindspore::HashMap<size_t, SomasStreamPtr> streams_map_;
|
||||
mindspore::HashMap<void *, vector<SomasParameterPtr>> parameters_map_;
|
||||
mindspore::HashMap<void *, std::vector<SomasNodePtr>> nodes_map_;
|
||||
|
||||
std::vector<vector<size_t>> union_tensors_list_;
|
||||
std::vector<vector<size_t>> contiguous_tensors_list_;
|
||||
|
||||
void AddControlTensor(const SomasNodePtr &from, const SomasNodePtr &to);
|
||||
void AddControlTensorFromExecOrder(const session::KernelGraph &graph);
|
||||
void GraphOutputProcess(const session::KernelGraph &graph);
|
||||
void UpdateContiguousTensorList();
|
||||
SomasNodePtr GetSomasNode(size_t node_id) const;
|
||||
static std::string GetSplitName(const string &scope_name);
|
||||
|
||||
size_t reused_memory_size_{0};
|
||||
std::vector<std::pair<size_t, size_t>> dump_merged_blocks_;
|
||||
};
|
||||
|
||||
using SomasPtr = std::shared_ptr<Somas>;
|
||||
using SomasCreator = std::function<std::shared_ptr<Somas>()>;
|
||||
|
||||
// @todo will delete when old runtime remove
|
||||
class SomasManager {
|
||||
public:
|
||||
static SomasManager &Instance() {
|
||||
static SomasManager instance{};
|
||||
return instance;
|
||||
}
|
||||
void Register(device::DeviceType device_type, SomasCreator &&creator) {
|
||||
if (base_map_.find(device_type) == base_map_.end()) {
|
||||
(void)base_map_.emplace(device_type, creator);
|
||||
}
|
||||
}
|
||||
SomasPtr GetSomas(device::DeviceType device_type) {
|
||||
auto iter = base_map_.find(device_type);
|
||||
if (base_map_.end() != iter) {
|
||||
MS_EXCEPTION_IF_NULL(iter->second);
|
||||
return (iter->second)();
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
private:
|
||||
std::map<device::DeviceType, SomasCreator> base_map_;
|
||||
};
|
||||
|
||||
class SomasRegister {
|
||||
public:
|
||||
SomasRegister(device::DeviceType device_type, SomasCreator &&creator) {
|
||||
SomasManager::Instance().Register(device_type, std::move(creator));
|
||||
}
|
||||
~SomasRegister() = default;
|
||||
};
|
||||
|
||||
#define REG_SOMAS(S, T, C) static const somas::SomasRegister g_##S##_reg(T, []() { return std::make_shared<C>(); });
|
||||
} // namespace somas
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_COMMON_SOMAS_SOMAS_H_
|
||||
|
|
|
@ -39,14 +39,14 @@ class SomasNode {
|
|||
|
||||
// node's dependency including data dependency and time dependency
|
||||
std::set<std::shared_ptr<SomasNode>> ancestor_nodes_;
|
||||
std::set<SomasTensorPtr> tensors_;
|
||||
|
||||
// data tensor
|
||||
std::vector<SomasTensorPtr> input_tensors_;
|
||||
std::vector<SomasTensorPtr> output_tensors_;
|
||||
std::vector<SomasTensorPtr> workspace_tensors_;
|
||||
std::map<size_t, SomasParameterPtr> input_parameters_map_;
|
||||
|
||||
mindspore::HashMap<int64_t, size_t> anc_stream_max_order_;
|
||||
// control tensor
|
||||
std::vector<SomasTensorPtr> control_input_tensors_;
|
||||
std::vector<SomasTensorPtr> control_output_tensors_;
|
||||
|
||||
// Constructors/Destructors
|
||||
SomasNode(std::string scope_full_name, size_t id, NodeType type, const size_t &stream_id)
|
||||
|
@ -57,7 +57,7 @@ class SomasNode {
|
|||
|
||||
// Accessors
|
||||
const size_t &GetId() const { return id_; }
|
||||
const size_t GetStreamId() const { return stream_id_; }
|
||||
const size_t &GetStreamId() const { return stream_id_; }
|
||||
const NodeType &GetType() const { return type_; }
|
||||
|
||||
private:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2022 Huawei Technologies Co., Ltd
|
||||
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -98,7 +98,7 @@ vector<TensorsDescMap> SomasSolverPre::CreateTensorsMaps(const TensorsDescMap &t
|
|||
}
|
||||
return vecTensorsMap;
|
||||
}
|
||||
Status SomasSolverPre::Solving(const session::KernelGraph *graph, TensorsDescMap *ptensors,
|
||||
Status SomasSolverPre::Solving(const session::KernelGraph &graph, TensorsDescMap *ptensors,
|
||||
const std::vector<DynamicBitSet> *pConstraints,
|
||||
const vector<vector<size_t>> &continuous_v, bool bVerifySolution, bool ball,
|
||||
SortingType sorting, FittingType fitting, AlgorithmType algorithm) {
|
||||
|
@ -198,7 +198,7 @@ Status SomasSolverPre::Solving(const session::KernelGraph *graph, TensorsDescMap
|
|||
return ret;
|
||||
}
|
||||
|
||||
void SomasSolverPre::Log(const session::KernelGraph *graph, const TensorsDescMap &tensors,
|
||||
void SomasSolverPre::Log(const session::KernelGraph &graph, const TensorsDescMap &tensors,
|
||||
const std::vector<DynamicBitSet> *pConstraints,
|
||||
const vector<vector<size_t>> &continuous_v) const {
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
|
@ -213,13 +213,13 @@ void SomasSolverPre::Log(const session::KernelGraph *graph, const TensorsDescMap
|
|||
}
|
||||
|
||||
void SomasSolverPre::TensorRelationLog(const std::vector<DynamicBitSet> *pConstraints,
|
||||
const session::KernelGraph *graph) const {
|
||||
const session::KernelGraph &graph) const {
|
||||
MS_LOG(INFO) << "SomasSolver::Log Writing somas_tensor_relation.ir..";
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
|
||||
std::string filename =
|
||||
GetSaveGraphsPathName("somas_tensor_relation_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
|
||||
GetSaveGraphsPathName("somas_tensor_relation_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path);
|
||||
std::ostringstream oss;
|
||||
for (size_t tid1 = 0; tid1 < pConstraints->size(); tid1++) {
|
||||
oss << 't' << tid1 << ' ';
|
||||
|
@ -232,14 +232,14 @@ void SomasSolverPre::TensorRelationLog(const std::vector<DynamicBitSet> *pConstr
|
|||
MS_LOG(INFO) << "SomasSolver somas_tensor_relation Log done";
|
||||
}
|
||||
|
||||
void SomasSolverPre::SolverInputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors,
|
||||
void SomasSolverPre::SolverInputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors,
|
||||
const vector<vector<size_t>> &continuous_v) const {
|
||||
MS_LOG(INFO) << "SomasSolver::Log Writing somas_solver_input..";
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
|
||||
std::string filename =
|
||||
GetSaveGraphsPathName("somas_solver_input_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
|
||||
GetSaveGraphsPathName("somas_solver_input_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path);
|
||||
std::ostringstream oss;
|
||||
for (auto &t : tensors) {
|
||||
oss << "T " << t.second->index_ << " " << t.second->size_ << " " << t.second->lifelong_ << std::endl;
|
||||
|
@ -256,13 +256,13 @@ void SomasSolverPre::SolverInputLog(const session::KernelGraph *graph, const Ten
|
|||
MS_LOG(INFO) << "SomasSolver input Log done";
|
||||
}
|
||||
|
||||
void SomasSolverPre::SolverOutputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors) const {
|
||||
void SomasSolverPre::SolverOutputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors) const {
|
||||
MS_LOG(INFO) << "SomasSolver::Log Writing somas_solver_output_..";
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
auto save_graphs_path = context_ptr->get_param<std::string>(MS_CTX_SAVE_GRAPHS_PATH);
|
||||
std::string out_filename =
|
||||
GetSaveGraphsPathName("somas_solver_output_" + std::to_string(graph->graph_id()) + ".ir", save_graphs_path);
|
||||
GetSaveGraphsPathName("somas_solver_output_" + std::to_string(graph.graph_id()) + ".ir", save_graphs_path);
|
||||
std::ostringstream oss;
|
||||
constexpr size_t contiguous_left = 1;
|
||||
constexpr size_t contiguous_mid = 2;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2022 Huawei Technologies Co., Ltd
|
||||
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -184,14 +184,14 @@ class SomasSolverPre {
|
|||
|
||||
size_t GetMaxOffset() const { return max_offset_; }
|
||||
|
||||
Status Solving(const session::KernelGraph *graph, TensorsDescMap *ptensors,
|
||||
Status Solving(const session::KernelGraph &graph, TensorsDescMap *ptensors,
|
||||
const std::vector<DynamicBitSet> *pConstraints, const vector<vector<size_t>> &continuous_v,
|
||||
bool bVerifySolution, // true -> Check continuous and non overlapping constraints solution
|
||||
bool ball = true, // true -> run full set of heuristics, false -> run single heuristic specified
|
||||
SortingType sorting = kGreaterSizeSmallerIndex, FittingType fitting = kBest,
|
||||
AlgorithmType algorithm = kManyObjects);
|
||||
|
||||
void Log(const session::KernelGraph *graph, const TensorsDescMap &tensors,
|
||||
void Log(const session::KernelGraph &graph, const TensorsDescMap &tensors,
|
||||
const std::vector<DynamicBitSet> *pConstraints, const vector<vector<size_t>> &continuous_v) const;
|
||||
|
||||
Status CheckTensors(const TensorsDescMap *pTensors, uint32_t index1, uint32_t index2) const;
|
||||
|
@ -201,11 +201,11 @@ class SomasSolverPre {
|
|||
|
||||
private:
|
||||
size_t max_offset_;
|
||||
void SolverInputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors,
|
||||
void SolverInputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors,
|
||||
const vector<vector<size_t>> &continuous_v) const;
|
||||
void SolverOutputLog(const session::KernelGraph *graph, const TensorsDescMap &tensors) const;
|
||||
void SolverOutputLog(const session::KernelGraph &graph, const TensorsDescMap &tensors) const;
|
||||
vector<TensorsDescMap> CreateTensorsMaps(const TensorsDescMap &tensors, size_t total_sol) const;
|
||||
void TensorRelationLog(const std::vector<DynamicBitSet> *pConstraints, const session::KernelGraph *graph) const;
|
||||
void TensorRelationLog(const std::vector<DynamicBitSet> *pConstraints, const session::KernelGraph &graph) const;
|
||||
};
|
||||
using SomasSolverPrePtr = std::shared_ptr<SomasSolverPre>;
|
||||
} // namespace somas
|
||||
|
|
|
@ -31,7 +31,7 @@ class SomasStream {
|
|||
std::vector<SomasNodePtr> nodes_;
|
||||
|
||||
// Constructors/Destructors
|
||||
explicit SomasStream(int64_t id) : id_(id) {}
|
||||
explicit SomasStream(size_t id) : id_(id) {}
|
||||
SomasStream(const SomasStream &) = delete;
|
||||
SomasStream &operator=(const SomasStream &) = delete;
|
||||
~SomasStream() = default;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2022 Huawei Technologies Co., Ltd
|
||||
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -15,25 +15,35 @@
|
|||
*/
|
||||
|
||||
#include "backend/common/somas/somas_tensor.h"
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
namespace mindspore {
|
||||
namespace somas {
|
||||
SomasTensor::SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t real_size,
|
||||
LifeLongType lifelong_value)
|
||||
: lifelong_value_(lifelong_value),
|
||||
between_streams_(false),
|
||||
std::map<somas::TensorType, std::string> tensor_type_name_map = {
|
||||
{kCommon, "Common"}, {kWorkspace, "Workspace"},
|
||||
{kOutputOnly, "OutputOnly"}, {kGraphOutput, "GraphOutput"},
|
||||
{kGraphInput, "GraphInput"}, {kSummaryInput, "SummaryInput"},
|
||||
{kUnion, "Union"}, {kControl, "Control"},
|
||||
{kUnknown, "Unknown"}};
|
||||
|
||||
std::map<LifeLongType, std::string> life_long_name_map = {{kLifeLongNone, "LifeLongNone"},
|
||||
{kLifeLongGraphAll, "LifeLongGraphAll"},
|
||||
{kLifeLongGraphStart, "LifeLongGraphStart"},
|
||||
{kLifeLongGraphEnd, "LifeLongGraphEnd"}};
|
||||
|
||||
SomasTensor::SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t ori_size,
|
||||
size_t aligned_size, LifeLongType lifelong_value)
|
||||
: aligned_size_(aligned_size),
|
||||
lifelong_value_(lifelong_value),
|
||||
contiguous_(false),
|
||||
type_(kUnknown),
|
||||
offset_(0),
|
||||
num_constraints_(0),
|
||||
ref_overlap_(false),
|
||||
id_(id),
|
||||
source_node_id_(source_node_id),
|
||||
source_stream_id_(source_stream_id),
|
||||
original_size_(real_size) {
|
||||
const size_t alignment = 512;
|
||||
const size_t alignment_complement = 31;
|
||||
aligned_size_ = (real_size > 0) ? ((real_size + alignment + alignment_complement) / alignment) * alignment : 0;
|
||||
original_size_(ori_size) {
|
||||
solver_tensor_desc_ = std::make_shared<SomasSolverTensorDesc>(id_, aligned_size_, offset_, false);
|
||||
}
|
||||
|
||||
|
@ -49,5 +59,9 @@ SomasSolverTensorDescPtr SomasTensor::GetSolverTensorDesc() {
|
|||
return solver_tensor_desc_;
|
||||
}
|
||||
}
|
||||
|
||||
std::string SomasTensor::GetTypeString() { return tensor_type_name_map[type_]; }
|
||||
|
||||
std::string SomasTensor::GetLifelongString() { return life_long_name_map[lifelong_value_]; }
|
||||
} // namespace somas
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2022 Huawei Technologies Co., Ltd
|
||||
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -20,7 +20,7 @@
|
|||
#include <memory>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
#include <string>
|
||||
#include "utils/hash_map.h"
|
||||
#include "backend/common/somas/somas_solver_pre.h"
|
||||
|
||||
|
@ -38,21 +38,21 @@ using lifetime_t = struct Lifetime;
|
|||
// Tensor type
|
||||
enum TensorType {
|
||||
kCommon,
|
||||
kOutputOnly,
|
||||
kWorkspace,
|
||||
kGetNextOutput,
|
||||
kOutputOnly,
|
||||
kGraphOutput,
|
||||
kGraphInput,
|
||||
kSummaryInput,
|
||||
kRefNodeInput,
|
||||
kRefNodeOutput,
|
||||
kEventVirtualOutput,
|
||||
kUnion,
|
||||
kControl,
|
||||
kUnknown
|
||||
};
|
||||
|
||||
enum LifeLongType {
|
||||
kLifeLongNone, // life time is from tensor start to tensor end
|
||||
kLifeLongGraphAll, // life time is from graph start to graph end
|
||||
kLifeLongGraphStart, // life time is from graph start to tensor end
|
||||
kLifeLongGraphEnd // life time is from tensor start to graph end
|
||||
kLifeLongGraphAll, // life time is from graph start to graph end
|
||||
kLifeLongGraphStart, // life time is from graph start to tensor end
|
||||
kLifeLongGraphEnd // life time is from tensor start to graph end
|
||||
};
|
||||
|
||||
class SomasTensor {
|
||||
|
@ -60,7 +60,6 @@ class SomasTensor {
|
|||
size_t aligned_size_{0};
|
||||
LifeLongType lifelong_value_;
|
||||
|
||||
bool between_streams_;
|
||||
bool contiguous_;
|
||||
|
||||
lifetime_t lifetime_;
|
||||
|
@ -72,7 +71,7 @@ class SomasTensor {
|
|||
vector<size_t> consumer_list_;
|
||||
|
||||
// Constructors/Destructors
|
||||
explicit SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t real_size,
|
||||
explicit SomasTensor(size_t id, size_t source_node_id, size_t source_stream_id, size_t ori_size, size_t aligned_size,
|
||||
LifeLongType lifelong_value = kLifeLongNone);
|
||||
SomasTensor(const SomasTensor &) = delete;
|
||||
SomasTensor &operator=(const SomasTensor &) = delete;
|
||||
|
@ -86,14 +85,12 @@ class SomasTensor {
|
|||
const size_t &GetAlignedSize() const { return aligned_size_; }
|
||||
const size_t &GetNumConstraints() const { return num_constraints_; }
|
||||
bool IsLifelong() const { return lifelong_value_ == kLifeLongGraphAll; }
|
||||
bool IsWorkspace() const { return type_ == kWorkspace; }
|
||||
bool IsOutputOnly() const { return type_ == kOutputOnly; }
|
||||
size_t GetOffset() const { return offset_; }
|
||||
bool IsBetweenStreams() const { return between_streams_; }
|
||||
bool IsSemiLifelongStart() const { return lifelong_value_ == kLifeLongGraphStart; }
|
||||
bool IsSemiLifelongEnd() const { return lifelong_value_ == kLifeLongGraphEnd; }
|
||||
bool IsRefOverlap() const { return ref_overlap_; }
|
||||
|
||||
string GetTypeString();
|
||||
string GetLifelongString();
|
||||
// Computing functions
|
||||
void SetOffset() {
|
||||
if (aligned_size_ != 0) {
|
||||
|
@ -104,7 +101,6 @@ class SomasTensor {
|
|||
size_t num_constraints_{0};
|
||||
|
||||
private:
|
||||
bool ref_overlap_;
|
||||
const size_t id_{0};
|
||||
const size_t source_node_id_;
|
||||
const size_t source_stream_id_;
|
||||
|
|
|
@ -607,8 +607,8 @@ const ActorInfo &MindRTBackend::CompileGraphs(const FuncGraphPtr &func_graph) {
|
|||
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name_, device_id_});
|
||||
MS_EXCEPTION_IF_NULL(device_context);
|
||||
bool all_support = device_context->PartitionGraph(func_graph);
|
||||
auto run_mode = device_context->GetRunMode(func_graph);
|
||||
if (all_support) {
|
||||
auto run_mode = device_context->GetRunMode(func_graph);
|
||||
if (run_mode == device::RunMode::kGraphMode) {
|
||||
auto graph_id = graph_compiler_->CompileWholeGraphForGraphRunMode(func_graph, device_context);
|
||||
graph_id_to_device_context_[graph_id] = device_context;
|
||||
|
@ -1384,9 +1384,15 @@ std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(con
|
|||
|
||||
std::vector<std::vector<int64_t> *> tensors_mask;
|
||||
std::vector<std::vector<tensor::TensorPtr> *> input_tensors;
|
||||
auto strategy = runtime::GraphExecutionStrategy::kPipeline;
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
if (context_ptr->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) {
|
||||
strategy = runtime::GraphExecutionStrategy::kPipelineWithExecutionOrder;
|
||||
}
|
||||
return std::make_unique<GraphCompilerInfo>(graphs, device_contexts, tensors_mask, input_tensors, control_nodes_,
|
||||
root_graph->parameters(), parser, outputs_order, outputs_num, name, false,
|
||||
runtime::GraphExecutionStrategy::kPipeline);
|
||||
strategy);
|
||||
}
|
||||
|
||||
std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(
|
||||
|
|
|
@ -104,16 +104,6 @@ uint8_t *AscendMemoryManager::MallocDynamicMem(size_t size, bool communication_m
|
|||
return communication_mem ? alloc_address + kMemAlignSize : alloc_address;
|
||||
}
|
||||
|
||||
void AscendMemoryManager::MallocSomasDynamicMem(const session::KernelGraph &graph) {
|
||||
MemoryManager::MallocSomasDynamicMem(graph);
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (MemoryProfiling::GetInstance().IsMemoryProfilingInitialized()) {
|
||||
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
|
||||
somas_reuse_util_ptr_->ConvertToProfilingNode(graph.graph_id());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// communication memory: [512align_size + data + 512align_size]
|
||||
// return the pointer to the start of data address.
|
||||
uint8_t *AscendMemoryManager::MallocCommunicationMemFromMemPool(size_t size) {
|
||||
|
|
|
@ -36,7 +36,6 @@ class AscendMemoryManager : public MemoryManager {
|
|||
void *MallocMemFromMemPool(size_t size, bool from_persistent_mem) override;
|
||||
void FreeMemFromMemPool(void *device_ptr) override;
|
||||
uint64_t GetMsMaxMemSize() const;
|
||||
void MallocSomasDynamicMem(const session::KernelGraph &graph) override;
|
||||
uint8_t *MallocCommunicationMemFromMemPool(size_t size) override;
|
||||
bool MallocContinuousMemFromMemPool(const DeviceAddressPtrList &addr_list, size_t total_size,
|
||||
std::vector<size_t> size_list) override;
|
||||
|
|
|
@ -0,0 +1,229 @@
|
|||
/**
|
||||
* Copyright 2021-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "plugin/device/ascend/hal/hardware/ascend_somas.h"
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include "backend/common/optimizer/helper.h"
|
||||
#include "utils/ms_context.h"
|
||||
#include "plugin/device/ascend/hal/device/ascend_stream_assign.h"
|
||||
#include "plugin/device/ascend/hal/profiler/memory_profiling.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
using KernelGraph = session::KernelGraph;
|
||||
using UnReuseType = somas::UnReuseType;
|
||||
using TensorType = somas::TensorType;
|
||||
using LifeLongType = somas::LifeLongType;
|
||||
using mindspore::profiler::ascend::MemoryProfiling;
|
||||
|
||||
#ifndef ENABLE_SECURITY
|
||||
void AscendSomas::ConvertToProfilingNode(uint32_t graph_id) const {
|
||||
if (!MemoryProfiling::GetInstance().IsMemoryProfilingInitialized()) {
|
||||
return;
|
||||
}
|
||||
auto graph_node = profiler::ascend::MemoryProfiling::GetInstance().GetGraphMemoryNode(graph_id);
|
||||
if (graph_node == nullptr) {
|
||||
graph_node = profiler::ascend::MemoryProfiling::GetInstance().AddGraphMemoryNode(graph_id);
|
||||
MS_LOG(INFO) << "Add graph memory node for dynamic memory profiling, graph id is " << graph_id;
|
||||
}
|
||||
|
||||
for (const auto &tensor : tensors_list_) {
|
||||
profiler::ascend::TensorMemory tensor_memory;
|
||||
tensor_memory.SetTensorId(tensor->GetId());
|
||||
tensor_memory.SetAlignedSize(tensor->GetAlignedSize());
|
||||
tensor_memory.SetType(tensor->GetTypeString());
|
||||
tensor_memory.SetLifeStart(tensor->lifetime_.start_);
|
||||
tensor_memory.SetLifeEnd(tensor->lifetime_.end_);
|
||||
tensor_memory.SetLifeLong(tensor->GetLifelongString());
|
||||
graph_node->AddTensorMemory(tensor_memory);
|
||||
}
|
||||
|
||||
for (const auto &node : nodes_list_) {
|
||||
profiler::ascend::NodeMemory node_memory;
|
||||
std::string name = GetSplitName(node->scope_full_name_);
|
||||
node_memory.SetNodeName(name);
|
||||
node_memory.SetNodeId(node->GetId());
|
||||
for (const auto &input_tensor : node->input_tensors_) {
|
||||
node_memory.AddInputTensorId(input_tensor->GetId());
|
||||
}
|
||||
for (const auto &output_tensor : node->output_tensors_) {
|
||||
node_memory.AddOutputTensorId(output_tensor->GetId());
|
||||
}
|
||||
for (const auto &workspace_tensor : node->workspace_tensors_) {
|
||||
node_memory.AddWorkSpaceTensorId(workspace_tensor->GetId());
|
||||
}
|
||||
graph_node->AddNodeMemory(node_memory);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
bool AscendSomas::Initialize() { return true; }
|
||||
|
||||
std::string AscendSomas::GetDeviceName() const { return "Ascend"; }
|
||||
|
||||
size_t AscendSomas::GetCommunicationReservedSize() const {
|
||||
constexpr size_t gap_size = 512;
|
||||
return gap_size;
|
||||
}
|
||||
|
||||
size_t AscendSomas::GetAlignSize(size_t original_size) const {
|
||||
constexpr size_t alignment = 512;
|
||||
constexpr size_t alignment_complement = 31;
|
||||
size_t aligned_size =
|
||||
(original_size > 0) ? ((original_size + alignment + alignment_complement) / alignment) * alignment : 0;
|
||||
return aligned_size;
|
||||
}
|
||||
|
||||
bool AscendSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const {
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
auto task_sink = ms_context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
|
||||
auto opt_level = ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL);
|
||||
if (task_sink || (opt_level == kOptimizeO1)) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<vector<uint32_t>> AscendSomas::GetStreamGroupInfo(const session::KernelGraph &graph) const {
|
||||
std::vector<vector<uint32_t>> stream_group;
|
||||
stream_group = device::ascend::AscendStreamAssign::GetInstance().get_stream_group();
|
||||
return stream_group;
|
||||
}
|
||||
|
||||
std::map<std::string, UnReuseType> AscendSomas::GetUnReuseNodeType(const session::KernelGraph &graph) const {
|
||||
std::map<std::string, UnReuseType> node_type;
|
||||
node_type[kGetNextOpName] = UnReuseType::kUnReuseOutput;
|
||||
return node_type;
|
||||
}
|
||||
|
||||
bool AscendSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) {
|
||||
InitEventInfo(graph);
|
||||
return true;
|
||||
}
|
||||
|
||||
void AscendSomas::InitEventInfo(const session::KernelGraph &graph) {
|
||||
event_map_ = {};
|
||||
auto &kernels = graph.execution_order();
|
||||
for (const auto &kernel : kernels) {
|
||||
auto type = common::AnfAlgo::GetCNodeName(kernel);
|
||||
if (type == kSendOpName) {
|
||||
auto event = common::AnfAlgo::GetNodeAttr<uint32_t>(kernel, kAttrEventId);
|
||||
auto iter = event_map_.find(event);
|
||||
if (iter == event_map_.end()) {
|
||||
auto pair = somas::EventPair();
|
||||
pair.send_ = kernel;
|
||||
event_map_[event] = pair;
|
||||
} else {
|
||||
iter->second.send_ = kernel;
|
||||
}
|
||||
} else if (type == kRecvOpName) {
|
||||
auto event = common::AnfAlgo::GetNodeAttr<uint32_t>(kernel, kAttrEventId);
|
||||
auto iter = event_map_.find(event);
|
||||
if (iter == event_map_.end()) {
|
||||
auto pair = somas::EventPair();
|
||||
pair.recv_ = kernel;
|
||||
event_map_[event] = pair;
|
||||
} else {
|
||||
iter->second.recv_ = kernel;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &event : event_map_) {
|
||||
auto pair = event.second;
|
||||
auto send_iter = nodes_map_.find(pair.send_.get());
|
||||
if (send_iter == nodes_map_.end()) {
|
||||
MS_LOG(WARNING) << "Can't find somas node for " << pair.send_->fullname_with_scope();
|
||||
continue;
|
||||
}
|
||||
|
||||
auto recv_iter = nodes_map_.find(pair.recv_.get());
|
||||
if (recv_iter == nodes_map_.end()) {
|
||||
MS_LOG(WARNING) << "Can't find somas node for " << pair.recv_->fullname_with_scope();
|
||||
continue;
|
||||
}
|
||||
|
||||
auto &somas_send = send_iter->second.at(0);
|
||||
auto &somas_recv = recv_iter->second.at(0);
|
||||
AddControlTensor(somas_send, somas_recv);
|
||||
}
|
||||
MS_LOG(DEBUG) << "Somas InitEventInfo end.";
|
||||
}
|
||||
|
||||
bool AscendSomas::DevSpecNodeProcess(const session::KernelGraph &graph) {
|
||||
IndependentNodeOutputProcess(graph);
|
||||
NonTaskSplitProcess(graph);
|
||||
return true;
|
||||
}
|
||||
|
||||
void AscendSomas::IndependentNodeOutputProcess(const session::KernelGraph &graph) {
|
||||
auto &kernel_cnodes = graph.execution_order();
|
||||
size_t total_size = 0;
|
||||
for (const auto &kernel : kernel_cnodes) {
|
||||
bool independent = AnfAlgo::IsIndependentNode(kernel);
|
||||
if (!independent) {
|
||||
continue;
|
||||
}
|
||||
auto iter = nodes_map_.find(kernel.get());
|
||||
if (iter != nodes_map_.end()) {
|
||||
auto &node = iter->second.at(0);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
auto semi_reuse_output_tensors = node->output_tensors_;
|
||||
for (auto &tensor : semi_reuse_output_tensors) {
|
||||
MS_EXCEPTION_IF_NULL(tensor);
|
||||
total_size += tensor->GetAlignedSize();
|
||||
tensor->lifelong_value_ = LifeLongType::kLifeLongGraphEnd;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MS_LOG(INFO) << "Special Tensor total size: Independent Node output " << total_size;
|
||||
}
|
||||
|
||||
void AscendSomas::NonTaskSplitProcess(const session::KernelGraph &graph) {
|
||||
auto &kernel_cnodes = graph.execution_order();
|
||||
for (const auto &kernel : kernel_cnodes) {
|
||||
auto op_name = common::AnfAlgo::GetCNodeName(kernel);
|
||||
if (common::AnfAlgo::IsNonTaskOp(kernel)) {
|
||||
std::vector<size_t> refnode_input_output;
|
||||
auto node = nodes_map_[kernel.get()].at(0);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
if (node->input_tensors_.empty()) {
|
||||
MS_LOG(EXCEPTION) << op_name << " has no input tensor, can not do split non_task process.";
|
||||
}
|
||||
auto input_tensor = node->input_tensors_[0];
|
||||
MS_EXCEPTION_IF_NULL(input_tensor);
|
||||
input_tensor->type_ = TensorType::kUnion;
|
||||
refnode_input_output.push_back(input_tensor->GetId());
|
||||
|
||||
for (auto &output_tensor : node->output_tensors_) {
|
||||
MS_EXCEPTION_IF_NULL(output_tensor);
|
||||
output_tensor->type_ = TensorType::kUnion;
|
||||
refnode_input_output.push_back(output_tensor->GetId());
|
||||
}
|
||||
union_tensors_list_.push_back(refnode_input_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,61 @@
|
|||
/**
|
||||
* Copyright 2021-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
#include <memory>
|
||||
#include "backend/common/somas/somas.h"
|
||||
#include "runtime/hardware/device_type.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace ascend {
|
||||
using KernelGraph = session::KernelGraph;
|
||||
using UnReuseType = somas::UnReuseType;
|
||||
class AscendSomas : public somas::Somas {
|
||||
public:
|
||||
#ifndef ENABLE_SECURITY
|
||||
void ConvertToProfilingNode(uint32_t graph_id) const override;
|
||||
#endif
|
||||
private:
|
||||
bool Initialize() override;
|
||||
string GetDeviceName() const override;
|
||||
size_t GetCommunicationReservedSize() const override;
|
||||
size_t GetAlignSize(size_t original_size) const override;
|
||||
|
||||
bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
|
||||
std::vector<vector<uint32_t>> GetStreamGroupInfo(const session::KernelGraph &graph) const override;
|
||||
std::map<std::string, UnReuseType> GetUnReuseNodeType(const session::KernelGraph &graph) const override;
|
||||
|
||||
bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
|
||||
bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
|
||||
|
||||
void InitEventInfo(const session::KernelGraph &graph);
|
||||
void IndependentNodeOutputProcess(const session::KernelGraph &graph);
|
||||
void NonTaskSplitProcess(const session::KernelGraph &graph);
|
||||
std::map<uint32_t, somas::EventPair> event_map_;
|
||||
};
|
||||
REG_SOMAS(Ascend, DeviceType::kAscend, AscendSomas)
|
||||
} // namespace ascend
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_ASCEND_HAL_HARDWARE_ASCEND_SOMAS_H_
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Copyright 2021-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "plugin/device/cpu/hal/hardware/cpu_somas.h"
|
||||
#include <string>
|
||||
#include "utils/ms_context.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace cpu {
|
||||
bool CPUSomas::Initialize() { return true; }
|
||||
|
||||
std::string CPUSomas::GetDeviceName() const { return "CPU"; }
|
||||
|
||||
size_t CPUSomas::GetAlignSize(size_t original_size) const {
|
||||
constexpr size_t alignment = 512;
|
||||
size_t aligned_size = (original_size > 0) ? ((original_size + alignment - 1) / alignment) * alignment : 0;
|
||||
return aligned_size;
|
||||
}
|
||||
|
||||
bool CPUSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const { return false; }
|
||||
|
||||
bool CPUSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) { return true; }
|
||||
|
||||
bool CPUSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { return true; }
|
||||
} // namespace cpu
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,43 @@
|
|||
/**
|
||||
* Copyright 2021-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__
|
||||
#define MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__
|
||||
|
||||
#include <string>
|
||||
#include "backend/common/somas/somas.h"
|
||||
#include "runtime/hardware/device_type.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace cpu {
|
||||
using KernelGraph = session::KernelGraph;
|
||||
class CPUSomas : public somas::Somas {
|
||||
private:
|
||||
bool Initialize() override;
|
||||
string GetDeviceName() const override;
|
||||
size_t GetAlignSize(size_t original_size) const override;
|
||||
|
||||
bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
|
||||
bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
|
||||
bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
|
||||
};
|
||||
REG_SOMAS(CPU, DeviceType::kCPU, CPUSomas)
|
||||
} // namespace cpu
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_plugin_DEVICE_CPU_HAL_HARDWARE_CPU_SOMAS_H__
|
|
@ -25,6 +25,7 @@
|
|||
#include "plugin/device/gpu/hal/device/gpu_stream_assign.h"
|
||||
#include "plugin/device/gpu/hal/device/distribution/collective_init.h"
|
||||
#include "plugin/device/gpu/hal/device/gpu_device_manager.h"
|
||||
#include "plugin/device/gpu/hal/hardware/gpu_somas.h"
|
||||
#include "runtime/data_queue/data_queue_mgr.h"
|
||||
#include "kernel/common_utils.h"
|
||||
#include "plugin/device/gpu/hal/device/gpu_common.h"
|
||||
|
@ -40,6 +41,7 @@
|
|||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "backend/common/optimizer/common_backend_optimization.h"
|
||||
#include "backend/common/optimizer/dynamic_shape/dynamic_shape_helper.h"
|
||||
#include "include/common/debug/anf_ir_dump.h"
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
#include "include/common/debug/rdr/recorder_manager.h"
|
||||
#include "debug/rdr/mem_address_recorder.h"
|
||||
|
@ -258,6 +260,25 @@ DeviceAddressPtr GPUDeviceResManager::CreateDeviceAddress(void *const device_ptr
|
|||
return device_address;
|
||||
}
|
||||
|
||||
void GPUKernelExecutor::PreprocessBeforeRun(const FuncGraphPtr &graph) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
auto kernel_graph = graph->cast<KernelGraphPtr>();
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
if (ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) {
|
||||
auto somas = std::make_shared<GPUSomas>();
|
||||
bool ret = somas->Assign(kernel_graph);
|
||||
if (ret) {
|
||||
MS_LOG(INFO) << "Somas allocate success for graph " << kernel_graph->graph_id()
|
||||
<< " somas size: " << kernel_graph->somas_whole_block_size();
|
||||
} else {
|
||||
MS_LOG(WARNING) << "Somas allocate failed for graph " << kernel_graph->graph_id();
|
||||
}
|
||||
}
|
||||
MS_LOG(INFO) << "Status record: end preprocess before run graph. graph id: " << kernel_graph->graph_id();
|
||||
}
|
||||
|
||||
void GPUKernelExecutor::OptimizeGraphWithoutDeviceInfo(const KernelGraphPtr &graph) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
// Operator fusion optimization.
|
||||
|
|
|
@ -82,6 +82,8 @@ class GPUKernelExecutor : public DeprecatedKernelExecutor {
|
|||
|
||||
void CreateKernel(const std::vector<CNodePtr> &nodes) const override;
|
||||
|
||||
void PreprocessBeforeRun(const FuncGraphPtr &graph) const override;
|
||||
|
||||
bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs) const override;
|
||||
|
||||
|
|
|
@ -0,0 +1,141 @@
|
|||
/**
|
||||
* Copyright 2021-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "plugin/device/gpu/hal/hardware/gpu_somas.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "backend/common/optimizer/helper.h"
|
||||
#include "utils/ms_context.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace gpu {
|
||||
bool GPUSomas::Initialize() { return true; }
|
||||
|
||||
std::string GPUSomas::GetDeviceName() const { return "GPU"; }
|
||||
|
||||
size_t GPUSomas::GetAlignSize(size_t original_size) const {
|
||||
constexpr size_t alignment = 512;
|
||||
size_t aligned_size = (original_size > 0) ? ((original_size + alignment - 1) / alignment) * alignment : 0;
|
||||
return aligned_size;
|
||||
}
|
||||
|
||||
bool GPUSomas::GetDependExecOrderFlag(const session::KernelGraph &graph) const {
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
if (context_ptr->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) == kOptimizeO1) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool GPUSomas::InitDevSpecControlTensors(const session::KernelGraph &graph) {
|
||||
InitEventInfo(graph);
|
||||
return true;
|
||||
}
|
||||
|
||||
void GPUSomas::InitEventInfo(const session::KernelGraph &graph) {
|
||||
event_map_ = {};
|
||||
auto &kernels = graph.execution_order();
|
||||
for (const auto &kernel : kernels) {
|
||||
auto type = common::AnfAlgo::GetCNodeName(kernel);
|
||||
if (type == kSendOpName) {
|
||||
auto event = common::AnfAlgo::GetNodeAttr<uintptr_t>(kernel, kAttrRecordEvent);
|
||||
auto iter = event_map_.find(event);
|
||||
if (iter == event_map_.end()) {
|
||||
auto pair = somas::EventPair();
|
||||
pair.send_ = kernel;
|
||||
event_map_[event] = pair;
|
||||
} else {
|
||||
iter->second.send_ = kernel;
|
||||
}
|
||||
} else if (type == kRecvOpName) {
|
||||
auto event = common::AnfAlgo::GetNodeAttr<uintptr_t>(kernel, kAttrWaitEvent);
|
||||
auto iter = event_map_.find(event);
|
||||
if (iter == event_map_.end()) {
|
||||
auto pair = somas::EventPair();
|
||||
pair.recv_ = kernel;
|
||||
event_map_[event] = pair;
|
||||
} else {
|
||||
iter->second.recv_ = kernel;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &event : event_map_) {
|
||||
auto pair = event.second;
|
||||
auto send_iter = nodes_map_.find(pair.send_.get());
|
||||
if (send_iter == nodes_map_.end()) {
|
||||
MS_LOG(WARNING) << "Can't find somas node for " << pair.send_->fullname_with_scope();
|
||||
continue;
|
||||
}
|
||||
|
||||
auto recv_iter = nodes_map_.find(pair.recv_.get());
|
||||
if (recv_iter == nodes_map_.end()) {
|
||||
MS_LOG(WARNING) << "Can't find somas node for " << pair.recv_->fullname_with_scope();
|
||||
continue;
|
||||
}
|
||||
|
||||
auto &somas_send = send_iter->second.at(0);
|
||||
auto &somas_recv = recv_iter->second.at(0);
|
||||
AddControlTensor(somas_send, somas_recv);
|
||||
}
|
||||
MS_LOG(DEBUG) << "Somas InitEventInfo end.";
|
||||
}
|
||||
|
||||
bool GPUSomas::DevSpecNodeProcess(const session::KernelGraph &graph) { return InplaceNodeProcess(graph); }
|
||||
|
||||
bool GPUSomas::InplaceNodeProcess(const session::KernelGraph &graph) {
|
||||
auto &kernels = graph.execution_order();
|
||||
for (auto &kernel : kernels) {
|
||||
if (!common::AnfAlgo::IsInplaceNode(kernel, "skip")) {
|
||||
continue;
|
||||
}
|
||||
auto iter = nodes_map_.find(kernel.get());
|
||||
if (iter != nodes_map_.end()) {
|
||||
auto &node = iter->second.at(0);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
auto input_tensors = node->input_tensors_;
|
||||
auto output_tensors = node->output_tensors_;
|
||||
std::vector<somas::SomasTensorPtr> union_tensors;
|
||||
union_tensors.insert(union_tensors.end(), input_tensors.begin(), input_tensors.end());
|
||||
union_tensors.insert(union_tensors.end(), output_tensors.begin(), output_tensors.end());
|
||||
// check whether the union tensor already in other union tensors
|
||||
for (auto &tensor : union_tensors) {
|
||||
auto tensor_id = tensor->GetId();
|
||||
for (auto &union_list : union_tensors_list_) {
|
||||
if (std::count(union_list.begin(), union_list.end(), tensor_id)) {
|
||||
MS_LOG(EXCEPTION) << "Inplace node union Tensor " << tensor_id << " already in other union tensor list.";
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<size_t> inplace_union_tensor_list;
|
||||
for (auto &tensor : union_tensors) {
|
||||
tensor->type_ = somas::kUnion;
|
||||
inplace_union_tensor_list.push_back(tensor->GetId());
|
||||
}
|
||||
|
||||
union_tensors_list_.push_back(inplace_union_tensor_list);
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Can't find somas node for inplace node " << kernel->fullname_with_scope();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace gpu
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,48 @@
|
|||
/**
|
||||
* Copyright 2021-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__
|
||||
#define MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include "backend/common/somas/somas.h"
|
||||
#include "runtime/hardware/device_type.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
namespace gpu {
|
||||
using KernelGraph = session::KernelGraph;
|
||||
|
||||
class GPUSomas : public somas::Somas {
|
||||
private:
|
||||
bool Initialize() override;
|
||||
string GetDeviceName() const override;
|
||||
size_t GetAlignSize(size_t original_size) const override;
|
||||
|
||||
bool GetDependExecOrderFlag(const session::KernelGraph &graph) const override;
|
||||
bool InitDevSpecControlTensors(const session::KernelGraph &graph) override;
|
||||
bool DevSpecNodeProcess(const session::KernelGraph &graph) override;
|
||||
bool InplaceNodeProcess(const session::KernelGraph &graph);
|
||||
void InitEventInfo(const session::KernelGraph &graph);
|
||||
std::map<uintptr_t, somas::EventPair> event_map_;
|
||||
};
|
||||
REG_SOMAS(GPU, DeviceType::kGPU, GPUSomas)
|
||||
} // namespace gpu
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_plugin_DEVICE_GPU_HAL_HARDWARE_GPU_SOMAS_H__
|
|
@ -101,7 +101,8 @@ REGISTER_PYBIND_DEFINE(MsContextPy, ([](const py::module *m) {
|
|||
.value("graph_kernel_flags", MsCtxParam::MS_CTX_GRAPH_KERNEL_FLAGS)
|
||||
.value("grad_for_scalar", MsCtxParam::MS_CTX_GRAD_FOR_SCALAR)
|
||||
.value("pynative_synchronize", MsCtxParam::MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE)
|
||||
.value("disable_format_transform", MsCtxParam::MS_CTX_DISABLE_FORMAT_TRANSFORM);
|
||||
.value("disable_format_transform", MsCtxParam::MS_CTX_DISABLE_FORMAT_TRANSFORM)
|
||||
.value("memory_optimize_level", MsCtxParam::MS_CTX_MEMORY_OPTIMIZE_LEVEL);
|
||||
(void)py::class_<mindspore::MsContext, std::shared_ptr<mindspore::MsContext>>(*m, "MSContext")
|
||||
.def_static("get_instance", &mindspore::MsContext::GetInstance, "Get ms context instance.")
|
||||
.def("get_param", &mindspore::MsCtxGetParameter, "Get value of specified parameter.")
|
||||
|
|
|
@ -3,6 +3,7 @@ file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*
|
|||
"memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc" "memory_scheduler.cc"
|
||||
"memory_offload_strategy.cc" "bucket.cc" "launch_kernel.cc" "launch_mul.cc" "tensor_array.cc"
|
||||
"ms_device_shape_transfer.cc" "context_extends.cc" "stream_synchronizer.cc" "tensors_queue.cc" "auto_mem_offload.cc"
|
||||
"common_somas_allocator.cc"
|
||||
)
|
||||
|
||||
if("${ENABLE_HIDDEN}" STREQUAL "OFF")
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "runtime/device/common_somas_allocator.h"
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include "backend/common/optimizer/helper.h"
|
||||
#include "utils/ms_context.h"
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
#include "debug/rdr/string_recorder.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
bool CommonSomasAllocator::Assign(const session::KernelGraph &graph) {
|
||||
somas::SomasPtr somas_ptr{nullptr};
|
||||
if (GetTargetFromContext() == kAscendDevice) {
|
||||
somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kAscend);
|
||||
} else if (GetTargetFromContext() == kGPUDevice) {
|
||||
somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kGPU);
|
||||
} else {
|
||||
somas_ptr = somas::SomasManager::Instance().GetSomas(DeviceType::kCPU);
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(somas_ptr);
|
||||
bool ret = somas_ptr->Assign(graph);
|
||||
if (ret) {
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
SubModuleId module = SubModuleId::SM_OPTIMIZER;
|
||||
std::string name = "somas_allocate_info." + std::to_string(graph.graph_id());
|
||||
(void)mindspore::RDR::RecordString(module, name, somas_ptr->SomasInfo());
|
||||
#endif
|
||||
#ifndef ENABLE_SECURITY
|
||||
somas_ptr->ConvertToProfilingNode(graph.graph_id());
|
||||
#endif
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint8_t *CommonSomasAllocator::GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const {
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
auto kernel_info = dynamic_cast<KernelInfo *>(node->kernel_info());
|
||||
MS_EXCEPTION_IF_NULL(kernel_info);
|
||||
if (index >= kernel_info->somas_output_offset_aligned_size_list().size()) {
|
||||
MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:["
|
||||
<< kernel_info->somas_output_offset_aligned_size_list().size() << "]";
|
||||
}
|
||||
auto somas_offset_aligned_size = kernel_info->somas_output_offset_aligned_size_list()[index];
|
||||
if (somas_offset_aligned_size.second == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
auto somas_offset = somas_offset_aligned_size.first;
|
||||
uint8_t *ptr = mem_base_addr_ + somas_offset;
|
||||
return ptr;
|
||||
}
|
||||
|
||||
uint8_t *CommonSomasAllocator::GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const {
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
auto kernel_info = dynamic_cast<KernelInfo *>(node->kernel_info());
|
||||
MS_EXCEPTION_IF_NULL(kernel_info);
|
||||
if (index >= kernel_info->somas_workspace_offset_aligned_size_list().size()) {
|
||||
MS_LOG(EXCEPTION) << "index:[" << index << "] is larger than it's output size:["
|
||||
<< kernel_info->somas_workspace_offset_aligned_size_list().size() << "]";
|
||||
}
|
||||
auto somas_offset_aligned_size = kernel_info->somas_workspace_offset_aligned_size_list()[index];
|
||||
if (somas_offset_aligned_size.second == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
auto somas_offset = somas_offset_aligned_size.first;
|
||||
uint8_t *ptr = mem_base_addr_ + somas_offset;
|
||||
return ptr;
|
||||
}
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,50 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H
|
||||
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
#include <memory>
|
||||
#include "backend/common/somas/somas.h"
|
||||
#include "runtime/hardware/device_type.h"
|
||||
#include "utils/ms_context.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
class CommonSomasAllocator {
|
||||
public:
|
||||
void set_mem_base_addr(uint8_t *mem_base_addr) { mem_base_addr_ = mem_base_addr; }
|
||||
static bool Assign(const session::KernelGraph &graph);
|
||||
uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
|
||||
uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
|
||||
|
||||
private:
|
||||
// Memory base addr
|
||||
uint8_t *mem_base_addr_{nullptr};
|
||||
static std::string GetTargetFromContext() {
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
return context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
|
||||
}
|
||||
};
|
||||
using CommonSomasAllocatorPtr = std::shared_ptr<CommonSomasAllocator>;
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_COMMON_SOMAS_ALLOCATOR_H
|
|
@ -15,6 +15,7 @@
|
|||
*/
|
||||
|
||||
#include "runtime/device/kernel_info.h"
|
||||
#include <utility>
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
|
@ -108,6 +109,13 @@ bool KernelInfo::SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t
|
|||
return true;
|
||||
}
|
||||
|
||||
bool KernelInfo::SetSomasResult(std::vector<std::pair<size_t, size_t>> &&output_somas_result,
|
||||
std::vector<std::pair<size_t, size_t>> &&workspace_somas_result) {
|
||||
somas_output_result_ = std::move(output_somas_result);
|
||||
somas_workspace_result_ = std::move(workspace_somas_result);
|
||||
return true;
|
||||
}
|
||||
|
||||
void KernelInfo::set_kernel_mod(const kernel::KernelModPtr &kernel_mod) { kernel_mod_ = kernel_mod; }
|
||||
|
||||
kernel::KernelMod *KernelInfo::MutableKernelMod() const { return kernel_mod_.get(); }
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include "ir/kernel_info_dev.h"
|
||||
#include "kernel/kernel_build_info.h"
|
||||
#include "kernel/kernel.h"
|
||||
|
@ -57,6 +58,8 @@ class KernelInfo : public KernelInfoDevice {
|
|||
DeviceAddressPtr GetMutableWorkspaceAddr(size_t index) const;
|
||||
bool WorkspaceAddrExist(size_t index) const;
|
||||
bool SetWorkspaceAddr(const DeviceAddressPtr &output_address, size_t index);
|
||||
bool SetSomasResult(std::vector<std::pair<size_t, size_t>> &&output_somas_result,
|
||||
std::vector<std::pair<size_t, size_t>> &&workspace_somas_result);
|
||||
void set_kernel_mod(const kernel::KernelModPtr &kernel_mod);
|
||||
kernel::KernelMod *MutableKernelMod() const;
|
||||
const kernel::KernelMod *kernel_mod() const;
|
||||
|
@ -70,6 +73,12 @@ class KernelInfo : public KernelInfoDevice {
|
|||
uint32_t graph_id() const { return graph_id_; }
|
||||
bool operator==(const KernelInfo &other) const;
|
||||
bool is_feature_map() const { return is_feature_map_; }
|
||||
const std::vector<std::pair<size_t, size_t>> &somas_output_offset_aligned_size_list() const {
|
||||
return somas_output_result_;
|
||||
}
|
||||
const std::vector<std::pair<size_t, size_t>> &somas_workspace_offset_aligned_size_list() const {
|
||||
return somas_workspace_result_;
|
||||
}
|
||||
const std::vector<std::shared_ptr<DeviceAddress>> &output_address_list() const { return output_address_list_; }
|
||||
const std::vector<std::shared_ptr<DeviceAddress>> &workspace_address_list() const { return workspace_address_list_; }
|
||||
|
||||
|
@ -83,6 +92,12 @@ class KernelInfo : public KernelInfoDevice {
|
|||
kernel::KernelBuildInfoPtr select_kernel_build_info_;
|
||||
std::vector<std::shared_ptr<DeviceAddress>> output_address_list_;
|
||||
std::vector<std::shared_ptr<DeviceAddress>> workspace_address_list_;
|
||||
// pair<size_t, size_t> : (offset, aligned_size)
|
||||
// aligned_size of 0 means no memory allocation
|
||||
std::vector<std::pair<size_t, size_t>> somas_output_result_;
|
||||
// pair<size_t, size_t> : (offset, aligned_size)
|
||||
// aligned_size of 0 means no memory allocation
|
||||
std::vector<std::pair<size_t, size_t>> somas_workspace_result_;
|
||||
kernel::KernelModPtr kernel_mod_;
|
||||
// stream_id_ is the index of stream object vector
|
||||
uint32_t stream_id_;
|
||||
|
|
|
@ -985,7 +985,12 @@ void KernelRuntime::AssignNodeOutputMem(MemType type, const AnfNodePtr &node, in
|
|||
auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type, {node, i});
|
||||
MS_EXCEPTION_IF_NULL(device_address);
|
||||
uint8_t *ptr = mem_manager_->MallocOutputMem(node, i, type, output_sizes[i], device_address, false);
|
||||
MS_EXCEPTION_IF_NULL(ptr);
|
||||
if (ptr == nullptr && type == kSomasReuseDynamicMem) {
|
||||
MS_LOG(INFO) << "node: " << node->fullname_with_scope() << " could be a RefNode, please check it"
|
||||
<< " output index: " << i << " memory type: " << type;
|
||||
} else {
|
||||
MS_EXCEPTION_IF_NULL(ptr);
|
||||
}
|
||||
device_address->set_host_shape(trans::GetRuntimePaddingShape(node, i));
|
||||
AnfAlgo::SetOutputAddr(device_address, i, node.get());
|
||||
}
|
||||
|
|
|
@ -18,10 +18,6 @@
|
|||
#include <string>
|
||||
#include "backend/common/session/anf_runtime_algorithm.h"
|
||||
#include "include/common/utils/anfalgo.h"
|
||||
#include "include/common/debug/common.h"
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
#include "debug/rdr/string_recorder.h"
|
||||
#endif
|
||||
#include "utils/ms_context.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
@ -37,41 +33,21 @@ size_t MemoryManager::GetCommunicationAlignSize(size_t input_size) {
|
|||
}
|
||||
|
||||
void MemoryManager::MallocSomasDynamicMem(const session::KernelGraph &graph) {
|
||||
SomasPtr somas_reuse_util_ptr = std::make_shared<somas::Somas>();
|
||||
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr);
|
||||
somas_reuse_util_ptr_ = somas_reuse_util_ptr;
|
||||
SomasAllocatorPtr somas_allocator_ptr = std::make_shared<device::CommonSomasAllocator>();
|
||||
MS_EXCEPTION_IF_NULL(somas_allocator_ptr);
|
||||
somas_allocator_ptr_ = somas_allocator_ptr;
|
||||
|
||||
if (!(somas_reuse_util_ptr->Allocate(&graph))) {
|
||||
if (!(somas_allocator_ptr->Assign(graph))) {
|
||||
MS_LOG(EXCEPTION) << "Somas Allocate Failed.";
|
||||
}
|
||||
|
||||
size_t total_allocated_size = somas_reuse_util_ptr->GetTotalMemSize();
|
||||
size_t total_allocated_size = graph.somas_whole_block_size();
|
||||
MS_LOG(INFO) << "Graph " << graph.graph_id() << ": TotalSomasReuseDynamicSize [" << total_allocated_size << "]";
|
||||
if (total_allocated_size > 0) {
|
||||
auto base_ptr = MallocDynamicMem(total_allocated_size, false);
|
||||
MS_LOG(INFO) << "Somas Reuse Memory Base Address [" << static_cast<void *>(base_ptr) << "], End Address ["
|
||||
<< static_cast<void *>(base_ptr + total_allocated_size) << "]";
|
||||
somas_reuse_util_ptr->set_mem_base_addr(base_ptr);
|
||||
}
|
||||
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
#ifdef ENABLE_DUMP_IR
|
||||
SubModuleId module = SubModuleId::SM_OPTIMIZER;
|
||||
|
||||
std::string name = "somas_allocate_info." + std::to_string(graph.graph_id());
|
||||
(void)mindspore::RDR::RecordString(module, name, somas_reuse_util_ptr_->SomasInfo());
|
||||
|
||||
name = "somas_mem_info." + std::to_string(graph.graph_id());
|
||||
(void)mindspore::RDR::RecordString(module, name, somas_reuse_util_ptr_->SomasMemory());
|
||||
#endif
|
||||
bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
|
||||
if (save_graphs) {
|
||||
std::string file_path = GetSaveGraphsPathName("somas_allocate_info_" + std::to_string(graph.graph_id()) + ".ir");
|
||||
somas_reuse_util_ptr_->DumpSomasInfoIR(file_path);
|
||||
|
||||
std::string mem_file_path = GetSaveGraphsPathName("somas_mem_info_" + std::to_string(graph.graph_id()) + ".ir");
|
||||
somas_reuse_util_ptr_->DumpSomasMemoryIR(mem_file_path);
|
||||
somas_allocator_ptr->set_mem_base_addr(base_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -94,8 +70,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
|
|||
address->communication_ptr_ = ptr - kMemAlignSize;
|
||||
}
|
||||
} else if (type == kSomasReuseDynamicMem) {
|
||||
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
|
||||
ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index);
|
||||
MS_EXCEPTION_IF_NULL(somas_allocator_ptr_);
|
||||
ptr = somas_allocator_ptr_->GetNodeOutputPtr(node, index);
|
||||
} else {
|
||||
ptr = MallocDynamicMem(size, communication_mem);
|
||||
}
|
||||
|
@ -109,8 +85,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
|
|||
} else if (type == kDynamicMem) {
|
||||
ptr = MallocDynamicMem(size, false);
|
||||
} else if (type == kSomasReuseDynamicMem) {
|
||||
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
|
||||
ptr = somas_reuse_util_ptr_->GetNodeOutputPtr(node, index);
|
||||
MS_EXCEPTION_IF_NULL(somas_allocator_ptr_);
|
||||
ptr = somas_allocator_ptr_->GetNodeOutputPtr(node, index);
|
||||
}
|
||||
address->ptr_ = ptr;
|
||||
return ptr;
|
||||
|
@ -118,8 +94,8 @@ uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, Me
|
|||
|
||||
uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, MemType type, size_t size) {
|
||||
if (type == kSomasReuseDynamicMem) {
|
||||
MS_EXCEPTION_IF_NULL(somas_reuse_util_ptr_);
|
||||
return somas_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
|
||||
MS_EXCEPTION_IF_NULL(somas_allocator_ptr_);
|
||||
return somas_allocator_ptr_->GetNodeWorkSpacePtr(node, index);
|
||||
}
|
||||
return MallocDynamicMem(size, false);
|
||||
}
|
||||
|
|
|
@ -22,14 +22,15 @@
|
|||
#include <map>
|
||||
#include <queue>
|
||||
#include "common/mem_reuse/mem_reuse.h"
|
||||
#include "backend/common/somas/somas.h"
|
||||
#include "runtime/device/common_somas_allocator.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
enum MemType { kStaticMem, kDynamicMem, kSomasReuseDynamicMem };
|
||||
constexpr int kGetAllOuts = -1;
|
||||
constexpr uint64_t kMemAlignSize = 512;
|
||||
constexpr uint64_t kTwiceMemAlignSize = kMemAlignSize << 1;
|
||||
using SomasPtr = mindspore::somas::SomasPtr;
|
||||
using SomasAllocatorPtr = mindspore::device::CommonSomasAllocatorPtr;
|
||||
|
||||
class MemoryManager {
|
||||
public:
|
||||
|
@ -80,7 +81,7 @@ class MemoryManager {
|
|||
return MallocStaticMem(size, communication_mem, kInvalidGraphId);
|
||||
}
|
||||
virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem);
|
||||
SomasPtr somas_reuse_util_ptr_{nullptr};
|
||||
SomasAllocatorPtr somas_allocator_ptr_{nullptr};
|
||||
};
|
||||
} // namespace device
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -81,6 +81,10 @@ void MemoryManagerActor::AllocateContinuousMemory(const std::vector<std::vector<
|
|||
auto &size_list = (*size_list_list)[i];
|
||||
auto &device_context = (*device_contexts)[i];
|
||||
MS_EXCEPTION_IF_NULL(device_context);
|
||||
// if the address of continuous tensor has already been allocated, skip the tensor
|
||||
if (alloc_list[0]->GetPtr() != nullptr) {
|
||||
continue;
|
||||
}
|
||||
// Allocate memory through the device context.
|
||||
device::DynamicMemAllocatorDebugInfo::SetDebugInfo(from_aid.Name(), device::AllocatorType::kKernelOutput);
|
||||
auto dev_ptr_list = device_context->device_res_manager_->AllocateContinuousMemory(size_list);
|
||||
|
|
|
@ -102,6 +102,7 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
|
|||
set_param<bool>(MS_CTX_ENABLE_RECOVERY, false);
|
||||
set_param<bool>(MS_CTX_ENABLE_GE_HETEROGENOUS, false);
|
||||
set_param<bool>(MS_CTX_DISABLE_FORMAT_TRANSFORM, false);
|
||||
set_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL, kOptimizeO0);
|
||||
|
||||
uint32_t kDefaultRuntimeNumThreads = 30;
|
||||
uint32_t cpu_core_num = std::thread::hardware_concurrency() - 1;
|
||||
|
|
|
@ -55,6 +55,8 @@ const char kGpuInferenceDevice[] = "GpuInference";
|
|||
const char kDavinciDevice[] = "Davinci";
|
||||
const char KNpuLog[] = "_npu_log";
|
||||
const unsigned int MAX_CALL_DEPTH_DEFAULT = 1000;
|
||||
const int kOptimizeO0 = 0;
|
||||
const int kOptimizeO1 = 1;
|
||||
|
||||
const std::set<std::string> kTargetSet = {kCPUDevice, kGPUDevice, kAscendDevice, kDavinciDevice};
|
||||
// The default max available device memory is 1024GB.
|
||||
|
@ -98,6 +100,7 @@ enum MsCtxParam : unsigned {
|
|||
// parameter of type int
|
||||
MS_CTX_TYPE_INT_BEGIN = MS_CTX_TYPE_BOOL_END,
|
||||
MS_CTX_EXECUTION_MODE = MS_CTX_TYPE_INT_BEGIN,
|
||||
MS_CTX_MEMORY_OPTIMIZE_LEVEL,
|
||||
MS_CTX_TYPE_INT_END,
|
||||
|
||||
// parameter of type uint32
|
||||
|
|
|
@ -98,7 +98,6 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
|
|||
${CCSRC_DIR}/backend/common/somas/somas_solver_alg.cc
|
||||
${CCSRC_DIR}/backend/graph_compiler/graph_partition.cc
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock/segment_runner.cc
|
||||
${CCSRC_DIR}/runtime/device/auto_mem_offload.cc
|
||||
${CCSRC_DIR}/runtime/device/ms_device_shape_transfer.cc
|
||||
${CCSRC_DIR}/runtime/device/kernel_info.cc
|
||||
${CCSRC_DIR}/runtime/device/convert_tensor_utils.cc
|
||||
|
@ -109,6 +108,7 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
|
|||
${CCSRC_DIR}/runtime/device/memory_offload_strategy.cc
|
||||
${CCSRC_DIR}/runtime/device/memory_manager.cc
|
||||
${CCSRC_DIR}/runtime/device/auto_mem_offload.cc
|
||||
${CCSRC_DIR}/runtime/device/common_somas_allocator.cc
|
||||
${CCSRC_DIR}/runtime/pynative/op_executor.cc
|
||||
${CCSRC_DIR}/runtime/pynative/op_runtime_info.cc
|
||||
${CCSRC_DIR}/runtime/hardware/device_type.cc
|
||||
|
@ -117,6 +117,8 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
|
|||
${CCSRC_DIR}/kernel/kernel.cc
|
||||
${CCSRC_DIR}/kernel/kash/kernel_pack.cc
|
||||
${CCSRC_DIR}/kernel/oplib/oplib.cc
|
||||
${CCSRC_DIR}/common/debug/anf_dump_utils.cc
|
||||
${CCSRC_DIR}/common/debug/anf_ir_dump.cc
|
||||
${CCSRC_DIR}/common/debug/common.cc
|
||||
${CCSRC_DIR}/common/debug/env_config_parser.cc
|
||||
${CCSRC_DIR}/common/thread_pool.cc
|
||||
|
|
|
@ -197,6 +197,22 @@ class _Context:
|
|||
f"or context.PYNATIVE_MODE (1), but got {mode}.")
|
||||
self.set_param(ms_ctx_param.mode, mode)
|
||||
|
||||
def set_memory_optimize_level(self, memory_optimize_level):
|
||||
"""
|
||||
The memory optimize level, support "O0", "O1".
|
||||
|
||||
Args:
|
||||
target (str): "O0", "O1"
|
||||
"""
|
||||
memory_optimize_levels = ["O0", "O1"]
|
||||
if memory_optimize_level not in memory_optimize_levels:
|
||||
raise ValueError(f"For 'context.set_context', the argument 'memory_optimize_level' must be one of "
|
||||
f"{memory_optimize_levels}, but got {memory_optimize_level}.")
|
||||
if memory_optimize_level == "O0":
|
||||
self.set_param(ms_ctx_param.memory_optimize_level, 0)
|
||||
else:
|
||||
self.set_param(ms_ctx_param.memory_optimize_level, 1)
|
||||
|
||||
def set_backend_policy(self, policy):
|
||||
success = self._context_handle.set_backend_policy(policy)
|
||||
if not success:
|
||||
|
@ -353,7 +369,8 @@ class _Context:
|
|||
'mempool_block_size': set_mempool_block_size,
|
||||
'print_file_path': set_print_file_path,
|
||||
'env_config_path': set_env_config_path,
|
||||
'runtime_num_threads': set_runtime_num_threads
|
||||
'runtime_num_threads': set_runtime_num_threads,
|
||||
'memory_optimize_level': set_memory_optimize_level
|
||||
}
|
||||
|
||||
@property
|
||||
|
|
|
@ -87,3 +87,30 @@ def test_trainTensor(num_classes=10, epoch=15, batch_size=32):
|
|||
loss = train_network(data, label).asnumpy()
|
||||
losses.append(loss)
|
||||
assert losses[-1] < 0.01
|
||||
|
||||
|
||||
@pytest.mark.level1
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_train_tensor_memory_opt(num_classes=10, epoch=15, batch_size=32):
|
||||
"""
|
||||
Feature: Somas GPU kernel by kernel.
|
||||
Description: AlexNet with Somas GPU kernel by kernel.
|
||||
Expectation: No exception.
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
|
||||
net = AlexNet(num_classes)
|
||||
lr = 0.1
|
||||
momentum = 0.9
|
||||
optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, momentum, weight_decay=0.0001)
|
||||
criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
|
||||
net_with_criterion = WithLossCell(net, criterion)
|
||||
train_network = TrainOneStepCell(net_with_criterion, optimizer)
|
||||
train_network.set_train()
|
||||
losses = []
|
||||
for i in range(0, epoch):
|
||||
data = Tensor(np.ones([batch_size, 3, 227, 227]).astype(np.float32) * 0.01)
|
||||
label = Tensor(np.ones([batch_size]).astype(np.int32))
|
||||
loss = train_network(data, label).asnumpy()
|
||||
losses.append(loss)
|
||||
assert losses[-1] < 0.01
|
||||
|
|
|
@ -150,6 +150,35 @@ def test_train_lenet():
|
|||
assert losses[-1] < 0.01
|
||||
|
||||
|
||||
@pytest.mark.level1
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_train_lenet_memory_opt():
|
||||
"""
|
||||
Feature: Somas GPU kernel by kernel.
|
||||
Description: LeNet with Somas GPU kernel by kernel.
|
||||
Expectation: No exception.
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
|
||||
epoch = 100
|
||||
net = LeNet()
|
||||
momentum = 0.9
|
||||
learning_rate = multisteplr(epoch, 30)
|
||||
|
||||
optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
|
||||
criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
|
||||
net_with_criterion = WithLossCell(net, criterion)
|
||||
train_network = TrainOneStepCell(net_with_criterion, optimizer) # optimizer
|
||||
train_network.set_train()
|
||||
losses = []
|
||||
for i in range(epoch):
|
||||
data = Tensor(np.ones([net.batch_size, 3, 32, 32]).astype(np.float32) * 0.01)
|
||||
label = Tensor(np.ones([net.batch_size]).astype(np.int32))
|
||||
loss = train_network(data, label).asnumpy()
|
||||
losses.append(loss)
|
||||
assert losses[-1] < 0.01
|
||||
|
||||
|
||||
def create_dataset(data_path, batch_size=32, repeat_size=1,
|
||||
num_parallel_workers=1):
|
||||
"""
|
||||
|
|
|
@ -142,3 +142,48 @@ def test_LSTM():
|
|||
losses.append(loss)
|
||||
print("loss:", loss.asnumpy())
|
||||
assert (losses[-1].asnumpy() < 0.01)
|
||||
|
||||
|
||||
@pytest.mark.level1
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_lstm_memory_opt():
|
||||
"""
|
||||
Feature: Somas GPU kernel by kernel.
|
||||
Description: LSTM with Somas GPU kernel by kernel.
|
||||
Expectation: No exception.
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
|
||||
num_epochs = 5
|
||||
embed_size = 100
|
||||
num_hiddens = 100
|
||||
num_layers = 2
|
||||
bidirectional = True
|
||||
labels = 2
|
||||
vocab_size = 252193
|
||||
max_len = 500
|
||||
|
||||
weight = np.ones((vocab_size + 1, embed_size)).astype(np.float32)
|
||||
|
||||
net = SentimentNet(vocab_size=(vocab_size + 1), embed_size=embed_size,
|
||||
num_hiddens=num_hiddens, num_layers=num_layers,
|
||||
bidirectional=bidirectional, weight=weight,
|
||||
labels=labels, batch_size=batch_size)
|
||||
|
||||
learning_rate = 0.1
|
||||
momentum = 0.9
|
||||
|
||||
optimizer = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum)
|
||||
criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
|
||||
net_with_criterion = WithLossCell(net, criterion)
|
||||
train_network = TrainOneStepCell(net_with_criterion, optimizer) # optimizer
|
||||
train_network.set_train()
|
||||
|
||||
train_features = Tensor(np.ones([64, max_len]).astype(np.int32))
|
||||
train_labels = Tensor(np.ones([64,]).astype(np.int32)[0:64])
|
||||
losses = []
|
||||
for epoch in range(num_epochs):
|
||||
loss = train_network(train_features, train_labels)
|
||||
losses.append(loss)
|
||||
print("loss:", loss.asnumpy())
|
||||
assert (losses[-1].asnumpy() < 0.01)
|
||||
|
|
|
@ -352,6 +352,36 @@ def test_trainTensor(num_classes=10, epoch=8, batch_size=1):
|
|||
assert (losses[-1].asnumpy() < 1)
|
||||
|
||||
|
||||
@pytest.mark.level1
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_train_tensor_memory_opt(num_classes=10, epoch=8, batch_size=1):
|
||||
"""
|
||||
Feature: Somas GPU kernel by kernel.
|
||||
Description: ResNet with Somas GPU kernel by kernel.
|
||||
Expectation: No exception.
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU", memory_optimize_level='O1')
|
||||
net = resnet50(num_classes)
|
||||
lr = 0.1
|
||||
momentum = 0.9
|
||||
optimizer = Momentum(filter(lambda x: x.requires_grad,
|
||||
net.get_parameters()), lr, momentum)
|
||||
criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
|
||||
net_with_criterion = WithLossCell(net, criterion)
|
||||
train_network = TrainOneStepCell(
|
||||
net_with_criterion, optimizer) # optimizer
|
||||
train_network.set_train()
|
||||
losses = []
|
||||
for i in range(0, epoch):
|
||||
data = Tensor(np.ones([batch_size, 3, 224, 224]
|
||||
).astype(np.float32) * 0.01)
|
||||
label = Tensor(np.ones([batch_size]).astype(np.int32))
|
||||
loss = train_network(data, label)
|
||||
losses.append(loss)
|
||||
assert (losses[-1].asnumpy() < 1)
|
||||
|
||||
|
||||
@pytest.mark.level2
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
|
|
Loading…
Reference in New Issue