log improvement

This commit is contained in:
LaiYongqiang 2021-11-25 11:42:08 +08:00
parent 249fcbf812
commit a42c936df6
11 changed files with 128 additions and 85 deletions

View File

@ -42,7 +42,6 @@ constexpr auto kProcessorUnknown = "unknown";
constexpr auto kJsonSuffix = ".json";
constexpr auto kInfoSuffix = ".info";
constexpr unsigned int AUTODIFF_COMPILE_OVERTIME = 600;
constexpr auto kCOMPILER_CACHE_PATH = "MS_COMPILER_CACHE_PATH";
const std::vector<std::string> support_devices = {"aicore", "aicpu", "cuda"};

View File

@ -58,7 +58,6 @@ constexpr auto kJOpTuneList = "op_tune_list";
constexpr auto kJPassList = "pass_list";
constexpr auto kRankID = "RANK_ID";
constexpr auto kCOMPILER_OP_LEVEL = "MS_COMPILER_OP_LEVEL";
constexpr auto kCOMPILER_CACHE_PATH = "MS_COMPILER_CACHE_PATH";
uintptr_t KernelManager::kernel_stub_gen_ = 0;
std::unordered_map<string, KernelMetaPtr> KernelManager::info_table_ = {};
@ -109,13 +108,7 @@ std::string TbeUtils::GetOpDebugPath() {
if (debug_path != "") {
return debug_path;
}
auto config_path = GetCompilerCachePath();
std::string rank_id_str = common::GetEnv(kRankID);
if (rank_id_str.empty()) {
MS_LOG(DEBUG) << "Environment variable 'RANK_ID' is empty, using the default value: 0";
rank_id_str = "0";
}
debug_path = config_path + "rank_" + rank_id_str + "/";
debug_path = Common::GetCompilerCachePath();
return debug_path;
}

View File

@ -488,7 +488,7 @@ void AscendBackendUBFusionOptimization(const std::shared_ptr<session::KernelGrap
}
if (kernel_graph->is_dynamic_shape()) {
MS_LOG(WARNING) << "Dynamic shape skip fusion";
MS_LOG(INFO) << "Dynamic shape skip fusion";
return;
}
auto pre_build = common::GetEnv("MS_DISABLE_PREBUILD");

View File

@ -125,8 +125,8 @@ bool Somas::LoadSomasCache(const session::KernelGraph *graph) {
bool ret = CalcSomasModelHash(graph);
if (ret) {
std::string filename = GetSaveGraphsPathName(
"/somas_meta/somas_graph" + std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json", save_graphs_path_);
std::string filename = Common::GetCompilerCachePath() + "/somas_meta/somas_graph_" +
std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json";
ret = LoadSomasResult(graph, filename);
if (ret) {
MS_LOG(INFO) << "Load Somas Cache file " << filename << " Successfully.";
@ -142,8 +142,8 @@ bool Somas::CalcSomasModelHash(const session::KernelGraph *graph) {
auto model_str = SomasInfo(true);
hash_id_ = std::to_string(std::hash<std::string>()(model_str));
MS_LOG(INFO) << "Graph " << graph->graph_id() << "'s SOMAS Model hash id is " << hash_id_;
std::string filename = GetSaveGraphsPathName(
"/somas_meta/somas_graph" + std::to_string(graph->graph_id()) + "_" + hash_id_ + ".info", save_graphs_path_);
std::string filename = Common::GetCompilerCachePath() + "/somas_meta/somas_graph_" +
std::to_string(graph->graph_id()) + "_" + hash_id_ + ".info";
return Common::SaveStringToFile(filename, model_str);
}
@ -179,17 +179,13 @@ bool Somas::SaveSomasResult(const session::KernelGraph *graph) {
}
somas_json[kTensors] = tensors_json;
std::string filename = GetSaveGraphsPathName(
"/somas_meta/somas_graph" + std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json", save_graphs_path_);
std::string filename = Common::GetCompilerCachePath() + "/somas_meta/somas_graph_" +
std::to_string(graph->graph_id()) + "_" + hash_id_ + ".json";
(void)Common::SaveStringToFile(filename, somas_json.dump());
return true;
}
bool Somas::LoadSomasResult(const session::KernelGraph *graph, const string &filename) {
if (filename.length() <= strlen(".json")) {
MS_LOG(WARNING) << "please check somas cache file path.";
return false;
}
std::ifstream somas_json_fs(filename);
if (!somas_json_fs.is_open()) {
MS_LOG(INFO) << "Open json file: " << filename << " error, Somas Cache Missed.";
@ -200,7 +196,7 @@ bool Somas::LoadSomasResult(const session::KernelGraph *graph, const string &fil
somas_json_fs >> somas_json;
somas_json_fs.close();
} catch (std::exception &e) {
MS_LOG(WARNING) << "Parse json file error: " << filename << ", sleep 500ms and retry again.";
MS_LOG(INFO) << "Parse json file error: " << filename << ", sleep 500ms and retry again.";
somas_json_fs.close();
std::this_thread::sleep_for(std::chrono::milliseconds(kRetryIntervalSeconds));
std::ifstream retry_tmp(filename);

View File

@ -263,7 +263,7 @@ bool FastHeuristic::Eval(vector<BlockTensor> *block_tensors_v, const std::shared
if (p->Next() != nullptr) {
p = p->Next();
} else if (bpushed == false) { // something went wrong
MS_LOG(WARNING) << "Could not allocate memory for tensor: " << tensor->index_;
MS_LOG(WARNING) << "Internal Error: Could not allocate memory for tensor: " << tensor->index_;
return false;
}
}

View File

@ -1305,6 +1305,12 @@ void AscendSession::MemoryAlloc(KernelGraph *kernel_graph) const {
device::KernelAdjust::GetInstance().AssignLoopCtrlMemory(*kernel_graph);
MS_LOG(INFO) << "Status record: end memory alloc. graph id: " << kernel_graph->graph_id()
<< ", Memory Statistics:" << device::ascend::AscendMemAdapter::GetInstance().DevMemStatistics();
MS_LOG(INFO) << "The dynamic memory pool total size is "
<< device::ascend::AscendMemoryPool::GetInstance().total_mem_statistics() / kMBToByte
<< "M, total used size is "
<< device::ascend::AscendMemoryPool::GetInstance().used_mem_statistics() / kMBToByte
<< "M, used peak size is "
<< device::ascend::AscendMemoryPool::GetInstance().used_mem_peak_statistics() / kMBToByte << "M.";
}
void AscendSession::RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input_tensors,

View File

@ -271,6 +271,31 @@ bool Common::FileExists(const std::string &filepath) {
return cache_file_existed;
}
std::string Common::GetUserDefineCachePath() {
static std::string config_path = "";
if (config_path != "") {
return config_path;
}
const char *value = ::getenv(kCOMPILER_CACHE_PATH);
if (value == nullptr) {
config_path = "./";
} else {
config_path = std::string(value);
FileUtils::CreateNotExistDirs(config_path);
if (config_path[config_path.length() - 1] != '/') {
config_path += "/";
}
}
return config_path;
}
std::string Common::GetCompilerCachePath() {
static const std::string user_defined_path = GetUserDefineCachePath();
static uint32_t rank_id = IsStandAlone() ? 0 : GetRank();
static const std::string compile_cache_dir = user_defined_path + "rank_" + std::to_string(rank_id) + "/";
return compile_cache_dir;
}
struct GlogLogDirRegister {
GlogLogDirRegister() {
const char *logtostderr = std::getenv("GLOG_logtostderr");

View File

@ -28,6 +28,8 @@ namespace mindspore {
static const int MAX_DIRECTORY_LENGTH = 1024;
static const int MAX_FILENAME_LENGTH = 128;
static const int MAX_OS_FILENAME_LENGTH = 255;
static const char kCOMPILER_CACHE_PATH[] = "MS_COMPILER_CACHE_PATH";
class Common {
public:
Common() = default;
@ -43,9 +45,11 @@ class Common {
static bool SaveStringToFile(const std::string filename, const std::string string_info);
static bool FileExists(const std::string &filepath);
static bool CommonFuncForConfigPath(const std::string &default_path, const std::string &env_path, std::string *value);
static std::string GetCompilerCachePath();
private:
static bool IsEveryFilenameValid(const std::string &path, size_t length_limit, const std::string &error_message);
static string GetUserDefineCachePath();
};
inline std::string GetSaveGraphsPathName(const std::string &file_name, const std::string &save_path = "") {

View File

@ -19,13 +19,14 @@
#include <algorithm>
#include "runtime/mem.h"
#include "utils/ms_context.h"
#include "utils/convert_utils_base.h"
#include "graphengine/inc/external/runtime/rt_error_codes.h"
namespace mindspore {
namespace device {
namespace ascend {
constexpr uint64_t kMemSizeGB = 30;
constexpr uint64_t kMaxAvailableMSHBMSizeThreshold = 30;
constexpr float kMSMemoryRatio = 0.9375; // 15/16
constexpr float kReservedMemoryRatio = 0.0625; // 1/16
bool AscendMemAdapter::Initialize() {
if (initialized_) {
@ -34,35 +35,44 @@ bool AscendMemAdapter::Initialize() {
size_t free_hbm_size = 0;
rtError_t ret = rtMemGetInfoEx(RT_MEMORYINFO_HBM, &free_hbm_size, &device_hbm_size_);
if (ret != RT_ERROR_NONE || device_hbm_size_ == 0) {
MS_LOG(EXCEPTION) << "Get Device HBM memory size failed, ret = " << ret << ", total HBM size :" << device_hbm_size_;
MS_LOG(EXCEPTION) << "Internal Error: Get Device HBM memory size failed, ret = " << ret
<< ", total HBM size :" << device_hbm_size_;
}
// reserved memory for HCCL or other component
auto reserved_mem_size_for_others = device_hbm_size_ * 15 / 16;
reserved_mem_size_for_others =
reserved_mem_size_for_others >= (1 << kMemSizeGB) ? (1 << kMemSizeGB) : reserved_mem_size_for_others;
max_available_ms_hbm_size_ = device_hbm_size_ - reserved_mem_size_for_others;
auto user_define_ms_size_ = GetDeviceMemSizeFromContext();
size_t default_hbm_size;
if (max_available_ms_hbm_size_ >= (kMaxAvailableMSHBMSizeThreshold << kMemSizeGB)) {
// Some HCCL case on 32GB Ascend device will Out Of Memory when reserved only 1GB memory for HCCL, so resize
// default HBM size for MindSpore to 30GB User can use context variable_memory_max_size="31GB" to Allocate 31GB
// HBM for MindSpore in some extreme case.
default_hbm_size = static_cast<size_t>(30) << kMemSizeGB;
MS_LOG(INFO) << "Default HBM size is resize to 30GB from Max available HBM Size for MindSpore "
<< max_available_ms_hbm_size_;
// get user define max backend memory
auto user_define_ms_size = GetDeviceMemSizeFromContext();
auto recommend_mem_size_for_others = FloatToSize(device_hbm_size_ * kReservedMemoryRatio);
size_t reserved_mem_size_for_others;
if (user_define_ms_size == 0) {
ms_used_hbm_size_ = FloatToSize(device_hbm_size_ * kMSMemoryRatio);
reserved_mem_size_for_others = device_hbm_size_ - user_define_ms_size;
} else {
default_hbm_size = max_available_ms_hbm_size_;
}
ms_used_hbm_size_ = user_define_ms_size_ == 0 ? default_hbm_size : user_define_ms_size_;
if (user_define_ms_size >= device_hbm_size_) {
MS_LOG(EXCEPTION)
<< "The Total Device Memory Size is " << (SizeToFloat(device_hbm_size_) / kGBToByte)
<< " GB, variable_memory_max_size/max_device_memory should be in range (0-"
<< (SizeToFloat(device_hbm_size_) / kGBToByte) << "]GB, but got "
<< (SizeToFloat(user_define_ms_size) / kGBToByte)
<< "GB, please set the context key 'variable_memory_max_size'/'max_device_memory' in valid range.";
}
ms_used_hbm_size_ = user_define_ms_size;
reserved_mem_size_for_others = device_hbm_size_ - ms_used_hbm_size_;
if (reserved_mem_size_for_others < recommend_mem_size_for_others) {
MS_LOG(WARNING) << "Reserved memory size for other components(" << reserved_mem_size_for_others
<< ") is less than recommend size(" << recommend_mem_size_for_others
<< "), It may lead to Out Of Memory in HCCL or other components, Please double check context key "
"'variable_memory_max_size'/'max_device_memory'";
}
}
MS_LOG(INFO) << "Device HBM Size:" << device_hbm_size_ / kMBToByte
<< "M, Reserved HBM size for Other Components(HCCL/rts/etc.):"
<< reserved_mem_size_for_others / kMBToByte
<< "M, Recommend Reserved HBM size for Other Components:" << recommend_mem_size_for_others / kMBToByte
<< "M, User define MindSpore HBM Size:" << user_define_ms_size / kGBToByte
<< "G, MindSpore Used HBM Size:" << ms_used_hbm_size_ / kMBToByte << "M.";
MS_LOG(INFO) << "Device HBM Size:" << device_hbm_size_
<< ", Reserved HBM size for Other Components(HCCL/rts/etc.):" << reserved_mem_size_for_others
<< ", Max available HBM Size for MindSpore:" << max_available_ms_hbm_size_
<< ", User define MindSpore HBM Size:" << user_define_ms_size_
<< ", Default MindSpore HBM Size:" << default_hbm_size
<< ", MindSpore Used HBM Size:" << ms_used_hbm_size_;
device_mem_base_addr_ = MallocFromRts(ms_used_hbm_size_);
static_mem_offset_ = ms_used_hbm_size_;
cur_dynamic_mem_offset_ = 0;
@ -103,11 +113,10 @@ uint8_t *AscendMemAdapter::MallocStaticDevMem(size_t size, std::string tag) {
std::lock_guard<std::mutex> locker(mutex_);
auto new_static_offset = static_mem_offset_ - size;
if (new_static_offset < max_dynamic_mem_offset_) {
MS_LOG(ERROR) << "Out of Memory!!! Request memory size: " << size << ", Memory Statistic:" << DevMemStatistics()
<< "Please try to reduce 'batch_size' or check whether exists extra large shape. More "
"details can be found in MindSpore's FAQ with keyword 'Out of Memory'.";
MS_LOG(ERROR) << DevMemDetailInfo();
return nullptr;
MS_LOG(INFO) << DevMemDetailInfo();
MS_LOG(EXCEPTION) << "Out of Memory!!! Request memory size: " << size << ", Memory Statistic:" << DevMemStatistics()
<< "\nPlease try to reduce 'batch_size' or check whether exists extra large shape. For more "
"details, please refer to 'Out of Memory' at https://www.mindspore.cn .";
}
auto memory_block_ptr = device_mem_base_addr_ + new_static_offset;
@ -121,11 +130,10 @@ uint8_t *AscendMemAdapter::MallocDynamicDevMem(size_t size, std::string tag) {
std::lock_guard<std::mutex> locker(mutex_);
auto new_dynamic_offset = cur_dynamic_mem_offset_ + size;
if (new_dynamic_offset > static_mem_offset_) {
MS_LOG(ERROR) << "Out of Memory!!! Request memory size: " << size << ", Memory Statistic:" << DevMemStatistics()
<< "Please try to reduce 'batch_size' or check whether exists extra large shape. More "
"details can be found in MindSpore's FAQ with keyword 'Out of Memory'.";
MS_LOG(ERROR) << DevMemDetailInfo();
return nullptr;
MS_LOG(INFO) << DevMemDetailInfo();
MS_LOG(EXCEPTION) << "Out of Memory!!! Request memory size: " << size << ", Memory Statistic:" << DevMemStatistics()
<< "\nPlease try to reduce 'batch_size' or check whether exists extra large shape. For more "
"details, please refer to 'Out of Memory' at https://www.mindspore.cn .";
}
auto memory_block_ptr = device_mem_base_addr_ + cur_dynamic_mem_offset_;
@ -140,12 +148,12 @@ void AscendMemAdapter::ResetDynamicMemory() { cur_dynamic_mem_offset_ = 0; }
std::string AscendMemAdapter::DevMemStatistics() {
std::ostringstream oss;
oss << "\nDevice HBM memory size: " << device_hbm_size_;
oss << "\nMindSpore Used memory size: " << ms_used_hbm_size_;
oss << "\nDevice HBM memory size: " << device_hbm_size_ / kMBToByte << "M";
oss << "\nMindSpore Used memory size: " << ms_used_hbm_size_ / kMBToByte << "M";
oss << "\nMindSpore memory base address: " << reinterpret_cast<void *>(device_mem_base_addr_);
oss << "\nTotal Static Memory size: " << ms_used_hbm_size_ - static_mem_offset_;
oss << "\nTotal Dynamic memory size: " << max_dynamic_mem_offset_;
oss << "\nDynamic memory size of this graph: " << cur_dynamic_mem_offset_;
oss << "\nTotal Static Memory size: " << (ms_used_hbm_size_ - static_mem_offset_) / kMBToByte << "M";
oss << "\nTotal Dynamic memory size: " << max_dynamic_mem_offset_ / kMBToByte << "M";
oss << "\nDynamic memory size of this graph: " << cur_dynamic_mem_offset_ / kMBToByte << "M";
oss << std::endl;
return oss.str();
}
@ -170,27 +178,28 @@ std::string AscendMemAdapter::DevMemDetailInfo() {
size_t AscendMemAdapter::GetDeviceMemSizeFromContext() {
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
auto variable_memory_max_size = context->get_param<std::string>(MS_CTX_VARIABLE_MEMORY_MAX_SIZE);
if (variable_memory_max_size == "0") {
return 0;
size_t size_from_context;
auto max_device_memory = context->get_param<float>(MS_CTX_MAX_DEVICE_MEMORY);
if (max_device_memory != mindspore::kDefaultMaxDeviceMemory) {
MS_LOG(INFO) << "context max_device_memory:" << max_device_memory;
size_from_context = FloatToSize(max_device_memory * kGBToByte);
} else {
auto variable_memory_max_size = context->get_param<std::string>(MS_CTX_VARIABLE_MEMORY_MAX_SIZE);
if (variable_memory_max_size == "0") {
return 0;
}
MS_LOG(INFO) << "context variable_memory_max_size:" << variable_memory_max_size;
auto pos = variable_memory_max_size.find('*');
if (pos == std::string::npos) {
MS_LOG(EXCEPTION) << "Invalid variable_memory_max_size";
}
auto gb_str = variable_memory_max_size.substr(0, pos);
auto gb_var = std::stoull(gb_str);
MS_LOG(INFO) << "variable_memory_max_size(GB):" << gb_var;
size_from_context = gb_var * kGBToByte;
}
MS_LOG(INFO) << "context variable_memory_max_size:" << variable_memory_max_size;
auto pos = variable_memory_max_size.find('*');
if (pos == std::string::npos) {
MS_LOG(EXCEPTION) << "Invalid variable_memory_max_size";
}
auto gb_str = variable_memory_max_size.substr(0, pos);
auto gb_var = std::stoull(gb_str);
MS_LOG(INFO) << "variable_memory_max_size(GB):" << gb_var;
auto max_hbm_size_for_ms_GB = max_available_ms_hbm_size_ >> kMemSizeGB;
if (gb_var > max_hbm_size_for_ms_GB || gb_var == 0) {
MS_LOG(EXCEPTION) << "The Total Device Memory Size is " << (device_hbm_size_ >> kMemSizeGB)
<< " GB, variable_memory_max_size should be in range (0-" << max_hbm_size_for_ms_GB
<< "]GB, but got " << gb_var
<< "GB, please set the context key 'variable_memory_max_size' in valid range.";
}
return gb_var << kMemSizeGB;
return size_from_context;
}
uint8_t *AscendMemAdapter::MallocFromRts(size_t size) {
@ -211,8 +220,9 @@ uint8_t *AscendMemAdapter::MallocFromRts(size_t size) {
MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << size << "] fail, ret[" << ret << "]";
}
} else {
MS_LOG(INFO) << "Call rtMalloc to allocate device memory Success, size : " << size
<< " bytes , address : " << reinterpret_cast<void *>(ptr);
MS_LOG(INFO) << "Call rtMalloc to allocate device memory Success, size: " << size
<< " bytes, address start: " << reinterpret_cast<void *>(ptr)
<< " end: " << reinterpret_cast<void *>(ptr + size);
}
return ptr;
}

View File

@ -19,6 +19,7 @@
#include "runtime/device/ascend/ascend_memory_adapter.h"
#include "runtime/mem.h"
#include "utils/log_adapter.h"
#include "utils/convert_utils_base.h"
namespace mindspore {
namespace device {
@ -31,6 +32,12 @@ static const size_t ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE_FOR_GRAPH = 8 << 20;
size_t AscendMemoryPool::CalMemBlockAllocSize(size_t size) {
auto device_free_mem_size = free_mem_size();
if (device_free_mem_size < size) {
MS_LOG(WARNING) << "The dynamic memory pool total size is "
<< device::ascend::AscendMemoryPool::GetInstance().total_mem_statistics() / kMBToByte
<< "M, total used size is "
<< device::ascend::AscendMemoryPool::GetInstance().used_mem_statistics() / kMBToByte
<< "M, used peak size is "
<< device::ascend::AscendMemoryPool::GetInstance().used_mem_peak_statistics() / kMBToByte << "M.";
MS_LOG(WARNING) << "Out of Memory. Request memory size: " << size
<< ", Memory Statistic:" << AscendMemAdapter::GetInstance().DevMemStatistics();
return 0;

View File

@ -23,6 +23,9 @@
#include "utils/log_adapter.h"
namespace mindspore {
const size_t kGBToByte = 1024 << 20;
const size_t kMBToByte = 1024 << 10;
inline int SizeToInt(size_t u) {
if (u > static_cast<size_t>((std::numeric_limits<int>::max)())) {
MS_LOG(EXCEPTION) << "The size_t value(" << u << ") exceeds the maximum value of int.";