!26675 Optimize oom log

Merge pull request !26675 from tanghuikang/oom_nullptr_log
This commit is contained in:
i-robot 2021-11-23 13:07:49 +00:00 committed by Gitee
commit df25ee8c68
4 changed files with 10 additions and 8 deletions

View File

@ -131,7 +131,7 @@ bool AscendPsCache::InitDevice(uint32_t device_id, const void *context) {
void *AscendPsCache::MallocMemory(size_t size) {
const auto device_addr = device::ascend::AscendMemoryPool::GetInstance().AllocTensorMem(size);
if (device_addr == nullptr) {
MS_LOG(EXCEPTION) << "Fail to alloc tensor memory, size: " << size;
MS_LOG(EXCEPTION) << "Fail to alloc memory, size: " << size;
}
return device_addr;
}
@ -139,13 +139,13 @@ void *AscendPsCache::MallocMemory(size_t size) {
bool AscendPsCache::MallocConstantMemory(size_t cache_vocab_size) {
offset_addr_ = reinterpret_cast<int *>(device::ascend::AscendMemoryPool::GetInstance().AllocTensorMem(sizeof(int)));
if (offset_addr_ == nullptr) {
MS_LOG(EXCEPTION) << "Fail to alloc tensor memory, size: " << sizeof(int);
MS_LOG(EXCEPTION) << "Fail to alloc memory, size: " << sizeof(int);
}
rtMemset(offset_addr_, sizeof(int), 0, sizeof(int));
cache_vocab_size_addr_ =
reinterpret_cast<int *>(device::ascend::AscendMemoryPool::GetInstance().AllocTensorMem(sizeof(int)));
if (cache_vocab_size_addr_ == nullptr) {
MS_LOG(EXCEPTION) << "Fail to alloc tensor memory, size: " << sizeof(int);
MS_LOG(EXCEPTION) << "Fail to alloc memory, size: " << sizeof(int);
}
int copy_value = SizeToInt(cache_vocab_size);
if (!CopyHostMemToDevice(cache_vocab_size_addr_, &copy_value, sizeof(int))) {

View File

@ -28,7 +28,7 @@ size_t AscendLaunchKernel::AlignSizeForLaunchKernel(size_t size) { return Memory
uint8_t *AscendLaunchKernel::AllocDeviceMem(size_t size) {
auto device_memory = AscendMemoryPool::GetInstance().AllocTensorMem(size);
if (device_memory == nullptr) {
MS_LOG(EXCEPTION) << "Fail to alloc tensor memory, size: " << size;
MS_LOG(EXCEPTION) << "Fail to alloc memory, size: " << size;
}
return static_cast<uint8_t *>(device_memory);
}

View File

@ -50,7 +50,7 @@ void *AscendMemoryManager::MallocMemFromMemPool(size_t size) {
auto align_size = GetCommonAlignSize(size);
const auto device_addr = AscendMemoryPool::GetInstance().AllocTensorMem(align_size);
if (device_addr == nullptr) {
MS_LOG(EXCEPTION) << "Fail to alloc tensor memory, size: " << align_size
MS_LOG(EXCEPTION) << "Fail to alloc memory, size: " << align_size
<< ", memory statistics:" << AscendMemAdapter::GetInstance().DevMemStatistics();
}
return device_addr;
@ -83,7 +83,7 @@ uint8_t *AscendMemoryManager::MallocStaticMem(size_t size, bool communication_me
uint8_t *alloc_address = reinterpret_cast<uint8_t *>(AscendMemoryPool::GetInstance().AllocTensorMem(align_size));
if (alloc_address == nullptr) {
MS_LOG(EXCEPTION) << "Fail to alloc tensor memory, size: " << align_size
MS_LOG(EXCEPTION) << "Fail to alloc memory, size: " << align_size
<< ", memory statistics:" << AscendMemAdapter::GetInstance().DevMemStatistics();
}
// create protect area [kMemAlignSize -- data -- kMemAlignSize] for communication node memory
@ -121,7 +121,7 @@ uint8_t *AscendMemoryManager::MallocCommunicationMemFromMemPool(size_t size) {
auto align_size = GetCommunicationAlignSize(size);
uint8_t *base_ptr = reinterpret_cast<uint8_t *>(AscendMemoryPool::GetInstance().AllocTensorMem(align_size));
if (base_ptr == nullptr) {
MS_LOG(EXCEPTION) << "Fail to alloc tensor memory, size: " << align_size
MS_LOG(EXCEPTION) << "Fail to alloc memory, size: " << align_size
<< ", memory statistics:" << AscendMemAdapter::GetInstance().DevMemStatistics();
}
return base_ptr + kMemAlignSize;

View File

@ -32,7 +32,9 @@ size_t AscendMemoryPool::CalMemBlockAllocSize(size_t size) {
auto device_free_mem_size = free_mem_size();
if (device_free_mem_size < size) {
MS_LOG(WARNING) << "Out of Memory. Request memory size: " << size
<< ", Memory Statistic:" << AscendMemAdapter::GetInstance().DevMemStatistics();
<< ", Memory Statistic:" << AscendMemAdapter::GetInstance().DevMemStatistics()
<< "Please try to reduce 'batch_size' or check whether exists extra large shape. More "
"details can be found in MindSpore's FAQ with keyword 'Out of Memory'.";
return 0;
}
auto alloc_mem_size = ASCEND_DYNAMIC_MEM_ALLOC_UNIT_SIZE;