add mem analyer for swap

This commit is contained in:
kswang 2023-01-19 10:52:53 +08:00
parent e12c672e95
commit 0a7533fe35
10 changed files with 462 additions and 10 deletions

View File

@ -1,4 +1,4 @@
file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc"
file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc" "gsm/*.cc"
"kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc"
"memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc" "memory_scheduler.cc"
"memory_offload_strategy.cc" "launch_kernel.cc" "launch_mul.cc" "tensor_array.cc"

View File

@ -0,0 +1,202 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "runtime/device/gsm/mem_usage_analyzer.h"
#include <memory>
#include "backend/common/session/anf_runtime_algorithm.h"
#include "include/common/utils/anfalgo.h"
namespace mindspore {
namespace device {
size_t MemUsageAnalyzer::AddTensorInfo(const AnfNodePtr &node, size_t index, bool is_workspace) {
auto add_to_container = [this](const AnfNodePtr &node, size_t index,
std::map<AnfNodePtr, std::map<size_t, size_t>> *container, bool is_workspace) {
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(container);
auto iter_node = container->find(node);
if (iter_node != container->end()) {
auto iter_tid = iter_node->second.find(index);
if (iter_tid == iter_node->second.end()) {
iter_node->second[index] = tensor_num_;
} else {
return iter_tid->second;
}
} else {
(*container)[node] = std::map<size_t, size_t>({{index, tensor_num_}});
}
DeviceAddressPtr address = nullptr;
if (is_workspace) {
address = AnfAlgo::GetMutableWorkspaceAddr(node, index);
} else {
address = AnfAlgo::GetMutableOutputAddr(node, index, true);
}
MS_EXCEPTION_IF_NULL(address);
auto info = std::make_shared<MemUsageTensorInfo>();
info->tensor_id_ = tensor_num_;
info->real_tensor_id_ = tensor_num_;
info->tensor_size_ = address->GetSize();
info->node_ = node;
info->index_ = index;
info->is_workspace_ = is_workspace;
info->is_graph_input_ = !(node->isa<CNode>());
info->is_graph_output_ = IsGraphOutput(node, index);
(void)tensor_infos_.emplace_back(info);
++tensor_num_;
return info->tensor_id_;
};
MS_EXCEPTION_IF_NULL(node);
size_t tensor_id = 0;
if (node->isa<ValueNode>()) {
tensor_id = add_to_container(node, index, &kernel_input_value_tid_, false);
} else if (node->isa<Parameter>()) {
tensor_id = add_to_container(node, index, &kernel_input_param_tid_, false);
} else if (is_workspace) {
tensor_id = add_to_container(node, index, &kernel_workspace_tid_, true);
} else {
tensor_id = add_to_container(node, index, &kernel_output_tid_, false);
}
return tensor_id;
}
void MemUsageAnalyzer::Analyze(const KernelGraphPtr &graph) {
AddOutputNodeInfo(graph);
AddKernelAndTensorInfo(graph);
AddFusedTensorInfo();
}
void MemUsageAnalyzer::AddOutputNodeInfo(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
auto outputs = common::AnfAlgo::GetAllOutputWithIndex(graph->output());
for (const auto &output : outputs) {
const auto &output_with_index = common::AnfAlgo::FetchRealNodeSkipMonadControl(output);
auto output_node = output_with_index.first;
MS_EXCEPTION_IF_NULL(output_node);
auto output_index = output_with_index.second;
if (common::AnfAlgo::IsNopNode(output_node)) {
auto real_node_with_index = common::AnfAlgo::GetPrevNodeOutput(output_node, output_index, true);
output_node = real_node_with_index.first;
output_index = real_node_with_index.second;
}
(void)graph_output_nodes_[output_node].insert(output_index);
}
}
bool MemUsageAnalyzer::IsGraphOutput(const AnfNodePtr &node, size_t index) {
auto iter = graph_output_nodes_.find(node);
if (iter == graph_output_nodes_.end()) {
return false;
}
if (iter->second.find(index) == iter->second.end()) {
return false;
}
return true;
}
void MemUsageAnalyzer::AddFusedTensorInfo() {
auto add_fused_tensor = [this](const std::vector<size_t> &tensors, size_t kernel_id) {
if (tensors.size() <= 1) {
return;
}
auto info = std::make_shared<MemUsageTensorInfo>();
info->tensor_id_ = tensor_num_;
info->real_tensor_id_ = tensor_num_;
info->tensor_size_ = 0;
info->node_ = nullptr;
info->index_ = 0;
(void)tensor_infos_.emplace_back(info);
++tensor_num_;
for (auto tensor_id : tensors) {
auto tensor_info = GetMemUsageTensorInfo(tensor_id);
tensor_info->real_tensor_id_ = info->tensor_id_;
info->tensor_size_ += tensor_info->tensor_size_;
(void)info->fused_tensor_ids_.emplace_back(tensor_info->tensor_id_);
(void)info->used_by_kernels_.emplace_back(kernel_id);
}
};
for (size_t i = 0; i < kernel_infos_.size(); ++i) {
auto &info = kernel_infos_[i];
MS_EXCEPTION_IF_NULL(info);
if (!info->is_comm_) {
continue;
}
add_fused_tensor(info->input_tensors_, i);
add_fused_tensor(info->output_tensors_, i);
}
}
void MemUsageAnalyzer::AddKernelAndTensorInfo(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
auto &exec_order = graph->execution_order();
auto real_kernel_num = exec_order.size();
kernel_infos_.resize(real_kernel_num);
auto add_tensor_usage = [this](size_t tensor_id, size_t kernel_id, size_t *kernel_mem) {
auto tensor_info = GetMemUsageTensorInfo(tensor_id);
(void)tensor_info->used_by_kernels_.emplace_back(kernel_id);
*kernel_mem += tensor_info->tensor_size_;
};
for (size_t i = 0; i < real_kernel_num; ++i) {
const auto &node = exec_order[i];
auto kernel_mod = AnfAlgo::GetKernelMod(node);
MS_EXCEPTION_IF_NULL(kernel_mod);
auto kernel_info = std::make_shared<MemUsageKernelInfo>();
kernel_info->is_comm_ = common::AnfAlgo::IsCommunicationOp(node);
kernel_info->update_input_ = common::AnfAlgo::IsUpdateParameterKernel(node);
// Memory used by this kernel
size_t kernel_mem = 0;
// Add input tensors
const auto input_num = kernel_mod->GetInputSizeList().size();
for (size_t index = 0; index < input_num; ++index) {
const auto &prev_node_output = common::AnfAlgo::GetPrevNodeOutput(node, index, true);
auto tensor_id = AddTensorInfo(prev_node_output.first, prev_node_output.second);
(void)kernel_info->input_tensors_.emplace_back(tensor_id);
add_tensor_usage(tensor_id, i, &kernel_mem);
}
// Add output tensors
const auto output_num = kernel_mod->GetOutputSizeList().size();
for (size_t index = 0; index < output_num; ++index) {
auto tensor_id = AddTensorInfo(node, index);
(void)kernel_info->output_tensors_.emplace_back(tensor_id);
add_tensor_usage(tensor_id, i, &kernel_mem);
}
// Add workspace tensors
const auto workspace_num = kernel_mod->GetWorkspaceSizeList().size();
for (size_t index = 0; index < workspace_num; ++index) {
auto tensor_id = AddTensorInfo(node, index, true);
(void)kernel_info->workspace_tensors_.emplace_back(tensor_id);
add_tensor_usage(tensor_id, i, &kernel_mem);
}
if (kernel_mem > least_mem_) {
least_mem_ = kernel_mem;
}
kernel_infos_[i] = kernel_info;
}
}
} // namespace device
} // namespace mindspore

View File

@ -0,0 +1,71 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GSM_MEM_USAGE_ANALYZER_H_
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GSM_MEM_USAGE_ANALYZER_H_
#include <memory>
#include <vector>
#include <map>
#include <set>
#include "backend/common/session/kernel_graph.h"
#include "runtime/device/gsm/swap_strategy.h"
namespace mindspore {
namespace device {
class MemUsageAnalyzer {
public:
MemUsageAnalyzer() = default;
~MemUsageAnalyzer() = default;
void Analyze(const KernelGraphPtr &graph);
const std::vector<std::shared_ptr<MemUsageKernelInfo>> &GetMemUsageKernelInfos() const { return kernel_infos_; }
const std::vector<std::shared_ptr<MemUsageTensorInfo>> &GetMemUsageTensorInfos() const { return tensor_infos_; }
size_t LeastMemNeeded() const { return least_mem_; }
const std::shared_ptr<MemUsageKernelInfo> GetMemUsageKernelInfo(size_t kid) const {
if (kid >= kernel_infos_.size()) {
MS_LOG(EXCEPTION) << "Invalid kernel id!!!";
}
return kernel_infos_[kid];
}
const std::shared_ptr<MemUsageTensorInfo> GetMemUsageTensorInfo(size_t tid) const {
if (tid >= tensor_infos_.size()) {
MS_LOG(EXCEPTION) << "Invalid tensor id!!!";
}
return tensor_infos_[tid];
}
private:
void AddOutputNodeInfo(const KernelGraphPtr &graph);
void AddKernelAndTensorInfo(const KernelGraphPtr &graph);
size_t AddTensorInfo(const AnfNodePtr &node, size_t index, bool is_workspace = false);
void AddFusedTensorInfo();
bool IsGraphOutput(const AnfNodePtr &node, size_t index);
std::map<AnfNodePtr, std::map<size_t, size_t>> kernel_input_value_tid_;
std::map<AnfNodePtr, std::map<size_t, size_t>> kernel_input_param_tid_;
std::map<AnfNodePtr, std::map<size_t, size_t>> kernel_output_tid_;
std::map<AnfNodePtr, std::map<size_t, size_t>> kernel_workspace_tid_;
std::map<AnfNodePtr, std::set<size_t>> graph_output_nodes_;
std::vector<std::shared_ptr<MemUsageTensorInfo>> tensor_infos_;
std::vector<std::shared_ptr<MemUsageKernelInfo>> kernel_infos_;
size_t tensor_num_{0};
size_t least_mem_{0};
};
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GSM_MEM_USAGE_ANALYZER_H_

View File

@ -0,0 +1,86 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GSM_SWAP_STRATEGY_H_
#define MINDSPORE_CCSRC_RUNTIME_DEVICE_GSM_SWAP_STRATEGY_H_
#include <vector>
#include <map>
#include <memory>
#include "ir/anf.h"
namespace mindspore {
namespace device {
struct MemUsageTensorInfo {
size_t tensor_id_{0};
size_t real_tensor_id_{0};
size_t tensor_size_{0};
AnfNodePtr node_{nullptr};
size_t index_{0};
bool is_workspace_{false};
bool is_graph_output_{false};
bool is_graph_input_{false};
std::vector<size_t> used_by_kernels_;
std::vector<size_t> fused_tensor_ids_;
};
struct MemUsageKernelInfo {
bool is_comm_{false};
bool update_input_{false};
std::vector<size_t> input_tensors_;
std::vector<size_t> output_tensors_;
std::vector<size_t> workspace_tensors_;
};
enum class SwapActionType {
kUnDefined,
kHBM2DDR,
kHBM2DISK,
kDDR2HBM,
kDISK2HBM,
kDDR2DISK,
kDISK2DDR,
kAllocHBM,
};
struct TensorAction {
SwapActionType action_;
size_t tensor_id_{0};
// Avoid copy if data exists in target storage and not be updated by kernel
bool avoid_copy_{false};
};
struct SwapAction {
std::vector<std::shared_ptr<TensorAction>> actions_;
};
struct SwapLink {
SwapLink(size_t from, size_t to) : from_(from), to_(to) {}
~SwapLink() = default;
size_t from_{0};
size_t to_{0};
};
struct SwapStrategy {
size_t kernel_num_{0};
size_t virtual_node_num_{0};
std::map<size_t, AnfNodePtr> nodes_;
std::map<size_t, std::shared_ptr<SwapAction>> actions_;
std::vector<std::shared_ptr<SwapLink>> links_;
std::vector<std::shared_ptr<MemUsageTensorInfo>> tensor_infos_;
std::vector<std::shared_ptr<MemUsageKernelInfo>> kernel_infos_;
};
} // namespace device
} // namespace mindspore
#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GSM_SWAP_STRATEGY_H_

View File

@ -96,6 +96,7 @@ if(MSLITE_ENABLE_CLOUD_FUSION_INFERENCE OR MSLITE_ENABLE_CLOUD_INFERENCE)
${CCSRC_DIR}/runtime/device/memory_offload_strategy.cc
${CCSRC_DIR}/runtime/device/memory_manager.cc
${CCSRC_DIR}/runtime/device/auto_mem_offload.cc
${CCSRC_DIR}/runtime/device/gsm/mem_usage_analyzer.cc
${CCSRC_DIR}/runtime/device/common_somas_allocator.cc
${CCSRC_DIR}/runtime/pynative/op_runtime_info.cc
${CCSRC_DIR}/runtime/hardware/device_type.cc

View File

@ -77,6 +77,7 @@ if(ENABLE_MINDDATA)
./tbe/*.cc
./mindapi/*.cc
./runtime/graph_scheduler/*.cc
./runtime/device/gsm/*.cc
./plugin/device/cpu/hal/*.cc
./place/*.cc
./ops/test_ops_fake_quant_param.cc

View File

@ -134,7 +134,7 @@ std::shared_ptr<session::KernelGraph> BackendCommon::Compile(const FuncGraphPtr
func_graph->set_manager(new_manager);
const std::string kDefaultDeviceName = "CPU";
auto graph_partition = std::make_shared<compile::GraphPartition>(compile::GetMsNonlinearOps(), kDefaultDeviceName);
auto graph_partition = std::make_shared<compile::GraphPartition>(compile::GetMsNonlinearOps(), kMsConvert);
bool multi_target = false;
auto segments = graph_partition->Partition(func_graph, &multi_target);
if (segments.empty()) {

View File

@ -80,7 +80,7 @@ class TestDeviceResManager : public device::DeviceResManager {
virtual DeviceAddressPtr CreateDeviceAddress(void *const device_ptr, size_t device_size, const string &format,
TypeId type_id, const ShapeVector &shape,
const UserDataPtr &user_data = nullptr) const {
return std::make_shared<TestDeviceAddress>(nullptr, 0);
return std::make_shared<TestDeviceAddress>(device_ptr, device_size);
}
};
@ -128,13 +128,13 @@ class TestKernelExecutor : public device::KernelExecutor {
std::vector<size_t> output_size_list;
size_t input_num = common::AnfAlgo::GetInputTensorNum(node);
for (size_t input_index = 0; input_index < input_num; ++input_index) {
TypeId type_id = AnfAlgo::GetInputDeviceDataType(node, input_index);
size_t type_size = GetTypeByte(TypeIdToType(type_id));
auto shape = AnfAlgo::GetInputDeviceShape(node, input_index);
size_t tensor_size =
shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
tensor_size = std::max(tensor_size, type_size);
auto [input_node, index] = common::AnfAlgo::GetPrevNodeOutput(node, input_index, true);
size_t tensor_size = AnfAlgo::GetOutputTensorMemSize(input_node, index);
(void)input_size_list.emplace_back(tensor_size);
if (AnfAlgo::OutputAddrExist(input_node, index)) {
continue;
}
AnfAlgo::SetOutputAddr(std::make_shared<TestDeviceAddress>(nullptr, tensor_size), index, input_node.get());
}
size_t output_num = AnfAlgo::GetOutputTensorNum(node);
for (size_t output_index = 0; output_index < output_num; ++output_index) {
@ -143,11 +143,13 @@ class TestKernelExecutor : public device::KernelExecutor {
AnfAlgo::SetOutputAddr(std::make_shared<TestDeviceAddress>(nullptr, tensor_size), output_index, node.get());
}
const size_t kDefaultWorkSpaceSize = 4;
auto kernel_mod_ptr = std::make_shared<TestKernelMod>();
kernel_mod_ptr->SetInputSizeList(input_size_list);
kernel_mod_ptr->SetOutputSizeList(output_size_list);
kernel_mod_ptr->SetWorkspaceSizeList({4});
kernel_mod_ptr->SetWorkspaceSizeList({kDefaultWorkSpaceSize});
AnfAlgo::SetKernelMod(kernel_mod_ptr, node.get());
AnfAlgo::SetWorkspaceAddr(std::make_shared<TestDeviceAddress>(nullptr, kDefaultWorkSpaceSize), 0, node.get());
}
}
};

View File

@ -0,0 +1,27 @@
# Copyright 2023 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
from mindspore.ops import operations as P
add = P.Add()
addn = P.AddN()
def add_net(x1, x2, x3, x4, x5):
sum1 = add(x1, x2)
sum2 = add(sum1, x3)
sum3 = add(sum2, x4)
sum4 = add(sum3, x5)
ret = addn((sum4, sum1, sum2))
return ret

View File

@ -0,0 +1,62 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <vector>
#include <map>
#include "common/common_test.h"
#include "common/backend_common_test.h"
#include "common/py_func_graph_fetcher.h"
#include "runtime/device/gsm/mem_usage_analyzer.h"
namespace mindspore::device {
class TestMemUsageAnalyzer : public BackendCommon {
public:
TestMemUsageAnalyzer() : get_py_func_("gtest_input.runtime.device.gsm.mem_usage_analyzer_test", true) {}
UT::PyFuncGraphFetcher get_py_func_;
};
/// Feature: MemUsageAnalyzer
/// Description: Test MemUsageAnalyzer interface
/// Expectation: Pass all interface test
TEST_F(TestMemUsageAnalyzer, test_mem_usage_analyzer) {
auto net = get_py_func_("add_net");
EXPECT_NE(net, nullptr);
std::vector<int64_t> shp_x{1, 2, 2, 2};
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
AbstractBasePtrList args_spec_list{x_abstract, x_abstract, x_abstract, x_abstract, x_abstract};
auto func_graph = GetFuncGraph(net, args_spec_list);
auto kernel_graph = Compile(func_graph);
auto analyzer = std::make_shared<MemUsageAnalyzer>();
analyzer->Analyze(kernel_graph);
auto kernel_infos = analyzer->GetMemUsageKernelInfos();
auto tensor_infos = analyzer->GetMemUsageTensorInfos();
ASSERT_EQ(5, kernel_infos.size());
ASSERT_EQ(15, tensor_infos.size());
for (size_t i = 0; i < kernel_infos.size(); ++i) {
ASSERT_NE(nullptr, analyzer->GetMemUsageKernelInfo(i));
}
for (size_t i = 0; i < tensor_infos.size(); ++i) {
ASSERT_NE(nullptr, analyzer->GetMemUsageTensorInfo(i));
}
ASSERT_EQ(132, analyzer->LeastMemNeeded());
}
} // namespace mindspore::device