!17434 dynamic shape for unify runtime

From: @zyli2020
Reviewed-by: @limingqi107,@cristoval
Signed-off-by: @cristoval
This commit is contained in:
mindspore-ci-bot 2021-06-02 16:38:08 +08:00 committed by Gitee
commit 9aae606590
11 changed files with 127 additions and 23 deletions

View File

@ -1684,7 +1684,7 @@ py::object ForwardExecutor::RunOpInMs(const OpExecInfoPtr &op_exec_info, Pynativ
}
const compile::ActorInfo &actor_info =
mind_rt_backend->CompileGraph(op_run_info, graph_info, &tensors_mask, &input_tensors);
outputs = mind_rt_backend->RunGraph(actor_info, &tensors_mask, &input_tensors);
outputs = mind_rt_backend->RunGraph(actor_info, &op_run_info, &tensors_mask, &input_tensors);
}
if (op_exec_info->is_dynamic_shape) {

View File

@ -67,6 +67,7 @@ class DeviceAddress : public mindspore::DeviceSync {
virtual ~DeviceAddress() { ptr_ = nullptr; }
const void *GetPtr() const { return ptr_; }
size_t GetSize() const { return size_; }
void SetSize(size_t size) { size_ = size; }
std::string format() const { return format_; }
TypeId type_id() const { return type_id_; }
void set_host_shape(const ShapeVector &shape) { host_shape_ = shape; }

View File

@ -72,6 +72,11 @@ void KernelActor::RunOpData(OpData<DeviceTensor> *input_data, OpContext<DeviceTe
input_op_datas_[sequential_num].emplace_back(input_data);
// When all the inputs are collected, then allocate memory and callback launch.
if (CheckLaunchCondition(context)) {
// Infer kernel shape and update abstract info for dynamic shape kernel.
if (AnfAlgo::IsDynamicShape(kernel_)) {
device_context_->UpdateKernelDynamicShape(kernel_);
}
FetchInputDeviceTensor(context);
FetchOutputDeviceTensor();
SendMemoryAllocReq(context);
@ -84,6 +89,11 @@ void KernelActor::RunOpControl(AID *input_control, OpContext<DeviceTensor> *cont
input_op_controls_[sequential_num].emplace_back(input_control);
// When all the inputs are collected, then allocate memory and callback launch.
if (CheckLaunchCondition(context)) {
// Infer kernel shape and update abstract info for dynamic shape kernel.
if (AnfAlgo::IsDynamicShape(kernel_)) {
device_context_->UpdateKernelDynamicShape(kernel_);
}
FetchInputDeviceTensor(context);
FetchOutputDeviceTensor();
SendMemoryAllocReq(context);
@ -221,9 +231,19 @@ void KernelActor::FetchInputDeviceTensor(OpContext<DeviceTensor> *context) {
void KernelActor::FetchOutputDeviceTensor() {
MS_EXCEPTION_IF_NULL(kernel_info_);
auto &output_addresss = kernel_info_->output_address_list();
for (size_t i = 0; i < output_addresss.size(); ++i) {
auto output_address = output_addresss[i].get();
auto &output_addresses = kernel_info_->output_address_list();
const auto &kernel_mod = kernel_info_->kernel_mod();
MS_EXCEPTION_IF_NULL(kernel_mod);
const auto &output_size_list = kernel_mod->GetOutputSizeList();
for (size_t i = 0; i < output_addresses.size(); ++i) {
auto output_address = output_addresses[i].get();
if (output_size_list[i] != output_address->GetSize()) {
// The size of output address may be changed in dynamic shape scenario.
output_address->SetSize(output_size_list[i]);
}
// When the tensor is the output of graph or in dynamic shape scenario, the output tensor may be changed.
if (output_device_tensors_[i] != output_address) {
output_device_tensors_[i] = output_address;
memory_alloc_list_[i] = output_address;

View File

@ -153,7 +153,7 @@ std::vector<AnfNodePtr> ControlNodeParser::FetchFuncGraphOutput(const FuncGraphP
std::vector<AnfNodePtr> *call_nodes) {
std::vector<AnfNodePtr> outputs;
const auto &output = func_graph->output();
const auto &real_output = AnfAlgo::VisitKernelWithReturnType(output, 0);
const auto &real_output = AnfAlgo::VisitKernelWithReturnType(output, 0, false, {prim::kPrimTupleGetItem});
if (find((*call_nodes).begin(), (*call_nodes).end(), real_output.first) != (*call_nodes).end()) {
return outputs;
}

View File

@ -47,6 +47,7 @@ class CPUDeviceContext : public DeviceContext {
void SetOperatorInfo(const std::vector<CNodePtr> &nodes) const override;
void CreateKernel(const std::vector<CNodePtr> &nodes) const override;
bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs) const override;

View File

@ -86,6 +86,13 @@ class DeviceContext {
// 'KernelMod' is real executive object of kernel.
virtual void CreateKernel(const std::vector<CNodePtr> &nodes) const = 0;
// Infer kernel shape and update abstract info for dynamic shape kernel.
virtual void UpdateKernelDynamicShape(const CNodePtr &kernel) const {
if (AnfAlgo::IsDynamicShape(kernel)) {
AnfAlgo::InferShape(kernel);
}
}
// Launch a kernel via 'KernelMod' of the kernel.
virtual bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs) const = 0;

View File

@ -34,6 +34,7 @@
#include "profiler/device/gpu/gpu_profiling.h"
#include "profiler/device/gpu/gpu_profiling_utils.h"
#include "backend/session/kernel_graph.h"
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
namespace mindspore {
namespace device {
@ -226,6 +227,7 @@ void GPUDeviceContext::OptimizeGraphWithDeviceInfo(const KernelGraphPtr &graph)
void GPUDeviceContext::FuseOperators(const KernelGraphPtr &graph) const {
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
pm->AddPass(std::make_shared<opt::MatMulBiasAddFusion>());
pm->AddPass(std::make_shared<opt::AdamWeightDecayFusion>());
pm->AddPass(std::make_shared<opt::AdamFusion>());
pm->AddPass(std::make_shared<opt::ApplyMomentumWeightDecayScaleFusion>());
@ -238,6 +240,7 @@ void GPUDeviceContext::FuseOperators(const KernelGraphPtr &graph) const {
pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
pm->AddPass(std::make_shared<opt::PrintReduceFusion>("print_reduce"));
pm->AddPass(std::make_shared<opt::BCEWithLogitsLossFusion>());
optimizer->AddPassManager(pm);
(void)optimizer->Optimize(graph);
graph->SetExecOrderByDefault();
@ -279,6 +282,23 @@ void GPUDeviceContext::SetOperatorInfo(const std::vector<CNodePtr> &nodes) const
void GPUDeviceContext::CreateKernel(const std::vector<CNodePtr> &nodes) const { CreateGPUKernel(nodes); }
void GPUDeviceContext::UpdateKernelDynamicShape(const CNodePtr &kernel) const {
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
MS_EXCEPTION_IF_NULL(kernel_mod);
if (session::AnfRuntimeAlgorithm::GetKernelType(kernel) == KernelType::AKG_KERNEL) {
MS_LOG(EXCEPTION) << "Akg kernels do not support dynamic shape by now.";
}
kernel::GpuKernel *gpu_kernel = dynamic_cast<kernel::GpuKernel *>(kernel_mod);
MS_EXCEPTION_IF_NULL(gpu_kernel);
device::DynamicKernelPtr dynamic_kernel = gpu_kernel->DynamicKernel();
MS_EXCEPTION_IF_NULL(dynamic_kernel);
dynamic_kernel->InferShape();
dynamic_kernel->UpdateArgs();
}
bool GPUDeviceContext::LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) const {
@ -289,15 +309,30 @@ bool GPUDeviceContext::LaunchKernel(const CNodePtr &kernel, const std::vector<Ad
std::lock_guard<std::mutex> locker(launch_mutex_);
auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
MS_EXCEPTION_IF_NULL(profiler_inst);
if (profiler_inst->GetEnableFlag()) {
return LaunchKernelWithProfiling(kernel, inputs, workspace, outputs);
}
bool ret = true;
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
MS_EXCEPTION_IF_NULL(kernel_mod);
return DoLaunchKernel(kernel_mod, inputs, workspace, outputs);
auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
MS_EXCEPTION_IF_NULL(profiler_inst);
if (!profiler_inst->GetEnableFlag()) {
ret = DoLaunchKernel(kernel_mod, inputs, workspace, outputs);
} else {
ret = LaunchKernelWithProfiling(kernel, inputs, workspace, outputs);
}
if (!ret) {
MS_LOG(ERROR) << "Launch kernel failed, kernel full name: " << kernel->fullname_with_scope();
return false;
}
// Processing after execution of dynamic kernel to update output shape.
if (AnfAlgo::IsDynamicShape(kernel)) {
kernel::GpuKernel *gpu_kernel = dynamic_cast<kernel::GpuKernel *>(kernel_mod);
MS_EXCEPTION_IF_NULL(gpu_kernel);
gpu_kernel->PostExecute();
}
return ret;
}
bool GPUDeviceContext::LaunchKernelWithProfiling(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,

View File

@ -55,6 +55,10 @@ class GPUDeviceContext : public DeviceContext {
void SetOperatorInfo(const std::vector<CNodePtr> &nodes) const override;
void CreateKernel(const std::vector<CNodePtr> &nodes) const override;
// Infer kernel shape and update abstract info for dynamic shape kernel.
void UpdateKernelDynamicShape(const CNodePtr &kernel) const override;
bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs) const override;

View File

@ -47,5 +47,7 @@
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
#include "backend/optimizer/pass/communication_op_fusion.h"
#include "backend/optimizer/pass/getitem_tuple.h"
#include "backend/optimizer/gpu/matmul_biasadd_fusion.h"
#include "backend/optimizer/gpu/bce_with_logits_loss_fusion.h"
#endif // MINDSPORE_CCSRC_RUNTIME_HARDWARE_GPU_OPTIMIZER_H_

View File

@ -163,6 +163,18 @@ void PushTensor(const VectorRef &args, const std::vector<AnfNodePtr> &parameters
auto position = iter - parameters.begin();
PushInputTensor(args[position], input_tensor);
}
void UpdateOutputAbstract(const KernelGraphPtr &kernel_graph, OpRunInfo *op_run_info) {
MS_EXCEPTION_IF_NULL(kernel_graph);
MS_EXCEPTION_IF_NULL(op_run_info);
const auto &kernels = kernel_graph->execution_order();
for (const auto &kernel : kernels) {
MS_EXCEPTION_IF_NULL(kernel);
if (AnfAlgo::GetCNodeName(kernel) == op_run_info->op_name) {
op_run_info->abstract = kernel->abstract();
}
}
}
} // namespace
VectorRef MsBackend::MsRunGraph(const GraphId &g, const VectorRef &args, const std::string &target) {
@ -380,7 +392,7 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs
const ActorInfo &actor_info =
CompileGraph(op_run_info, graph_info, &input_tensor_info.input_tensors_mask, &input_tensor_info.input_tensors);
VectorRef op_outputs =
RunGraph(actor_info, &input_tensor_info.input_tensors_mask, &input_tensor_info.input_tensors);
RunGraph(actor_info, &op_run_info, &input_tensor_info.input_tensors_mask, &input_tensor_info.input_tensors);
std::vector<tensor::TensorPtr> new_output_tensors;
runtime::GraphCompiler::GetInstance().RecoverGraphOutput(kernel, op_outputs, output_indexes, &op_output_map,
@ -468,8 +480,14 @@ VectorRef MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &
// Fetch outputs.
MS_EXCEPTION_IF_NULL(actor_set->output_actor_);
auto &output_tensors = actor_set->output_actor_->outputs();
(void)std::transform(output_tensors.begin(), output_tensors.end(), std::back_inserter(outputs.elements_),
[](tensor::TensorPtr &tensor) { return std::move(tensor); });
if (output_tensors.size() > 1) {
VectorRef tmp;
(void)std::transform(output_tensors.begin(), output_tensors.end(), std::back_inserter(tmp.elements_),
[](tensor::TensorPtr &tensor) { return std::move(tensor); });
outputs.emplace_back(std::move(tmp));
} else if (output_tensors.size() == 1) {
outputs.emplace_back(std::move(output_tensors.front()));
}
MS_LOG(INFO) << "Run actor end, actor name: " << actor_info;
runtime::GraphCompiler::GetInstance().Summary(graph_compiler_info.graphs_);
@ -494,13 +512,22 @@ std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(con
const auto &all_branch_output = ControlNodeParser::FetchAllBranchOutputs(root_graph);
for (const auto &branch_output : all_branch_output) {
size_t position = 0;
const auto &outputs = AnfAlgo::GetAllOutput(branch_output, {prim::kPrimTupleGetItem});
outputs_num = outputs.size();
if (AnfAlgo::CheckPrimitiveType(branch_output, prim::kPrimMakeTuple)) {
const auto &outputs = AnfAlgo::GetAllOutput(branch_output, {prim::kPrimTupleGetItem});
outputs_num = outputs.size();
for (const auto &output : outputs) {
const auto &output_with_index = AnfAlgo::VisitKernelWithReturnType(output, 0, false);
MS_EXCEPTION_IF_NULL(output_with_index.first);
outputs_order.emplace(output_with_index, position++);
for (const auto &output : outputs) {
const auto &output_with_index = AnfAlgo::VisitKernelWithReturnType(output, 0, false);
MS_EXCEPTION_IF_NULL(output_with_index.first);
outputs_order.emplace(output_with_index, position++);
}
} else if (branch_output->isa<CNode>()) {
outputs_num = AnfAlgo::GetOutputTensorNum(branch_output);
for (size_t i = 0; i < outputs_num; i++) {
const auto &output_with_index = AnfAlgo::VisitKernelWithReturnType(branch_output, i, false);
MS_EXCEPTION_IF_NULL(output_with_index.first);
outputs_order.emplace(output_with_index, position++);
}
}
}
@ -547,7 +574,8 @@ std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(
outputs_order.size(), name);
}
VectorRef MindRTBackend::RunGraph(const ActorInfo &actor_info, const std::vector<int64_t> *tensors_mask,
VectorRef MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info,
const std::vector<int64_t> *tensors_mask,
const std::vector<tensor::TensorPtr> *input_tensors) {
const auto &graph_iter = actor_to_graph_compiler_info_.find(actor_info);
if (graph_iter == actor_to_graph_compiler_info_.end()) {
@ -589,6 +617,12 @@ VectorRef MindRTBackend::RunGraph(const ActorInfo &actor_info, const std::vector
VectorRef outputs;
(void)std::transform(output_tensors.begin(), output_tensors.end(), std::back_inserter(outputs.elements_),
[](tensor::TensorPtr &tensor) { return std::move(tensor); });
// update output abstract of dynamic op to op_run_info
if (op_run_info->is_dynamic_shape) {
UpdateOutputAbstract(graph_compiler_info.graphs_.front(), op_run_info);
}
return outputs;
}
} // namespace compile

View File

@ -113,7 +113,7 @@ class MindRTBackend : public Backend {
VectorRef RunGraph(const ActorInfo &actor_info, const VectorRef &args);
// Run Graph in the pyNative mode.
VectorRef RunGraph(const ActorInfo &actor_info, const std::vector<int64_t> *tensors_mask,
VectorRef RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info, const std::vector<int64_t> *tensors_mask,
const std::vector<tensor::TensorPtr> *input_tensors);
private: