forked from mindspore-Ecosystem/mindspore
!17434 dynamic shape for unify runtime
From: @zyli2020 Reviewed-by: @limingqi107,@cristoval Signed-off-by: @cristoval
This commit is contained in:
commit
9aae606590
|
@ -1684,7 +1684,7 @@ py::object ForwardExecutor::RunOpInMs(const OpExecInfoPtr &op_exec_info, Pynativ
|
|||
}
|
||||
const compile::ActorInfo &actor_info =
|
||||
mind_rt_backend->CompileGraph(op_run_info, graph_info, &tensors_mask, &input_tensors);
|
||||
outputs = mind_rt_backend->RunGraph(actor_info, &tensors_mask, &input_tensors);
|
||||
outputs = mind_rt_backend->RunGraph(actor_info, &op_run_info, &tensors_mask, &input_tensors);
|
||||
}
|
||||
|
||||
if (op_exec_info->is_dynamic_shape) {
|
||||
|
|
|
@ -67,6 +67,7 @@ class DeviceAddress : public mindspore::DeviceSync {
|
|||
virtual ~DeviceAddress() { ptr_ = nullptr; }
|
||||
const void *GetPtr() const { return ptr_; }
|
||||
size_t GetSize() const { return size_; }
|
||||
void SetSize(size_t size) { size_ = size; }
|
||||
std::string format() const { return format_; }
|
||||
TypeId type_id() const { return type_id_; }
|
||||
void set_host_shape(const ShapeVector &shape) { host_shape_ = shape; }
|
||||
|
|
|
@ -72,6 +72,11 @@ void KernelActor::RunOpData(OpData<DeviceTensor> *input_data, OpContext<DeviceTe
|
|||
input_op_datas_[sequential_num].emplace_back(input_data);
|
||||
// When all the inputs are collected, then allocate memory and callback launch.
|
||||
if (CheckLaunchCondition(context)) {
|
||||
// Infer kernel shape and update abstract info for dynamic shape kernel.
|
||||
if (AnfAlgo::IsDynamicShape(kernel_)) {
|
||||
device_context_->UpdateKernelDynamicShape(kernel_);
|
||||
}
|
||||
|
||||
FetchInputDeviceTensor(context);
|
||||
FetchOutputDeviceTensor();
|
||||
SendMemoryAllocReq(context);
|
||||
|
@ -84,6 +89,11 @@ void KernelActor::RunOpControl(AID *input_control, OpContext<DeviceTensor> *cont
|
|||
input_op_controls_[sequential_num].emplace_back(input_control);
|
||||
// When all the inputs are collected, then allocate memory and callback launch.
|
||||
if (CheckLaunchCondition(context)) {
|
||||
// Infer kernel shape and update abstract info for dynamic shape kernel.
|
||||
if (AnfAlgo::IsDynamicShape(kernel_)) {
|
||||
device_context_->UpdateKernelDynamicShape(kernel_);
|
||||
}
|
||||
|
||||
FetchInputDeviceTensor(context);
|
||||
FetchOutputDeviceTensor();
|
||||
SendMemoryAllocReq(context);
|
||||
|
@ -221,9 +231,19 @@ void KernelActor::FetchInputDeviceTensor(OpContext<DeviceTensor> *context) {
|
|||
|
||||
void KernelActor::FetchOutputDeviceTensor() {
|
||||
MS_EXCEPTION_IF_NULL(kernel_info_);
|
||||
auto &output_addresss = kernel_info_->output_address_list();
|
||||
for (size_t i = 0; i < output_addresss.size(); ++i) {
|
||||
auto output_address = output_addresss[i].get();
|
||||
auto &output_addresses = kernel_info_->output_address_list();
|
||||
const auto &kernel_mod = kernel_info_->kernel_mod();
|
||||
MS_EXCEPTION_IF_NULL(kernel_mod);
|
||||
const auto &output_size_list = kernel_mod->GetOutputSizeList();
|
||||
|
||||
for (size_t i = 0; i < output_addresses.size(); ++i) {
|
||||
auto output_address = output_addresses[i].get();
|
||||
if (output_size_list[i] != output_address->GetSize()) {
|
||||
// The size of output address may be changed in dynamic shape scenario.
|
||||
output_address->SetSize(output_size_list[i]);
|
||||
}
|
||||
|
||||
// When the tensor is the output of graph or in dynamic shape scenario, the output tensor may be changed.
|
||||
if (output_device_tensors_[i] != output_address) {
|
||||
output_device_tensors_[i] = output_address;
|
||||
memory_alloc_list_[i] = output_address;
|
||||
|
|
|
@ -153,7 +153,7 @@ std::vector<AnfNodePtr> ControlNodeParser::FetchFuncGraphOutput(const FuncGraphP
|
|||
std::vector<AnfNodePtr> *call_nodes) {
|
||||
std::vector<AnfNodePtr> outputs;
|
||||
const auto &output = func_graph->output();
|
||||
const auto &real_output = AnfAlgo::VisitKernelWithReturnType(output, 0);
|
||||
const auto &real_output = AnfAlgo::VisitKernelWithReturnType(output, 0, false, {prim::kPrimTupleGetItem});
|
||||
if (find((*call_nodes).begin(), (*call_nodes).end(), real_output.first) != (*call_nodes).end()) {
|
||||
return outputs;
|
||||
}
|
||||
|
|
|
@ -47,6 +47,7 @@ class CPUDeviceContext : public DeviceContext {
|
|||
|
||||
void SetOperatorInfo(const std::vector<CNodePtr> &nodes) const override;
|
||||
void CreateKernel(const std::vector<CNodePtr> &nodes) const override;
|
||||
|
||||
bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs) const override;
|
||||
|
||||
|
|
|
@ -86,6 +86,13 @@ class DeviceContext {
|
|||
// 'KernelMod' is real executive object of kernel.
|
||||
virtual void CreateKernel(const std::vector<CNodePtr> &nodes) const = 0;
|
||||
|
||||
// Infer kernel shape and update abstract info for dynamic shape kernel.
|
||||
virtual void UpdateKernelDynamicShape(const CNodePtr &kernel) const {
|
||||
if (AnfAlgo::IsDynamicShape(kernel)) {
|
||||
AnfAlgo::InferShape(kernel);
|
||||
}
|
||||
}
|
||||
|
||||
// Launch a kernel via 'KernelMod' of the kernel.
|
||||
virtual bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs) const = 0;
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
#include "profiler/device/gpu/gpu_profiling.h"
|
||||
#include "profiler/device/gpu/gpu_profiling_utils.h"
|
||||
#include "backend/session/kernel_graph.h"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace device {
|
||||
|
@ -226,6 +227,7 @@ void GPUDeviceContext::OptimizeGraphWithDeviceInfo(const KernelGraphPtr &graph)
|
|||
void GPUDeviceContext::FuseOperators(const KernelGraphPtr &graph) const {
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto pm = std::make_shared<opt::PassManager>();
|
||||
pm->AddPass(std::make_shared<opt::MatMulBiasAddFusion>());
|
||||
pm->AddPass(std::make_shared<opt::AdamWeightDecayFusion>());
|
||||
pm->AddPass(std::make_shared<opt::AdamFusion>());
|
||||
pm->AddPass(std::make_shared<opt::ApplyMomentumWeightDecayScaleFusion>());
|
||||
|
@ -238,6 +240,7 @@ void GPUDeviceContext::FuseOperators(const KernelGraphPtr &graph) const {
|
|||
pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
|
||||
pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
|
||||
pm->AddPass(std::make_shared<opt::PrintReduceFusion>("print_reduce"));
|
||||
pm->AddPass(std::make_shared<opt::BCEWithLogitsLossFusion>());
|
||||
optimizer->AddPassManager(pm);
|
||||
(void)optimizer->Optimize(graph);
|
||||
graph->SetExecOrderByDefault();
|
||||
|
@ -279,6 +282,23 @@ void GPUDeviceContext::SetOperatorInfo(const std::vector<CNodePtr> &nodes) const
|
|||
|
||||
void GPUDeviceContext::CreateKernel(const std::vector<CNodePtr> &nodes) const { CreateGPUKernel(nodes); }
|
||||
|
||||
void GPUDeviceContext::UpdateKernelDynamicShape(const CNodePtr &kernel) const {
|
||||
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
|
||||
MS_EXCEPTION_IF_NULL(kernel_mod);
|
||||
|
||||
if (session::AnfRuntimeAlgorithm::GetKernelType(kernel) == KernelType::AKG_KERNEL) {
|
||||
MS_LOG(EXCEPTION) << "Akg kernels do not support dynamic shape by now.";
|
||||
}
|
||||
|
||||
kernel::GpuKernel *gpu_kernel = dynamic_cast<kernel::GpuKernel *>(kernel_mod);
|
||||
MS_EXCEPTION_IF_NULL(gpu_kernel);
|
||||
device::DynamicKernelPtr dynamic_kernel = gpu_kernel->DynamicKernel();
|
||||
MS_EXCEPTION_IF_NULL(dynamic_kernel);
|
||||
|
||||
dynamic_kernel->InferShape();
|
||||
dynamic_kernel->UpdateArgs();
|
||||
}
|
||||
|
||||
bool GPUDeviceContext::LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) const {
|
||||
|
@ -289,15 +309,30 @@ bool GPUDeviceContext::LaunchKernel(const CNodePtr &kernel, const std::vector<Ad
|
|||
|
||||
std::lock_guard<std::mutex> locker(launch_mutex_);
|
||||
|
||||
auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(profiler_inst);
|
||||
if (profiler_inst->GetEnableFlag()) {
|
||||
return LaunchKernelWithProfiling(kernel, inputs, workspace, outputs);
|
||||
}
|
||||
|
||||
bool ret = true;
|
||||
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
|
||||
MS_EXCEPTION_IF_NULL(kernel_mod);
|
||||
return DoLaunchKernel(kernel_mod, inputs, workspace, outputs);
|
||||
|
||||
auto profiler_inst = profiler::gpu::GPUProfiler::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(profiler_inst);
|
||||
|
||||
if (!profiler_inst->GetEnableFlag()) {
|
||||
ret = DoLaunchKernel(kernel_mod, inputs, workspace, outputs);
|
||||
} else {
|
||||
ret = LaunchKernelWithProfiling(kernel, inputs, workspace, outputs);
|
||||
}
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "Launch kernel failed, kernel full name: " << kernel->fullname_with_scope();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Processing after execution of dynamic kernel to update output shape.
|
||||
if (AnfAlgo::IsDynamicShape(kernel)) {
|
||||
kernel::GpuKernel *gpu_kernel = dynamic_cast<kernel::GpuKernel *>(kernel_mod);
|
||||
MS_EXCEPTION_IF_NULL(gpu_kernel);
|
||||
gpu_kernel->PostExecute();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool GPUDeviceContext::LaunchKernelWithProfiling(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
|
||||
|
|
|
@ -55,6 +55,10 @@ class GPUDeviceContext : public DeviceContext {
|
|||
|
||||
void SetOperatorInfo(const std::vector<CNodePtr> &nodes) const override;
|
||||
void CreateKernel(const std::vector<CNodePtr> &nodes) const override;
|
||||
|
||||
// Infer kernel shape and update abstract info for dynamic shape kernel.
|
||||
void UpdateKernelDynamicShape(const CNodePtr &kernel) const override;
|
||||
|
||||
bool LaunchKernel(const CNodePtr &kernel, const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace, const std::vector<AddressPtr> &outputs) const override;
|
||||
|
||||
|
|
|
@ -47,5 +47,7 @@
|
|||
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
|
||||
#include "backend/optimizer/pass/communication_op_fusion.h"
|
||||
#include "backend/optimizer/pass/getitem_tuple.h"
|
||||
#include "backend/optimizer/gpu/matmul_biasadd_fusion.h"
|
||||
#include "backend/optimizer/gpu/bce_with_logits_loss_fusion.h"
|
||||
|
||||
#endif // MINDSPORE_CCSRC_RUNTIME_HARDWARE_GPU_OPTIMIZER_H_
|
||||
|
|
|
@ -163,6 +163,18 @@ void PushTensor(const VectorRef &args, const std::vector<AnfNodePtr> ¶meters
|
|||
auto position = iter - parameters.begin();
|
||||
PushInputTensor(args[position], input_tensor);
|
||||
}
|
||||
|
||||
void UpdateOutputAbstract(const KernelGraphPtr &kernel_graph, OpRunInfo *op_run_info) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
MS_EXCEPTION_IF_NULL(op_run_info);
|
||||
const auto &kernels = kernel_graph->execution_order();
|
||||
for (const auto &kernel : kernels) {
|
||||
MS_EXCEPTION_IF_NULL(kernel);
|
||||
if (AnfAlgo::GetCNodeName(kernel) == op_run_info->op_name) {
|
||||
op_run_info->abstract = kernel->abstract();
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
VectorRef MsBackend::MsRunGraph(const GraphId &g, const VectorRef &args, const std::string &target) {
|
||||
|
@ -380,7 +392,7 @@ void MindRTBackend::RunGraphBySingleOp(const std::vector<KernelGraphPtr> &graphs
|
|||
const ActorInfo &actor_info =
|
||||
CompileGraph(op_run_info, graph_info, &input_tensor_info.input_tensors_mask, &input_tensor_info.input_tensors);
|
||||
VectorRef op_outputs =
|
||||
RunGraph(actor_info, &input_tensor_info.input_tensors_mask, &input_tensor_info.input_tensors);
|
||||
RunGraph(actor_info, &op_run_info, &input_tensor_info.input_tensors_mask, &input_tensor_info.input_tensors);
|
||||
|
||||
std::vector<tensor::TensorPtr> new_output_tensors;
|
||||
runtime::GraphCompiler::GetInstance().RecoverGraphOutput(kernel, op_outputs, output_indexes, &op_output_map,
|
||||
|
@ -468,8 +480,14 @@ VectorRef MindRTBackend::RunGraph(const ActorInfo &actor_info, const VectorRef &
|
|||
// Fetch outputs.
|
||||
MS_EXCEPTION_IF_NULL(actor_set->output_actor_);
|
||||
auto &output_tensors = actor_set->output_actor_->outputs();
|
||||
(void)std::transform(output_tensors.begin(), output_tensors.end(), std::back_inserter(outputs.elements_),
|
||||
[](tensor::TensorPtr &tensor) { return std::move(tensor); });
|
||||
if (output_tensors.size() > 1) {
|
||||
VectorRef tmp;
|
||||
(void)std::transform(output_tensors.begin(), output_tensors.end(), std::back_inserter(tmp.elements_),
|
||||
[](tensor::TensorPtr &tensor) { return std::move(tensor); });
|
||||
outputs.emplace_back(std::move(tmp));
|
||||
} else if (output_tensors.size() == 1) {
|
||||
outputs.emplace_back(std::move(output_tensors.front()));
|
||||
}
|
||||
MS_LOG(INFO) << "Run actor end, actor name: " << actor_info;
|
||||
|
||||
runtime::GraphCompiler::GetInstance().Summary(graph_compiler_info.graphs_);
|
||||
|
@ -494,13 +512,22 @@ std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(con
|
|||
const auto &all_branch_output = ControlNodeParser::FetchAllBranchOutputs(root_graph);
|
||||
for (const auto &branch_output : all_branch_output) {
|
||||
size_t position = 0;
|
||||
const auto &outputs = AnfAlgo::GetAllOutput(branch_output, {prim::kPrimTupleGetItem});
|
||||
outputs_num = outputs.size();
|
||||
if (AnfAlgo::CheckPrimitiveType(branch_output, prim::kPrimMakeTuple)) {
|
||||
const auto &outputs = AnfAlgo::GetAllOutput(branch_output, {prim::kPrimTupleGetItem});
|
||||
outputs_num = outputs.size();
|
||||
|
||||
for (const auto &output : outputs) {
|
||||
const auto &output_with_index = AnfAlgo::VisitKernelWithReturnType(output, 0, false);
|
||||
MS_EXCEPTION_IF_NULL(output_with_index.first);
|
||||
outputs_order.emplace(output_with_index, position++);
|
||||
for (const auto &output : outputs) {
|
||||
const auto &output_with_index = AnfAlgo::VisitKernelWithReturnType(output, 0, false);
|
||||
MS_EXCEPTION_IF_NULL(output_with_index.first);
|
||||
outputs_order.emplace(output_with_index, position++);
|
||||
}
|
||||
} else if (branch_output->isa<CNode>()) {
|
||||
outputs_num = AnfAlgo::GetOutputTensorNum(branch_output);
|
||||
for (size_t i = 0; i < outputs_num; i++) {
|
||||
const auto &output_with_index = AnfAlgo::VisitKernelWithReturnType(branch_output, i, false);
|
||||
MS_EXCEPTION_IF_NULL(output_with_index.first);
|
||||
outputs_order.emplace(output_with_index, position++);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -547,7 +574,8 @@ std::unique_ptr<GraphCompilerInfo> MindRTBackend::ConstructGraphCompilerInfo(
|
|||
outputs_order.size(), name);
|
||||
}
|
||||
|
||||
VectorRef MindRTBackend::RunGraph(const ActorInfo &actor_info, const std::vector<int64_t> *tensors_mask,
|
||||
VectorRef MindRTBackend::RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info,
|
||||
const std::vector<int64_t> *tensors_mask,
|
||||
const std::vector<tensor::TensorPtr> *input_tensors) {
|
||||
const auto &graph_iter = actor_to_graph_compiler_info_.find(actor_info);
|
||||
if (graph_iter == actor_to_graph_compiler_info_.end()) {
|
||||
|
@ -589,6 +617,12 @@ VectorRef MindRTBackend::RunGraph(const ActorInfo &actor_info, const std::vector
|
|||
VectorRef outputs;
|
||||
(void)std::transform(output_tensors.begin(), output_tensors.end(), std::back_inserter(outputs.elements_),
|
||||
[](tensor::TensorPtr &tensor) { return std::move(tensor); });
|
||||
|
||||
// update output abstract of dynamic op to op_run_info
|
||||
if (op_run_info->is_dynamic_shape) {
|
||||
UpdateOutputAbstract(graph_compiler_info.graphs_.front(), op_run_info);
|
||||
}
|
||||
|
||||
return outputs;
|
||||
}
|
||||
} // namespace compile
|
||||
|
|
|
@ -113,7 +113,7 @@ class MindRTBackend : public Backend {
|
|||
VectorRef RunGraph(const ActorInfo &actor_info, const VectorRef &args);
|
||||
|
||||
// Run Graph in the pyNative mode.
|
||||
VectorRef RunGraph(const ActorInfo &actor_info, const std::vector<int64_t> *tensors_mask,
|
||||
VectorRef RunGraph(const ActorInfo &actor_info, OpRunInfo *op_run_info, const std::vector<int64_t> *tensors_mask,
|
||||
const std::vector<tensor::TensorPtr> *input_tensors);
|
||||
|
||||
private:
|
||||
|
|
Loading…
Reference in New Issue