diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc index fd56e9b63af..6a818df9dcb 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc @@ -27,6 +27,7 @@ #include "runtime/device/kernel_runtime_manager.h" #include "runtime/device/ascend/ascend_event.h" #include "runtime/device/ascend/ascend_launch_mul.h" +#include "runtime/device/ascend/ascend_launch_atomic_clean.h" #include "utils/profile.h" #define CHECK_ASCEND_RT_WITH_EXCEPTION(expression, message) \ @@ -90,16 +91,18 @@ void AscendBucket::FreeAllDeviceMem() { ar_output_addr_ = nullptr; } // clear launch mul device Memory - if (launch_kernel != nullptr) { - launch_kernel->FreeLaunchDeviceMem(); + if (launch_mul_ != nullptr) { + launch_mul_->FreeLaunchDeviceMem(); + } + // clear launch atomic clean device Memory + if (launch_atomic_clean_ != nullptr) { + launch_atomic_clean_->FreeLaunchDeviceMem(); } } void AscendBucket::CopyTensorToContiguousMemory() { - // Clean input addr - CHECK_ASCEND_RT_WITH_EXCEPTION(rtMemsetAsync(ar_input_addr_, total_size_, 0, total_size_, compute_stream_), - "Call rtMemsetAsync failed"); - + // clear allreduce input addr + CleanAllReduceInputAddr(); for (size_t i = 0; i < bucket_size_; ++i) { MS_EXCEPTION_IF_NULL(memcpy_input_addrs_[i]); MS_EXCEPTION_IF_NULL(memcpy_output_addrs_[i]); @@ -151,15 +154,36 @@ void AscendBucket::LaunchAllReduce() { } } -std::shared_ptr AscendBucket::CreateLaunchKernel() { +void AscendBucket::CleanAllReduceInputAddr() { + if (launch_atomic_clean_ == nullptr) { + launch_atomic_clean_ = CreateLaunchAtomicClean(); + MS_EXCEPTION_IF_NULL(launch_atomic_clean_); + } + // set atomic clean input addr + launch_atomic_clean_->SetInputAddr(ar_input_addr_); + // launch atomic clean + launch_atomic_clean_->LaunchOpKernel(); +} + +std::shared_ptr AscendBucket::CreateLaunchMul() { if (tensor_type_list_.empty()) { MS_LOG(ERROR) << "tensor_type_list_ is empty"; } - auto launch_mul = std::make_shared(stream_, tensor_type_list_[0], total_size_, ar_output_addr_); + auto launch_mul = std::make_shared(stream_, tensor_type_list_[0], total_size_); MS_EXCEPTION_IF_NULL(launch_mul); return launch_mul; } +std::shared_ptr AscendBucket::CreateLaunchAtomicClean() { + if (tensor_type_list_.empty()) { + MS_LOG(ERROR) << "tensor_type_list_ is empty"; + } + auto launch_atomic_clean = + std::make_shared(compute_stream_, tensor_type_list_[0], total_size_); + MS_EXCEPTION_IF_NULL(launch_atomic_clean); + return launch_atomic_clean; +} + void AscendBucket::Init() { pre_event_ = std::make_shared(); post_event_ = std::make_shared(); diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.h b/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.h index cd3050090bf..af6f770d8d0 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_bucket.h @@ -34,7 +34,9 @@ class AscendBucket : public Bucket { void FreeDeviceMem(void *dev_ptr) override; void CopyTensorToContiguousMemory() override; void LaunchAllReduce() override; - std::shared_ptr CreateLaunchKernel() override; + std::shared_ptr CreateLaunchMul() override; + std::shared_ptr CreateLaunchAtomicClean(); + void CleanAllReduceInputAddr(); }; } // namespace mindspore::device::ascend #endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_ diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc new file mode 100644 index 00000000000..6abec788283 --- /dev/null +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc @@ -0,0 +1,114 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "runtime/device/ascend/ascend_launch_atomic_clean.h" + +#include +#include +#include "abstract/utils.h" +#include "backend/session/single_kernel_graph.h" +#include "backend/session/anf_runtime_algorithm.h" +#include "debug/anf_ir_dump.h" + +namespace mindspore::device::ascend { +void AscendLaunchAtomicClean::FreeDeviceMem(void *addr) { AscendLaunchKernel::FreeDeviceMem(addr); } + +size_t AscendLaunchAtomicClean::AlignSizeForLaunchKernel(size_t size) { + return AscendLaunchKernel::AlignSizeForLaunchKernel(size); +} + +uint8_t *AscendLaunchAtomicClean::AllocDeviceMem(size_t size) { return AscendLaunchKernel::AllocDeviceMem(size); } + +void AscendLaunchAtomicClean::KernelSelect(std::shared_ptr kernel_graph) { + AscendLaunchKernel::KernelSelect(kernel_graph); +} + +void AscendLaunchAtomicClean::KernelBuild(std::shared_ptr kernel_graph) { + AscendLaunchKernel::KernelBuild(kernel_graph); +} + +void AscendLaunchAtomicClean::LaunchOpKernel() { + if (atomic_clean_graph_ == nullptr) { + // construct atomic clean kernel graph and set attr + ConstructKernelGraphAndSetAttr(); + // kernel build + KernelBuild(atomic_clean_graph_); + } + // obtain kernel_mod + if (atomic_clean_graph_->execution_order().size() != 1) { + MS_LOG(ERROR) << "the execution order of the atomic clean graph should have only one node"; + } + kernel_mod_ = AnfAlgo::GetKernelMod(atomic_clean_graph_->execution_order()[0]); + MS_EXCEPTION_IF_NULL(kernel_mod_); + // obtain kernel inputs + std::vector kernel_inputs; + auto input = std::make_shared(); + MS_EXCEPTION_IF_NULL(input); + input->addr = input_addr_; + MS_EXCEPTION_IF_NULL(input->addr); + input->size = total_size_; + kernel_inputs.push_back(input); + // obtain kernel outputs + auto kernel_outputs = ObtainKernelOutputs(kernel_mod_->GetOutputSizeList()); + // obtain kernel workspace + auto kernel_workspaces = ObtainKernelWorkspaces(kernel_mod_->GetWorkspaceSizeList()); + // launch + auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_); + if (!ret_status) { + MS_LOG(ERROR) << "Launch single kernel failed."; + } +} + +void AscendLaunchAtomicClean::FreeLaunchDeviceMem() { + input_addr_ = nullptr; + FreeOutputAndWorkspaceDeviceMem(); +} + +std::shared_ptr AscendLaunchAtomicClean::ObtainAtomicCleanKernelGraph() { + std::vector input_dtypes = {dtype_}; + std::vector output_dtypes = {}; + // obtain input & output shapes + size_t dtype_size = abstract::TypeIdSize(dtype_); + int64_t shape = total_size_ / dtype_size; + std::vector> input_shapes = {{shape}}; + std::vector> output_shapes = {}; + auto atomic_clean_graph = session::SingleKernelGraph::ConstructKernelGraphBasedOnSingleOp( + kAtomicAddrCleanOpName, input_dtypes, input_shapes, output_dtypes, output_shapes); + MS_EXCEPTION_IF_NULL(atomic_clean_graph); + return atomic_clean_graph; +} + +void AscendLaunchAtomicClean::ConstructKernelGraphAndSetAttr() { + // construct atomic clean kernel graph + atomic_clean_graph_ = ObtainAtomicCleanKernelGraph(); + MS_EXCEPTION_IF_NULL(atomic_clean_graph_); + // set atomic clean attr + if (!atomic_clean_graph_->execution_order().empty()) { + auto clean_node = atomic_clean_graph_->execution_order()[0]; + // set abstract + AbstractBasePtr abstract = std::make_shared(); + MS_EXCEPTION_IF_NULL(abstract); + clean_node->set_abstract(abstract); + // set build info + auto builder = std::make_shared(); + builder->SetKernelType(KernelType::TBE_KERNEL); + AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), clean_node.get()); + // set attr + std::vector clean_size = {total_size_}; + AnfAlgo::SetNodeAttr(kAttrAtomicAddMemSize, MakeValue(clean_size), clean_node); + } +} +} // namespace mindspore::device::ascend diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.h b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.h new file mode 100644 index 00000000000..eaf3b9dcd48 --- /dev/null +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.h @@ -0,0 +1,57 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_ +#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_ + +#include +#include +#include "runtime/device/ascend/ascend_launch_kernel.h" + +namespace mindspore::device::ascend { +class AscendLaunchAtomicClean : public AscendLaunchKernel { + public: + AscendLaunchAtomicClean(void *stream, TypeId dtype, size_t total_size) + : AscendLaunchKernel(stream), + dtype_(dtype), + total_size_(total_size), + atomic_clean_graph_(nullptr), + input_addr_(nullptr) {} + ~AscendLaunchAtomicClean() override = default; + + void SetInputAddr(uint8_t *input_addr) override { input_addr_ = input_addr; } + void FreeDeviceMem(void *addr) override; + size_t AlignSizeForLaunchKernel(size_t size) override; + uint8_t *AllocDeviceMem(size_t size) override; + void KernelSelect(std::shared_ptr kernel_graph) override; + void KernelBuild(std::shared_ptr kernel_graph) override; + + void LaunchOpKernel() override; + void FreeLaunchDeviceMem() override; + + protected: + TypeId dtype_; + size_t total_size_; + std::shared_ptr atomic_clean_graph_; + uint8_t *input_addr_; + + private: + std::shared_ptr ObtainAtomicCleanKernelGraph(); + void ConstructKernelGraphAndSetAttr(); +}; +} // namespace mindspore::device::ascend + +#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_ diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.h b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.h index a228162317f..12561e3ffa4 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.h @@ -33,6 +33,7 @@ class AscendLaunchKernel : public LaunchKernel { void KernelSelect(std::shared_ptr kernel_graph) override; void KernelBuild(std::shared_ptr kernel_graph) override; + void SetInputAddr(uint8_t *input_addr) override = 0; void LaunchOpKernel() override = 0; void FreeLaunchDeviceMem() override = 0; }; diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.h b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.h index ab090b95c99..0c8a600e0a6 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.h @@ -25,10 +25,11 @@ namespace mindspore::device::ascend { class AscendLaunchMul : public AscendLaunchKernel, public LaunchMul { public: - AscendLaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr) - : AscendLaunchKernel(stream), LaunchMul(dtype, total_size, input1_addr) {} + AscendLaunchMul(void *stream, TypeId dtype, size_t total_size) + : AscendLaunchKernel(stream), LaunchMul(dtype, total_size) {} ~AscendLaunchMul() override = default; + void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; } void FreeDeviceMem(void *addr) override; size_t AlignSizeForLaunchKernel(size_t size) override; uint8_t *AllocDeviceMem(size_t size) override; diff --git a/mindspore/ccsrc/runtime/device/bucket.cc b/mindspore/ccsrc/runtime/device/bucket.cc index c354e994dd3..dcf54d76e8a 100644 --- a/mindspore/ccsrc/runtime/device/bucket.cc +++ b/mindspore/ccsrc/runtime/device/bucket.cc @@ -94,12 +94,16 @@ void Bucket::CalculateMean() { if (!grad_mean) { return; } - launch_kernel = CreateLaunchKernel(); - MS_EXCEPTION_IF_NULL(launch_kernel); + if (launch_mul_ == nullptr) { + launch_mul_ = CreateLaunchMul(); + MS_EXCEPTION_IF_NULL(launch_mul_); + } + // set mul input1 addr + launch_mul_->SetInputAddr(ar_output_addr_); // launch mean - launch_kernel->LaunchOpKernel(); + launch_mul_->LaunchOpKernel(); // store output tensor addr - auto launch_output = launch_kernel->GetKernelOutputAddr(); + auto launch_output = launch_mul_->GetKernelOutputAddr(); if (launch_output.size() != 1) { MS_LOG(ERROR) << "launch mul outputs should have one output"; } diff --git a/mindspore/ccsrc/runtime/device/bucket.h b/mindspore/ccsrc/runtime/device/bucket.h index a2cc276cf45..4bbe8f4a8e4 100644 --- a/mindspore/ccsrc/runtime/device/bucket.h +++ b/mindspore/ccsrc/runtime/device/bucket.h @@ -38,7 +38,8 @@ class Bucket { compute_stream_(nullptr), pre_event_(nullptr), post_event_(nullptr), - launch_kernel(nullptr), + launch_mul_(nullptr), + launch_atomic_clean_(nullptr), total_size_(0), ar_input_addr_(nullptr), ar_output_addr_(nullptr) {} @@ -60,7 +61,8 @@ class Bucket { std::shared_ptr pre_event_; std::shared_ptr post_event_; - std::shared_ptr launch_kernel; + std::shared_ptr launch_mul_; + std::shared_ptr launch_atomic_clean_; size_t total_size_; uint8_t *ar_input_addr_; @@ -77,7 +79,7 @@ class Bucket { virtual void AllocateAllReduceAddr() = 0; void UpdateTensorAddr(); void CalculateMean(); - virtual std::shared_ptr CreateLaunchKernel() = 0; + virtual std::shared_ptr CreateLaunchMul() = 0; virtual void LaunchAllReduce() = 0; virtual void FreeAllDeviceMem() = 0; virtual void FreeDeviceMem(void *dev_ptr) = 0; diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.cc index 2fc4ddbe06b..d6e10114fa8 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.cc @@ -92,8 +92,8 @@ void GPUBucket::FreeAllDeviceMem() { ar_output_addr_ = nullptr; } // clear launch mul device memory - if (launch_kernel != nullptr) { - launch_kernel->FreeLaunchDeviceMem(); + if (launch_mul_ != nullptr) { + launch_mul_->FreeLaunchDeviceMem(); } MS_LOG(INFO) << "end"; } @@ -156,11 +156,11 @@ void GPUBucket::LaunchAllReduce() { MS_LOG(INFO) << "end"; } -std::shared_ptr GPUBucket::CreateLaunchKernel() { +std::shared_ptr GPUBucket::CreateLaunchMul() { if (tensor_type_list_.empty()) { MS_LOG(ERROR) << "tensor_type_list_ is empty"; } - auto launch_mul = std::make_shared(stream_, tensor_type_list_[0], total_size_, ar_output_addr_); + auto launch_mul = std::make_shared(stream_, tensor_type_list_[0], total_size_); MS_EXCEPTION_IF_NULL(launch_mul); return launch_mul; } diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.h b/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.h index 7f5211dfb78..34fecf14354 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_bucket.h @@ -34,7 +34,7 @@ class GPUBucket : public Bucket { void FreeDeviceMem(void *dev_ptr) override; void CopyTensorToContiguousMemory() override; void LaunchAllReduce() override; - std::shared_ptr CreateLaunchKernel() override; + std::shared_ptr CreateLaunchMul() override; const void *collective_handle_; }; } // namespace mindspore::device::gpu diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_launch_kernel.h b/mindspore/ccsrc/runtime/device/gpu/gpu_launch_kernel.h index ae3dd575f96..e438ca3f3fb 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_launch_kernel.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_launch_kernel.h @@ -33,6 +33,7 @@ class GPULaunchkernel : public LaunchKernel { void KernelSelect(std::shared_ptr kernel_graph) override; void KernelBuild(std::shared_ptr kernel_graph) override; + void SetInputAddr(uint8_t *input_addr) override = 0; void LaunchOpKernel() override = 0; void FreeLaunchDeviceMem() override = 0; }; diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_launch_mul.h b/mindspore/ccsrc/runtime/device/gpu/gpu_launch_mul.h index e5798a932f0..ff22d041707 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_launch_mul.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_launch_mul.h @@ -25,10 +25,10 @@ namespace mindspore::device::gpu { class GPULaunchMul : public GPULaunchkernel, public LaunchMul { public: - GPULaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr) - : GPULaunchkernel(stream), LaunchMul(dtype, total_size, input1_addr) {} + GPULaunchMul(void *stream, TypeId dtype, size_t total_size) : GPULaunchkernel(stream), LaunchMul(dtype, total_size) {} ~GPULaunchMul() override = default; + void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; } void FreeDeviceMem(void *addr) override; size_t AlignSizeForLaunchKernel(size_t size) override; uint8_t *AllocDeviceMem(size_t size) override; diff --git a/mindspore/ccsrc/runtime/device/launch_kernel.cc b/mindspore/ccsrc/runtime/device/launch_kernel.cc index b90ba1610a5..cf0101b86cf 100644 --- a/mindspore/ccsrc/runtime/device/launch_kernel.cc +++ b/mindspore/ccsrc/runtime/device/launch_kernel.cc @@ -83,7 +83,7 @@ void LaunchKernel::LaunchSingleKernel(const std::vector &inputs_addr) // launch auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_); if (!ret_status) { - MS_LOG(ERROR) << "Launch mul kernel failed."; + MS_LOG(ERROR) << "Launch single kernel failed."; } } diff --git a/mindspore/ccsrc/runtime/device/launch_kernel.h b/mindspore/ccsrc/runtime/device/launch_kernel.h index 0f75a9572e9..8e532a75560 100644 --- a/mindspore/ccsrc/runtime/device/launch_kernel.h +++ b/mindspore/ccsrc/runtime/device/launch_kernel.h @@ -37,6 +37,7 @@ class LaunchKernel { virtual void KernelSelect(std::shared_ptr kernel_graph) = 0; virtual void KernelBuild(std::shared_ptr kernel_graph) = 0; + virtual void SetInputAddr(uint8_t *input_addr) = 0; virtual void LaunchOpKernel() = 0; virtual void FreeLaunchDeviceMem() = 0; @@ -46,7 +47,6 @@ class LaunchKernel { std::vector outputs_addr_; std::vector workspaces_addr_; - private: std::vector ObtainKernelAddress(const std::vector &list, std::vector *addr); std::vector ObtainKernelInputs(const std::vector &inputs_list, const std::vector &inputs_addr); diff --git a/mindspore/ccsrc/runtime/device/launch_mul.h b/mindspore/ccsrc/runtime/device/launch_mul.h index 1b2c4651397..461a91f9735 100644 --- a/mindspore/ccsrc/runtime/device/launch_mul.h +++ b/mindspore/ccsrc/runtime/device/launch_mul.h @@ -24,10 +24,10 @@ namespace mindspore::device { class LaunchMul { public: - LaunchMul(TypeId dtype, size_t total_size, uint8_t *input1_addr) + LaunchMul(TypeId dtype, size_t total_size) : dtype_(dtype), total_size_(total_size), - input1_addr_(input1_addr), + input1_addr_(nullptr), input2_addr_(nullptr), input2_value_(0), mul_graph_(nullptr) {} diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 42d3fe57983..27aa68c49f6 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -106,6 +106,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc" "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.cc" "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.cc" + "../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc" "../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_graph_kernel.cc" "../../../mindspore/ccsrc/runtime/device/convert_tensor_utils.cc" "../../../mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc"