add op atomic clean to clear input addr in launch allreduce

This commit is contained in:
lvchangquan 2021-03-25 11:19:23 +08:00
parent b9f87ba48a
commit 0a7df321fe
16 changed files with 236 additions and 29 deletions

View File

@ -27,6 +27,7 @@
#include "runtime/device/kernel_runtime_manager.h"
#include "runtime/device/ascend/ascend_event.h"
#include "runtime/device/ascend/ascend_launch_mul.h"
#include "runtime/device/ascend/ascend_launch_atomic_clean.h"
#include "utils/profile.h"
#define CHECK_ASCEND_RT_WITH_EXCEPTION(expression, message) \
@ -90,16 +91,18 @@ void AscendBucket::FreeAllDeviceMem() {
ar_output_addr_ = nullptr;
}
// clear launch mul device Memory
if (launch_kernel != nullptr) {
launch_kernel->FreeLaunchDeviceMem();
if (launch_mul_ != nullptr) {
launch_mul_->FreeLaunchDeviceMem();
}
// clear launch atomic clean device Memory
if (launch_atomic_clean_ != nullptr) {
launch_atomic_clean_->FreeLaunchDeviceMem();
}
}
void AscendBucket::CopyTensorToContiguousMemory() {
// Clean input addr
CHECK_ASCEND_RT_WITH_EXCEPTION(rtMemsetAsync(ar_input_addr_, total_size_, 0, total_size_, compute_stream_),
"Call rtMemsetAsync failed");
// clear allreduce input addr
CleanAllReduceInputAddr();
for (size_t i = 0; i < bucket_size_; ++i) {
MS_EXCEPTION_IF_NULL(memcpy_input_addrs_[i]);
MS_EXCEPTION_IF_NULL(memcpy_output_addrs_[i]);
@ -151,15 +154,36 @@ void AscendBucket::LaunchAllReduce() {
}
}
std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchKernel() {
void AscendBucket::CleanAllReduceInputAddr() {
if (launch_atomic_clean_ == nullptr) {
launch_atomic_clean_ = CreateLaunchAtomicClean();
MS_EXCEPTION_IF_NULL(launch_atomic_clean_);
}
// set atomic clean input addr
launch_atomic_clean_->SetInputAddr(ar_input_addr_);
// launch atomic clean
launch_atomic_clean_->LaunchOpKernel();
}
std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchMul() {
if (tensor_type_list_.empty()) {
MS_LOG(ERROR) << "tensor_type_list_ is empty";
}
auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_);
auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_);
MS_EXCEPTION_IF_NULL(launch_mul);
return launch_mul;
}
std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchAtomicClean() {
if (tensor_type_list_.empty()) {
MS_LOG(ERROR) << "tensor_type_list_ is empty";
}
auto launch_atomic_clean =
std::make_shared<AscendLaunchAtomicClean>(compute_stream_, tensor_type_list_[0], total_size_);
MS_EXCEPTION_IF_NULL(launch_atomic_clean);
return launch_atomic_clean;
}
void AscendBucket::Init() {
pre_event_ = std::make_shared<AscendEvent>();
post_event_ = std::make_shared<AscendEvent>();

View File

@ -34,7 +34,9 @@ class AscendBucket : public Bucket {
void FreeDeviceMem(void *dev_ptr) override;
void CopyTensorToContiguousMemory() override;
void LaunchAllReduce() override;
std::shared_ptr<LaunchKernel> CreateLaunchKernel() override;
std::shared_ptr<LaunchKernel> CreateLaunchMul() override;
std::shared_ptr<LaunchKernel> CreateLaunchAtomicClean();
void CleanAllReduceInputAddr();
};
} // namespace mindspore::device::ascend
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_

View File

@ -0,0 +1,114 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "runtime/device/ascend/ascend_launch_atomic_clean.h"
#include <memory>
#include <vector>
#include "abstract/utils.h"
#include "backend/session/single_kernel_graph.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "debug/anf_ir_dump.h"
namespace mindspore::device::ascend {
void AscendLaunchAtomicClean::FreeDeviceMem(void *addr) { AscendLaunchKernel::FreeDeviceMem(addr); }
size_t AscendLaunchAtomicClean::AlignSizeForLaunchKernel(size_t size) {
return AscendLaunchKernel::AlignSizeForLaunchKernel(size);
}
uint8_t *AscendLaunchAtomicClean::AllocDeviceMem(size_t size) { return AscendLaunchKernel::AllocDeviceMem(size); }
void AscendLaunchAtomicClean::KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) {
AscendLaunchKernel::KernelSelect(kernel_graph);
}
void AscendLaunchAtomicClean::KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) {
AscendLaunchKernel::KernelBuild(kernel_graph);
}
void AscendLaunchAtomicClean::LaunchOpKernel() {
if (atomic_clean_graph_ == nullptr) {
// construct atomic clean kernel graph and set attr
ConstructKernelGraphAndSetAttr();
// kernel build
KernelBuild(atomic_clean_graph_);
}
// obtain kernel_mod
if (atomic_clean_graph_->execution_order().size() != 1) {
MS_LOG(ERROR) << "the execution order of the atomic clean graph should have only one node";
}
kernel_mod_ = AnfAlgo::GetKernelMod(atomic_clean_graph_->execution_order()[0]);
MS_EXCEPTION_IF_NULL(kernel_mod_);
// obtain kernel inputs
std::vector<kernel::AddressPtr> kernel_inputs;
auto input = std::make_shared<kernel::Address>();
MS_EXCEPTION_IF_NULL(input);
input->addr = input_addr_;
MS_EXCEPTION_IF_NULL(input->addr);
input->size = total_size_;
kernel_inputs.push_back(input);
// obtain kernel outputs
auto kernel_outputs = ObtainKernelOutputs(kernel_mod_->GetOutputSizeList());
// obtain kernel workspace
auto kernel_workspaces = ObtainKernelWorkspaces(kernel_mod_->GetWorkspaceSizeList());
// launch
auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
if (!ret_status) {
MS_LOG(ERROR) << "Launch single kernel failed.";
}
}
void AscendLaunchAtomicClean::FreeLaunchDeviceMem() {
input_addr_ = nullptr;
FreeOutputAndWorkspaceDeviceMem();
}
std::shared_ptr<session::KernelGraph> AscendLaunchAtomicClean::ObtainAtomicCleanKernelGraph() {
std::vector<TypeId> input_dtypes = {dtype_};
std::vector<TypeId> output_dtypes = {};
// obtain input & output shapes
size_t dtype_size = abstract::TypeIdSize(dtype_);
int64_t shape = total_size_ / dtype_size;
std::vector<std::vector<int64_t>> input_shapes = {{shape}};
std::vector<std::vector<size_t>> output_shapes = {};
auto atomic_clean_graph = session::SingleKernelGraph::ConstructKernelGraphBasedOnSingleOp(
kAtomicAddrCleanOpName, input_dtypes, input_shapes, output_dtypes, output_shapes);
MS_EXCEPTION_IF_NULL(atomic_clean_graph);
return atomic_clean_graph;
}
void AscendLaunchAtomicClean::ConstructKernelGraphAndSetAttr() {
// construct atomic clean kernel graph
atomic_clean_graph_ = ObtainAtomicCleanKernelGraph();
MS_EXCEPTION_IF_NULL(atomic_clean_graph_);
// set atomic clean attr
if (!atomic_clean_graph_->execution_order().empty()) {
auto clean_node = atomic_clean_graph_->execution_order()[0];
// set abstract
AbstractBasePtr abstract = std::make_shared<abstract::AbstractNone>();
MS_EXCEPTION_IF_NULL(abstract);
clean_node->set_abstract(abstract);
// set build info
auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>();
builder->SetKernelType(KernelType::TBE_KERNEL);
AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), clean_node.get());
// set attr
std::vector<size_t> clean_size = {total_size_};
AnfAlgo::SetNodeAttr(kAttrAtomicAddMemSize, MakeValue(clean_size), clean_node);
}
}
} // namespace mindspore::device::ascend

View File

@ -0,0 +1,57 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_
#include <vector>
#include <memory>
#include "runtime/device/ascend/ascend_launch_kernel.h"
namespace mindspore::device::ascend {
class AscendLaunchAtomicClean : public AscendLaunchKernel {
public:
AscendLaunchAtomicClean(void *stream, TypeId dtype, size_t total_size)
: AscendLaunchKernel(stream),
dtype_(dtype),
total_size_(total_size),
atomic_clean_graph_(nullptr),
input_addr_(nullptr) {}
~AscendLaunchAtomicClean() override = default;
void SetInputAddr(uint8_t *input_addr) override { input_addr_ = input_addr; }
void FreeDeviceMem(void *addr) override;
size_t AlignSizeForLaunchKernel(size_t size) override;
uint8_t *AllocDeviceMem(size_t size) override;
void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;
void LaunchOpKernel() override;
void FreeLaunchDeviceMem() override;
protected:
TypeId dtype_;
size_t total_size_;
std::shared_ptr<session::KernelGraph> atomic_clean_graph_;
uint8_t *input_addr_;
private:
std::shared_ptr<session::KernelGraph> ObtainAtomicCleanKernelGraph();
void ConstructKernelGraphAndSetAttr();
};
} // namespace mindspore::device::ascend
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_ATOMIC_CLEAN_H_

View File

@ -33,6 +33,7 @@ class AscendLaunchKernel : public LaunchKernel {
void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;
void SetInputAddr(uint8_t *input_addr) override = 0;
void LaunchOpKernel() override = 0;
void FreeLaunchDeviceMem() override = 0;
};

View File

@ -25,10 +25,11 @@
namespace mindspore::device::ascend {
class AscendLaunchMul : public AscendLaunchKernel, public LaunchMul {
public:
AscendLaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr)
: AscendLaunchKernel(stream), LaunchMul(dtype, total_size, input1_addr) {}
AscendLaunchMul(void *stream, TypeId dtype, size_t total_size)
: AscendLaunchKernel(stream), LaunchMul(dtype, total_size) {}
~AscendLaunchMul() override = default;
void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; }
void FreeDeviceMem(void *addr) override;
size_t AlignSizeForLaunchKernel(size_t size) override;
uint8_t *AllocDeviceMem(size_t size) override;

View File

@ -94,12 +94,16 @@ void Bucket::CalculateMean() {
if (!grad_mean) {
return;
}
launch_kernel = CreateLaunchKernel();
MS_EXCEPTION_IF_NULL(launch_kernel);
if (launch_mul_ == nullptr) {
launch_mul_ = CreateLaunchMul();
MS_EXCEPTION_IF_NULL(launch_mul_);
}
// set mul input1 addr
launch_mul_->SetInputAddr(ar_output_addr_);
// launch mean
launch_kernel->LaunchOpKernel();
launch_mul_->LaunchOpKernel();
// store output tensor addr
auto launch_output = launch_kernel->GetKernelOutputAddr();
auto launch_output = launch_mul_->GetKernelOutputAddr();
if (launch_output.size() != 1) {
MS_LOG(ERROR) << "launch mul outputs should have one output";
}

View File

@ -38,7 +38,8 @@ class Bucket {
compute_stream_(nullptr),
pre_event_(nullptr),
post_event_(nullptr),
launch_kernel(nullptr),
launch_mul_(nullptr),
launch_atomic_clean_(nullptr),
total_size_(0),
ar_input_addr_(nullptr),
ar_output_addr_(nullptr) {}
@ -60,7 +61,8 @@ class Bucket {
std::shared_ptr<DeviceEvent> pre_event_;
std::shared_ptr<DeviceEvent> post_event_;
std::shared_ptr<LaunchKernel> launch_kernel;
std::shared_ptr<LaunchKernel> launch_mul_;
std::shared_ptr<LaunchKernel> launch_atomic_clean_;
size_t total_size_;
uint8_t *ar_input_addr_;
@ -77,7 +79,7 @@ class Bucket {
virtual void AllocateAllReduceAddr() = 0;
void UpdateTensorAddr();
void CalculateMean();
virtual std::shared_ptr<LaunchKernel> CreateLaunchKernel() = 0;
virtual std::shared_ptr<LaunchKernel> CreateLaunchMul() = 0;
virtual void LaunchAllReduce() = 0;
virtual void FreeAllDeviceMem() = 0;
virtual void FreeDeviceMem(void *dev_ptr) = 0;

View File

@ -92,8 +92,8 @@ void GPUBucket::FreeAllDeviceMem() {
ar_output_addr_ = nullptr;
}
// clear launch mul device memory
if (launch_kernel != nullptr) {
launch_kernel->FreeLaunchDeviceMem();
if (launch_mul_ != nullptr) {
launch_mul_->FreeLaunchDeviceMem();
}
MS_LOG(INFO) << "end";
}
@ -156,11 +156,11 @@ void GPUBucket::LaunchAllReduce() {
MS_LOG(INFO) << "end";
}
std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchKernel() {
std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchMul() {
if (tensor_type_list_.empty()) {
MS_LOG(ERROR) << "tensor_type_list_ is empty";
}
auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_);
auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_);
MS_EXCEPTION_IF_NULL(launch_mul);
return launch_mul;
}

View File

@ -34,7 +34,7 @@ class GPUBucket : public Bucket {
void FreeDeviceMem(void *dev_ptr) override;
void CopyTensorToContiguousMemory() override;
void LaunchAllReduce() override;
std::shared_ptr<LaunchKernel> CreateLaunchKernel() override;
std::shared_ptr<LaunchKernel> CreateLaunchMul() override;
const void *collective_handle_;
};
} // namespace mindspore::device::gpu

View File

@ -33,6 +33,7 @@ class GPULaunchkernel : public LaunchKernel {
void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;
void SetInputAddr(uint8_t *input_addr) override = 0;
void LaunchOpKernel() override = 0;
void FreeLaunchDeviceMem() override = 0;
};

View File

@ -25,10 +25,10 @@
namespace mindspore::device::gpu {
class GPULaunchMul : public GPULaunchkernel, public LaunchMul {
public:
GPULaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr)
: GPULaunchkernel(stream), LaunchMul(dtype, total_size, input1_addr) {}
GPULaunchMul(void *stream, TypeId dtype, size_t total_size) : GPULaunchkernel(stream), LaunchMul(dtype, total_size) {}
~GPULaunchMul() override = default;
void SetInputAddr(uint8_t *input1_addr) override { input1_addr_ = input1_addr; }
void FreeDeviceMem(void *addr) override;
size_t AlignSizeForLaunchKernel(size_t size) override;
uint8_t *AllocDeviceMem(size_t size) override;

View File

@ -83,7 +83,7 @@ void LaunchKernel::LaunchSingleKernel(const std::vector<uint8_t *> &inputs_addr)
// launch
auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
if (!ret_status) {
MS_LOG(ERROR) << "Launch mul kernel failed.";
MS_LOG(ERROR) << "Launch single kernel failed.";
}
}

View File

@ -37,6 +37,7 @@ class LaunchKernel {
virtual void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) = 0;
virtual void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) = 0;
virtual void SetInputAddr(uint8_t *input_addr) = 0;
virtual void LaunchOpKernel() = 0;
virtual void FreeLaunchDeviceMem() = 0;
@ -46,7 +47,6 @@ class LaunchKernel {
std::vector<uint8_t *> outputs_addr_;
std::vector<uint8_t *> workspaces_addr_;
private:
std::vector<kernel::AddressPtr> ObtainKernelAddress(const std::vector<size_t> &list, std::vector<uint8_t *> *addr);
std::vector<kernel::AddressPtr> ObtainKernelInputs(const std::vector<size_t> &inputs_list,
const std::vector<uint8_t *> &inputs_addr);

View File

@ -24,10 +24,10 @@
namespace mindspore::device {
class LaunchMul {
public:
LaunchMul(TypeId dtype, size_t total_size, uint8_t *input1_addr)
LaunchMul(TypeId dtype, size_t total_size)
: dtype_(dtype),
total_size_(total_size),
input1_addr_(input1_addr),
input1_addr_(nullptr),
input2_addr_(nullptr),
input2_value_(0),
mul_graph_(nullptr) {}

View File

@ -106,6 +106,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc"
"../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.cc"
"../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.cc"
"../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_atomic_clean.cc"
"../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_graph_kernel.cc"
"../../../mindspore/ccsrc/runtime/device/convert_tensor_utils.cc"
"../../../mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc"