forked from mindspore-Ecosystem/mindspore
add op_mul fusion based on allreduce fusion in pynative mode
This commit is contained in:
parent
64e83c1f26
commit
31f9e6a42c
|
@ -6,6 +6,7 @@ file(GLOB_RECURSE _SESSION_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
"executor.cc"
|
"executor.cc"
|
||||||
"executor_manager.cc"
|
"executor_manager.cc"
|
||||||
"anf_runtime_algorithm.cc"
|
"anf_runtime_algorithm.cc"
|
||||||
|
"single_kernel_graph.cc"
|
||||||
)
|
)
|
||||||
|
|
||||||
if("${ENABLE_HIDDEN}" STREQUAL "OFF")
|
if("${ENABLE_HIDDEN}" STREQUAL "OFF")
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "backend/session/single_kernel_graph.h"
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include "backend/session/anf_runtime_algorithm.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace session {
|
||||||
|
std::shared_ptr<session::KernelGraph> SingleKernelGraph::ConstructKernelGraphBasedOnSingleOp(
|
||||||
|
const std::string &op_name, const std::vector<TypeId> &input_dtypes, const std::vector<ShapeVector> &input_shapes,
|
||||||
|
const std::vector<TypeId> &output_dtypes, const std::vector<std::vector<size_t>> &output_shapes) {
|
||||||
|
auto graph = std::make_shared<session::KernelGraph>();
|
||||||
|
std::vector<AnfNodePtr> inputs;
|
||||||
|
// set input[0]
|
||||||
|
PrimitivePtr op_prim = std::make_shared<Primitive>(op_name);
|
||||||
|
MS_EXCEPTION_IF_NULL(op_prim);
|
||||||
|
inputs.push_back(std::make_shared<ValueNode>(op_prim));
|
||||||
|
// construct real input
|
||||||
|
if (input_dtypes.size() != input_shapes.size()) {
|
||||||
|
MS_LOG(EXCEPTION) << " input_dtypes size should equal to input_shapes size";
|
||||||
|
}
|
||||||
|
auto input_num = input_dtypes.size();
|
||||||
|
for (size_t i = 0; i < input_num; ++i) {
|
||||||
|
auto tensor = std::make_shared<tensor::Tensor>(input_dtypes[i], input_shapes[i]);
|
||||||
|
auto value_node = ConstructRunOpValueNode(graph, tensor);
|
||||||
|
inputs.push_back(value_node);
|
||||||
|
}
|
||||||
|
// obtain cnode
|
||||||
|
auto cnode = graph->NewCNode(inputs);
|
||||||
|
MS_EXCEPTION_IF_NULL(cnode);
|
||||||
|
// get output dynamic shape info
|
||||||
|
AnfAlgo::SetNodeAttr(kAttrOutputIsDynamicShape, MakeValue(false), cnode);
|
||||||
|
if (output_dtypes.size() != output_shapes.size()) {
|
||||||
|
MS_LOG(EXCEPTION) << " output_dtypes size should equal to output_shapes size";
|
||||||
|
}
|
||||||
|
AnfAlgo::SetOutputInferTypeAndShape(output_dtypes, output_shapes, cnode.get());
|
||||||
|
// set execution order
|
||||||
|
std::vector<CNodePtr> exe_order = {cnode};
|
||||||
|
graph->set_execution_order(exe_order);
|
||||||
|
// set graph output
|
||||||
|
graph->set_output(cnode);
|
||||||
|
graph->SetInputNodes();
|
||||||
|
return graph;
|
||||||
|
}
|
||||||
|
|
||||||
|
ValueNodePtr SingleKernelGraph::ConstructRunOpValueNode(const std::shared_ptr<session::KernelGraph> &graph,
|
||||||
|
const tensor::TensorPtr &input_tensor) {
|
||||||
|
MS_EXCEPTION_IF_NULL(graph);
|
||||||
|
MS_EXCEPTION_IF_NULL(input_tensor);
|
||||||
|
auto value_node = std::make_shared<ValueNode>(input_tensor);
|
||||||
|
MS_EXCEPTION_IF_NULL(value_node);
|
||||||
|
// construct abstract of value node
|
||||||
|
auto type_of_tensor = input_tensor->Dtype();
|
||||||
|
auto shape_of_tensor = input_tensor->shape();
|
||||||
|
auto abstract = std::make_shared<abstract::AbstractTensor>(type_of_tensor, shape_of_tensor);
|
||||||
|
value_node->set_abstract(abstract);
|
||||||
|
// add value node to graph
|
||||||
|
auto input_value_node = graph->NewValueNode(value_node);
|
||||||
|
graph->AddValueNodeToGraph(input_value_node);
|
||||||
|
return input_value_node;
|
||||||
|
}
|
||||||
|
} // namespace session
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,43 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_BACKEND_SESSION_SINGLE_KERNEL_GRAPH_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_BACKEND_SESSION_SINGLE_KERNEL_GRAPH_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "backend/session/kernel_graph.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace session {
|
||||||
|
class SingleKernelGraph {
|
||||||
|
public:
|
||||||
|
SingleKernelGraph() = default;
|
||||||
|
~SingleKernelGraph() = default;
|
||||||
|
|
||||||
|
static std::shared_ptr<session::KernelGraph> ConstructKernelGraphBasedOnSingleOp(
|
||||||
|
const std::string &op_name, const std::vector<TypeId> &input_dtypes, const std::vector<ShapeVector> &input_shapes,
|
||||||
|
const std::vector<TypeId> &output_dtypes, const std::vector<std::vector<size_t>> &output_shapes);
|
||||||
|
|
||||||
|
private:
|
||||||
|
static ValueNodePtr ConstructRunOpValueNode(const std::shared_ptr<session::KernelGraph> &graph,
|
||||||
|
const tensor::TensorPtr &input_tensor);
|
||||||
|
};
|
||||||
|
} // namespace session
|
||||||
|
} // namespace mindspore
|
||||||
|
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_BACKEND_SESSION_SINGLE_KERNEL_GRAPH_H_
|
|
@ -1,7 +1,7 @@
|
||||||
file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc"
|
file(GLOB_RECURSE DEVICE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "common/*.cc"
|
||||||
"kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc"
|
"kernel_info.cc" "executor/dynamic_kernel.cc" "executor/executor_callback.cc" "kernel_runtime.cc"
|
||||||
"memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc"
|
"memory_manager.cc" "kernel_runtime_manager.cc" "convert_tensor_utils.cc"
|
||||||
"bucket.cc"
|
"bucket.cc" "launch_kernel.cc" "launch_mul.cc"
|
||||||
)
|
)
|
||||||
|
|
||||||
if("${ENABLE_HIDDEN}" STREQUAL "OFF")
|
if("${ENABLE_HIDDEN}" STREQUAL "OFF")
|
||||||
|
|
|
@ -26,6 +26,7 @@
|
||||||
#include "runtime/device/memory_manager.h"
|
#include "runtime/device/memory_manager.h"
|
||||||
#include "runtime/device/kernel_runtime_manager.h"
|
#include "runtime/device/kernel_runtime_manager.h"
|
||||||
#include "runtime/device/ascend/ascend_event.h"
|
#include "runtime/device/ascend/ascend_event.h"
|
||||||
|
#include "runtime/device/ascend/ascend_launch_mul.h"
|
||||||
#include "utils/profile.h"
|
#include "utils/profile.h"
|
||||||
|
|
||||||
#define CHECK_ASCEND_RT_WITH_EXCEPTION(expression, message) \
|
#define CHECK_ASCEND_RT_WITH_EXCEPTION(expression, message) \
|
||||||
|
@ -45,7 +46,6 @@ void AscendBucket::AllocateAllReduceAddr() {
|
||||||
}
|
}
|
||||||
|
|
||||||
auto total_size = 0;
|
auto total_size = 0;
|
||||||
std::vector<size_t> align_size_list;
|
|
||||||
std::vector<size_t> origin_size_list;
|
std::vector<size_t> origin_size_list;
|
||||||
for (auto &tensor : grad_tensor_list_) {
|
for (auto &tensor : grad_tensor_list_) {
|
||||||
MS_EXCEPTION_IF_NULL(tensor);
|
MS_EXCEPTION_IF_NULL(tensor);
|
||||||
|
@ -54,7 +54,7 @@ void AscendBucket::AllocateAllReduceAddr() {
|
||||||
auto origin_size = device_address->GetSize();
|
auto origin_size = device_address->GetSize();
|
||||||
auto align_size = MemoryManager::GetCommonAlignSize(origin_size);
|
auto align_size = MemoryManager::GetCommonAlignSize(origin_size);
|
||||||
origin_size_list.emplace_back(origin_size);
|
origin_size_list.emplace_back(origin_size);
|
||||||
align_size_list.emplace_back(align_size);
|
align_size_list_.emplace_back(align_size);
|
||||||
total_size += align_size;
|
total_size += align_size;
|
||||||
memcpy_input_addrs_.emplace_back(std::make_shared<kernel::Address>(
|
memcpy_input_addrs_.emplace_back(std::make_shared<kernel::Address>(
|
||||||
static_cast<uint8_t *>(device_address->GetMutablePtr()), device_address->GetSize()));
|
static_cast<uint8_t *>(device_address->GetMutablePtr()), device_address->GetSize()));
|
||||||
|
@ -72,14 +72,7 @@ void AscendBucket::AllocateAllReduceAddr() {
|
||||||
uint8_t *memcpy_output = ar_input_addr_;
|
uint8_t *memcpy_output = ar_input_addr_;
|
||||||
for (size_t i = 0; i < bucket_size_; ++i) {
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
memcpy_output_addrs_.emplace_back(std::make_shared<kernel::Address>(memcpy_output, origin_size_list[i]));
|
memcpy_output_addrs_.emplace_back(std::make_shared<kernel::Address>(memcpy_output, origin_size_list[i]));
|
||||||
memcpy_output += align_size_list[i];
|
memcpy_output += align_size_list_[i];
|
||||||
}
|
|
||||||
|
|
||||||
// store output tensor addr
|
|
||||||
uint8_t *tensor_output = ar_output_addr_;
|
|
||||||
for (size_t i = 0; i < bucket_size_; ++i) {
|
|
||||||
new_tensor_output_addrs_.emplace_back(tensor_output);
|
|
||||||
tensor_output += align_size_list[i];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,6 +89,10 @@ void AscendBucket::FreeAllDeviceMem() {
|
||||||
FreeDeviceMem(origin_dev_addr);
|
FreeDeviceMem(origin_dev_addr);
|
||||||
ar_output_addr_ = nullptr;
|
ar_output_addr_ = nullptr;
|
||||||
}
|
}
|
||||||
|
// clear launch mul device Memory
|
||||||
|
if (launch_kernel != nullptr) {
|
||||||
|
launch_kernel->FreeLaunchDeviceMem();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void AscendBucket::CopyTensorToContiguousMemory() {
|
void AscendBucket::CopyTensorToContiguousMemory() {
|
||||||
|
@ -154,6 +151,15 @@ void AscendBucket::LaunchAllReduce() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<LaunchKernel> AscendBucket::CreateLaunchKernel() {
|
||||||
|
if (tensor_type_list_.empty()) {
|
||||||
|
MS_LOG(ERROR) << "tensor_type_list_ is empty";
|
||||||
|
}
|
||||||
|
auto launch_mul = std::make_shared<AscendLaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_);
|
||||||
|
MS_EXCEPTION_IF_NULL(launch_mul);
|
||||||
|
return launch_mul;
|
||||||
|
}
|
||||||
|
|
||||||
void AscendBucket::Init() {
|
void AscendBucket::Init() {
|
||||||
pre_event_ = std::make_shared<AscendEvent>();
|
pre_event_ = std::make_shared<AscendEvent>();
|
||||||
post_event_ = std::make_shared<AscendEvent>();
|
post_event_ = std::make_shared<AscendEvent>();
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_
|
||||||
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
#include "runtime/device/bucket.h"
|
#include "runtime/device/bucket.h"
|
||||||
|
|
||||||
namespace mindspore::device::ascend {
|
namespace mindspore::device::ascend {
|
||||||
|
@ -33,6 +34,7 @@ class AscendBucket : public Bucket {
|
||||||
void FreeDeviceMem(void *dev_ptr) override;
|
void FreeDeviceMem(void *dev_ptr) override;
|
||||||
void CopyTensorToContiguousMemory() override;
|
void CopyTensorToContiguousMemory() override;
|
||||||
void LaunchAllReduce() override;
|
void LaunchAllReduce() override;
|
||||||
|
std::shared_ptr<LaunchKernel> CreateLaunchKernel() override;
|
||||||
};
|
};
|
||||||
} // namespace mindspore::device::ascend
|
} // namespace mindspore::device::ascend
|
||||||
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_
|
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_BUCKET_H_
|
||||||
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime/device/ascend/ascend_launch_kernel.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "runtime/device/memory_manager.h"
|
||||||
|
#include "runtime/device/ascend/ascend_memory_pool.h"
|
||||||
|
#include "runtime/device/ascend/kernel_build_ascend.h"
|
||||||
|
#include "runtime/device/ascend/kernel_select_ascend.h"
|
||||||
|
|
||||||
|
namespace mindspore::device::ascend {
|
||||||
|
void AscendLaunchKernel::FreeDeviceMem(void *addr) { AscendMemoryPool::GetInstance().FreeTensorMem(addr); }
|
||||||
|
|
||||||
|
size_t AscendLaunchKernel::AlignSizeForLaunchKernel(size_t size) { return MemoryManager::GetCommonAlignSize(size); }
|
||||||
|
|
||||||
|
uint8_t *AscendLaunchKernel::AllocDeviceMem(size_t size) {
|
||||||
|
auto device_memory = AscendMemoryPool::GetInstance().AllocTensorMem(size);
|
||||||
|
MS_EXCEPTION_IF_NULL(device_memory);
|
||||||
|
return static_cast<uint8_t *>(device_memory);
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendLaunchKernel::KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) {
|
||||||
|
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||||
|
auto node_list = kernel_graph->execution_order();
|
||||||
|
for (size_t i = 0; i < node_list.size(); ++i) {
|
||||||
|
auto status = device::ascend::SelectKernelInfo(node_list[i]);
|
||||||
|
if (status == ascend::kNoMatched) {
|
||||||
|
MS_LOG(ERROR) << "cnode name : " << node_list[i]->fullname_with_scope() << " kernel select failed";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendLaunchKernel::KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) {
|
||||||
|
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||||
|
device::ascend::KernelBuild(kernel_graph->execution_order());
|
||||||
|
}
|
||||||
|
} // namespace mindspore::device::ascend
|
|
@ -0,0 +1,41 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_KERNEL_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_KERNEL_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "runtime/device/launch_kernel.h"
|
||||||
|
|
||||||
|
namespace mindspore::device::ascend {
|
||||||
|
class AscendLaunchKernel : public LaunchKernel {
|
||||||
|
public:
|
||||||
|
explicit AscendLaunchKernel(void *stream) : LaunchKernel(stream) {}
|
||||||
|
virtual ~AscendLaunchKernel() = default;
|
||||||
|
|
||||||
|
void FreeDeviceMem(void *addr) override;
|
||||||
|
size_t AlignSizeForLaunchKernel(size_t size) override;
|
||||||
|
uint8_t *AllocDeviceMem(size_t size) override;
|
||||||
|
void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
|
||||||
|
void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;
|
||||||
|
|
||||||
|
void LaunchOpKernel() override = 0;
|
||||||
|
void FreeLaunchDeviceMem() override = 0;
|
||||||
|
};
|
||||||
|
} // namespace mindspore::device::ascend
|
||||||
|
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_KERNEL_H_
|
|
@ -0,0 +1,62 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime/device/ascend/ascend_launch_mul.h"
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include "abstract/utils.h"
|
||||||
|
#include "runtime/mem.h"
|
||||||
|
#include "backend/session/single_kernel_graph.h"
|
||||||
|
#include "frontend/parallel/context.h"
|
||||||
|
|
||||||
|
namespace mindspore::device::ascend {
|
||||||
|
void AscendLaunchMul::FreeDeviceMem(void *addr) { AscendLaunchKernel::FreeDeviceMem(addr); }
|
||||||
|
|
||||||
|
size_t AscendLaunchMul::AlignSizeForLaunchKernel(size_t size) {
|
||||||
|
return AscendLaunchKernel::AlignSizeForLaunchKernel(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t *AscendLaunchMul::AllocDeviceMem(size_t size) { return AscendLaunchKernel::AllocDeviceMem(size); }
|
||||||
|
|
||||||
|
void AscendLaunchMul::KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) {
|
||||||
|
AscendLaunchKernel::KernelSelect(kernel_graph);
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendLaunchMul::KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) {
|
||||||
|
AscendLaunchKernel::KernelBuild(kernel_graph);
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendLaunchMul::LaunchOpKernel() {
|
||||||
|
kernel_mod_ = ObtainLaunchMulKernelMod();
|
||||||
|
MS_EXCEPTION_IF_NULL(kernel_mod_);
|
||||||
|
// construct mul inputs addr
|
||||||
|
ObtainMulInputsAddr();
|
||||||
|
// launch mul
|
||||||
|
LaunchSingleKernel(inputs_addr_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendLaunchMul::FreeLaunchDeviceMem() {
|
||||||
|
FreeInputDeviceMemory();
|
||||||
|
FreeOutputAndWorkspaceDeviceMem();
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendLaunchMul::CopyHostMemToDevice(size_t origin_size, size_t dst_size) {
|
||||||
|
auto ret = rtMemcpyAsync(input2_addr_, dst_size, &input2_value_, origin_size, RT_MEMCPY_HOST_TO_DEVICE, stream_);
|
||||||
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
MS_LOG(EXCEPTION) << "launch rtMemcpyAsync failed, ret:" << ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace mindspore::device::ascend
|
|
@ -0,0 +1,44 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_MUL_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_MUL_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "runtime/device/launch_mul.h"
|
||||||
|
#include "runtime/device/ascend/ascend_launch_kernel.h"
|
||||||
|
|
||||||
|
namespace mindspore::device::ascend {
|
||||||
|
class AscendLaunchMul : public AscendLaunchKernel, public LaunchMul {
|
||||||
|
public:
|
||||||
|
AscendLaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr)
|
||||||
|
: AscendLaunchKernel(stream), LaunchMul(dtype, total_size, input1_addr) {}
|
||||||
|
~AscendLaunchMul() override = default;
|
||||||
|
|
||||||
|
void FreeDeviceMem(void *addr) override;
|
||||||
|
size_t AlignSizeForLaunchKernel(size_t size) override;
|
||||||
|
uint8_t *AllocDeviceMem(size_t size) override;
|
||||||
|
void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
|
||||||
|
void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;
|
||||||
|
|
||||||
|
void LaunchOpKernel() override;
|
||||||
|
void FreeLaunchDeviceMem() override;
|
||||||
|
void CopyHostMemToDevice(size_t origin_size, size_t dst_size) override;
|
||||||
|
};
|
||||||
|
} // namespace mindspore::device::ascend
|
||||||
|
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_LAUNCH_MUL_H_
|
|
@ -18,6 +18,7 @@
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include "runtime/device/kernel_runtime_manager.h"
|
#include "runtime/device/kernel_runtime_manager.h"
|
||||||
|
#include "frontend/parallel/context.h"
|
||||||
#include "utils/profile.h"
|
#include "utils/profile.h"
|
||||||
|
|
||||||
namespace mindspore::device {
|
namespace mindspore::device {
|
||||||
|
@ -51,6 +52,8 @@ void Bucket::Launch() {
|
||||||
pre_event_->RecordEvent();
|
pre_event_->RecordEvent();
|
||||||
pre_event_->WaitEvent();
|
pre_event_->WaitEvent();
|
||||||
LaunchAllReduce();
|
LaunchAllReduce();
|
||||||
|
// mul fusion
|
||||||
|
CalculateMean();
|
||||||
post_event_->RecordEvent();
|
post_event_->RecordEvent();
|
||||||
UpdateTensorAddr();
|
UpdateTensorAddr();
|
||||||
// pass event to the tensor
|
// pass event to the tensor
|
||||||
|
@ -84,6 +87,29 @@ void Bucket::UpdateTensorAddr() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Bucket::CalculateMean() {
|
||||||
|
auto parallel_context = parallel::ParallelContext::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(parallel_context);
|
||||||
|
auto grad_mean = parallel_context->gradients_mean();
|
||||||
|
if (!grad_mean) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
launch_kernel = CreateLaunchKernel();
|
||||||
|
MS_EXCEPTION_IF_NULL(launch_kernel);
|
||||||
|
// launch mean
|
||||||
|
launch_kernel->LaunchOpKernel();
|
||||||
|
// store output tensor addr
|
||||||
|
auto launch_output = launch_kernel->GetKernelOutputAddr();
|
||||||
|
if (launch_output.size() != 1) {
|
||||||
|
MS_LOG(ERROR) << "launch mul outputs should have one output";
|
||||||
|
}
|
||||||
|
uint8_t *tensor_output = launch_output[0];
|
||||||
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
|
new_tensor_output_addrs_.emplace_back(tensor_output);
|
||||||
|
tensor_output += align_size_list_[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void Bucket::LazyDeleteOldAddr() {
|
void Bucket::LazyDeleteOldAddr() {
|
||||||
MS_LOG(INFO) << "Lazy delete old grad address";
|
MS_LOG(INFO) << "Lazy delete old grad address";
|
||||||
for (auto old_addr : tensor_old_addr_list_) {
|
for (auto old_addr : tensor_old_addr_list_) {
|
||||||
|
@ -95,6 +121,7 @@ void Bucket::LazyDeleteOldAddr() {
|
||||||
void Bucket::Release() {
|
void Bucket::Release() {
|
||||||
MS_LOG(INFO) << "Clear bucket:" << id_;
|
MS_LOG(INFO) << "Clear bucket:" << id_;
|
||||||
grad_tensor_list_.clear();
|
grad_tensor_list_.clear();
|
||||||
|
align_size_list_.clear();
|
||||||
new_tensor_output_addrs_.clear();
|
new_tensor_output_addrs_.clear();
|
||||||
memcpy_input_addrs_.clear();
|
memcpy_input_addrs_.clear();
|
||||||
memcpy_output_addrs_.clear();
|
memcpy_output_addrs_.clear();
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include "ir/anf.h"
|
#include "ir/anf.h"
|
||||||
#include "ir/device_event.h"
|
#include "ir/device_event.h"
|
||||||
|
#include "runtime/device/launch_kernel.h"
|
||||||
#include "runtime/device/device_address.h"
|
#include "runtime/device/device_address.h"
|
||||||
#include "backend/session/kernel_graph.h"
|
#include "backend/session/kernel_graph.h"
|
||||||
|
|
||||||
|
@ -37,6 +38,7 @@ class Bucket {
|
||||||
compute_stream_(nullptr),
|
compute_stream_(nullptr),
|
||||||
pre_event_(nullptr),
|
pre_event_(nullptr),
|
||||||
post_event_(nullptr),
|
post_event_(nullptr),
|
||||||
|
launch_kernel(nullptr),
|
||||||
total_size_(0),
|
total_size_(0),
|
||||||
ar_input_addr_(nullptr),
|
ar_input_addr_(nullptr),
|
||||||
ar_output_addr_(nullptr) {}
|
ar_output_addr_(nullptr) {}
|
||||||
|
@ -58,11 +60,13 @@ class Bucket {
|
||||||
|
|
||||||
std::shared_ptr<DeviceEvent> pre_event_;
|
std::shared_ptr<DeviceEvent> pre_event_;
|
||||||
std::shared_ptr<DeviceEvent> post_event_;
|
std::shared_ptr<DeviceEvent> post_event_;
|
||||||
|
std::shared_ptr<LaunchKernel> launch_kernel;
|
||||||
|
|
||||||
size_t total_size_;
|
size_t total_size_;
|
||||||
uint8_t *ar_input_addr_;
|
uint8_t *ar_input_addr_;
|
||||||
uint8_t *ar_output_addr_;
|
uint8_t *ar_output_addr_;
|
||||||
std::string group_;
|
std::string group_;
|
||||||
|
std::vector<size_t> align_size_list_;
|
||||||
std::vector<tensor::TensorPtr> grad_tensor_list_;
|
std::vector<tensor::TensorPtr> grad_tensor_list_;
|
||||||
std::vector<uint8_t *> new_tensor_output_addrs_;
|
std::vector<uint8_t *> new_tensor_output_addrs_;
|
||||||
std::vector<kernel::AddressPtr> memcpy_input_addrs_;
|
std::vector<kernel::AddressPtr> memcpy_input_addrs_;
|
||||||
|
@ -72,6 +76,8 @@ class Bucket {
|
||||||
|
|
||||||
virtual void AllocateAllReduceAddr() = 0;
|
virtual void AllocateAllReduceAddr() = 0;
|
||||||
void UpdateTensorAddr();
|
void UpdateTensorAddr();
|
||||||
|
void CalculateMean();
|
||||||
|
virtual std::shared_ptr<LaunchKernel> CreateLaunchKernel() = 0;
|
||||||
virtual void LaunchAllReduce() = 0;
|
virtual void LaunchAllReduce() = 0;
|
||||||
virtual void FreeAllDeviceMem() = 0;
|
virtual void FreeAllDeviceMem() = 0;
|
||||||
virtual void FreeDeviceMem(void *dev_ptr) = 0;
|
virtual void FreeDeviceMem(void *dev_ptr) = 0;
|
||||||
|
|
|
@ -26,6 +26,7 @@
|
||||||
#include "runtime/device/gpu/gpu_device_manager.h"
|
#include "runtime/device/gpu/gpu_device_manager.h"
|
||||||
#include "runtime/device/kernel_runtime_manager.h"
|
#include "runtime/device/kernel_runtime_manager.h"
|
||||||
#include "runtime/device/gpu/distribution/collective_init.h"
|
#include "runtime/device/gpu/distribution/collective_init.h"
|
||||||
|
#include "runtime/device/gpu/gpu_launch_mul.h"
|
||||||
#include "backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h"
|
#include "backend/kernel_compiler/gpu/nccl/nccl_gpu_kernel.h"
|
||||||
#include "runtime/device/gpu/gpu_common.h"
|
#include "runtime/device/gpu/gpu_common.h"
|
||||||
|
|
||||||
|
@ -52,7 +53,6 @@ void GPUBucket::AllocateAllReduceAddr() {
|
||||||
|
|
||||||
auto total_size = 0;
|
auto total_size = 0;
|
||||||
std::vector<size_t> size_list;
|
std::vector<size_t> size_list;
|
||||||
std::vector<size_t> align_size_list;
|
|
||||||
for (auto &tensor : grad_tensor_list_) {
|
for (auto &tensor : grad_tensor_list_) {
|
||||||
MS_EXCEPTION_IF_NULL(tensor);
|
MS_EXCEPTION_IF_NULL(tensor);
|
||||||
tensor_type_list_.emplace_back(tensor->data_type());
|
tensor_type_list_.emplace_back(tensor->data_type());
|
||||||
|
@ -61,7 +61,7 @@ void GPUBucket::AllocateAllReduceAddr() {
|
||||||
auto origin_size = device_address->GetSize();
|
auto origin_size = device_address->GetSize();
|
||||||
auto align_size = AlignMemorySize(origin_size);
|
auto align_size = AlignMemorySize(origin_size);
|
||||||
size_list.emplace_back(origin_size);
|
size_list.emplace_back(origin_size);
|
||||||
align_size_list.emplace_back(align_size);
|
align_size_list_.emplace_back(align_size);
|
||||||
total_size += align_size;
|
total_size += align_size;
|
||||||
memcpy_input_addrs_.emplace_back(
|
memcpy_input_addrs_.emplace_back(
|
||||||
std::make_shared<kernel::Address>(static_cast<uint8_t *>(device_address->GetMutablePtr()), origin_size));
|
std::make_shared<kernel::Address>(static_cast<uint8_t *>(device_address->GetMutablePtr()), origin_size));
|
||||||
|
@ -74,13 +74,7 @@ void GPUBucket::AllocateAllReduceAddr() {
|
||||||
uint8_t *memcpy_output = ar_input_addr_;
|
uint8_t *memcpy_output = ar_input_addr_;
|
||||||
for (size_t i = 0; i < bucket_size_; ++i) {
|
for (size_t i = 0; i < bucket_size_; ++i) {
|
||||||
memcpy_output_addrs_.emplace_back(std::make_shared<kernel::Address>(memcpy_output, size_list[i]));
|
memcpy_output_addrs_.emplace_back(std::make_shared<kernel::Address>(memcpy_output, size_list[i]));
|
||||||
memcpy_output += align_size_list[i];
|
memcpy_output += align_size_list_[i];
|
||||||
}
|
|
||||||
|
|
||||||
uint8_t *tensor_output = ar_output_addr_;
|
|
||||||
for (size_t i = 0; i < bucket_size_; ++i) {
|
|
||||||
new_tensor_output_addrs_.emplace_back(tensor_output);
|
|
||||||
tensor_output += align_size_list[i];
|
|
||||||
}
|
}
|
||||||
MS_LOG(INFO) << "end";
|
MS_LOG(INFO) << "end";
|
||||||
}
|
}
|
||||||
|
@ -97,6 +91,10 @@ void GPUBucket::FreeAllDeviceMem() {
|
||||||
FreeDeviceMem(ar_output_addr_);
|
FreeDeviceMem(ar_output_addr_);
|
||||||
ar_output_addr_ = nullptr;
|
ar_output_addr_ = nullptr;
|
||||||
}
|
}
|
||||||
|
// clear launch mul device memory
|
||||||
|
if (launch_kernel != nullptr) {
|
||||||
|
launch_kernel->FreeLaunchDeviceMem();
|
||||||
|
}
|
||||||
MS_LOG(INFO) << "end";
|
MS_LOG(INFO) << "end";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -158,6 +156,15 @@ void GPUBucket::LaunchAllReduce() {
|
||||||
MS_LOG(INFO) << "end";
|
MS_LOG(INFO) << "end";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<LaunchKernel> GPUBucket::CreateLaunchKernel() {
|
||||||
|
if (tensor_type_list_.empty()) {
|
||||||
|
MS_LOG(ERROR) << "tensor_type_list_ is empty";
|
||||||
|
}
|
||||||
|
auto launch_mul = std::make_shared<GPULaunchMul>(stream_, tensor_type_list_[0], total_size_, ar_output_addr_);
|
||||||
|
MS_EXCEPTION_IF_NULL(launch_mul);
|
||||||
|
return launch_mul;
|
||||||
|
}
|
||||||
|
|
||||||
void GPUBucket::Init() {
|
void GPUBucket::Init() {
|
||||||
pre_event_ = std::make_shared<GpuEvent>();
|
pre_event_ = std::make_shared<GpuEvent>();
|
||||||
post_event_ = std::make_shared<GpuEvent>();
|
post_event_ = std::make_shared<GpuEvent>();
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_BUCKET_H_
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_BUCKET_H_
|
||||||
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_BUCKET_H_
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_BUCKET_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
#include "runtime/device/bucket.h"
|
#include "runtime/device/bucket.h"
|
||||||
|
|
||||||
namespace mindspore::device::gpu {
|
namespace mindspore::device::gpu {
|
||||||
|
@ -33,7 +34,7 @@ class GPUBucket : public Bucket {
|
||||||
void FreeDeviceMem(void *dev_ptr) override;
|
void FreeDeviceMem(void *dev_ptr) override;
|
||||||
void CopyTensorToContiguousMemory() override;
|
void CopyTensorToContiguousMemory() override;
|
||||||
void LaunchAllReduce() override;
|
void LaunchAllReduce() override;
|
||||||
|
std::shared_ptr<LaunchKernel> CreateLaunchKernel() override;
|
||||||
const void *collective_handle_;
|
const void *collective_handle_;
|
||||||
};
|
};
|
||||||
} // namespace mindspore::device::gpu
|
} // namespace mindspore::device::gpu
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime/device/gpu/gpu_launch_kernel.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "runtime/device/gpu/gpu_memory_allocator.h"
|
||||||
|
#include "runtime/device/gpu/gpu_device_manager.h"
|
||||||
|
#include "runtime/device/gpu/kernel_info_setter.h"
|
||||||
|
#include "runtime/device/gpu/gpu_kernel_build.h"
|
||||||
|
#include "abstract/utils.h"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
constexpr size_t kCommunicationMemAlignSize = 16;
|
||||||
|
size_t AlignMemorySize(size_t size) {
|
||||||
|
if (size == 0) {
|
||||||
|
return kCommunicationMemAlignSize;
|
||||||
|
}
|
||||||
|
return ((size + kCommunicationMemAlignSize - 1) / kCommunicationMemAlignSize) * kCommunicationMemAlignSize;
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
namespace mindspore::device::gpu {
|
||||||
|
void GPULaunchkernel::FreeDeviceMem(void *addr) { GPUMemoryAllocator::GetInstance().FreeTensorMem(addr); }
|
||||||
|
|
||||||
|
size_t GPULaunchkernel::AlignSizeForLaunchKernel(size_t size) { return AlignMemorySize(size); }
|
||||||
|
|
||||||
|
uint8_t *GPULaunchkernel::AllocDeviceMem(size_t size) {
|
||||||
|
auto device_memory = GPUMemoryAllocator::GetInstance().AllocTensorMem(size);
|
||||||
|
MS_EXCEPTION_IF_NULL(device_memory);
|
||||||
|
return static_cast<uint8_t *>(device_memory);
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPULaunchkernel::KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) {
|
||||||
|
auto node_list = kernel_graph->execution_order();
|
||||||
|
for (size_t i = 0; i < node_list.size(); ++i) {
|
||||||
|
device::gpu::SetKernelInfo(node_list[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPULaunchkernel::KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) {
|
||||||
|
device::gpu::GpuBuild(kernel_graph);
|
||||||
|
}
|
||||||
|
} // namespace mindspore::device::gpu
|
|
@ -0,0 +1,41 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_LAUNCH_KERNEL_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_LAUNCH_KERNEL_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "runtime/device/launch_kernel.h"
|
||||||
|
|
||||||
|
namespace mindspore::device::gpu {
|
||||||
|
class GPULaunchkernel : public LaunchKernel {
|
||||||
|
public:
|
||||||
|
explicit GPULaunchkernel(void *stream) : LaunchKernel(stream) {}
|
||||||
|
virtual ~GPULaunchkernel() = default;
|
||||||
|
|
||||||
|
void FreeDeviceMem(void *addr) override;
|
||||||
|
size_t AlignSizeForLaunchKernel(size_t size) override;
|
||||||
|
uint8_t *AllocDeviceMem(size_t size) override;
|
||||||
|
void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
|
||||||
|
void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;
|
||||||
|
|
||||||
|
void LaunchOpKernel() override = 0;
|
||||||
|
void FreeLaunchDeviceMem() override = 0;
|
||||||
|
};
|
||||||
|
} // namespace mindspore::device::gpu
|
||||||
|
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_LAUNCH_KERNEL_H_
|
|
@ -0,0 +1,61 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime/device/gpu/gpu_launch_mul.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "abstract/utils.h"
|
||||||
|
#include "runtime/device/gpu/gpu_memory_allocator.h"
|
||||||
|
#include "runtime/device/gpu/gpu_device_manager.h"
|
||||||
|
#include "backend/session/single_kernel_graph.h"
|
||||||
|
#include "frontend/parallel/context.h"
|
||||||
|
|
||||||
|
namespace mindspore::device::gpu {
|
||||||
|
void GPULaunchMul::FreeDeviceMem(void *addr) { GPULaunchkernel::FreeDeviceMem(addr); }
|
||||||
|
|
||||||
|
size_t GPULaunchMul::AlignSizeForLaunchKernel(size_t size) { return GPULaunchkernel::AlignSizeForLaunchKernel(size); }
|
||||||
|
|
||||||
|
uint8_t *GPULaunchMul::AllocDeviceMem(size_t size) { return GPULaunchkernel::AllocDeviceMem(size); }
|
||||||
|
|
||||||
|
void GPULaunchMul::KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) {
|
||||||
|
GPULaunchkernel::KernelSelect(kernel_graph);
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPULaunchMul::KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) {
|
||||||
|
GPULaunchkernel::KernelBuild(kernel_graph);
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPULaunchMul::LaunchOpKernel() {
|
||||||
|
kernel_mod_ = ObtainLaunchMulKernelMod();
|
||||||
|
MS_EXCEPTION_IF_NULL(kernel_mod_);
|
||||||
|
// construct mul inputs addr
|
||||||
|
ObtainMulInputsAddr();
|
||||||
|
// launch mul
|
||||||
|
LaunchSingleKernel(inputs_addr_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPULaunchMul::FreeLaunchDeviceMem() {
|
||||||
|
FreeInputDeviceMemory();
|
||||||
|
FreeOutputAndWorkspaceDeviceMem();
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPULaunchMul::CopyHostMemToDevice(size_t origin_size, size_t) {
|
||||||
|
if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(input2_addr_, &input2_value_, origin_size, stream_)) {
|
||||||
|
MS_LOG(EXCEPTION) << "Copy memory failed";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace mindspore::device::gpu
|
|
@ -0,0 +1,44 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_LAUNCH_MUL_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_LAUNCH_MUL_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "runtime/device/launch_mul.h"
|
||||||
|
#include "runtime/device/gpu/gpu_launch_kernel.h"
|
||||||
|
|
||||||
|
namespace mindspore::device::gpu {
|
||||||
|
class GPULaunchMul : public GPULaunchkernel, public LaunchMul {
|
||||||
|
public:
|
||||||
|
GPULaunchMul(void *stream, TypeId dtype, size_t total_size, uint8_t *input1_addr)
|
||||||
|
: GPULaunchkernel(stream), LaunchMul(dtype, total_size, input1_addr) {}
|
||||||
|
~GPULaunchMul() override = default;
|
||||||
|
|
||||||
|
void FreeDeviceMem(void *addr) override;
|
||||||
|
size_t AlignSizeForLaunchKernel(size_t size) override;
|
||||||
|
uint8_t *AllocDeviceMem(size_t size) override;
|
||||||
|
void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) override;
|
||||||
|
void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) override;
|
||||||
|
|
||||||
|
void LaunchOpKernel() override;
|
||||||
|
void FreeLaunchDeviceMem() override;
|
||||||
|
void CopyHostMemToDevice(size_t origin_size, size_t) override;
|
||||||
|
};
|
||||||
|
} // namespace mindspore::device::gpu
|
||||||
|
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_LAUNCH_MUL_H_
|
|
@ -0,0 +1,107 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime/device/launch_kernel.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
namespace mindspore::device {
|
||||||
|
std::vector<kernel::AddressPtr> LaunchKernel::ObtainKernelAddress(const std::vector<size_t> &list,
|
||||||
|
std::vector<uint8_t *> *addr) {
|
||||||
|
std::vector<kernel::AddressPtr> kernel_address;
|
||||||
|
for (size_t i = 0; i < list.size(); ++i) {
|
||||||
|
auto size = AlignSizeForLaunchKernel(list[i]);
|
||||||
|
(*addr)[i] = AllocDeviceMem(size);
|
||||||
|
auto address = std::make_shared<kernel::Address>();
|
||||||
|
MS_EXCEPTION_IF_NULL(address);
|
||||||
|
address->addr = (*addr)[i];
|
||||||
|
MS_EXCEPTION_IF_NULL(address->addr);
|
||||||
|
address->size = size;
|
||||||
|
kernel_address.push_back(address);
|
||||||
|
}
|
||||||
|
return kernel_address;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<kernel::AddressPtr> LaunchKernel::ObtainKernelInputs(const std::vector<size_t> &inputs_list,
|
||||||
|
const std::vector<uint8_t *> &inputs_addr) {
|
||||||
|
std::vector<kernel::AddressPtr> kernel_inputs;
|
||||||
|
if (inputs_list.size() != inputs_addr.size()) {
|
||||||
|
MS_LOG(ERROR) << "input_list size should equal to input_addr_ size";
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < inputs_list.size(); ++i) {
|
||||||
|
auto input_size = AlignSizeForLaunchKernel(inputs_list[i]);
|
||||||
|
auto input = std::make_shared<kernel::Address>();
|
||||||
|
MS_EXCEPTION_IF_NULL(input);
|
||||||
|
input->addr = inputs_addr[i];
|
||||||
|
MS_EXCEPTION_IF_NULL(input->addr);
|
||||||
|
input->size = input_size;
|
||||||
|
kernel_inputs.push_back(input);
|
||||||
|
}
|
||||||
|
return kernel_inputs;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<kernel::AddressPtr> LaunchKernel::ObtainKernelOutputs(const std::vector<size_t> &outputs_list) {
|
||||||
|
// init output_addr_
|
||||||
|
outputs_addr_ = std::vector<uint8_t *>(outputs_list.size(), nullptr);
|
||||||
|
auto kernel_outputs = ObtainKernelAddress(outputs_list, &outputs_addr_);
|
||||||
|
return kernel_outputs;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<kernel::AddressPtr> LaunchKernel::ObtainKernelWorkspaces(const std::vector<size_t> &workspaces_list) {
|
||||||
|
std::vector<kernel::AddressPtr> kernel_workspace;
|
||||||
|
if (workspaces_list.empty()) {
|
||||||
|
return kernel_workspace;
|
||||||
|
}
|
||||||
|
// init workspace_addr_
|
||||||
|
workspaces_addr_ = std::vector<uint8_t *>(workspaces_list.size(), nullptr);
|
||||||
|
kernel_workspace = ObtainKernelAddress(workspaces_list, &workspaces_addr_);
|
||||||
|
return kernel_workspace;
|
||||||
|
}
|
||||||
|
|
||||||
|
void LaunchKernel::LaunchSingleKernel(const std::vector<uint8_t *> &inputs_addr) {
|
||||||
|
MS_EXCEPTION_IF_NULL(kernel_mod_);
|
||||||
|
// obtain kernel inputs
|
||||||
|
auto kernel_inputs = ObtainKernelInputs(kernel_mod_->GetInputSizeList(), inputs_addr);
|
||||||
|
// obtain kernel outputs
|
||||||
|
auto kernel_outputs = ObtainKernelOutputs(kernel_mod_->GetOutputSizeList());
|
||||||
|
// obtain kernel workspace
|
||||||
|
auto kernel_workspaces = ObtainKernelWorkspaces(kernel_mod_->GetWorkspaceSizeList());
|
||||||
|
// launch
|
||||||
|
auto ret_status = kernel_mod_->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_);
|
||||||
|
if (!ret_status) {
|
||||||
|
MS_LOG(ERROR) << "Launch mul kernel failed.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void LaunchKernel::FreeOutputAndWorkspaceDeviceMem() {
|
||||||
|
// free outputs_addr and workspaces_addr_
|
||||||
|
for (size_t i = 0; i < outputs_addr_.size(); ++i) {
|
||||||
|
if (outputs_addr_[i] != nullptr) {
|
||||||
|
FreeDeviceMem(outputs_addr_[i]);
|
||||||
|
outputs_addr_[i] = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < workspaces_addr_.size(); ++i) {
|
||||||
|
if (workspaces_addr_[i] != nullptr) {
|
||||||
|
FreeDeviceMem(workspaces_addr_[i]);
|
||||||
|
workspaces_addr_[i] = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
outputs_addr_.clear();
|
||||||
|
workspaces_addr_.clear();
|
||||||
|
}
|
||||||
|
} // namespace mindspore::device
|
|
@ -0,0 +1,58 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_LAUNCH_KERNEL_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_LAUNCH_KERNEL_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "backend/session/kernel_graph.h"
|
||||||
|
|
||||||
|
namespace mindspore::device {
|
||||||
|
class LaunchKernel {
|
||||||
|
public:
|
||||||
|
explicit LaunchKernel(void *stream) : stream_(stream), kernel_mod_(nullptr) {}
|
||||||
|
virtual ~LaunchKernel() = default;
|
||||||
|
|
||||||
|
std::vector<uint8_t *> GetKernelOutputAddr() { return outputs_addr_; }
|
||||||
|
void LaunchSingleKernel(const std::vector<uint8_t *> &inputs_addr);
|
||||||
|
void FreeOutputAndWorkspaceDeviceMem();
|
||||||
|
|
||||||
|
virtual void FreeDeviceMem(void *addr) = 0;
|
||||||
|
virtual size_t AlignSizeForLaunchKernel(size_t size) = 0;
|
||||||
|
virtual uint8_t *AllocDeviceMem(size_t size) = 0;
|
||||||
|
virtual void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) = 0;
|
||||||
|
virtual void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) = 0;
|
||||||
|
|
||||||
|
virtual void LaunchOpKernel() = 0;
|
||||||
|
virtual void FreeLaunchDeviceMem() = 0;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void *stream_;
|
||||||
|
kernel::KernelMod *kernel_mod_;
|
||||||
|
std::vector<uint8_t *> outputs_addr_;
|
||||||
|
std::vector<uint8_t *> workspaces_addr_;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<kernel::AddressPtr> ObtainKernelAddress(const std::vector<size_t> &list, std::vector<uint8_t *> *addr);
|
||||||
|
std::vector<kernel::AddressPtr> ObtainKernelInputs(const std::vector<size_t> &inputs_list,
|
||||||
|
const std::vector<uint8_t *> &inputs_addr);
|
||||||
|
std::vector<kernel::AddressPtr> ObtainKernelOutputs(const std::vector<size_t> &outputs_list);
|
||||||
|
std::vector<kernel::AddressPtr> ObtainKernelWorkspaces(const std::vector<size_t> &workspaces_list);
|
||||||
|
};
|
||||||
|
} // namespace mindspore::device
|
||||||
|
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_LAUNCH_KERNEL_H_
|
|
@ -0,0 +1,83 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "runtime/device/launch_mul.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "abstract/utils.h"
|
||||||
|
#include "backend/session/single_kernel_graph.h"
|
||||||
|
#include "frontend/parallel/context.h"
|
||||||
|
|
||||||
|
namespace mindspore::device {
|
||||||
|
std::shared_ptr<session::KernelGraph> LaunchMul::ObtainMulKernelGraph() {
|
||||||
|
std::vector<TypeId> input_dtypes = {dtype_, dtype_};
|
||||||
|
std::vector<TypeId> output_dtypes = {dtype_};
|
||||||
|
// obtain input & output shapes
|
||||||
|
size_t dtype_size = abstract::TypeIdSize(dtype_);
|
||||||
|
int64_t shape = total_size_ / dtype_size;
|
||||||
|
std::vector<std::vector<int64_t>> input_shapes = {{shape}, {1}};
|
||||||
|
std::vector<std::vector<size_t>> output_shapes = {{static_cast<size_t>(shape)}};
|
||||||
|
auto mul_graph = session::SingleKernelGraph::ConstructKernelGraphBasedOnSingleOp(
|
||||||
|
kMulOpName, input_dtypes, input_shapes, output_dtypes, output_shapes);
|
||||||
|
MS_EXCEPTION_IF_NULL(mul_graph);
|
||||||
|
return mul_graph;
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel::KernelMod *LaunchMul::ObtainLaunchMulKernelMod() {
|
||||||
|
if (mul_graph_ == nullptr) {
|
||||||
|
// construct mul kernel graph
|
||||||
|
mul_graph_ = ObtainMulKernelGraph();
|
||||||
|
MS_EXCEPTION_IF_NULL(mul_graph_);
|
||||||
|
// kernel select
|
||||||
|
KernelSelect(mul_graph_);
|
||||||
|
// kernel build
|
||||||
|
KernelBuild(mul_graph_);
|
||||||
|
}
|
||||||
|
// obtain kernel_mod
|
||||||
|
if (mul_graph_->execution_order().size() != 1) {
|
||||||
|
MS_LOG(ERROR) << "the execution order of the mul graph should have only one node";
|
||||||
|
}
|
||||||
|
return AnfAlgo::GetKernelMod(mul_graph_->execution_order()[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
void LaunchMul::ObtainMulInputsAddr() {
|
||||||
|
inputs_addr_.push_back(input1_addr_);
|
||||||
|
|
||||||
|
auto parallel_context = parallel::ParallelContext::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(parallel_context);
|
||||||
|
auto device_num = parallel_context->device_num();
|
||||||
|
if (device_num == 0) {
|
||||||
|
MS_LOG(ERROR) << "device num can't be zero";
|
||||||
|
}
|
||||||
|
input2_value_ = 1.0 / device_num;
|
||||||
|
auto size = abstract::TypeIdSize(dtype_);
|
||||||
|
auto input_size = AlignSizeForLaunchKernel(size * 1);
|
||||||
|
// alloc memory
|
||||||
|
input2_addr_ = AllocDeviceMem(input_size);
|
||||||
|
CopyHostMemToDevice(size, input_size);
|
||||||
|
inputs_addr_.push_back(input2_addr_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void LaunchMul::FreeInputDeviceMemory() {
|
||||||
|
input1_addr_ = nullptr;
|
||||||
|
if (input2_addr_ != nullptr) {
|
||||||
|
FreeDeviceMem(input2_addr_);
|
||||||
|
input2_addr_ = nullptr;
|
||||||
|
}
|
||||||
|
inputs_addr_.clear();
|
||||||
|
}
|
||||||
|
} // namespace mindspore::device
|
|
@ -0,0 +1,59 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_LAUNCH_MUL_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_LAUNCH_MUL_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "backend/session/kernel_graph.h"
|
||||||
|
|
||||||
|
namespace mindspore::device {
|
||||||
|
class LaunchMul {
|
||||||
|
public:
|
||||||
|
LaunchMul(TypeId dtype, size_t total_size, uint8_t *input1_addr)
|
||||||
|
: dtype_(dtype),
|
||||||
|
total_size_(total_size),
|
||||||
|
input1_addr_(input1_addr),
|
||||||
|
input2_addr_(nullptr),
|
||||||
|
input2_value_(0),
|
||||||
|
mul_graph_(nullptr) {}
|
||||||
|
virtual ~LaunchMul() = default;
|
||||||
|
|
||||||
|
virtual void FreeDeviceMem(void *addr) = 0;
|
||||||
|
virtual size_t AlignSizeForLaunchKernel(size_t size) = 0;
|
||||||
|
virtual uint8_t *AllocDeviceMem(size_t size) = 0;
|
||||||
|
virtual void KernelSelect(std::shared_ptr<session::KernelGraph> kernel_graph) = 0;
|
||||||
|
virtual void KernelBuild(std::shared_ptr<session::KernelGraph> kernel_graph) = 0;
|
||||||
|
virtual void CopyHostMemToDevice(size_t origin_size, size_t dst_size) = 0;
|
||||||
|
|
||||||
|
std::shared_ptr<session::KernelGraph> ObtainMulKernelGraph();
|
||||||
|
kernel::KernelMod *ObtainLaunchMulKernelMod();
|
||||||
|
void ObtainMulInputsAddr();
|
||||||
|
void FreeInputDeviceMemory();
|
||||||
|
|
||||||
|
protected:
|
||||||
|
TypeId dtype_;
|
||||||
|
size_t total_size_;
|
||||||
|
uint8_t *input1_addr_;
|
||||||
|
uint8_t *input2_addr_;
|
||||||
|
float input2_value_;
|
||||||
|
std::shared_ptr<session::KernelGraph> mul_graph_;
|
||||||
|
std::vector<uint8_t *> inputs_addr_;
|
||||||
|
};
|
||||||
|
} // namespace mindspore::device
|
||||||
|
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_RUNTIME_DEVICE_LAUNCH_MUL_H_
|
|
@ -373,7 +373,7 @@ class DistributedGradReducer(Cell):
|
||||||
datatypes = self.map_(F.partial(_get_datatype), grads)
|
datatypes = self.map_(F.partial(_get_datatype), grads)
|
||||||
grads = self.map_(F.partial(_cast_datatype, mstype.float32), grads)
|
grads = self.map_(F.partial(_cast_datatype, mstype.float32), grads)
|
||||||
if self.mode == context.PYNATIVE_MODE:
|
if self.mode == context.PYNATIVE_MODE:
|
||||||
new_grad = self.map_(F.partial(reduce_opt, self.degree, self.mean), self.allreduce_filter, grads)
|
new_grad = grads
|
||||||
elif self.split_fusion:
|
elif self.split_fusion:
|
||||||
if self.enable_parameter_server:
|
if self.enable_parameter_server:
|
||||||
new_grad = self.map_(F.partial(reduce_opt, self.degree, self.mean, self.allgather),
|
new_grad = self.map_(F.partial(reduce_opt, self.degree, self.mean, self.allgather),
|
||||||
|
|
|
@ -98,8 +98,11 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
"../../../mindspore/ccsrc/runtime/device/kernel_runtime_manager.cc"
|
"../../../mindspore/ccsrc/runtime/device/kernel_runtime_manager.cc"
|
||||||
"../../../mindspore/ccsrc/runtime/device/kernel_info.cc"
|
"../../../mindspore/ccsrc/runtime/device/kernel_info.cc"
|
||||||
"../../../mindspore/ccsrc/runtime/device/bucket.cc"
|
"../../../mindspore/ccsrc/runtime/device/bucket.cc"
|
||||||
|
"../../../mindspore/ccsrc/runtime/device/launch_kernel.cc"
|
||||||
"../../../mindspore/ccsrc/runtime/device/ascend/profiling/*.cc"
|
"../../../mindspore/ccsrc/runtime/device/ascend/profiling/*.cc"
|
||||||
"../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc"
|
"../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_ascend.cc"
|
||||||
|
"../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_kernel.cc"
|
||||||
|
"../../../mindspore/ccsrc/runtime/device/ascend/ascend_launch_mul.cc"
|
||||||
"../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_graph_kernel.cc"
|
"../../../mindspore/ccsrc/runtime/device/ascend/kernel_select_graph_kernel.cc"
|
||||||
"../../../mindspore/ccsrc/runtime/device/convert_tensor_utils.cc"
|
"../../../mindspore/ccsrc/runtime/device/convert_tensor_utils.cc"
|
||||||
"../../../mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc"
|
"../../../mindspore/ccsrc/runtime/device/ascend/ascend_bucket.cc"
|
||||||
|
|
Loading…
Reference in New Issue