forked from mindspore-Ecosystem/mindspore
synchronize latest Ascend software suite 18 Jul 2020, and merging branches
This commit is contained in:
commit
859acc6d2a
|
@ -15,4 +15,4 @@
|
|||
url = https://gitee.com/mindspore/akg.git
|
||||
[submodule "graphengine"]
|
||||
path = graphengine
|
||||
url = https://gitee.com/ms-incubator/graphengine.git
|
||||
url = https://gitee.com/mindspore/graphengine.git
|
||||
|
|
|
@ -202,10 +202,10 @@ Check out how MindSpore Open Governance [works](https://gitee.com/mindspore/comm
|
|||
|
||||
### Communication
|
||||
|
||||
- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/enQtOTcwMTIxMDI3NjM0LTNkMWM2MzI5NjIyZWU5ZWQ5M2EwMTQ5MWNiYzMxOGM4OWFhZjI4M2E5OGI2YTg3ODU1ODE2Njg1MThiNWI3YmQ) - Communication platform for developers.
|
||||
- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/zt-dgk65rli-3ex4xvS4wHX7UDmsQmfu8w) - Communication platform for developers.
|
||||
- IRC channel at `#mindspore` (only for meeting minutes logging purpose)
|
||||
- Video Conferencing: https://meet.jit.si
|
||||
- Mailing-list: https://mailweb.mindspore.cn/postorius/lists
|
||||
- Video Conferencing: TBD
|
||||
- Mailing-list: <https://mailweb.mindspore.cn/postorius/lists>
|
||||
|
||||
## Contributing
|
||||
|
||||
|
|
2
akg
2
akg
|
@ -1 +1 @@
|
|||
Subproject commit df57a6cf9450e347d1854687d1fe66a420ee3b35
|
||||
Subproject commit f60af9df4220bf3db5de2b224418953c0dc1f625
|
15
build.sh
15
build.sh
|
@ -24,7 +24,7 @@ usage()
|
|||
{
|
||||
echo "Usage:"
|
||||
echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
|
||||
echo " [-a on|off] [-Q on|off] [-S on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
|
||||
echo " [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
|
||||
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
|
@ -48,7 +48,6 @@ usage()
|
|||
echo " -P Enable dump anf graph to file in ProtoBuffer format, default on"
|
||||
echo " -Q Enable dump memory, default off"
|
||||
echo " -D Enable dumping of function graph ir, default on"
|
||||
echo " -S Enable async data dump, default off"
|
||||
echo " -z Compile dataset & mindrecord, default on"
|
||||
echo " -M Enable MPI and NCCL for GPU training, gpu default on"
|
||||
echo " -V Specify the minimum required cuda version, default CUDA 10.1"
|
||||
|
@ -89,7 +88,6 @@ checkopts()
|
|||
ENABLE_TIMELINE="off"
|
||||
ENABLE_DUMP2PROTO="on"
|
||||
ENABLE_DUMPE2E="off"
|
||||
ENABLE_DATA_DUMP="off"
|
||||
ENABLE_DUMP_IR="on"
|
||||
COMPILE_MINDDATA="on"
|
||||
ENABLE_MPI="off"
|
||||
|
@ -104,7 +102,7 @@ checkopts()
|
|||
ENABLE_PYTHON="on"
|
||||
|
||||
# Process the options
|
||||
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:S:D:zM:V:K:sB:E' opt
|
||||
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:sB:E' opt
|
||||
do
|
||||
OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
|
||||
case "${opt}" in
|
||||
|
@ -186,6 +184,7 @@ checkopts()
|
|||
elif [[ "X$OPTARG" == "Xd" || "X$OPTARG" == "Xascend" ]]; then
|
||||
ENABLE_D="on"
|
||||
ENABLE_CPU="on"
|
||||
ENABLE_SERVING="on"
|
||||
elif [[ "X$OPTARG" == "Xcpu" ]]; then
|
||||
ENABLE_CPU="on"
|
||||
else
|
||||
|
@ -220,11 +219,6 @@ checkopts()
|
|||
ENABLE_DUMPE2E="$OPTARG"
|
||||
echo "enable dump end to end"
|
||||
;;
|
||||
S)
|
||||
check_on_off $OPTARG S
|
||||
ENABLE_DATA_DUMP="$OPTARG"
|
||||
echo "enable data dump"
|
||||
;;
|
||||
D)
|
||||
check_on_off $OPTARG D
|
||||
ENABLE_DUMP_IR="$OPTARG"
|
||||
|
@ -328,9 +322,6 @@ build_mindspore()
|
|||
if [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
|
||||
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON"
|
||||
fi
|
||||
if [[ "X$ENABLE_DATA_DUMP" = "Xon" ]]; then
|
||||
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DATA_DUMP=ON"
|
||||
fi
|
||||
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}"
|
||||
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}"
|
||||
if [[ "X$ENABLE_MPI" = "Xon" ]]; then
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS}")
|
||||
set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
|
||||
set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
|
||||
mindspore_add_pkg(glog
|
||||
VER 0.4.0
|
||||
|
|
|
@ -116,10 +116,10 @@ if(ENABLE_DUMP_E2E)
|
|||
add_compile_definitions(ENABLE_DUMP_E2E)
|
||||
endif()
|
||||
|
||||
if(ENABLE_DATA_DUMP)
|
||||
add_compile_definitions(ENABLE_DATA_DUMP)
|
||||
endif()
|
||||
|
||||
if(ENABLE_DEBUGGER)
|
||||
add_compile_definitions(ENABLE_DEBUGGER)
|
||||
endif()
|
||||
|
||||
if(ENABLE_TESTCASES)
|
||||
add_compile_definitions(ENABLE_TESTCASES)
|
||||
endif()
|
|
@ -1,13 +1,16 @@
|
|||
# find exec
|
||||
find_package(Python3 3.7 COMPONENTS Interpreter Development)
|
||||
if (NOT Python3_FOUND)
|
||||
message("No python3 found.")
|
||||
return ()
|
||||
message(FATAL_ERROR "No python3 found.")
|
||||
endif ()
|
||||
|
||||
set(PYTHON ${Python3_EXECUTABLE})
|
||||
set(PYTHON_VERSION ${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR})
|
||||
|
||||
if (NOT PYTHON_VERSION MATCHES "3.7")
|
||||
message(FATAL_ERROR "FIND PYTHON VERSION ${PYTHON_VERSION} BUT CAN NOT MATCH PYTHON VERSION 3.7")
|
||||
endif ()
|
||||
|
||||
find_package(Git)
|
||||
if (NOT GIT_FOUND)
|
||||
message("No git found.")
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit eee707935c066c16e9b9cd207f8125871b6b97cf
|
||||
Subproject commit 103f2d1019dc50d781d7a964551d9f1f50b3b009
|
|
@ -17,7 +17,7 @@
|
|||
"""Resources for ast tree parse."""
|
||||
import ast
|
||||
import math
|
||||
from mindspore import IndexedSlices
|
||||
from mindspore import IndexedSlices, SparseTensor
|
||||
from mindspore.ops.composite import multitype_ops
|
||||
from mindspore.ops import functional as F, composite as C
|
||||
from . import standard_method as M
|
||||
|
@ -140,4 +140,5 @@ convert_object_map = {
|
|||
|
||||
# user defined
|
||||
IndexedSlices: F.make_indexed_slices,
|
||||
SparseTensor: F.make_sparse_tensor,
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@ if(ENABLE_GPU)
|
|||
"backend/kernel_compiler/akg/akg_kernel_attrs_process.cc"
|
||||
)
|
||||
|
||||
list(APPEND CUDA_NVCC_FLAGS -arch=sm_53)
|
||||
list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
|
||||
list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/blocking_queue.cc" "runtime/device/gpu/gpu_buffer_mgr.cc")
|
||||
list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/mpi/mpi_initializer.cc"
|
||||
"runtime/device/gpu/distribution/collective_wrapper.cc"
|
||||
|
|
|
@ -26,14 +26,6 @@ if (ENABLE_CPU)
|
|||
"cpu/*.cc"
|
||||
)
|
||||
|
||||
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc"
|
||||
"cpu/ps/pull_kernel.cc"
|
||||
"cpu/ps/embedding_look_up_ps_kernel.cc"
|
||||
"cpu/ps/embedding_look_up_proxy_kernel.cc"
|
||||
"cpu/ps/apply_momentum_ps_kernel.cc"
|
||||
"cpu/ps/sparse_apply_adam_ps_kernel.cc"
|
||||
"cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
|
||||
|
||||
if (NOT ENABLE_MPI)
|
||||
list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc")
|
||||
list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc")
|
||||
|
@ -41,6 +33,17 @@ if (ENABLE_CPU)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_SYSTEM_NAME} MATCHES "Windows" OR ENABLE_GE)
|
||||
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/apply_momentum_ps_kernel.cc")
|
||||
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_proxy_kernel.cc")
|
||||
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_ps_kernel.cc")
|
||||
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pserver_kernel.cc")
|
||||
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pull_kernel.cc")
|
||||
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc")
|
||||
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_adam_ps_kernel.cc")
|
||||
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
|
||||
endif()
|
||||
|
||||
if (ENABLE_GPU)
|
||||
file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
"gpu/*.cu"
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <algorithm>
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "backend/optimizer/common/helper.h"
|
||||
#include "backend/kernel_compiler/common_utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -75,15 +76,7 @@ void SetAkgAttrsForCast(const AnfNodePtr &anf_node) {
|
|||
|
||||
std::string dst_type;
|
||||
TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, 0);
|
||||
if (output_type == kFloat32->type_id()) {
|
||||
dst_type = "float32";
|
||||
} else if (output_type == kFloat16->type_id()) {
|
||||
dst_type = "float16";
|
||||
} else if (output_type == kInt32->type_id()) {
|
||||
dst_type = "int32";
|
||||
} else {
|
||||
MS_LOG(WARNING) << "Unknown cast_to type: " << TypeIdToType(output_type)->ToString();
|
||||
}
|
||||
dst_type = TypeId2String(output_type);
|
||||
AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node);
|
||||
}
|
||||
|
||||
|
|
|
@ -21,9 +21,7 @@
|
|||
#include <memory>
|
||||
#include "framework/ge_runtime/task_info.h"
|
||||
#include "backend/kernel_compiler/kernel.h"
|
||||
#ifdef ENABLE_DATA_DUMP
|
||||
#include "debug/data_dump_parser.h"
|
||||
#endif
|
||||
|
||||
using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>;
|
||||
namespace mindspore {
|
||||
|
@ -34,13 +32,7 @@ class AscendKernelMod : public KernelMod {
|
|||
const std::vector<AddressPtr> &, uint32_t) = 0;
|
||||
uint32_t block_dim() { return block_dim_; }
|
||||
uint32_t stream_id() { return stream_id_; }
|
||||
virtual bool NeedDump() {
|
||||
#ifdef ENABLE_DATA_DUMP
|
||||
return DataDumpParser::GetInstance().NeedDump(kernel_name_);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
virtual bool NeedDump() { return DataDumpParser::GetInstance().NeedDump(kernel_name_); }
|
||||
|
||||
protected:
|
||||
uint32_t block_dim_{1};
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include <iostream>
|
||||
#include <utility>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <thread>
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
|
@ -499,235 +500,329 @@ int Sign(float x) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
|
||||
size_t outer_dim) {
|
||||
MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
|
||||
MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
|
||||
MS_EXCEPTION_IF_NULL(unique_grad);
|
||||
MS_EXCEPTION_IF_NULL(unique_grad->value_);
|
||||
MS_EXCEPTION_IF_NULL(unique_grad->indices_);
|
||||
namespace {
|
||||
struct BucketSparseGradient {
|
||||
float *value_;
|
||||
int *indices_;
|
||||
int *global_indices_;
|
||||
size_t indices_size_;
|
||||
};
|
||||
|
||||
struct MultiThreadReduceSparseGradientParam {
|
||||
SparseGradient *input_grad_{nullptr};
|
||||
SparseGradient *workspace_grad_{nullptr};
|
||||
SparseGradient *output_grad_{nullptr};
|
||||
size_t max_index_{0};
|
||||
size_t value_stride_{0};
|
||||
size_t thread_num_{0};
|
||||
bool use_sort_reduce_{false};
|
||||
};
|
||||
|
||||
void CalculateEachBucketSize(const std::shared_ptr<SparseGradient> &sparse_grad, size_t max_index,
|
||||
std::vector<size_t> *each_bucket_size) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
MS_EXCEPTION_IF_NULL(sparse_grad);
|
||||
MS_EXCEPTION_IF_NULL(sparse_grad->indices_);
|
||||
MS_EXCEPTION_IF_NULL(each_bucket_size);
|
||||
size_t bucket_num = each_bucket_size->size();
|
||||
for (size_t i = 0; i < sparse_grad->indices_size_; ++i) {
|
||||
int index = sparse_grad->indices_[i];
|
||||
if (index >= 0 && IntToSize(index) < max_index) {
|
||||
auto bucket_id = index % bucket_num;
|
||||
each_bucket_size->at(bucket_id)++;
|
||||
}
|
||||
}
|
||||
MS_LOG(DEBUG) << "End";
|
||||
}
|
||||
|
||||
void SplitAndCalculateSegmentBucketSize(const MultiThreadReduceSparseGradientParam ¶m,
|
||||
std::vector<std::shared_ptr<SparseGradient>> *segments_ptr,
|
||||
std::vector<std::shared_ptr<std::vector<size_t>>> *segment_bucket_sizes_ptr) {
|
||||
MS_EXCEPTION_IF_NULL(param.input_grad_);
|
||||
MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr);
|
||||
MS_EXCEPTION_IF_NULL(segments_ptr);
|
||||
auto &segments = *segments_ptr;
|
||||
auto &segment_bucket_sizes = *segment_bucket_sizes_ptr;
|
||||
auto input_grad = param.input_grad_;
|
||||
if (param.thread_num_ < 1) {
|
||||
MS_EXCEPTION(ArgumentError) << "Input param thread num must > 0!";
|
||||
}
|
||||
size_t thread_indices_size = input_grad->indices_size_ / param.thread_num_;
|
||||
size_t left_indices_size = input_grad->indices_size_ % param.thread_num_;
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(param.thread_num_);
|
||||
segments.reserve(param.thread_num_);
|
||||
|
||||
size_t current_indices_offset = 0;
|
||||
for (size_t i = 0; i < param.thread_num_; ++i) {
|
||||
segment_bucket_sizes.emplace_back(std::make_shared<std::vector<size_t>>(param.thread_num_, 0));
|
||||
size_t indices_size = thread_indices_size;
|
||||
if (i < left_indices_size) {
|
||||
indices_size += 1;
|
||||
}
|
||||
segments.emplace_back(std::make_shared<SparseGradient>());
|
||||
segments[i]->value_ = input_grad->value_ + current_indices_offset * param.value_stride_;
|
||||
segments[i]->indices_ = input_grad->indices_ + current_indices_offset;
|
||||
segments[i]->indices_size_ = indices_size;
|
||||
threads.emplace_back(
|
||||
std::thread(CalculateEachBucketSize, segments[i], param.max_index_, segment_bucket_sizes[i].get()));
|
||||
current_indices_offset += indices_size;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < param.thread_num_; ++i) {
|
||||
threads[i].join();
|
||||
}
|
||||
}
|
||||
|
||||
void CopySegmentIndicesToBucket(const MultiThreadReduceSparseGradientParam ¶m,
|
||||
const std::shared_ptr<SparseGradient> &segment, size_t bucket_offset,
|
||||
const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
MS_EXCEPTION_IF_NULL(segment);
|
||||
MS_EXCEPTION_IF_NULL(segment->indices_);
|
||||
std::vector<size_t> bucket_data_num(param.thread_num_, 0);
|
||||
for (size_t i = 0; i < segment->indices_size_; ++i) {
|
||||
int index = segment->indices_[i];
|
||||
if (index >= 0 && IntToSize(index) < param.max_index_) {
|
||||
auto bucket_id = index % param.thread_num_;
|
||||
auto bucket_index = bucket_data_num[bucket_id];
|
||||
buckets[bucket_id]->indices_[bucket_index] = index;
|
||||
buckets[bucket_id]->global_indices_[bucket_index] = bucket_offset + i;
|
||||
bucket_data_num[bucket_id]++;
|
||||
}
|
||||
}
|
||||
MS_LOG(DEBUG) << "End";
|
||||
}
|
||||
|
||||
void GatherSegmentIndicesToOutputBucket(const MultiThreadReduceSparseGradientParam ¶m,
|
||||
const std::vector<std::shared_ptr<SparseGradient>> &segments,
|
||||
const std::vector<std::shared_ptr<std::vector<size_t>>> &segment_bucket_sizes,
|
||||
std::vector<std::shared_ptr<BucketSparseGradient>> *buckets_ptr) {
|
||||
MS_EXCEPTION_IF_NULL(param.output_grad_);
|
||||
MS_EXCEPTION_IF_NULL(param.output_grad_->value_);
|
||||
MS_EXCEPTION_IF_NULL(param.output_grad_->indices_);
|
||||
MS_EXCEPTION_IF_NULL(buckets_ptr);
|
||||
auto &buckets = *buckets_ptr;
|
||||
size_t thread_num = param.thread_num_;
|
||||
if (thread_num != segment_bucket_sizes.size()) {
|
||||
MS_EXCEPTION(ArgumentError) << "Input param thread num not equal to segment size!";
|
||||
}
|
||||
std::vector<size_t> bucket_data_size(thread_num, 0);
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
for (size_t j = 0; j < thread_num; ++j) {
|
||||
bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
|
||||
}
|
||||
}
|
||||
size_t current_indices_offset = 0;
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
buckets.emplace_back(std::make_shared<BucketSparseGradient>());
|
||||
buckets[i]->value_ = param.output_grad_->value_ + current_indices_offset * param.value_stride_;
|
||||
buckets[i]->indices_ = param.output_grad_->indices_ + current_indices_offset;
|
||||
buckets[i]->global_indices_ = param.workspace_grad_->indices_ + current_indices_offset;
|
||||
buckets[i]->indices_size_ = bucket_data_size[i];
|
||||
current_indices_offset += bucket_data_size[i];
|
||||
}
|
||||
std::vector<size_t> tmp_bucket_data_size(thread_num, 0);
|
||||
std::vector<std::vector<std::shared_ptr<BucketSparseGradient>>> each_thread_buckets;
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
std::vector<std::shared_ptr<BucketSparseGradient>> thread_buckets;
|
||||
for (size_t j = 0; j < thread_num; ++j) {
|
||||
thread_buckets.emplace_back(std::make_shared<BucketSparseGradient>());
|
||||
thread_buckets[j]->indices_ = buckets[j]->indices_ + tmp_bucket_data_size[j];
|
||||
thread_buckets[j]->global_indices_ = buckets[j]->global_indices_ + tmp_bucket_data_size[j];
|
||||
thread_buckets[j]->value_ = buckets[j]->value_ + tmp_bucket_data_size[j] * param.value_stride_;
|
||||
thread_buckets[j]->indices_size_ = segment_bucket_sizes[i]->at(j);
|
||||
tmp_bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
|
||||
}
|
||||
each_thread_buckets.emplace_back(thread_buckets);
|
||||
}
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(thread_num);
|
||||
current_indices_offset = 0;
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
threads.emplace_back(
|
||||
std::thread(CopySegmentIndicesToBucket, param, segments[i], current_indices_offset, each_thread_buckets[i]));
|
||||
current_indices_offset += segments[i]->indices_size_;
|
||||
}
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
threads[i].join();
|
||||
}
|
||||
}
|
||||
|
||||
void SortAndReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam ¶m,
|
||||
const std::shared_ptr<BucketSparseGradient> &bucket,
|
||||
const std::shared_ptr<SparseGradient> &reduced_bucket) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
MS_EXCEPTION_IF_NULL(bucket);
|
||||
MS_EXCEPTION_IF_NULL(bucket->value_);
|
||||
MS_EXCEPTION_IF_NULL(bucket->indices_);
|
||||
MS_EXCEPTION_IF_NULL(reduced_bucket);
|
||||
MS_EXCEPTION_IF_NULL(reduced_bucket->value_);
|
||||
MS_EXCEPTION_IF_NULL(reduced_bucket->indices_);
|
||||
std::vector<std::pair<int, int>> sorted_indices;
|
||||
sorted_indices.reserve(bucket->indices_size_);
|
||||
for (size_t i = 0; i < bucket->indices_size_; ++i) {
|
||||
int index = bucket->indices_[i];
|
||||
int global_index = bucket->global_indices_[i];
|
||||
sorted_indices.emplace_back(std::pair<int, int>(index, global_index));
|
||||
}
|
||||
std::sort(sorted_indices.begin(), sorted_indices.end());
|
||||
|
||||
float *global_value = param.input_grad_->value_;
|
||||
size_t unique_indices_size = 0;
|
||||
size_t max_length = reduced_bucket->indices_size_ * param.value_stride_;
|
||||
int last_index{0};
|
||||
size_t value_offset{0};
|
||||
for (size_t i = 0; i < sorted_indices.size(); ++i) {
|
||||
int index = sorted_indices[i].first;
|
||||
int global_index = sorted_indices[i].second;
|
||||
int global_value_offset = global_index * param.value_stride_;
|
||||
if (i == 0 || index != last_index) {
|
||||
if (i != 0) {
|
||||
unique_indices_size++;
|
||||
}
|
||||
reduced_bucket->indices_[unique_indices_size] = index;
|
||||
value_offset = unique_indices_size * param.value_stride_;
|
||||
auto ret_code = memcpy_s(reduced_bucket->value_ + value_offset, (max_length - value_offset) * sizeof(float),
|
||||
global_value + global_value_offset, param.value_stride_ * sizeof(float));
|
||||
if (ret_code != EOK) {
|
||||
MS_LOG(EXCEPTION) << "Failed to copy data!";
|
||||
}
|
||||
} else {
|
||||
for (size_t j = 0; j < param.value_stride_; ++j) {
|
||||
reduced_bucket->value_[value_offset + j] += global_value[global_value_offset + j];
|
||||
}
|
||||
}
|
||||
last_index = index;
|
||||
}
|
||||
reduced_bucket->indices_size_ = unique_indices_size;
|
||||
MS_LOG(DEBUG) << "End";
|
||||
}
|
||||
|
||||
void ReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam ¶m,
|
||||
const std::shared_ptr<BucketSparseGradient> &bucket,
|
||||
const std::shared_ptr<SparseGradient> &reduced_bucket) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
MS_EXCEPTION_IF_NULL(bucket);
|
||||
MS_EXCEPTION_IF_NULL(bucket->value_);
|
||||
MS_EXCEPTION_IF_NULL(bucket->indices_);
|
||||
MS_EXCEPTION_IF_NULL(reduced_bucket);
|
||||
MS_EXCEPTION_IF_NULL(reduced_bucket->value_);
|
||||
MS_EXCEPTION_IF_NULL(reduced_bucket->indices_);
|
||||
|
||||
float *global_value = param.input_grad_->value_;
|
||||
std::unordered_map<int, size_t> index_map;
|
||||
size_t unique_indices_size = 0;
|
||||
for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
|
||||
int index = origin_sparse_grad.indices_[i];
|
||||
if (index < 0 || IntToSize(index) >= first_dim) {
|
||||
continue;
|
||||
}
|
||||
size_t max_length = reduced_bucket->indices_size_ * param.value_stride_;
|
||||
for (size_t i = 0; i < bucket->indices_size_; ++i) {
|
||||
int index = bucket->indices_[i];
|
||||
int global_index = bucket->global_indices_[i];
|
||||
auto iter = index_map.find(index);
|
||||
if (iter == index_map.end()) {
|
||||
index_map[index] = unique_indices_size;
|
||||
unique_grad->indices_[unique_indices_size] = index;
|
||||
size_t start_index = unique_indices_size * outer_dim;
|
||||
size_t end_index = start_index + outer_dim;
|
||||
for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) {
|
||||
unique_grad->value_[j] = origin_sparse_grad.value_[k];
|
||||
reduced_bucket->indices_[unique_indices_size] = index;
|
||||
size_t start_index = unique_indices_size * param.value_stride_;
|
||||
index_map[index] = start_index;
|
||||
auto ret_code = memcpy_s(reduced_bucket->value_ + start_index, (max_length - start_index) * sizeof(float),
|
||||
global_value + global_index * param.value_stride_, param.value_stride_ * sizeof(float));
|
||||
if (ret_code != EOK) {
|
||||
MS_LOG(EXCEPTION) << "Failed to copy data!";
|
||||
}
|
||||
unique_indices_size++;
|
||||
} else {
|
||||
size_t first_index = iter->second;
|
||||
size_t start_index = first_index * outer_dim;
|
||||
size_t end_index = start_index + outer_dim;
|
||||
for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) {
|
||||
unique_grad->value_[j] += origin_sparse_grad.value_[k];
|
||||
size_t start_index = iter->second;
|
||||
size_t end_index = start_index + param.value_stride_;
|
||||
for (size_t j = start_index, k = global_index * param.value_stride_; j < end_index; ++j, ++k) {
|
||||
reduced_bucket->value_[j] += global_value[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
unique_grad->indices_size_ = unique_indices_size;
|
||||
}
|
||||
|
||||
struct WorkerParamsForReduceSparseGradient {
|
||||
size_t slice_start_{0};
|
||||
size_t slice_end_{0};
|
||||
size_t max_length_{0};
|
||||
size_t outer_dim_{0};
|
||||
std::vector<std::pair<int, size_t>> *sorted_indices_{nullptr};
|
||||
std::vector<size_t> *slice_positions_{nullptr};
|
||||
float *src_value_{nullptr};
|
||||
SparseGradient *unique_grad_{nullptr};
|
||||
};
|
||||
|
||||
void WorkerForReduceSparseGradient(WorkerParamsForReduceSparseGradient param) {
|
||||
MS_EXCEPTION_IF_NULL(param.sorted_indices_);
|
||||
MS_EXCEPTION_IF_NULL(param.slice_positions_);
|
||||
MS_EXCEPTION_IF_NULL(param.src_value_);
|
||||
MS_EXCEPTION_IF_NULL(param.unique_grad_);
|
||||
auto outer_dim = param.outer_dim_;
|
||||
auto &sorted_indices = *(param.sorted_indices_);
|
||||
auto &slice_positions = *(param.slice_positions_);
|
||||
auto unique_grad = param.unique_grad_;
|
||||
for (size_t slice_id = param.slice_start_; slice_id < param.slice_end_; ++slice_id) {
|
||||
size_t cur_pos = slice_positions[slice_id];
|
||||
int index = sorted_indices[cur_pos].first;
|
||||
unique_grad->indices_[slice_id] = index;
|
||||
size_t start_index = slice_id * outer_dim;
|
||||
auto ret_code = memcpy_s(unique_grad->value_ + start_index, (param.max_length_ - start_index) * sizeof(float),
|
||||
param.src_value_ + sorted_indices[cur_pos].second, outer_dim * sizeof(float));
|
||||
if (ret_code != EOK) {
|
||||
MS_LOG(EXCEPTION) << "Failed to copy data!";
|
||||
}
|
||||
cur_pos++;
|
||||
size_t end_pos;
|
||||
if (slice_id + 1 < slice_positions.size()) {
|
||||
end_pos = slice_positions[slice_id + 1];
|
||||
} else {
|
||||
end_pos = sorted_indices.size();
|
||||
}
|
||||
while (cur_pos < end_pos) {
|
||||
for (size_t i = 0; i < outer_dim; ++i) {
|
||||
unique_grad->value_[start_index + i] += param.src_value_[sorted_indices[cur_pos].second + i];
|
||||
}
|
||||
cur_pos++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
|
||||
size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
|
||||
std::vector<size_t> *slice_positions) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
size_t thread_num = 24;
|
||||
if (slice_positions->size() < thread_num) {
|
||||
thread_num = slice_positions->size();
|
||||
}
|
||||
size_t stride = (slice_positions->size() + thread_num - 1) / thread_num;
|
||||
thread_num = (slice_positions->size() + stride - 1) / stride;
|
||||
std::vector<std::thread> threads;
|
||||
size_t max_length = sorted_indices->size() * outer_dim;
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
size_t slice_start = i * stride;
|
||||
size_t slice_end = 0;
|
||||
if (i == thread_num - 1) {
|
||||
slice_end = slice_positions->size();
|
||||
} else {
|
||||
slice_end = slice_start + stride;
|
||||
}
|
||||
WorkerParamsForReduceSparseGradient params{
|
||||
slice_start, slice_end, max_length, outer_dim, sorted_indices, slice_positions, origin_sparse_grad.value_,
|
||||
unique_grad};
|
||||
threads.emplace_back(std::thread(WorkerForReduceSparseGradient, params));
|
||||
}
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
threads[i].join();
|
||||
}
|
||||
reduced_bucket->indices_size_ = unique_indices_size;
|
||||
MS_LOG(DEBUG) << "End";
|
||||
}
|
||||
|
||||
void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
|
||||
size_t outer_dim, bool use_multi_threads) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
|
||||
MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
|
||||
MS_EXCEPTION_IF_NULL(unique_grad);
|
||||
MS_EXCEPTION_IF_NULL(unique_grad->value_);
|
||||
MS_EXCEPTION_IF_NULL(unique_grad->indices_);
|
||||
std::vector<std::pair<int, size_t>> sorted_indices;
|
||||
sorted_indices.reserve(origin_sparse_grad.indices_size_);
|
||||
for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
|
||||
int index = origin_sparse_grad.indices_[i];
|
||||
if (index >= 0 && IntToSize(index) < first_dim) {
|
||||
sorted_indices.emplace_back(std::pair<int, size_t>(index, i * outer_dim));
|
||||
}
|
||||
}
|
||||
std::sort(
|
||||
sorted_indices.begin(), sorted_indices.end(),
|
||||
[](const std::pair<int, size_t> &left, const std::pair<int, size_t> &right) { return left.first < right.first; });
|
||||
int last_index = 0;
|
||||
std::vector<size_t> slice_positions;
|
||||
slice_positions.reserve(sorted_indices.size());
|
||||
for (size_t i = 0; i < sorted_indices.size(); ++i) {
|
||||
if (i == 0 || last_index != sorted_indices[i].first) {
|
||||
slice_positions.emplace_back(i);
|
||||
}
|
||||
last_index = sorted_indices[i].first;
|
||||
}
|
||||
if (use_multi_threads) {
|
||||
RunMultiThreadReduceSparseGradient(origin_sparse_grad, unique_grad, outer_dim, &sorted_indices, &slice_positions);
|
||||
} else {
|
||||
size_t max_length = sorted_indices.size() * outer_dim;
|
||||
WorkerParamsForReduceSparseGradient params{0,
|
||||
slice_positions.size(),
|
||||
max_length,
|
||||
outer_dim,
|
||||
&sorted_indices,
|
||||
&slice_positions,
|
||||
origin_sparse_grad.value_,
|
||||
unique_grad};
|
||||
WorkerForReduceSparseGradient(params);
|
||||
}
|
||||
unique_grad->indices_size_ = slice_positions.size();
|
||||
MS_LOG(DEBUG) << "End";
|
||||
}
|
||||
|
||||
void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
|
||||
SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
|
||||
size_t outer_dim) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
if (unique_slice_grads.empty()) {
|
||||
return;
|
||||
}
|
||||
size_t index_data_size = outer_dim * sizeof(float);
|
||||
size_t unique_indices_size = 0;
|
||||
for (size_t i = 0; i < unique_slice_grads.size(); ++i) {
|
||||
auto &slice_grad = unique_slice_grads[i];
|
||||
auto ret_code = memcpy_s(tmp_grad->value_ + unique_indices_size * outer_dim,
|
||||
(tmp_grad->indices_size_ - unique_indices_size) * index_data_size, slice_grad->value_,
|
||||
slice_grad->indices_size_ * index_data_size);
|
||||
if (ret_code != EOK) {
|
||||
MS_LOG(EXCEPTION) << "Failed to copy data!";
|
||||
}
|
||||
ret_code =
|
||||
memcpy_s(tmp_grad->indices_ + unique_indices_size, (tmp_grad->indices_size_ - unique_indices_size) * sizeof(int),
|
||||
slice_grad->indices_, slice_grad->indices_size_ * sizeof(int));
|
||||
if (ret_code != EOK) {
|
||||
MS_LOG(EXCEPTION) << "Failed to copy data!";
|
||||
}
|
||||
unique_indices_size += slice_grad->indices_size_;
|
||||
}
|
||||
tmp_grad->indices_size_ = unique_indices_size;
|
||||
ReduceSparseGradient(*tmp_grad, unique_grad, first_dim, outer_dim);
|
||||
MS_LOG(DEBUG) << "End";
|
||||
}
|
||||
|
||||
void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
|
||||
SparseGradient *unique_grad, size_t first_dim, size_t outer_dim) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
|
||||
MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
|
||||
MS_EXCEPTION_IF_NULL(unique_grad);
|
||||
MS_EXCEPTION_IF_NULL(unique_grad->value_);
|
||||
MS_EXCEPTION_IF_NULL(unique_grad->indices_);
|
||||
MS_EXCEPTION_IF_NULL(tmp_grad);
|
||||
MS_EXCEPTION_IF_NULL(tmp_grad->value_);
|
||||
MS_EXCEPTION_IF_NULL(tmp_grad->indices_);
|
||||
size_t thread_num = 24;
|
||||
if (origin_sparse_grad.indices_size_ < thread_num) {
|
||||
thread_num = origin_sparse_grad.indices_size_;
|
||||
}
|
||||
size_t thread_indices_size = origin_sparse_grad.indices_size_ / thread_num;
|
||||
size_t left_indices_size = origin_sparse_grad.indices_size_ % thread_num;
|
||||
void ReduceBucketSparseGradientToWorkspace(const MultiThreadReduceSparseGradientParam ¶m,
|
||||
const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets,
|
||||
std::vector<std::shared_ptr<SparseGradient>> *reduced_buckets_ptr) {
|
||||
MS_EXCEPTION_IF_NULL(param.workspace_grad_);
|
||||
MS_EXCEPTION_IF_NULL(param.workspace_grad_->value_);
|
||||
MS_EXCEPTION_IF_NULL(param.workspace_grad_->indices_);
|
||||
MS_EXCEPTION_IF_NULL(reduced_buckets_ptr);
|
||||
auto &reduced_buckets = *reduced_buckets_ptr;
|
||||
size_t thread_num = buckets.size();
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(thread_num);
|
||||
std::vector<std::shared_ptr<SparseGradient>> unique_slice_grads;
|
||||
|
||||
size_t current_indices_offset = 0;
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
size_t indices_size = thread_indices_size;
|
||||
if (i == thread_num - 1) {
|
||||
indices_size = thread_indices_size + left_indices_size;
|
||||
reduced_buckets.emplace_back(std::make_shared<SparseGradient>());
|
||||
reduced_buckets[i]->value_ = param.workspace_grad_->value_ + current_indices_offset * param.value_stride_;
|
||||
reduced_buckets[i]->indices_ = param.workspace_grad_->indices_ + current_indices_offset;
|
||||
reduced_buckets[i]->indices_size_ = buckets[i]->indices_size_;
|
||||
if (param.use_sort_reduce_) {
|
||||
threads.emplace_back(std::thread(SortAndReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i]));
|
||||
} else {
|
||||
threads.emplace_back(std::thread(ReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i]));
|
||||
}
|
||||
size_t value_offset = i * thread_indices_size * outer_dim;
|
||||
size_t indices_offset = i * thread_indices_size;
|
||||
auto slice_grad = SparseGradient(
|
||||
{origin_sparse_grad.value_ + value_offset, origin_sparse_grad.indices_ + indices_offset, indices_size});
|
||||
unique_slice_grads.emplace_back(std::make_shared<SparseGradient>());
|
||||
unique_slice_grads[i]->value_ = unique_grad->value_ + value_offset;
|
||||
unique_slice_grads[i]->indices_ = unique_grad->indices_ + indices_offset;
|
||||
unique_slice_grads[i]->indices_size_ = indices_size;
|
||||
threads.emplace_back(
|
||||
std::thread(ReduceSparseGradient, slice_grad, unique_slice_grads[i].get(), first_dim, outer_dim, false));
|
||||
current_indices_offset += buckets[i]->indices_size_;
|
||||
}
|
||||
for (size_t i = 0; i < thread_num; ++i) {
|
||||
threads[i].join();
|
||||
}
|
||||
ReduceMultiSparseGradient(unique_slice_grads, tmp_grad, unique_grad, first_dim, outer_dim);
|
||||
}
|
||||
|
||||
void MergeReduceSparseGradient(const MultiThreadReduceSparseGradientParam ¶m,
|
||||
const std::vector<std::shared_ptr<SparseGradient>> &reduced_buckets) {
|
||||
MS_EXCEPTION_IF_NULL(param.output_grad_);
|
||||
auto output_grad = param.output_grad_;
|
||||
MS_EXCEPTION_IF_NULL(output_grad->value_);
|
||||
MS_EXCEPTION_IF_NULL(output_grad->indices_);
|
||||
size_t stride_data_size = param.value_stride_ * sizeof(float);
|
||||
size_t unique_indices_size = 0;
|
||||
for (size_t i = 0; i < reduced_buckets.size(); ++i) {
|
||||
auto &bucket = reduced_buckets[i];
|
||||
MS_EXCEPTION_IF_NULL(bucket);
|
||||
if (bucket->indices_size_ == 0) {
|
||||
continue;
|
||||
}
|
||||
auto ret_code = memcpy_s(output_grad->value_ + unique_indices_size * param.value_stride_,
|
||||
(output_grad->indices_size_ - unique_indices_size) * stride_data_size, bucket->value_,
|
||||
bucket->indices_size_ * stride_data_size);
|
||||
if (ret_code != EOK) {
|
||||
MS_LOG(EXCEPTION) << "Failed to copy data!";
|
||||
}
|
||||
ret_code = memcpy_s(output_grad->indices_ + unique_indices_size,
|
||||
(output_grad->indices_size_ - unique_indices_size) * sizeof(int), bucket->indices_,
|
||||
bucket->indices_size_ * sizeof(int));
|
||||
if (ret_code != EOK) {
|
||||
MS_LOG(EXCEPTION) << "Failed to copy data!";
|
||||
}
|
||||
unique_indices_size += bucket->indices_size_;
|
||||
}
|
||||
output_grad->indices_size_ = unique_indices_size;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void BucketReduceSparseGradient(const ReduceSparseGradientParam ¶m) {
|
||||
MS_LOG(DEBUG) << "Start";
|
||||
MS_EXCEPTION_IF_NULL(param.input_grad_);
|
||||
size_t thread_num = 23;
|
||||
if (param.input_grad_->indices_size_ < thread_num) {
|
||||
thread_num = param.input_grad_->indices_size_;
|
||||
}
|
||||
MultiThreadReduceSparseGradientParam multi_thread_param({param.input_grad_, param.workspace_grad_, param.output_grad_,
|
||||
param.max_index_, param.value_stride_, thread_num,
|
||||
param.use_sort_reduce_});
|
||||
std::vector<std::shared_ptr<SparseGradient>> segments;
|
||||
std::vector<std::shared_ptr<std::vector<size_t>>> segment_bucket_sizes;
|
||||
SplitAndCalculateSegmentBucketSize(multi_thread_param, &segments, &segment_bucket_sizes);
|
||||
|
||||
std::vector<std::shared_ptr<BucketSparseGradient>> buckets;
|
||||
GatherSegmentIndicesToOutputBucket(multi_thread_param, segments, segment_bucket_sizes, &buckets);
|
||||
|
||||
std::vector<std::shared_ptr<SparseGradient>> reduced_buckets;
|
||||
ReduceBucketSparseGradientToWorkspace(multi_thread_param, buckets, &reduced_buckets);
|
||||
|
||||
MergeReduceSparseGradient(multi_thread_param, reduced_buckets);
|
||||
MS_LOG(DEBUG) << "End";
|
||||
}
|
||||
|
||||
|
|
|
@ -73,9 +73,18 @@ class KernelMeta {
|
|||
};
|
||||
|
||||
struct SparseGradient {
|
||||
float *value_;
|
||||
int *indices_;
|
||||
size_t indices_size_;
|
||||
float *value_{nullptr};
|
||||
int *indices_{nullptr};
|
||||
size_t indices_size_{0};
|
||||
};
|
||||
|
||||
struct ReduceSparseGradientParam {
|
||||
SparseGradient *input_grad_{nullptr};
|
||||
SparseGradient *workspace_grad_{nullptr};
|
||||
SparseGradient *output_grad_{nullptr};
|
||||
size_t max_index_{0};
|
||||
size_t value_stride_{0};
|
||||
bool use_sort_reduce_{false};
|
||||
};
|
||||
|
||||
struct MultiThreadComputeParams {
|
||||
|
@ -112,10 +121,6 @@ void SaveJsonInfo(const std::string &json_name, const std::string &info);
|
|||
std::string GetProcessor(const AnfNodePtr &anf_node);
|
||||
bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
|
||||
int Sign(float x);
|
||||
void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
|
||||
size_t outer_dim);
|
||||
void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
|
||||
size_t outer_dim, bool use_multi_threads = true);
|
||||
std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
|
||||
std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
|
||||
const std::vector<AnfNodePtr> &input_list);
|
||||
|
@ -130,14 +135,7 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<An
|
|||
bool IsWeightBoundary(const AnfNodePtr &node);
|
||||
void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params,
|
||||
size_t total_compute_size);
|
||||
void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
|
||||
size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
|
||||
std::vector<size_t> *slice_positions);
|
||||
void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
|
||||
SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
|
||||
size_t outer_dim);
|
||||
void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
|
||||
SparseGradient *unique_grad, size_t first_dim, size_t outer_dim);
|
||||
void BucketReduceSparseGradient(const ReduceSparseGradientParam ¶m);
|
||||
std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -46,7 +46,7 @@ class EmbeddingLookUpCPUKernel : public CPUKernel {
|
|||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
protected:
|
||||
void LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2,
|
||||
float **output_addr);
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
|
|
|
@ -53,15 +53,15 @@ bool EmbeddingLookUpProxyKernel::Launch(const std::vector<kernel::AddressPtr> &i
|
|||
size_t output_size = outputs[0]->size;
|
||||
|
||||
size_t size = input_size / sizeof(float);
|
||||
::ps::SArray<float> lookup_ids(size, 0);
|
||||
::ps::SArray<int> lookup_ids(size, 0);
|
||||
::ps::SArray<int> lengths{size};
|
||||
::ps::SArray<float> lookup_result;
|
||||
::ps::SArray<float> lookup_result(output_size / sizeof(float), 0);
|
||||
|
||||
auto ret = memcpy_s(lookup_ids.data(), input_size, indices_addr, input_size);
|
||||
if (ret != EOK) {
|
||||
MS_LOG(EXCEPTION) << "Lookup id memcpy failed.";
|
||||
}
|
||||
parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, lookup_result,
|
||||
parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, &lookup_result,
|
||||
parallel::ps::kEmbeddingLookupCmd);
|
||||
|
||||
auto ret2 = memcpy_s(output_addr, output_size, lookup_result.data(), output_size);
|
||||
|
|
|
@ -50,7 +50,7 @@ void EmbeddingLookUpPSKernel::InitKernel(
|
|||
split_num_ = pserver_num_;
|
||||
|
||||
// input shape should be sharded after computing offset_;
|
||||
Shard(input_shape_, axis_);
|
||||
Shard(&input_shape_, axis_);
|
||||
|
||||
size_t output_size =
|
||||
std::accumulate(output_shape_.begin(), output_shape_.end(), sizeof(float), std::multiplies<size_t>());
|
||||
|
|
|
@ -34,5 +34,13 @@ MS_REG_CPU_KERNEL_T(Push,
|
|||
MS_REG_CPU_KERNEL_T(
|
||||
Push, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt64),
|
||||
PushKernel, float);
|
||||
|
||||
MS_REG_CPU_KERNEL_T(Push,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeUInt64),
|
||||
PushKernel, float);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -43,7 +43,7 @@ class PushKernel : public CPUKernel {
|
|||
sizes.push_back(SizeToInt(input->size) / sizeof(T));
|
||||
}
|
||||
parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes);
|
||||
memcpy(outputs[0]->addr, &key_, sizeof(size_t));
|
||||
memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -75,7 +75,7 @@ void SparseApplyAdamPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar
|
|||
|
||||
void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
|
||||
const auto &indices_addr = inputs[10];
|
||||
indices_size_ = indices_addr->size;
|
||||
indices_size_ = indices_addr->size / sizeof(int);
|
||||
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
|
||||
workspace_size_list_[1] = indices_size_ * sizeof(int);
|
||||
}
|
||||
|
|
|
@ -64,7 +64,7 @@ void SparseApplyFtrlPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar
|
|||
|
||||
void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
|
||||
const auto &indices_addr = inputs[4];
|
||||
indices_size_ = indices_addr->size;
|
||||
indices_size_ = indices_addr->size / sizeof(int);
|
||||
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
|
||||
workspace_size_list_[1] = indices_size_ * sizeof(int);
|
||||
}
|
||||
|
|
|
@ -81,6 +81,8 @@ void SparseApplyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node)
|
|||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
|
||||
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
|
||||
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
|
||||
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
|
||||
workspace_size_list_.emplace_back(var_first_dim_size_ * var_outer_dim_size_ * sizeof(float));
|
||||
}
|
||||
|
||||
|
@ -142,11 +144,21 @@ bool SparseApplyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
|
|||
auto indices = reinterpret_cast<int *>(inputs[10]->addr);
|
||||
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
|
||||
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
|
||||
auto m_t = reinterpret_cast<float *>(workspace[2]->addr);
|
||||
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
|
||||
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
|
||||
auto m_t = reinterpret_cast<float *>(workspace[4]->addr);
|
||||
|
||||
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
|
||||
ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
|
||||
var_outer_dim_size_);
|
||||
SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
|
||||
SparseGradient input_sparse_grad({grad, indices, indices_size_});
|
||||
ReduceSparseGradientParam param;
|
||||
param.input_grad_ = &input_sparse_grad;
|
||||
param.workspace_grad_ = &workspace_sparse_grad;
|
||||
param.output_grad_ = &unique_sparse_grad;
|
||||
param.max_index_ = var_first_dim_size_;
|
||||
param.value_stride_ = var_outer_dim_size_;
|
||||
BucketReduceSparseGradient(param);
|
||||
|
||||
size_t total_dim_size = var_first_dim_size_ * var_outer_dim_size_;
|
||||
lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
|
||||
|
||||
|
|
|
@ -132,12 +132,19 @@ bool SparseApplyFtrlCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
|
|||
auto indices = reinterpret_cast<int *>(inputs[4]->addr);
|
||||
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
|
||||
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
|
||||
auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr);
|
||||
auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr);
|
||||
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
|
||||
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
|
||||
|
||||
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
|
||||
SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_});
|
||||
TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad,
|
||||
var_first_dim_size_, var_outer_dim_size_);
|
||||
SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
|
||||
SparseGradient input_sparse_grad({grad, indices, indices_size_});
|
||||
ReduceSparseGradientParam param;
|
||||
param.input_grad_ = &input_sparse_grad;
|
||||
param.workspace_grad_ = &workspace_sparse_grad;
|
||||
param.output_grad_ = &unique_sparse_grad;
|
||||
param.max_index_ = var_first_dim_size_;
|
||||
param.value_stride_ = var_outer_dim_size_;
|
||||
BucketReduceSparseGradient(param);
|
||||
|
||||
MultiThreadComputeParams input_params;
|
||||
input_params.var_ = var;
|
||||
|
|
|
@ -123,13 +123,19 @@ bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr>
|
|||
auto indices = reinterpret_cast<int *>(inputs[10]->addr);
|
||||
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
|
||||
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
|
||||
auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr);
|
||||
auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr);
|
||||
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
|
||||
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
|
||||
|
||||
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
|
||||
SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_});
|
||||
TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad,
|
||||
var_first_dim_size_, var_outer_dim_size_);
|
||||
SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
|
||||
SparseGradient input_sparse_grad({grad, indices, indices_size_});
|
||||
ReduceSparseGradientParam param;
|
||||
param.input_grad_ = &input_sparse_grad;
|
||||
param.workspace_grad_ = &workspace_sparse_grad;
|
||||
param.output_grad_ = &unique_sparse_grad;
|
||||
param.max_index_ = var_first_dim_size_;
|
||||
param.value_stride_ = var_outer_dim_size_;
|
||||
BucketReduceSparseGradient(param);
|
||||
|
||||
lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
|
||||
MultiThreadComputeParams input_params;
|
||||
|
|
|
@ -61,6 +61,8 @@ void SparseApplyProximalAdagradCPUKernel::InitInputOutputSize(const CNodePtr &ke
|
|||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
|
||||
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
|
||||
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
|
||||
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
|
||||
}
|
||||
|
||||
void SparseApplyProximalAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
|
@ -119,9 +121,19 @@ bool SparseApplyProximalAdagradCPUKernel::Launch(const std::vector<kernel::Addre
|
|||
auto indices = reinterpret_cast<int *>(inputs[6]->addr);
|
||||
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
|
||||
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
|
||||
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
|
||||
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
|
||||
|
||||
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
|
||||
ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
|
||||
var_outer_dim_size_);
|
||||
SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
|
||||
SparseGradient input_sparse_grad({grad, indices, indices_size_});
|
||||
ReduceSparseGradientParam param;
|
||||
param.input_grad_ = &input_sparse_grad;
|
||||
param.workspace_grad_ = &workspace_sparse_grad;
|
||||
param.output_grad_ = &unique_sparse_grad;
|
||||
param.max_index_ = var_first_dim_size_;
|
||||
param.value_stride_ = var_outer_dim_size_;
|
||||
BucketReduceSparseGradient(param);
|
||||
|
||||
MultiThreadComputeParams input_params;
|
||||
input_params.var_ = var;
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
BroadcastToGpuKernel, float)
|
||||
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
BroadcastToGpuKernel, half)
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,83 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
class BroadcastToGpuKernel : public GpuKernel {
|
||||
public:
|
||||
BroadcastToGpuKernel() {}
|
||||
~BroadcastToGpuKernel() = default;
|
||||
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
|
||||
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
T *input_addr = GetDeviceAddress<T>(inputs, 0);
|
||||
T *output_addr = GetDeviceAddress<T>(outputs, 0);
|
||||
|
||||
BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], output_shape_[0], output_shape_[1],
|
||||
output_shape_[2], output_shape_[3], input_addr, output_addr,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
if (input_shapes.size() > 4 || output_shapes.size() > 4) {
|
||||
MS_LOG(EXCEPTION) << "BroadcastTo operation not support dim greater than 4";
|
||||
}
|
||||
|
||||
for (int i = input_shapes.size() - 1; i >= 0; i--) {
|
||||
input_shape_[i] = input_shapes[i];
|
||||
}
|
||||
|
||||
for (int j = output_shapes.size() - 1; j >= 0; j--) {
|
||||
output_shape_[j] = output_shapes[j];
|
||||
}
|
||||
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitSizeLists() override {
|
||||
input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * sizeof(T));
|
||||
output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] * sizeof(T));
|
||||
}
|
||||
|
||||
private:
|
||||
int input_shape_[4] = {1, 1, 1, 1};
|
||||
int output_shape_[4] = {1, 1, 1, 1};
|
||||
|
||||
std::vector<size_t> input_size_list_;
|
||||
std::vector<size_t> output_size_list_;
|
||||
std::vector<size_t> workspace_size_list_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
|
|
@ -18,6 +18,7 @@
|
|||
#define MINDSPORE_CCSRC_KERNEL_GPU_CONCATV2_GPU_KERNEL_H
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
|
||||
|
@ -27,40 +28,35 @@ namespace kernel {
|
|||
template <typename T>
|
||||
class ConcatV2GpuFwdKernel : public GpuKernel {
|
||||
public:
|
||||
ConcatV2GpuFwdKernel() : axis_(0), output_size_(0) {}
|
||||
ConcatV2GpuFwdKernel()
|
||||
: axis_(0),
|
||||
input_num_(1),
|
||||
output_size_(0),
|
||||
all_size_before_axis_(1),
|
||||
all_size_axis_(1),
|
||||
inputs_host_(nullptr),
|
||||
len_axis_(nullptr) {}
|
||||
~ConcatV2GpuFwdKernel() override = default;
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
|
||||
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
if (inputs.size() == 2) {
|
||||
T *input_0 = GetDeviceAddress<T>(inputs, 0);
|
||||
T *input_1 = GetDeviceAddress<T>(inputs, 1);
|
||||
T *output = GetDeviceAddress<T>(outputs, 0);
|
||||
ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], input_0, input_1, output,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
}
|
||||
|
||||
if (inputs.size() == 3) {
|
||||
T *input_0 = GetDeviceAddress<T>(inputs, 0);
|
||||
T *input_1 = GetDeviceAddress<T>(inputs, 1);
|
||||
T *input_2 = GetDeviceAddress<T>(inputs, 2);
|
||||
T *output = GetDeviceAddress<T>(outputs, 0);
|
||||
ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], input_0, input_1, input_2, output,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
}
|
||||
|
||||
if (inputs.size() == 4) {
|
||||
T *input_0 = GetDeviceAddress<T>(inputs, 0);
|
||||
T *input_1 = GetDeviceAddress<T>(inputs, 1);
|
||||
T *input_2 = GetDeviceAddress<T>(inputs, 2);
|
||||
T *input_3 = GetDeviceAddress<T>(inputs, 3);
|
||||
T *output = GetDeviceAddress<T>(outputs, 0);
|
||||
ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], w_[3], input_0, input_1, input_2, input_3, output,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
T *output = GetDeviceAddress<T>(outputs, 0);
|
||||
T **inputs_device = GetDeviceAddress<T *>(workspace, 0);
|
||||
int *len_axis_device = GetDeviceAddress<int>(workspace, 1);
|
||||
for (size_t i = 0; i < inputs.size(); i++) {
|
||||
inputs_host_[i] = GetDeviceAddress<T>(inputs, i);
|
||||
}
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"ConcatV2 opt cudaMemcpyAsync inputs failed");
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"ConcatV2 opt cudaMemcpyAsync length on axis failed");
|
||||
ConcatKernel(output_size_, input_num_, all_size_before_axis_, all_size_axis_, len_axis_device, inputs_device,
|
||||
output, reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
|
@ -74,25 +70,34 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
|
|||
axis_ += SizeToInt(input_shape.size());
|
||||
}
|
||||
|
||||
auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
for (size_t i = 0; i < input_num; i++) {
|
||||
auto input_size = sizeof(T);
|
||||
input_num_ = SizeToInt(AnfAlgo::GetInputTensorNum(kernel_node));
|
||||
inputs_host_ = std::make_unique<T *[]>(input_num_);
|
||||
len_axis_ = std::make_unique<int[]>(input_num_);
|
||||
for (int i = 0; i < input_num_; i++) {
|
||||
int input_size = 1;
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
|
||||
for (size_t j = 0; j < input_shape.size(); j++) {
|
||||
input_size *= SizeToInt(input_shape[j]);
|
||||
if (j >= IntToSize(axis_)) {
|
||||
w_[i] *= SizeToInt(input_shape[j]);
|
||||
}
|
||||
input_size_list_.push_back(input_size);
|
||||
}
|
||||
input_size_list_.push_back(IntToSize(input_size * sizeof(T)));
|
||||
len_axis_[i] = SizeToInt(input_shape[axis_]);
|
||||
}
|
||||
workspace_size_list_.push_back(sizeof(T *) * input_num_);
|
||||
workspace_size_list_.push_back(sizeof(int) * input_num_);
|
||||
|
||||
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
output_size_ = sizeof(T);
|
||||
for (size_t i = 0; i < output_shape.size(); i++) {
|
||||
output_size_ = 1;
|
||||
for (int i = 0; i < SizeToInt(output_shape.size()); i++) {
|
||||
output_size_ *= output_shape[i];
|
||||
if (i > axis_) {
|
||||
all_size_before_axis_ *= output_shape[i];
|
||||
all_size_axis_ *= output_shape[i];
|
||||
}
|
||||
if (i == axis_) {
|
||||
all_size_before_axis_ *= output_shape[i];
|
||||
}
|
||||
}
|
||||
output_size_list_.push_back(output_size_);
|
||||
output_size_list_.push_back(IntToSize(output_size_ * sizeof(T)));
|
||||
|
||||
InitSizeLists();
|
||||
return true;
|
||||
|
@ -103,11 +108,6 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
|
|||
|
||||
private:
|
||||
bool CheckParam(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num < 2 || input_num > 4) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but ConcatV2GpuFwdKernel needs inputs between 2 and 4.";
|
||||
return false;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but ConcatV2GpuFwdKernel needs 1 output.";
|
||||
|
@ -115,9 +115,13 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
int w_[4] = {1, 1, 1, 1};
|
||||
int axis_;
|
||||
size_t output_size_;
|
||||
int input_num_;
|
||||
int output_size_;
|
||||
int all_size_before_axis_;
|
||||
int all_size_axis_;
|
||||
std::unique_ptr<T *[]> inputs_host_;
|
||||
std::unique_ptr<int[]> len_axis_;
|
||||
std::vector<size_t> input_size_list_;
|
||||
std::vector<size_t> output_size_list_;
|
||||
std::vector<size_t> workspace_size_list_;
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
SplitGpuFwdKernel, float)
|
||||
MS_REG_GPU_KERNEL_ONE(Split,
|
||||
KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
SplitGpuFwdKernel, int)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
SplitGpuFwdKernel, half)
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,153 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
|
||||
#define MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
class SplitGpuFwdKernel : public GpuKernel {
|
||||
public:
|
||||
SplitGpuFwdKernel()
|
||||
: axis_(0),
|
||||
output_num_(1),
|
||||
input_size_(1),
|
||||
axis_step_(1),
|
||||
all_size_before_axis_(1),
|
||||
all_size_axis_(1),
|
||||
outputs_host_(nullptr) {}
|
||||
~SplitGpuFwdKernel() override = default;
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
|
||||
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
T *input = GetDeviceAddress<T>(inputs, 0);
|
||||
T **outputs_device = GetDeviceAddress<T *>(workspace, 0);
|
||||
for (size_t i = 0; i < outputs.size(); i++) {
|
||||
outputs_host_[i] = GetDeviceAddress<T>(outputs, i);
|
||||
}
|
||||
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_,
|
||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"Split opt cudaMemcpyAsync outputs failed");
|
||||
SplitKernel(input_size_, axis_step_, all_size_before_axis_, all_size_axis_, input, outputs_device,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
axis_ = GetAttr<int>(kernel_node, "axis");
|
||||
if (axis_ < 0) {
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
axis_ += SizeToInt(input_shape.size());
|
||||
}
|
||||
output_num_ = GetAttr<int>(kernel_node, "output_num");
|
||||
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
input_size_ = 1;
|
||||
all_size_before_axis_ = 1;
|
||||
all_size_axis_ = 1;
|
||||
|
||||
for (int i = 0; i < SizeToInt(input_shape.size()); i++) {
|
||||
input_size_ *= input_shape[i];
|
||||
if (i > axis_) {
|
||||
all_size_before_axis_ *= input_shape[i];
|
||||
all_size_axis_ *= input_shape[i];
|
||||
}
|
||||
if (i == axis_) {
|
||||
all_size_before_axis_ *= input_shape[i];
|
||||
}
|
||||
}
|
||||
input_size_list_.push_back(IntToSize(input_size_ * sizeof(T)));
|
||||
axis_step_ = input_shape[axis_] / output_num_;
|
||||
|
||||
for (int i = 0; i < output_num_; i++) {
|
||||
size_t output_size = 1;
|
||||
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, i);
|
||||
for (size_t j = 0; j < output_shape.size(); j++) {
|
||||
output_size *= output_shape[j];
|
||||
}
|
||||
output_size_list_.push_back(output_size * sizeof(T));
|
||||
}
|
||||
workspace_size_list_.push_back(sizeof(T *) * output_num_);
|
||||
InitSizeLists();
|
||||
outputs_host_ = std::make_unique<T *[]>(output_num_);
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitSizeLists() override {}
|
||||
|
||||
private:
|
||||
bool CheckParam(const CNodePtr &kernel_node) {
|
||||
auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
int dims = SizeToInt(input_shape.size());
|
||||
int output_num = SizeToInt(AnfAlgo::GetOutputTensorNum(kernel_node));
|
||||
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but Split needs 1 input.";
|
||||
return false;
|
||||
}
|
||||
if (dims == 0) {
|
||||
MS_LOG(ERROR) << "Input dims is " << dims << ", scalar is not supported.";
|
||||
return false;
|
||||
}
|
||||
if (axis_ < -dims || axis_ >= dims) {
|
||||
MS_LOG(ERROR) << "Attr axis " << axis_ << " must be in " << -dims << "~" << dims;
|
||||
return false;
|
||||
}
|
||||
if (output_num_ > SizeToInt(input_shape[axis_])) {
|
||||
MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must less than" << input_shape[axis_];
|
||||
return false;
|
||||
}
|
||||
if (input_shape[axis_] % output_num_ != 0) {
|
||||
MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must be divided by" << input_shape[axis_];
|
||||
return false;
|
||||
}
|
||||
if (output_num_ != output_num) {
|
||||
MS_LOG(ERROR) << "Output num is " << output_num << ", but need " << output_num_;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
int axis_;
|
||||
int output_num_;
|
||||
int input_size_;
|
||||
int axis_step_;
|
||||
int all_size_before_axis_;
|
||||
int all_size_axis_;
|
||||
std::unique_ptr<T *[]> outputs_host_;
|
||||
std::vector<size_t> input_size_list_;
|
||||
std::vector<size_t> output_size_list_;
|
||||
std::vector<size_t> workspace_size_list_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
|
|
@ -0,0 +1,29 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
MS_REG_GPU_KERNEL_TWO(TopK,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeInt32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeInt32),
|
||||
TopKGpuKernel, float, int)
|
||||
}
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,110 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_
|
||||
#define MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_
|
||||
|
||||
#include <vector>
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T, typename S>
|
||||
class TopKGpuKernel : public GpuKernel {
|
||||
public:
|
||||
TopKGpuKernel() : sorted_(false), outer_size_(1), inner_size_(1), k_(1), use_share_mem_(true), ceil_power2_(0) {}
|
||||
~TopKGpuKernel() override = default;
|
||||
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
|
||||
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspaces,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
T *input_addr = GetDeviceAddress<T>(inputs, 0);
|
||||
S *k = GetDeviceAddress<S>(inputs, 1);
|
||||
T *output_addr = GetDeviceAddress<T>(outputs, 0);
|
||||
S *indices = GetDeviceAddress<S>(outputs, 1);
|
||||
T *data_buff = nullptr;
|
||||
S *index_buff = nullptr;
|
||||
if (use_share_mem_ == false) {
|
||||
data_buff = GetDeviceAddress<T>(workspaces, 0);
|
||||
index_buff = GetDeviceAddress<S>(workspaces, 1);
|
||||
}
|
||||
|
||||
TopK(outer_size_, inner_size_, input_addr, k, output_addr, indices, data_buff, index_buff,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
|
||||
if (sorted_ == false) {
|
||||
std::cout << "================BitonicSortByKey" << std::endl;
|
||||
BitonicSortByKey(outer_size_, k_, output_addr, indices, data_buff, index_buff,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
for (size_t i = 0; i < input_shapes.size() - 1; i++) {
|
||||
outer_size_ *= input_shapes[i];
|
||||
}
|
||||
inner_size_ = input_shapes[input_shapes.size() - 1];
|
||||
k_ = output_shapes[output_shapes.size() - 1];
|
||||
|
||||
sorted_ = GetAttr<bool>(kernel_node, "sorted");
|
||||
|
||||
ceil_power2_ = RoundUpPower2(inner_size_);
|
||||
size_t buffer_size = ceil_power2_ * (sizeof(T) + sizeof(S));
|
||||
if (buffer_size > SHARED_MEM_PER_BLOCK) {
|
||||
use_share_mem_ = false;
|
||||
MS_LOG(WARNING) << "CUDA share memory not enough, sort with RAM";
|
||||
}
|
||||
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitSizeLists() override {
|
||||
input_size_list_.push_back(outer_size_ * inner_size_ * sizeof(T));
|
||||
input_size_list_.push_back(sizeof(S));
|
||||
output_size_list_.push_back(outer_size_ * k_ * sizeof(T));
|
||||
output_size_list_.push_back(outer_size_ * k_ * sizeof(S));
|
||||
if (use_share_mem_ == false) {
|
||||
workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(T));
|
||||
workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(S));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
bool sorted_;
|
||||
int outer_size_;
|
||||
int inner_size_;
|
||||
int k_;
|
||||
bool use_share_mem_;
|
||||
int ceil_power2_;
|
||||
|
||||
std::vector<size_t> input_size_list_;
|
||||
std::vector<size_t> output_size_list_;
|
||||
std::vector<size_t> workspace_size_list_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // TopKpuKernel
|
|
@ -116,16 +116,16 @@ __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const
|
|||
output);
|
||||
case BROADCAST_TYPE_REALDIV:
|
||||
return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
|
||||
output);
|
||||
output);
|
||||
case BROADCAST_TYPE_MUL:
|
||||
return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
|
||||
output);
|
||||
output);
|
||||
case BROADCAST_TYPE_SUB:
|
||||
return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
|
||||
output);
|
||||
output);
|
||||
case BROADCAST_TYPE_ADD:
|
||||
return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
|
||||
output);
|
||||
output);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -176,6 +176,28 @@ void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, cons
|
|||
NoBroadcastKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, op, input0, input1, output);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0,
|
||||
const int o1, const int o2, const int o3, const T *input_addr, T *output_addr) {
|
||||
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < o0 * o1 * o2 * o3; pos += blockDim.x * gridDim.x) {
|
||||
int i = pos / (o1 * o2 * o3) % o0;
|
||||
int j = pos / (o2 * o3) % o1;
|
||||
int k = pos / o3 % o2;
|
||||
int l = pos % o3;
|
||||
|
||||
int input_idx = Index(i, i0) * i1 * i2 * i3 + Index(j, i1) * i2 * i3 + Index(k, i2) * i3 + Index(l, i3);
|
||||
output_addr[pos] = input_addr[input_idx];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
|
||||
const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) {
|
||||
int nums = o0 * o1 * o2 * o3;
|
||||
BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr,
|
||||
output_addr);
|
||||
}
|
||||
|
||||
template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
|
||||
const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
|
||||
enum BroadcastOpType op, const float *input0, const float *input1, bool *output,
|
||||
|
@ -204,5 +226,11 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
|
|||
bool *output, cudaStream_t stream);
|
||||
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
|
||||
half *output, cudaStream_t stream);
|
||||
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
|
||||
int *output, cudaStream_t stream);
|
||||
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
|
||||
cudaStream_t stream);
|
||||
|
||||
template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
|
||||
const int &o2, const int &o3, const float *input_addr, float *output_addr,
|
||||
cudaStream_t stream);
|
||||
template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
|
||||
const int &o2, const int &o3, const half *input_addr, half *output_addr, cudaStream_t stream);
|
||||
|
|
|
@ -41,4 +41,8 @@ template <typename T, typename S>
|
|||
void NoBroadcast(const int &size, enum BroadcastOpType op, const T *input0, const T *input1, S *output,
|
||||
cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
|
||||
const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream);
|
||||
|
||||
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
|
||||
|
|
|
@ -19,90 +19,51 @@
|
|||
#include <cuda_runtime.h>
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
|
||||
template <typename T>
|
||||
__global__ void Concat(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output) {
|
||||
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
|
||||
int n = pos / (w1 + w2);
|
||||
int m = pos % (w1 + w2);
|
||||
output[pos] = m >= w1 ? input_2[n * w2 + m - w1] : input_1[n * w1 + m];
|
||||
__global__ void Concat(const int size, const int input_num,
|
||||
const int all_size_before_axis, const int all_size_axis,
|
||||
int* len_axis, T** inputs, T* output) {
|
||||
for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
|
||||
int num = pos % all_size_before_axis / all_size_axis;
|
||||
int block = -1;
|
||||
int axis_inc = 0;
|
||||
int block_len = 0;
|
||||
for (int i = 0; i < input_num; i++) {
|
||||
if (axis_inc <= num) {
|
||||
block++;
|
||||
axis_inc += len_axis[i];
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
block_len = len_axis[block];
|
||||
axis_inc -= len_axis[block];
|
||||
int block_pos = pos / all_size_before_axis * block_len * all_size_axis +
|
||||
(num - axis_inc) * all_size_axis + pos % all_size_axis;;
|
||||
output[pos] = inputs[block][block_pos];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void Concat(const size_t size, const int w1, const int w2, const int w3,
|
||||
const T* input_1, const T* input_2, const T* input_3, T* output) {
|
||||
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
|
||||
int n = pos / (w1 + w2 + w3);
|
||||
int m = pos % (w1 + w2 + w3);
|
||||
output[pos] = m < w1 ? input_1[n * w1 + m] :
|
||||
m < w1 + w2 ? input_2[n * w2 + m - w1] :
|
||||
input_3[n * w3 + m - w1 - w2];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void Concat(const size_t size, const int w1, const int w2, const int w3, const int w4,
|
||||
const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output) {
|
||||
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
|
||||
int n = pos / (w1 + w2 + w3 + w4);
|
||||
int m = pos % (w1 + w2 + w3 + w4);
|
||||
output[pos] = m < w1 ? input_1[n * w1 + m] :
|
||||
m < w1 + w2 ? input_2[n * w2 + m - w1]:
|
||||
m < w1 + w2 + w3 ? input_3[n * w3 + m - w1 - w2]:
|
||||
input_4[n * w4 + m - w1 - w2 - w3];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output,
|
||||
cudaStream_t cuda_stream) {
|
||||
Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, input_1, input_2, output);
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
|
||||
const T* input_1, const T* input_2, const T* input_3, T* output,
|
||||
void ConcatKernel(const int size, const int input_num,
|
||||
const int all_size_before_axis, const int all_size_axis,
|
||||
int* len_axis, T** inputs, T* output,
|
||||
cudaStream_t cuda_stream) {
|
||||
Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, input_1, input_2, input_3, output);
|
||||
Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_num,
|
||||
all_size_before_axis, all_size_axis,
|
||||
len_axis, inputs, output);
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
|
||||
const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output,
|
||||
cudaStream_t cuda_stream) {
|
||||
Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, w4, input_1,
|
||||
input_2, input_3, input_4, output);
|
||||
return;
|
||||
}
|
||||
|
||||
template void ConcatKernel(const size_t size, const int w1, const int w2, const float* input_1, const float* input_2,
|
||||
float* output, cudaStream_t cuda_stream);
|
||||
template void ConcatKernel(const size_t size, const int w1, const int w2, const int* input_1, const int* input_2,
|
||||
int* output, cudaStream_t cuda_stream);
|
||||
template void ConcatKernel(const size_t size, const int w1, const int w2, const half* input_1, const half* input_2,
|
||||
half* output, cudaStream_t cuda_stream);
|
||||
|
||||
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
|
||||
const float* input_1, const float* input_2, const float* input_3,
|
||||
float* output, cudaStream_t cuda_stream);
|
||||
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
|
||||
const int* input_1, const int* input_2, const int* input_3,
|
||||
int* output, cudaStream_t cuda_stream);
|
||||
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
|
||||
const half* input_1, const half* input_2, const half* input_3,
|
||||
half* output, cudaStream_t cuda_stream);
|
||||
|
||||
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
|
||||
const float* input_1, const float* input_2, const float* input_3, const float* input_4,
|
||||
float* output, cudaStream_t cuda_stream);
|
||||
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
|
||||
const int* input_1, const int* input_2, const int* input_3, const int* input_4,
|
||||
int* output, cudaStream_t cuda_stream);
|
||||
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
|
||||
const half* input_1, const half* input_2, const half* input_3, const half* input_4,
|
||||
half* output, cudaStream_t cuda_stream);
|
||||
|
||||
template void ConcatKernel(const int size, const int input_num,
|
||||
const int all_size_before_axis, const int all_size_axis,
|
||||
int* len_axis, float** inputs, float* output,
|
||||
cudaStream_t cuda_stream);
|
||||
template void ConcatKernel(const int size, const int input_num,
|
||||
const int all_size_before_axis, const int all_size_axis,
|
||||
int* len_axis, int** inputs, int* output,
|
||||
cudaStream_t cuda_stream);
|
||||
template void ConcatKernel(const int size, const int input_num,
|
||||
const int all_size_before_axis, const int all_size_axis,
|
||||
int* len_axis, half** inputs, half* output,
|
||||
cudaStream_t cuda_stream);
|
||||
|
|
|
@ -19,13 +19,8 @@
|
|||
|
||||
#include "runtime/device/gpu/cuda_common.h"
|
||||
template <typename T>
|
||||
void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output,
|
||||
cudaStream_t cuda_stream);
|
||||
template <typename T>
|
||||
void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
|
||||
const T* input_1, const T* input_2, const T* input_3, T* output, cudaStream_t cuda_stream);
|
||||
template <typename T>
|
||||
void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
|
||||
const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output,
|
||||
void ConcatKernel(const int size, const int input_num,
|
||||
const int all_size_before_axis, const int all_size_axis,
|
||||
int* len_axis, T** inputs, T* output,
|
||||
cudaStream_t cuda_stream);
|
||||
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CONCATV2IMPL_H_
|
||||
|
|
|
@ -15,9 +15,9 @@
|
|||
*/
|
||||
|
||||
#include "momentum_impl.cuh"
|
||||
template <typename T, typename S>
|
||||
template <typename T, typename S, typename G>
|
||||
__global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const S *learning_rate,
|
||||
const T *gradient, const S *momentum) {
|
||||
const G *gradient, const S *momentum) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
|
||||
accumulation[i] = momentum[0] * accumulation[i] + gradient[i];
|
||||
variable[i] -= learning_rate[0] * accumulation[i];
|
||||
|
@ -34,19 +34,32 @@ __global__ void MomentumUpdateVariableKernel(const size_t size, half *variable,
|
|||
}
|
||||
return;
|
||||
}
|
||||
template <typename T, typename S>
|
||||
void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient,
|
||||
template <>
|
||||
__global__ void MomentumUpdateVariableKernel(const size_t size, float *variable, float *accumulation,
|
||||
const float *learning_rate, const half *gradient,
|
||||
const float *momentum) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
|
||||
accumulation[i] = momentum[0] * accumulation[i] + __half2float(gradient[i]);
|
||||
variable[i] -= learning_rate[0] * accumulation[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
template <typename T, typename S, typename G>
|
||||
void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
|
||||
const S *momentum, cudaStream_t cuda_stream) {
|
||||
MomentumUpdateVariableKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, variable, accumulation,
|
||||
learning_rate, gradient, momentum);
|
||||
return;
|
||||
}
|
||||
template void MomentumUpdateVariable<float, float>(const size_t size, float *variable, float *accumulation,
|
||||
template void MomentumUpdateVariable<float, float, float>(const size_t size, float *variable, float *accumulation,
|
||||
const float *learning_rate, const float *gradient,
|
||||
const float *momentum, cudaStream_t cuda_stream);
|
||||
template void MomentumUpdateVariable<half, half>(const size_t size, half *variable, half *accumulation,
|
||||
template void MomentumUpdateVariable<half, half, half>(const size_t size, half *variable, half *accumulation,
|
||||
const half *learning_rate, const half *gradient,
|
||||
const half *momentum, cudaStream_t cuda_stream);
|
||||
template void MomentumUpdateVariable<half, float>(const size_t size, half *variable, half *accumulation,
|
||||
template void MomentumUpdateVariable<half, float, half>(const size_t size, half *variable, half *accumulation,
|
||||
const float *learning_rate, const half *gradient,
|
||||
const float *momentum, cudaStream_t cuda_stream);
|
||||
template void MomentumUpdateVariable<float, float, half>(const size_t size, float *variable, float *accumulation,
|
||||
const float *learning_rate, const half *gradient,
|
||||
const float *momentum, cudaStream_t cuda_stream);
|
||||
|
|
|
@ -18,8 +18,8 @@
|
|||
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
|
||||
|
||||
#include "runtime/device/gpu/cuda_common.h"
|
||||
template <typename T, typename S>
|
||||
void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient,
|
||||
template <typename T, typename S, typename G>
|
||||
void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
|
||||
const S *momentum, cudaStream_t cuda_stream);
|
||||
|
||||
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
|
||||
template <typename T>
|
||||
__global__ void Split(const int size, const int axis_step, const int all_size_before_axis,
|
||||
const int all_size_axis, const T* input, T** outputs) {
|
||||
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
|
||||
int num = pos % all_size_before_axis / all_size_axis;
|
||||
int block = num / axis_step;
|
||||
int block_pos = pos / all_size_before_axis * axis_step * all_size_axis +
|
||||
num % axis_step * all_size_axis + pos % all_size_axis;
|
||||
outputs[block][block_pos] = input[pos];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
|
||||
const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) {
|
||||
Split<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, axis_step, all_size_before_axis,
|
||||
all_size_axis, input, outputs);
|
||||
return;
|
||||
}
|
||||
|
||||
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
|
||||
const int all_size_axis, const float* input, float** outputs,
|
||||
cudaStream_t cuda_stream);
|
||||
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
|
||||
const int all_size_axis, const int* input, int** outputs,
|
||||
cudaStream_t cuda_stream);
|
||||
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
|
||||
const int all_size_axis, const half* input, half** outputs,
|
||||
cudaStream_t cuda_stream);
|
|
@ -0,0 +1,24 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
|
||||
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
|
||||
|
||||
#include "runtime/device/gpu/cuda_common.h"
|
||||
template <typename T>
|
||||
void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
|
||||
const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream);
|
||||
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
|
|
@ -0,0 +1,162 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
|
||||
int RoundUpPower2(int v) {
|
||||
v--;
|
||||
v |= v >> 1;
|
||||
v |= v >> 2;
|
||||
v |= v >> 4;
|
||||
v |= v >> 8;
|
||||
v |= v >> 16;
|
||||
v++;
|
||||
return v;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__inline__ __device__ void Swap(T *lhs, T *rhs) {
|
||||
T tmp = lhs[0];
|
||||
lhs[0] = rhs[0];
|
||||
rhs[0] = tmp;
|
||||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
__global__ void TopkKernel(const int outer, const int inner, const int ceil_power2, const T *input, const S *k,
|
||||
T *output, S *indices, T *data_buff, S *index_buff) {
|
||||
// default: sort with share memory
|
||||
extern __shared__ T share_mem[];
|
||||
T *data_arr = share_mem;
|
||||
S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2);
|
||||
// sort with RAM
|
||||
if (data_buff != nullptr && index_buff != nullptr) {
|
||||
data_arr = data_buff + blockIdx.x * ceil_power2;
|
||||
index_arr = index_buff + blockIdx.x * ceil_power2;
|
||||
}
|
||||
|
||||
for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
|
||||
data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max();
|
||||
index_arr[i] = i;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (size_t i = 2; i <= ceil_power2; i <<= 1) {
|
||||
for (size_t j = (i >> 1); j > 0; j >>= 1) {
|
||||
for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
|
||||
size_t tid_comp = tid ^ j;
|
||||
if (tid_comp > tid) {
|
||||
if ((tid & i) == 0) {
|
||||
if (data_arr[tid] > data_arr[tid_comp]) {
|
||||
Swap(&data_arr[tid], &data_arr[tid_comp]);
|
||||
Swap(&index_arr[tid], &index_arr[tid_comp]);
|
||||
}
|
||||
} else {
|
||||
if (data_arr[tid] < data_arr[tid_comp]) {
|
||||
Swap(&data_arr[tid], &data_arr[tid_comp]);
|
||||
Swap(&index_arr[tid], &index_arr[tid_comp]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t tid = threadIdx.x; tid < k[0]; tid += blockDim.x) {
|
||||
output[blockIdx.x * k[0] + tid] = data_arr[inner - tid - 1];
|
||||
indices[blockIdx.x * k[0] + tid] = index_arr[inner - tid - 1];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
void TopK(const int &outer, const int &inner, const T *input, const S *k, T *output, S *indices, T *data_buff,
|
||||
S *index_buff, cudaStream_t stream) {
|
||||
int ceil_power2 = RoundUpPower2(inner);
|
||||
int share_mem = (data_buff == nullptr) ? ceil_power2 * (sizeof(T) + sizeof(S)) : 0;
|
||||
int thread = std::min(ceil_power2, GET_THREADS);
|
||||
TopkKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, k, output, indices, data_buff,
|
||||
index_buff);
|
||||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
__global__ void BitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2, T *input,
|
||||
S *indices, T *data_buff, S *index_buff) {
|
||||
// default: sort with share memory
|
||||
extern __shared__ T share_mem[];
|
||||
T *data_arr = share_mem;
|
||||
S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2);
|
||||
// sort with RAM
|
||||
if (data_buff != nullptr && index_buff != nullptr) {
|
||||
data_arr = data_buff + blockIdx.x * ceil_power2;
|
||||
index_arr = index_buff + blockIdx.x * ceil_power2;
|
||||
}
|
||||
|
||||
for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
|
||||
data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max();
|
||||
index_arr[i] = (i < inner) ? indices[blockIdx.x * inner + i] : std::numeric_limits<S>::max();;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (size_t i = 2; i <= ceil_power2; i <<= 1) {
|
||||
for (size_t j = (i >> 1); j > 0; j >>= 1) {
|
||||
for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
|
||||
size_t tid_comp = tid ^ j;
|
||||
if (tid_comp > tid) {
|
||||
if ((tid & i) == 0) {
|
||||
if (index_arr[tid] > index_arr[tid_comp]) {
|
||||
Swap(&data_arr[tid], &data_arr[tid_comp]);
|
||||
Swap(&index_arr[tid], &index_arr[tid_comp]);
|
||||
}
|
||||
} else {
|
||||
if (index_arr[tid] < index_arr[tid_comp]) {
|
||||
Swap(&data_arr[tid], &data_arr[tid_comp]);
|
||||
Swap(&index_arr[tid], &index_arr[tid_comp]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t tid = threadIdx.x; tid < inner; tid += blockDim.x) {
|
||||
input[blockIdx.x * inner + tid] = data_arr[tid];
|
||||
indices[blockIdx.x * inner + tid] = index_arr[tid];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename S>
|
||||
void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff,
|
||||
cudaStream_t stream) {
|
||||
int ceil_power2 = RoundUpPower2(inner);
|
||||
size_t share_mem = ceil_power2 * (sizeof(T) + sizeof(S));
|
||||
if (share_mem > SHARED_MEM_PER_BLOCK) {
|
||||
share_mem = 0;
|
||||
} else {
|
||||
data_buff = nullptr;
|
||||
index_buff = nullptr;
|
||||
}
|
||||
int thread = std::min(ceil_power2, GET_THREADS);
|
||||
BitonicSortByKeyKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, indices, data_buff,
|
||||
index_buff);
|
||||
}
|
||||
|
||||
template void TopK(const int &outer, const int &inner, const float *input_addr, const int *k, float *output,
|
||||
int *indices, float *data_buff, int *index_buff, cudaStream_t stream);
|
||||
template void BitonicSortByKey(const int &outer, const int &inner, float *input, int *indices, float *data_buff,
|
||||
int *index_buff, cudaStream_t stream);
|
|
@ -0,0 +1,32 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
|
||||
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include "runtime/device/gpu/cuda_common.h"
|
||||
|
||||
template <typename T, typename S>
|
||||
void TopK(const int &outer, const int &inner, const T *input_addr, const S *k, T *output, S *indices, T *data_buff,
|
||||
S *index_buff, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename S>
|
||||
void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff,
|
||||
cudaStream_t stream);
|
||||
int RoundUpPower2(int v);
|
||||
|
||||
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
|
|
@ -103,6 +103,35 @@ __global__ void ZeroslikeKernel(T *output, size_t count) {
|
|||
return;
|
||||
}
|
||||
template <typename T>
|
||||
__global__ void AbsKernel(T *input, T *output, size_t count) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
output[i] = abs(input[i]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
template <>
|
||||
__global__ void AbsKernel(half *input, half *output, size_t count) {
|
||||
half zero = 0.0;
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
output[i] = input[i] < zero ? -input[i] : input[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
template <typename T>
|
||||
__global__ void FloorKernel(T *input, T *output, size_t count) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
output[i] = floor(input[i]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
template <>
|
||||
__global__ void FloorKernel(half *input, half *output, size_t count) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
output[i] = hfloor(input[i]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
template <typename T>
|
||||
void Exponential(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
|
||||
ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
|
||||
return;
|
||||
|
@ -147,6 +176,16 @@ void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream) {
|
|||
ZeroslikeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(output, count);
|
||||
return;
|
||||
}
|
||||
template <typename T>
|
||||
void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
|
||||
AbsKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
|
||||
return;
|
||||
}
|
||||
template <typename T>
|
||||
void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
|
||||
FloorKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
|
||||
return;
|
||||
}
|
||||
|
||||
template void Exponential<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
|
||||
template void Logarithm<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
|
||||
|
@ -156,6 +195,8 @@ template void Square<float>(float *input, float *output, size_t count, cudaStrea
|
|||
template void Sqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
|
||||
template void Rsqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
|
||||
template void Zeroslike<float>(float *output, size_t count, cudaStream_t cuda_stream);
|
||||
template void Abs<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
|
||||
template void Floor<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
|
||||
template void Exponential<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
|
||||
template void Logarithm<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
|
||||
template void Negative<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
|
||||
|
@ -164,3 +205,5 @@ template void Square<half>(half *input, half *output, size_t count, cudaStream_t
|
|||
template void Sqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
|
||||
template void Rsqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
|
||||
template void Zeroslike<half>(half *output, size_t count, cudaStream_t cuda_stream);
|
||||
template void Abs<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
|
||||
template void Floor<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
|
||||
|
|
|
@ -34,5 +34,9 @@ template <typename T>
|
|||
void Rsqrt(T *input, T *output, size_t count, cudaStream_t cuda_stream);
|
||||
template <typename T>
|
||||
void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream);
|
||||
template <typename T>
|
||||
void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream);
|
||||
template <typename T>
|
||||
void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream);
|
||||
|
||||
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_
|
||||
|
|
|
@ -88,6 +88,12 @@ class GpuKernelRegister {
|
|||
static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \
|
||||
static const GpuKernelRegister g_##OPNAME##_##T##_##S##_gpu_kernel_reg(#OPNAME, ATTR, \
|
||||
[]() { return new OPCLASS<T, S>(); });
|
||||
|
||||
// register of mixed accuracy kernels which use template and maintain three typename
|
||||
#define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G) \
|
||||
static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S, G>>::value, " must be base of GpuKernel"); \
|
||||
static const GpuKernelRegister g_##OPNAME##_##T##_##S##_##G##_gpu_kernel_reg( \
|
||||
#OPNAME, ATTR, []() { return new OPCLASS<T, S, G>(); });
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_KERNEL_GPU_GPUKERNELFACTORY_H_
|
||||
|
|
|
@ -46,5 +46,13 @@ MS_REG_GPU_KERNEL_ONE(Sqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOut
|
|||
UnaryOpGpuKernel, float)
|
||||
MS_REG_GPU_KERNEL_ONE(Rsqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
UnaryOpGpuKernel, float)
|
||||
MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
UnaryOpGpuKernel, float)
|
||||
MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
UnaryOpGpuKernel, half)
|
||||
MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
UnaryOpGpuKernel, float)
|
||||
MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
UnaryOpGpuKernel, half)
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -36,6 +36,8 @@ enum UnaryOptype {
|
|||
UNARY_OP_SQUARE,
|
||||
UNARY_OP_SQRT,
|
||||
UNARY_OP_RSQRT,
|
||||
UNARY_OP_ABS,
|
||||
UNARY_OP_FLOOR,
|
||||
UNARY_OP_INVALID_TYPE = 255
|
||||
};
|
||||
static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY_OP_EXP},
|
||||
|
@ -45,7 +47,9 @@ static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY
|
|||
{"ZerosLike", UNARY_OP_ZEROSLIKE},
|
||||
{"Square", UNARY_OP_SQUARE},
|
||||
{"Sqrt", UNARY_OP_SQRT},
|
||||
{"Rsqrt", UNARY_OP_RSQRT}};
|
||||
{"Rsqrt", UNARY_OP_RSQRT},
|
||||
{"Abs", UNARY_OP_ABS},
|
||||
{"Floor", UNARY_OP_FLOOR}};
|
||||
template <typename T>
|
||||
class UnaryOpGpuKernel : public GpuKernel {
|
||||
public:
|
||||
|
@ -100,6 +104,14 @@ class UnaryOpGpuKernel : public GpuKernel {
|
|||
Zeroslike(output_addr, output_size_ / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
return true;
|
||||
}
|
||||
case UNARY_OP_ABS: {
|
||||
Abs(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
break;
|
||||
}
|
||||
case UNARY_OP_FLOOR: {
|
||||
Floor(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
MS_LOG(EXCEPTION) << "Unary operation " << unary_op_type_ << " is not supported.";
|
||||
}
|
||||
|
|
|
@ -34,15 +34,15 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNorm,
|
|||
MS_REG_GPU_KERNEL_ONE(FusedBatchNorm,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
FusedBatchNormGpuKernel, half)
|
||||
MS_REG_GPU_KERNEL_ONE(BatchNorm,
|
||||
KernelAttr()
|
||||
|
@ -60,15 +60,15 @@ MS_REG_GPU_KERNEL_ONE(BatchNorm,
|
|||
MS_REG_GPU_KERNEL_ONE(BatchNorm,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
FusedBatchNormGpuKernel, half)
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -56,17 +56,17 @@ class FusedBatchNormGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
auto x = GetDeviceAddress<T>(inputs, 0);
|
||||
auto scale = GetDeviceAddress<T>(inputs, 1);
|
||||
auto bias = GetDeviceAddress<T>(inputs, 2);
|
||||
auto runing_mean = GetDeviceAddress<T>(inputs, 3);
|
||||
auto runnig_variance = GetDeviceAddress<T>(inputs, 4);
|
||||
auto scale = GetDeviceAddress<float>(inputs, 1);
|
||||
auto bias = GetDeviceAddress<float>(inputs, 2);
|
||||
auto runing_mean = GetDeviceAddress<float>(inputs, 3);
|
||||
auto runnig_variance = GetDeviceAddress<float>(inputs, 4);
|
||||
auto y = GetDeviceAddress<T>(outputs, 0);
|
||||
|
||||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
if (is_train_) {
|
||||
auto save_mean = GetDeviceAddress<T>(outputs, 3);
|
||||
auto save_variance = GetDeviceAddress<T>(outputs, 4);
|
||||
auto save_mean = GetDeviceAddress<float>(outputs, 3);
|
||||
auto save_variance = GetDeviceAddress<float>(outputs, 4);
|
||||
CHECK_CUDNN_RET_WITH_EXCEPT(
|
||||
cudnnBatchNormalizationForwardTraining(handle_, mode_, &alpha, &beta, x_desc_, x, y_desc_, y,
|
||||
scale_bias_mean_var_desc_, scale, bias, exp_avg_factor_, runing_mean,
|
||||
|
|
|
@ -33,12 +33,12 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNormGrad,
|
|||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
.AddOutputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
FusedBatchNormGradGpuKernel, half)
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -55,12 +55,12 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
auto dy = GetDeviceAddress<T>(inputs, 0);
|
||||
auto x = GetDeviceAddress<T>(inputs, 1);
|
||||
auto scale = GetDeviceAddress<T>(inputs, 2);
|
||||
auto save_mean = GetDeviceAddress<T>(inputs, 3);
|
||||
auto save_variance = GetDeviceAddress<T>(inputs, 4);
|
||||
auto scale = GetDeviceAddress<float>(inputs, 2);
|
||||
auto save_mean = GetDeviceAddress<float>(inputs, 3);
|
||||
auto save_variance = GetDeviceAddress<float>(inputs, 4);
|
||||
auto dx = GetDeviceAddress<T>(outputs, 0);
|
||||
auto bn_scale = GetDeviceAddress<T>(outputs, 1);
|
||||
auto bn_bias = GetDeviceAddress<T>(outputs, 2);
|
||||
auto bn_scale = GetDeviceAddress<float>(outputs, 1);
|
||||
auto bn_bias = GetDeviceAddress<float>(outputs, 2);
|
||||
|
||||
const float alpha_data_diff = 1;
|
||||
const float beta_data_diff = 0;
|
||||
|
|
|
@ -18,32 +18,41 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
MomentumGpuKernel, float, float)
|
||||
MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
MomentumGpuKernel, half, half)
|
||||
MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
MomentumGpuKernel, half, float)
|
||||
MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
MomentumGpuKernel, float, float, float)
|
||||
MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
MomentumGpuKernel, half, half, half)
|
||||
MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat16),
|
||||
MomentumGpuKernel, half, float, half)
|
||||
MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
|
||||
KernelAttr()
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddInputAttr(kNumberTypeFloat16)
|
||||
.AddInputAttr(kNumberTypeFloat32)
|
||||
.AddOutputAttr(kNumberTypeFloat32),
|
||||
MomentumGpuKernel, float, float, half)
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
#include "backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh"
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T, typename S>
|
||||
template <typename T, typename S, typename G>
|
||||
class MomentumGpuKernel : public GpuKernel {
|
||||
public:
|
||||
MomentumGpuKernel()
|
||||
|
@ -38,7 +38,7 @@ class MomentumGpuKernel : public GpuKernel {
|
|||
T *variable = GetDeviceAddress<T>(inputs, 0);
|
||||
T *accumulation = GetDeviceAddress<T>(inputs, 1);
|
||||
S *learning_rate = GetDeviceAddress<S>(inputs, 2);
|
||||
T *gradient = GetDeviceAddress<T>(inputs, 3);
|
||||
G *gradient = GetDeviceAddress<G>(inputs, 3);
|
||||
S *momentum = GetDeviceAddress<S>(inputs, 4);
|
||||
MomentumUpdateVariable(inputs[0]->size / sizeof(T), variable, accumulation, learning_rate, gradient, momentum,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
|
@ -54,7 +54,7 @@ class MomentumGpuKernel : public GpuKernel {
|
|||
variable_size_ = sizeof(T);
|
||||
accumulation_size_ = sizeof(T);
|
||||
learning_rate_size_ = sizeof(S);
|
||||
gradient_size_ = sizeof(T);
|
||||
gradient_size_ = sizeof(G);
|
||||
momentum_size_ = sizeof(S);
|
||||
|
||||
auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
|
|
|
@ -81,6 +81,7 @@ static std::map<string, string> tbe_func_adapter_map = {
|
|||
{"sparse_apply_proximal_adagrad", "sparse_apply_proximal_adagrad_d"},
|
||||
{"apply_add_sign", "apply_add_sign_d"},
|
||||
{"apply_power_sign", "apply_power_sign_d"},
|
||||
{"apply_centered_rms_prop", "apply_centered_rms_prop_d"},
|
||||
{"transpose", "transpose_d"},
|
||||
{"fill", "fill_d"},
|
||||
{"unsorted_segment_sum", "unsorted_segment_sum_d"},
|
||||
|
|
|
@ -43,6 +43,7 @@ constexpr auto kJInputs = "inputs";
|
|||
constexpr auto kJOutputs = "outputs";
|
||||
constexpr auto kJAttrs = "attrs";
|
||||
constexpr auto kJKernelName = "kernel_name";
|
||||
constexpr auto kJFullName = "full_name";
|
||||
constexpr auto kJOpInfo = "op_info";
|
||||
constexpr auto kJDtype = "dtype";
|
||||
constexpr auto kJtype = "type";
|
||||
|
@ -125,6 +126,7 @@ bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspor
|
|||
op_info_json[kJKernelName] = json_name_;
|
||||
}
|
||||
(*kernel_json)[kJOpInfo] = op_info_json;
|
||||
(*kernel_json)[kJFullName] = anf_node->fullname_with_scope();
|
||||
if (creater_type_ == SINGLE_BUILD) {
|
||||
TbeUtils::SaveJsonInfo(json_name_, json_info_);
|
||||
}
|
||||
|
|
|
@ -97,6 +97,7 @@
|
|||
#include "backend/optimizer/ascend/format_type/modify_ops_attrs.h"
|
||||
#include "backend/optimizer/ascend/format_type/remove_no_use_reshape_op.h"
|
||||
#include "backend/optimizer/ascend/ir_fusion/add_input_to_output.h"
|
||||
#include "backend/optimizer/ascend/format_type/remove_internal_output.h"
|
||||
#include "utils/context/ms_context.h"
|
||||
#include "utils/config_manager.h"
|
||||
#include "debug/anf_ir_dump.h"
|
||||
|
@ -201,6 +202,7 @@ void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph)
|
|||
data_layout_pm->AddPass(std::make_shared<OptimizeDependence>());
|
||||
data_layout_pm->AddPass(std::make_shared<TransDataSplit>());
|
||||
data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
|
||||
data_layout_pm->AddPass(std::make_shared<RemoveInternalOutputTransOp>());
|
||||
optimizer->AddPassManager(data_layout_pm);
|
||||
(void)optimizer->Optimize(kernel_graph);
|
||||
kernel_graph->SetExecOrderByDefault();
|
||||
|
@ -222,6 +224,7 @@ void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_grap
|
|||
mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>());
|
||||
mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
|
||||
mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>());
|
||||
mixed_precision_pm->AddPass(std::make_shared<RemoveInternalOutputCast>());
|
||||
optimizer->AddPassManager(mixed_precision_pm);
|
||||
(void)optimizer->Optimize(kernel_graph);
|
||||
kernel_graph->SetExecOrderByDefault();
|
||||
|
|
|
@ -142,6 +142,7 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
|
|||
MS_EXCEPTION_IF_NULL(node);
|
||||
std::vector<AnfNodePtr> make_tuple_inputs;
|
||||
make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
|
||||
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
|
||||
for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(node); ++output_idx) {
|
||||
std::string output_format = AnfAlgo::GetOutputFormat(node, output_idx);
|
||||
if (output_format == kOpFormat_NC1KHKWHWC0) {
|
||||
|
@ -151,7 +152,11 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
|
|||
auto tuple_getitem = CreatTupleGetItemNode(func_graph, node, output_idx);
|
||||
std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx);
|
||||
if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) {
|
||||
make_tuple_inputs.emplace_back(AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false));
|
||||
auto trans_op = AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false);
|
||||
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
|
||||
kernel_graph->ReplaceInternalOutput(node, trans_op, output_idx, 0);
|
||||
}
|
||||
make_tuple_inputs.emplace_back(trans_op);
|
||||
} else {
|
||||
// No need insert trans op.
|
||||
make_tuple_inputs.push_back(tuple_getitem);
|
||||
|
@ -249,9 +254,14 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP
|
|||
if (outputs_num == 0) {
|
||||
return node;
|
||||
}
|
||||
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
|
||||
// Single output
|
||||
if (outputs_num == 1 && (!AnfAlgo::IsTupleOutput(node))) {
|
||||
return InsertTransOpForSingleOutput(func_graph, node, kernel_select);
|
||||
auto new_node = InsertTransOpForSingleOutput(func_graph, node, kernel_select);
|
||||
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
|
||||
kernel_graph->ReplaceInternalOutput(node, new_node);
|
||||
}
|
||||
return new_node;
|
||||
}
|
||||
// Multiple output
|
||||
return InsertTransOpForMultipleOutput(func_graph, node, kernel_select);
|
||||
|
|
|
@ -40,6 +40,7 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
|
|||
std::vector<AnfNodePtr> make_tuple_inputs;
|
||||
AbstractBasePtrList abstract_list;
|
||||
make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
|
||||
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
|
||||
for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(cnode); ++output_idx) {
|
||||
AnfNodePtr replace_node = nullptr;
|
||||
const auto origin_shape = AnfAlgo::GetOutputInferShape(cnode, output_idx);
|
||||
|
@ -64,6 +65,9 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
|
|||
MS_EXCEPTION_IF_NULL(replace_node);
|
||||
replace_node->set_scope(cnode->scope());
|
||||
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
|
||||
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
|
||||
kernel_graph->ReplaceInternalOutput(cnode, replace_node, output_idx, 0);
|
||||
}
|
||||
} else {
|
||||
replace_node = getitem;
|
||||
}
|
||||
|
@ -87,6 +91,7 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
|
|||
return cnode;
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(cnode->Type());
|
||||
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
|
||||
// Single output
|
||||
if (!cnode->Type()->isa<Tuple>()) {
|
||||
if (!need_insert_cast[0]) {
|
||||
|
@ -109,6 +114,9 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
|
|||
MS_EXCEPTION_IF_NULL(replace_node);
|
||||
replace_node->set_scope(cnode->scope());
|
||||
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
|
||||
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
|
||||
kernel_graph->ReplaceInternalOutput(cnode, replace_node);
|
||||
}
|
||||
}
|
||||
return replace_node;
|
||||
}
|
||||
|
@ -188,6 +196,10 @@ const AnfNodePtr InsertCast::Process(const FuncGraphPtr &func_graph, const AnfNo
|
|||
CNodePtr cnode = node->cast<CNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(cnode);
|
||||
auto new_node = InsertCastForInput(func_graph, cnode);
|
||||
auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
|
||||
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
|
||||
kernel_graph->ReplaceInternalOutput(node, new_node);
|
||||
}
|
||||
// process output
|
||||
return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true));
|
||||
}
|
||||
|
|
|
@ -46,14 +46,13 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
|
|||
if (node == nullptr || !AnfAlgo::IsRealKernel(node)) {
|
||||
return nullptr;
|
||||
}
|
||||
AnfNodePtr front_node;
|
||||
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
|
||||
MS_LOG(DEBUG) << "process op: " << node->DebugString();
|
||||
AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
|
||||
auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
|
||||
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
|
||||
front_node = kernel_graph->GetFrontNodeByInternalOutput(node);
|
||||
kernel_graph->ReplaceInternalOutput(node, new_node);
|
||||
}
|
||||
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
|
||||
MS_LOG(DEBUG) << "====process op: " << node->DebugString();
|
||||
AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
if (ms_context->execution_mode() == kPynativeMode && !ms_context->enable_pynative_hook()) {
|
||||
|
@ -61,12 +60,7 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
|
|||
return new_node;
|
||||
}
|
||||
}
|
||||
auto final_node = InsertTransOpForOutput(func_graph, new_node, kernel_select_);
|
||||
if (kernel_graph != nullptr && front_node != nullptr) {
|
||||
auto old_node = kernel_graph->GetInternalOutputByFrontNode(front_node);
|
||||
kernel_graph->ReplaceInternalOutput(old_node, final_node);
|
||||
}
|
||||
return final_node;
|
||||
return InsertTransOpForOutput(func_graph, new_node, kernel_select_);
|
||||
}
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/optimizer/ascend/format_type/remove_internal_output.h"
|
||||
#include <memory>
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
namespace {
|
||||
bool UsedForOutputOnly(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
|
||||
MS_EXCEPTION_IF_NULL(func_graph);
|
||||
auto manager = func_graph->manager();
|
||||
MS_EXCEPTION_IF_NULL(manager);
|
||||
auto &node_users = manager->node_users();
|
||||
auto iter = node_users.find(node);
|
||||
if (iter == node_users.end()) {
|
||||
return false;
|
||||
}
|
||||
const auto &node_set = iter->second;
|
||||
for (const auto &node_index : node_set) {
|
||||
if (!AnfAlgo::CheckPrimitiveType(node_index.first, prim::kPrimMakeTuple)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace
|
||||
const BaseRef RemoveInternalOutputTransOp::DefinePattern() const {
|
||||
VarPtr X = std::make_shared<Var>();
|
||||
auto prim = std::make_shared<Primitive>(kTransDataOpName);
|
||||
return VectorRef({prim, X});
|
||||
}
|
||||
|
||||
const BaseRef RemoveInternalOutputCast::DefinePattern() const {
|
||||
VarPtr X = std::make_shared<Var>();
|
||||
return VectorRef({prim::kPrimCast, X});
|
||||
}
|
||||
|
||||
const AnfNodePtr RemoveInternalOutput::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
|
||||
const EquivPtr &) const {
|
||||
MS_EXCEPTION_IF_NULL(func_graph);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
|
||||
if (kernel_graph == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!kernel_graph->IsInternalOutput(node)) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!UsedForOutputOnly(func_graph, node)) {
|
||||
return nullptr;
|
||||
}
|
||||
auto cnode = node->cast<CNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(cnode);
|
||||
CheckCNodeInputSize(cnode, kTransOpInputNum);
|
||||
auto input_node = cnode->input(1);
|
||||
if (!AnfAlgo::CheckPrimitiveType(input_node, prim::kPrimTupleGetItem)) {
|
||||
kernel_graph->ReplaceInternalOutput(node, input_node);
|
||||
} else {
|
||||
auto tuple_getitem = input_node->cast<CNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(tuple_getitem);
|
||||
int idx = AnfAlgo::GetTupleGetItemOutIndex(tuple_getitem);
|
||||
AnfNodePtr real_input_node = AnfAlgo::GetTupleGetItemRealInput(tuple_getitem);
|
||||
kernel_graph->ReplaceInternalOutput(node, real_input_node, 0, idx);
|
||||
}
|
||||
return input_node;
|
||||
}
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,51 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
|
||||
#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
|
||||
|
||||
#include <string>
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class RemoveInternalOutput : public PatternProcessPass {
|
||||
public:
|
||||
explicit RemoveInternalOutput(const std::string &name, bool multigraph = true)
|
||||
: PatternProcessPass(name, multigraph) {}
|
||||
~RemoveInternalOutput() override = default;
|
||||
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
|
||||
};
|
||||
|
||||
class RemoveInternalOutputTransOp : public RemoveInternalOutput {
|
||||
public:
|
||||
explicit RemoveInternalOutputTransOp(bool multigraph = true)
|
||||
: RemoveInternalOutput("remove_internal_output_trans_op", multigraph) {}
|
||||
~RemoveInternalOutputTransOp() override = default;
|
||||
const BaseRef DefinePattern() const override;
|
||||
};
|
||||
|
||||
class RemoveInternalOutputCast : public RemoveInternalOutput {
|
||||
public:
|
||||
explicit RemoveInternalOutputCast(bool multigraph = true)
|
||||
: RemoveInternalOutput("remove_internal_output_cast", multigraph) {}
|
||||
~RemoveInternalOutputCast() override = default;
|
||||
const BaseRef DefinePattern() const override;
|
||||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
|
|
@ -13,8 +13,8 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
|
||||
#define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
|
||||
|
||||
#include <memory>
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
|
@ -53,4 +53,4 @@ class AdamFusion : public PatternProcessPass {
|
|||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
|
||||
|
|
|
@ -13,8 +13,8 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
|
||||
#define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
|
||||
|
||||
#include <memory>
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
|
@ -55,4 +55,4 @@ class AdamWeightDecayFusion : public PatternProcessPass {
|
|||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/optimizer/gpu/replace_addn_fusion.h"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "ir/primitive.h"
|
||||
#include "utils/utils.h"
|
||||
#include "backend/optimizer/common/helper.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
const BaseRef ReplaceAddNFusion::DefinePattern() const {
|
||||
VectorRef addn = VectorRef({prim::kPrimAddN, A, B});
|
||||
return addn;
|
||||
}
|
||||
|
||||
const AnfNodePtr ReplaceAddNFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
|
||||
const EquivPtr &equiv) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
MS_EXCEPTION_IF_NULL(equiv);
|
||||
|
||||
auto A = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
|
||||
auto B = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 1);
|
||||
MS_EXCEPTION_IF_NULL(A);
|
||||
MS_EXCEPTION_IF_NULL(B);
|
||||
int num_input = AnfAlgo::GetNodeAttr<int>(node, "n");
|
||||
|
||||
if (num_input == 2) {
|
||||
auto prim = std::make_shared<Primitive>(prim::kPrimTensorAdd->name());
|
||||
MS_EXCEPTION_IF_NULL(prim);
|
||||
std::vector<AnfNodePtr> inputs = {NewValueNode(prim), A, B};
|
||||
auto add_new = graph->NewCNode(inputs);
|
||||
std::vector<TypeId> outputs_type;
|
||||
std::vector<std::vector<size_t>> outputs_shape;
|
||||
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(A, 0));
|
||||
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(A, 0));
|
||||
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, add_new.get());
|
||||
auto manager = graph->manager();
|
||||
MS_EXCEPTION_IF_NULL(manager);
|
||||
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(add_new));
|
||||
return add_new;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
|
||||
|
||||
#include <memory>
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class ReplaceAddNFusion : public PatternProcessPass {
|
||||
public:
|
||||
explicit ReplaceAddNFusion(bool multigraph = true) : PatternProcessPass("replace_addn", multigraph) {
|
||||
A = std::make_shared<Var>();
|
||||
B = std::make_shared<Var>();
|
||||
}
|
||||
~ReplaceAddNFusion() override = default;
|
||||
const BaseRef DefinePattern() const override;
|
||||
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
|
||||
|
||||
private:
|
||||
VarPtr A;
|
||||
VarPtr B;
|
||||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
|
|
@ -0,0 +1,92 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "ir/primitive.h"
|
||||
#include "utils/utils.h"
|
||||
#include "backend/optimizer/common/helper.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
const BaseRef ReplaceBNCastFusion::DefinePattern() const {
|
||||
VectorRef in_cast = VectorRef({prim::kPrimCast, x_});
|
||||
VectorRef fbn2 = VectorRef({prim::kPrimFusedBatchNorm, in_cast, scale_, bias_, mean_, var_});
|
||||
VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2, index_});
|
||||
VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
|
||||
return out_cast;
|
||||
}
|
||||
|
||||
const AnfNodePtr ReplaceBNCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
|
||||
const EquivPtr &equiv) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
MS_EXCEPTION_IF_NULL(equiv);
|
||||
|
||||
auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
|
||||
auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
|
||||
MS_EXCEPTION_IF_NULL(index_node);
|
||||
auto value_node = index_node->cast<ValueNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(value_node);
|
||||
int item_idx = GetValue<int>(value_node->value());
|
||||
|
||||
auto fbn2 = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
|
||||
auto x_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 0);
|
||||
auto x_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(x_after), 0);
|
||||
if (item_idx != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 1);
|
||||
auto bias = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 2);
|
||||
auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 3);
|
||||
auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 4);
|
||||
|
||||
MS_EXCEPTION_IF_NULL(fbn2);
|
||||
MS_EXCEPTION_IF_NULL(x_after);
|
||||
MS_EXCEPTION_IF_NULL(x_before);
|
||||
MS_EXCEPTION_IF_NULL(scale);
|
||||
MS_EXCEPTION_IF_NULL(bias);
|
||||
MS_EXCEPTION_IF_NULL(mean);
|
||||
MS_EXCEPTION_IF_NULL(var);
|
||||
|
||||
auto manager = graph->manager();
|
||||
MS_EXCEPTION_IF_NULL(manager);
|
||||
manager->Replace(utils::cast<CNodePtr>(x_after), utils::cast<CNodePtr>(x_before));
|
||||
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
|
||||
|
||||
std::vector<TypeId> outputs_type;
|
||||
std::vector<std::vector<size_t>> outputs_shape;
|
||||
auto output_num = AnfAlgo::GetOutputTensorNum(fbn2);
|
||||
for (size_t i = 0; i < output_num; i++) {
|
||||
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2, i));
|
||||
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2, i));
|
||||
}
|
||||
outputs_type[0] = kNumberTypeFloat16;
|
||||
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2.get());
|
||||
|
||||
outputs_type.clear();
|
||||
outputs_shape.clear();
|
||||
outputs_type.push_back(kNumberTypeFloat16);
|
||||
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
|
||||
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
|
||||
return tuple;
|
||||
}
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,58 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
|
||||
|
||||
#include <memory>
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class ReplaceBNCastFusion : public PatternProcessPass {
|
||||
public:
|
||||
explicit ReplaceBNCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_cast", multigraph) {
|
||||
x_ = std::make_shared<Var>();
|
||||
scale_ = std::make_shared<Var>();
|
||||
bias_ = std::make_shared<Var>();
|
||||
mean_ = std::make_shared<Var>();
|
||||
var_ = std::make_shared<Var>();
|
||||
y_ = std::make_shared<Var>();
|
||||
running_mean_ = std::make_shared<Var>();
|
||||
running_var_ = std::make_shared<Var>();
|
||||
save_mean_ = std::make_shared<Var>();
|
||||
save_var_ = std::make_shared<Var>();
|
||||
index_ = std::make_shared<Var>();
|
||||
}
|
||||
~ReplaceBNCastFusion() override = default;
|
||||
const BaseRef DefinePattern() const override;
|
||||
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
|
||||
|
||||
private:
|
||||
VarPtr x_;
|
||||
VarPtr scale_;
|
||||
VarPtr bias_;
|
||||
VarPtr mean_;
|
||||
VarPtr var_;
|
||||
VarPtr y_;
|
||||
VarPtr running_mean_;
|
||||
VarPtr running_var_;
|
||||
VarPtr save_mean_;
|
||||
VarPtr save_var_;
|
||||
VarPtr index_;
|
||||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
|
|
@ -0,0 +1,88 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "ir/primitive.h"
|
||||
#include "utils/utils.h"
|
||||
#include "backend/optimizer/common/helper.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
const BaseRef ReplaceBNGradCast2Fusion::DefinePattern() const {
|
||||
VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_, x_, scale_, mean_, var_});
|
||||
VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
|
||||
VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
|
||||
return out_cast;
|
||||
}
|
||||
|
||||
const AnfNodePtr ReplaceBNGradCast2Fusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
|
||||
const EquivPtr &equiv) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
MS_EXCEPTION_IF_NULL(equiv);
|
||||
auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
|
||||
auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
|
||||
MS_EXCEPTION_IF_NULL(index_node);
|
||||
auto value_node = index_node->cast<ValueNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(value_node);
|
||||
int item_idx = GetValue<int>(value_node->value());
|
||||
if (item_idx != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
|
||||
|
||||
auto dy_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
|
||||
auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
|
||||
|
||||
auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
|
||||
auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
|
||||
auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);
|
||||
|
||||
MS_EXCEPTION_IF_NULL(fbn2g);
|
||||
MS_EXCEPTION_IF_NULL(dy_);
|
||||
MS_EXCEPTION_IF_NULL(scale);
|
||||
MS_EXCEPTION_IF_NULL(x_);
|
||||
MS_EXCEPTION_IF_NULL(mean);
|
||||
MS_EXCEPTION_IF_NULL(var);
|
||||
|
||||
auto manager = graph->manager();
|
||||
MS_EXCEPTION_IF_NULL(manager);
|
||||
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
|
||||
std::vector<TypeId> outputs_type;
|
||||
std::vector<std::vector<size_t>> outputs_shape;
|
||||
auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
|
||||
for (size_t i = 0; i < output_num; i++) {
|
||||
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
|
||||
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
|
||||
}
|
||||
outputs_type[0] = AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0);
|
||||
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
|
||||
|
||||
outputs_type.clear();
|
||||
outputs_shape.clear();
|
||||
outputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0));
|
||||
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
|
||||
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
|
||||
|
||||
return tuple;
|
||||
}
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,54 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
|
||||
|
||||
#include <memory>
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class ReplaceBNGradCast2Fusion : public PatternProcessPass {
|
||||
public:
|
||||
explicit ReplaceBNGradCast2Fusion(bool multigraph = true) : PatternProcessPass("replace_grad_cast2", multigraph) {
|
||||
dy_ = std::make_shared<Var>();
|
||||
x_ = std::make_shared<Var>();
|
||||
scale_ = std::make_shared<Var>();
|
||||
mean_ = std::make_shared<Var>();
|
||||
var_ = std::make_shared<Var>();
|
||||
dx_ = std::make_shared<Var>();
|
||||
bn_scale_ = std::make_shared<Var>();
|
||||
bn_bias_ = std::make_shared<Var>();
|
||||
index_ = std::make_shared<Var>();
|
||||
}
|
||||
~ReplaceBNGradCast2Fusion() override = default;
|
||||
const BaseRef DefinePattern() const override;
|
||||
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
|
||||
|
||||
private:
|
||||
VarPtr dy_;
|
||||
VarPtr x_;
|
||||
VarPtr scale_;
|
||||
VarPtr mean_;
|
||||
VarPtr var_;
|
||||
VarPtr dx_;
|
||||
VarPtr bn_scale_;
|
||||
VarPtr bn_bias_;
|
||||
VarPtr index_;
|
||||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
|
|
@ -0,0 +1,91 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "ir/primitive.h"
|
||||
#include "utils/utils.h"
|
||||
#include "backend/optimizer/common/helper.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
const BaseRef ReplaceBNGradCastFusion::DefinePattern() const {
|
||||
VectorRef dy_cast = VectorRef({prim::kPrimCast, dy_});
|
||||
VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_cast, x_, scale_, mean_, var_});
|
||||
VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
|
||||
VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
|
||||
return out_cast;
|
||||
}
|
||||
|
||||
const AnfNodePtr ReplaceBNGradCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
|
||||
const EquivPtr &equiv) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
MS_EXCEPTION_IF_NULL(equiv);
|
||||
|
||||
auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
|
||||
auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
|
||||
MS_EXCEPTION_IF_NULL(index_node);
|
||||
auto value_node = index_node->cast<ValueNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(value_node);
|
||||
int item_idx = GetValue<int>(value_node->value());
|
||||
if (item_idx != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
|
||||
|
||||
auto dy_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
|
||||
auto dy_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(dy_after), 0);
|
||||
auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
|
||||
|
||||
auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
|
||||
auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
|
||||
auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);
|
||||
|
||||
MS_EXCEPTION_IF_NULL(fbn2g);
|
||||
MS_EXCEPTION_IF_NULL(dy_after);
|
||||
MS_EXCEPTION_IF_NULL(dy_before);
|
||||
MS_EXCEPTION_IF_NULL(scale);
|
||||
MS_EXCEPTION_IF_NULL(x_);
|
||||
MS_EXCEPTION_IF_NULL(mean);
|
||||
MS_EXCEPTION_IF_NULL(var);
|
||||
|
||||
auto manager = graph->manager();
|
||||
MS_EXCEPTION_IF_NULL(manager);
|
||||
manager->Replace(utils::cast<CNodePtr>(dy_after), utils::cast<CNodePtr>(dy_before));
|
||||
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
|
||||
std::vector<TypeId> outputs_type;
|
||||
std::vector<std::vector<size_t>> outputs_shape;
|
||||
auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
|
||||
for (size_t i = 0; i < output_num; i++) {
|
||||
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
|
||||
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
|
||||
}
|
||||
outputs_type[0] = kNumberTypeFloat16;
|
||||
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
|
||||
outputs_type.clear();
|
||||
outputs_shape.clear();
|
||||
outputs_type.push_back(kNumberTypeFloat16);
|
||||
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
|
||||
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
|
||||
return tuple;
|
||||
}
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,54 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
|
||||
|
||||
#include <memory>
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class ReplaceBNGradCastFusion : public PatternProcessPass {
|
||||
public:
|
||||
explicit ReplaceBNGradCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_grad_cast", multigraph) {
|
||||
dy_ = std::make_shared<Var>();
|
||||
x_ = std::make_shared<Var>();
|
||||
scale_ = std::make_shared<Var>();
|
||||
mean_ = std::make_shared<Var>();
|
||||
var_ = std::make_shared<Var>();
|
||||
dx_ = std::make_shared<Var>();
|
||||
bn_scale_ = std::make_shared<Var>();
|
||||
bn_bias_ = std::make_shared<Var>();
|
||||
index_ = std::make_shared<Var>();
|
||||
}
|
||||
~ReplaceBNGradCastFusion() override = default;
|
||||
const BaseRef DefinePattern() const override;
|
||||
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
|
||||
|
||||
private:
|
||||
VarPtr dy_;
|
||||
VarPtr x_;
|
||||
VarPtr scale_;
|
||||
VarPtr mean_;
|
||||
VarPtr var_;
|
||||
VarPtr dx_;
|
||||
VarPtr bn_scale_;
|
||||
VarPtr bn_bias_;
|
||||
VarPtr index_;
|
||||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
|
|
@ -0,0 +1,63 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "ir/primitive.h"
|
||||
#include "utils/utils.h"
|
||||
#include "backend/optimizer/common/helper.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
const BaseRef ReplaceMomentumCastFusion::DefinePattern() const {
|
||||
VectorRef grad_cast = VectorRef({prim::kPrimCast, grad_});
|
||||
VectorRef momentum = VectorRef({prim::kPrimApplyMomentum, var_, acc_, lr_, grad_cast, mom_});
|
||||
return momentum;
|
||||
}
|
||||
|
||||
const AnfNodePtr ReplaceMomentumCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
|
||||
const EquivPtr &equiv) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
MS_EXCEPTION_IF_NULL(equiv);
|
||||
|
||||
auto grad_cast = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 3);
|
||||
auto grad = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(grad_cast), 0);
|
||||
MS_EXCEPTION_IF_NULL(grad_cast);
|
||||
MS_EXCEPTION_IF_NULL(grad);
|
||||
|
||||
auto manager = graph->manager();
|
||||
MS_EXCEPTION_IF_NULL(manager);
|
||||
manager->Replace(utils::cast<CNodePtr>(grad_cast), utils::cast<CNodePtr>(grad));
|
||||
std::vector<TypeId> outputs_type;
|
||||
std::vector<std::vector<size_t>> outputs_shape;
|
||||
auto output_num = AnfAlgo::GetOutputTensorNum(node);
|
||||
for (size_t i = 0; i < output_num; i++) {
|
||||
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(node, i));
|
||||
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(node, i));
|
||||
}
|
||||
outputs_type[3] = AnfAlgo::GetPrevNodeOutputInferDataType(grad_cast, 0);
|
||||
|
||||
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, node.get());
|
||||
|
||||
return node;
|
||||
}
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,46 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
|
||||
|
||||
#include <memory>
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class ReplaceMomentumCastFusion : public PatternProcessPass {
|
||||
public:
|
||||
explicit ReplaceMomentumCastFusion(bool multigraph = true) : PatternProcessPass("replace_momentum_cast", multigraph) {
|
||||
var_ = std::make_shared<Var>();
|
||||
acc_ = std::make_shared<Var>();
|
||||
lr_ = std::make_shared<Var>();
|
||||
grad_ = std::make_shared<Var>();
|
||||
mom_ = std::make_shared<Var>();
|
||||
}
|
||||
~ReplaceMomentumCastFusion() override = default;
|
||||
const BaseRef DefinePattern() const override;
|
||||
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
|
||||
|
||||
private:
|
||||
VarPtr var_;
|
||||
VarPtr acc_;
|
||||
VarPtr lr_;
|
||||
VarPtr grad_;
|
||||
VarPtr mom_;
|
||||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
|
|
@ -25,7 +25,8 @@
|
|||
namespace mindspore {
|
||||
namespace memreuse {
|
||||
enum RefCountType { kDynamicRefCount, kStaticRefCount };
|
||||
enum NodeType { NORMAL, SPECIAL };
|
||||
enum NodeType { COMMON_NODE, COMMUNICATION_NODE };
|
||||
enum KernelRefType { COMMON, REFNODE_OUTPUT, COMM_NOTREUSE, COMM_REUSE, SUMMARY };
|
||||
static constexpr int kInitIndex = -1;
|
||||
class KernelRefCount {
|
||||
public:
|
||||
|
@ -36,6 +37,7 @@ class KernelRefCount {
|
|||
size_t offset_;
|
||||
size_t size_;
|
||||
int index_;
|
||||
KernelRefType type_;
|
||||
// remember to reset offset
|
||||
KernelRefCount()
|
||||
: stream_id_(0),
|
||||
|
@ -44,6 +46,7 @@ class KernelRefCount {
|
|||
offset_(0),
|
||||
size_(0),
|
||||
index_(kInitIndex),
|
||||
type_(COMMON),
|
||||
reftype_(kStaticRefCount) {}
|
||||
~KernelRefCount() = default;
|
||||
void SetKernelRefCountInfo(int index, size_t size, RefCountType reftype);
|
||||
|
@ -65,7 +68,7 @@ class KernelDef {
|
|||
KernelMap inputs_;
|
||||
KernelMap outputs_;
|
||||
KernelMap wk_space_;
|
||||
NodeType dirty = NORMAL;
|
||||
NodeType type_ = COMMON_NODE;
|
||||
KernelDef() = default;
|
||||
~KernelDef() = default;
|
||||
void set_input_refs(const KernelRefCountPtrList &kernelRefPtrList) { input_refs_ = kernelRefPtrList; }
|
||||
|
|
|
@ -46,6 +46,8 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() {
|
|||
if (iter == kernel_output_refs_.end()) {
|
||||
auto output_sizes = kernel_mod->GetOutputSizeList();
|
||||
KernelRefCountPtrList kernel_refs;
|
||||
bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel_cnode);
|
||||
size_t output_index = 0;
|
||||
for (auto size : output_sizes) {
|
||||
total_dy_size_ += size;
|
||||
// do not MallocDynamicMem just record this
|
||||
|
@ -54,9 +56,20 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() {
|
|||
auto curr_stream_id = AnfAlgo::GetStreamId(kernel_cnode);
|
||||
kernel_ref->stream_id_ = curr_stream_id;
|
||||
kernel_ref->SetKernelRefCountInfo(index, size, kDynamicRefCount);
|
||||
if (is_comm_op) {
|
||||
kernel_ref->type_ = COMM_REUSE;
|
||||
} else {
|
||||
session::AnfWithOutIndex out_pair(kernel_cnode, output_index);
|
||||
if (graph_->IsInRefOutputMap(out_pair)) {
|
||||
kernel_ref->type_ = REFNODE_OUTPUT;
|
||||
} else {
|
||||
kernel_ref->type_ = COMMON;
|
||||
}
|
||||
}
|
||||
kernel_refs.push_back(kernel_ref);
|
||||
kernel_out_ref_num++;
|
||||
total_refs_list_.push_back(kernel_ref);
|
||||
output_index++;
|
||||
}
|
||||
if (!kernel_refs.empty()) {
|
||||
kernel_output_refs_[key] = kernel_refs;
|
||||
|
@ -155,9 +168,19 @@ void MemReuseUtil::SetInputMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr
|
|||
MS_EXCEPTION_IF_NULL(kernel);
|
||||
MS_EXCEPTION_IF_NULL(kernel_def_ptr);
|
||||
auto key = kernel.get();
|
||||
for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
|
||||
bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel);
|
||||
size_t input_tensor_num = AnfAlgo::GetInputTensorNum(kernel);
|
||||
for (size_t i = 0; i < input_tensor_num; ++i) {
|
||||
auto ref_ptr = GetKernelInputRef(kernel, i);
|
||||
if (ref_ptr != nullptr) {
|
||||
if (is_comm_op) {
|
||||
if (input_tensor_num == 1) {
|
||||
ref_ptr->type_ = COMM_REUSE;
|
||||
} else {
|
||||
ref_ptr->type_ = COMM_NOTREUSE;
|
||||
}
|
||||
}
|
||||
|
||||
if (ref_ptr->reftype() == kStaticRefCount) {
|
||||
continue;
|
||||
} else if (ref_ptr->reftype() == kDynamicRefCount) {
|
||||
|
@ -258,6 +281,11 @@ void MemReuseUtil::SetKernelDefMap() {
|
|||
auto key = kernel.get();
|
||||
kernel_def_ptr->set_input_refs(kernel_def_ptr->inputs_[key]);
|
||||
kernel_def_ptr->set_output_refs(kernel_def_ptr->outputs_[key]);
|
||||
if (AnfAlgo::IsCommunicationOp(kernel)) {
|
||||
kernel_def_ptr->type_ = COMMUNICATION_NODE;
|
||||
} else {
|
||||
kernel_def_ptr->type_ = COMMON_NODE;
|
||||
}
|
||||
kernel_def_ptr_list_.push_back(kernel_def_ptr);
|
||||
kernel_map_[key] = kernel_def_ptr;
|
||||
}
|
||||
|
@ -337,6 +365,7 @@ void MemReuseUtil::SetSummaryNodesRefCount() {
|
|||
KernelRefCountPtr kernel_ref = kernel_output_refs_[node.get()][index];
|
||||
kernel_ref->ref_count_ = kMaxRefCount;
|
||||
kernel_ref->ref_count_dynamic_use_ = kMaxRefCount;
|
||||
kernel_ref->type_ = SUMMARY;
|
||||
total_summary_size += kernel_ref->size_;
|
||||
MS_LOG(INFO) << "Set summary node's ref count, node: " << node->fullname_with_scope() << " index: " << index;
|
||||
} else {
|
||||
|
|
|
@ -83,6 +83,7 @@ class MemReuseUtil {
|
|||
void set_mem_base(uint8_t *mem_base) { mem_base_ = mem_base; }
|
||||
uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
|
||||
uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
|
||||
bool is_all_nop_node() const { return is_all_nop_node_; }
|
||||
|
||||
private:
|
||||
int util_index_;
|
||||
|
|
|
@ -33,11 +33,11 @@ void BestFitMemReuse::InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr) {
|
|||
set_op_ptr_list(mem_reuse_util_ptr->kernel_def_ptr_list());
|
||||
// check info Correctness
|
||||
for (auto &tensor : tensor_ptr_list_) {
|
||||
tensor->size_ = AlignMemorySize(tensor->size_);
|
||||
tensor->size_ = AlignCommonMemorySize(tensor->size_);
|
||||
}
|
||||
// align wk size to 512 && refcount == 1
|
||||
for (auto &wk : wk_tensor_list_) {
|
||||
wk->size_ = AlignMemorySize(wk->size_);
|
||||
wk->size_ = AlignCommonMemorySize(wk->size_);
|
||||
wk->ref_count_ = 1;
|
||||
}
|
||||
#ifdef ENABLE_D
|
||||
|
@ -135,11 +135,23 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr
|
|||
return false;
|
||||
}
|
||||
|
||||
void BestFitMemReuse::AssignNodeOutputOffset() {
|
||||
void BestFitMemReuse::AssignCommonNodeOutputOffset() {
|
||||
MS_EXCEPTION_IF_NULL(current_kernel_);
|
||||
for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
|
||||
size_t index = GetTensorIndex(tensor_idx);
|
||||
auto tensor_desc = tensor_ptr_list_[index];
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
if (tensor_desc->type_ == REFNODE_OUTPUT) {
|
||||
total_refoutput_size += tensor_desc->size_;
|
||||
continue;
|
||||
} else if (tensor_desc->type_ == COMM_NOTREUSE) {
|
||||
total_comm_not_reuse_size += tensor_desc->size_;
|
||||
} else if (tensor_desc->type_ == COMM_REUSE) {
|
||||
// get align size for communication op's single input
|
||||
tensor_desc->size_ = AlignCommunicationMemorySize(tensor_desc->size_);
|
||||
total_comm_reuse_size += tensor_desc->size_;
|
||||
}
|
||||
|
||||
auto reusable_membuf_map = GetReusableMembufMap(tensor_desc->size_);
|
||||
if (!reusable_membuf_map.empty()) {
|
||||
auto membuf_index = reusable_membuf_map.begin()->second;
|
||||
|
@ -152,6 +164,93 @@ void BestFitMemReuse::AssignNodeOutputOffset() {
|
|||
MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
|
||||
#endif
|
||||
}
|
||||
// skip left align border for communication op single input to used
|
||||
if (tensor_desc->type_ == COMM_REUSE) {
|
||||
tensor_desc->offset_ += kDefaultMemAlignSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BestFitMemReuse::AssignCommunicationNodeOutputOffset() {
|
||||
size_t total_kernel_output_size = 0;
|
||||
size_t output_num = 0;
|
||||
// get all output size
|
||||
MS_EXCEPTION_IF_NULL(current_kernel_);
|
||||
for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
|
||||
size_t index = GetTensorIndex(tensor_idx);
|
||||
auto tensor_desc = tensor_ptr_list_[index];
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
if (tensor_desc->type_ == COMM_REUSE) {
|
||||
total_comm_reuse_size += tensor_desc->size_;
|
||||
total_comm_output_reuse_size += tensor_desc->size_;
|
||||
total_kernel_output_size += tensor_desc->size_;
|
||||
} else {
|
||||
MS_LOG(ERROR) << "All communication op's outputs should be memory reuse, Kernel:"
|
||||
<< current_kernel_->scope_full_name();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
total_kernel_output_size = AlignCommunicationMemorySize(total_kernel_output_size);
|
||||
|
||||
// add left align border for the first output and right align border for the last output to alloc align border memory
|
||||
size_t output_index = 0;
|
||||
auto output_ref_indexes = current_kernel_->GetOutputRefIndexs();
|
||||
for (auto &tensor_idx : output_ref_indexes) {
|
||||
size_t index = GetTensorIndex(tensor_idx);
|
||||
auto tensor_desc = tensor_ptr_list_[index];
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
if (output_index == 0 || output_index == output_num - 1) {
|
||||
tensor_desc->size_ += kDefaultMemAlignSize;
|
||||
}
|
||||
|
||||
if ((output_index == 0) && (output_ref_indexes.size() == 1)) {
|
||||
// add right align border for single output
|
||||
tensor_desc->size_ += kDefaultMemAlignSize;
|
||||
}
|
||||
|
||||
output_index++;
|
||||
}
|
||||
|
||||
auto reusable_membuf_map = GetReusableMembufMap(total_kernel_output_size);
|
||||
if (!reusable_membuf_map.empty()) {
|
||||
auto membuf_index = reusable_membuf_map.begin()->second;
|
||||
output_index = 0;
|
||||
for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
|
||||
size_t index = GetTensorIndex(tensor_idx);
|
||||
auto tensor_desc = tensor_ptr_list_[index];
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
ReuseExistMembuf(tensor_desc.get(), membuf_index + output_index, kDynamicMem);
|
||||
// skip skip left align border for communication op's first output to used
|
||||
if (output_index == 0) {
|
||||
tensor_desc->offset_ += kDefaultMemAlignSize;
|
||||
}
|
||||
output_index++;
|
||||
}
|
||||
} else {
|
||||
// no membuf can reuse, add new membuf after the membuf_ptr_list
|
||||
output_index = 0;
|
||||
for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
|
||||
size_t index = GetTensorIndex(tensor_idx);
|
||||
auto tensor_desc = tensor_ptr_list_[index];
|
||||
MS_EXCEPTION_IF_NULL(tensor_desc);
|
||||
AddNewMembufPtr(tensor_desc.get(), kDynamicMem);
|
||||
// skip align size offset for first output to used
|
||||
if (output_index == 0) {
|
||||
tensor_desc->offset_ += kDefaultMemAlignSize;
|
||||
}
|
||||
output_index++;
|
||||
#ifdef MEM_REUSE_DEBUG
|
||||
MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BestFitMemReuse::AssignNodeOutputOffset() {
|
||||
if (current_kernel_->type_ == COMMUNICATION_NODE) {
|
||||
AssignCommunicationNodeOutputOffset();
|
||||
} else {
|
||||
AssignCommonNodeOutputOffset();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -319,11 +418,17 @@ void BestFitMemReuse::ReleaseMembuf(size_t tensor_index, int flag) {
|
|||
}
|
||||
}
|
||||
|
||||
size_t BestFitMemReuse::AlignMemorySize(size_t size) const {
|
||||
size_t BestFitMemReuse::AlignCommonMemorySize(size_t size) const {
|
||||
// memory size 512 align
|
||||
return (size + kDefaultMemAlignSize + kAttAlignSize) / kDefaultMemAlignSize * kDefaultMemAlignSize;
|
||||
}
|
||||
|
||||
size_t BestFitMemReuse::AlignCommunicationMemorySize(size_t size) const {
|
||||
// memory size 512 align and add communication memory: left align border memory - data - right align border memory
|
||||
return kDefaultMemAlignSize + (size + kDefaultMemAlignSize - 1) / kDefaultMemAlignSize * kDefaultMemAlignSize +
|
||||
kDefaultMemAlignSize;
|
||||
}
|
||||
|
||||
size_t BestFitMemReuse::GetAllocatedSize() {
|
||||
size_t AllocatedSize = kTotalSize;
|
||||
if (membuf_ptr_list_.empty()) {
|
||||
|
@ -412,6 +517,9 @@ void BestFitMemReuse::Reuse(const MemReuseUtil *mem_reuse_util_ptr) {
|
|||
++op_num;
|
||||
#endif
|
||||
}
|
||||
MS_LOG(INFO) << "Special Tensor total size: RefOutput: " << total_refoutput_size
|
||||
<< " CommReuse: " << total_comm_reuse_size << " CommOutputReuse: " << total_comm_output_reuse_size
|
||||
<< " CommNotReuse: " << total_comm_not_reuse_size;
|
||||
#ifdef MEM_REUSE_DEBUG
|
||||
MemReuseChecker::GetInstance().ExportMembufInfoIR();
|
||||
MemReuseChecker::GetInstance().ExportAddNewMmebufIR();
|
||||
|
|
|
@ -74,6 +74,14 @@ class BestFitMemReuse {
|
|||
* Assign output tensor memory offset of current kernel
|
||||
*/
|
||||
void AssignNodeOutputOffset();
|
||||
/**
|
||||
* Assign output tensor memory offset of common kernel
|
||||
*/
|
||||
void AssignCommonNodeOutputOffset();
|
||||
/**
|
||||
* Assign output tensor memory offset of communication kernel
|
||||
*/
|
||||
void AssignCommunicationNodeOutputOffset();
|
||||
/**
|
||||
* Update input tensor's status of current kernel, and the status of membuf used by current kernel
|
||||
*/
|
||||
|
@ -110,8 +118,10 @@ class BestFitMemReuse {
|
|||
void AddNewMembufPtr(KernelRefCount *tensor_desc, int flag);
|
||||
// Merge unused membuf
|
||||
void ReleaseMembuf(size_t tensor_index, int flag);
|
||||
// Memory address alignment 512
|
||||
size_t AlignMemorySize(size_t size) const;
|
||||
// Memory address alignment for common memory
|
||||
size_t AlignCommonMemorySize(size_t size) const;
|
||||
// Memory address alignment for communication used memory
|
||||
size_t AlignCommunicationMemorySize(size_t size) const;
|
||||
int GetRealIndex(size_t index, int flag = kDynamicMem) const;
|
||||
size_t GetTensorIndex(int index) const;
|
||||
size_t GetWorkspaceIndex(int index) const;
|
||||
|
@ -153,6 +163,10 @@ class BestFitMemReuse {
|
|||
// kernel_front_map_, key: the kernel_def, value: kernels before this kernel_def
|
||||
std::map<KernelDefPtr, std::set<KernelDefPtr>> kernel_front_map_;
|
||||
std::vector<std::vector<uint32_t>> stream_groups_;
|
||||
size_t total_refoutput_size{0};
|
||||
size_t total_comm_reuse_size{0};
|
||||
size_t total_comm_output_reuse_size{0};
|
||||
size_t total_comm_not_reuse_size{0};
|
||||
};
|
||||
} // namespace memreuse
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -170,12 +170,14 @@ void MemReuseChecker::CheckMemReuseIR(const KernelRefCountPtrList &total_refs_li
|
|||
ofs << "all_tensor_refs:\n";
|
||||
ofs << "index:"
|
||||
<< "\tsize:"
|
||||
<< "\trefcount:\n";
|
||||
<< "\trefcount:"
|
||||
<< "\ttype:\n";
|
||||
for (auto &ref : total_refs_list) {
|
||||
ofs << "%" << ref->index_ << "T"
|
||||
<< "\t"
|
||||
<< "#" << ref->size_ << "S"
|
||||
<< "\t" << ref->ref_count_ << "C"
|
||||
<< "\t" << ref->type_ << "t"
|
||||
<< "\n";
|
||||
}
|
||||
ofs << "kernel_def exc_order:\n";
|
||||
|
@ -241,7 +243,7 @@ bool MemReuseChecker::CheckGraphOutputAssigned(const session::KernelGraph *graph
|
|||
void MemReuseChecker::ExportMemOpIr(const KernelDef *def, std::ofstream &ofs, int def_idx) {
|
||||
auto scope_name = def->scope_full_name();
|
||||
std::string split_name = GetSplitName(scope_name);
|
||||
ofs << "$" << def_idx << "\t" << split_name << "\t";
|
||||
ofs << "$" << def_idx << "\t" << split_name << "\t" << static_cast<int>(def->type_) << "\t";
|
||||
ofs << "inputs[";
|
||||
for (auto &in : def->inputs_) {
|
||||
for (auto &in_ref : in.second) {
|
||||
|
|
|
@ -100,7 +100,10 @@ bool CommunicationOpFusion::GetSplitSegments(const CommunicationOpInfo &communic
|
|||
|
||||
auto parallel_context = parallel::ParallelContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(parallel_context);
|
||||
const auto &split_indices = parallel_context->GetAllReduceFusionSplitIndices(group);
|
||||
std::vector<uint32_t> split_indices;
|
||||
if (!parallel_context->enable_parallel_optimizer()) {
|
||||
split_indices = parallel_context->GetAllReduceFusionSplitIndices(group);
|
||||
}
|
||||
|
||||
size_t segments = 0;
|
||||
if (split_indices.size() != 0) {
|
||||
|
|
|
@ -71,7 +71,6 @@ bool ReplaceNodeByProxy::Run(const FuncGraphPtr &func_graph) {
|
|||
|
||||
AbstractBasePtrList abstract_list;
|
||||
AnfAlgo::CopyNodeAttr(kAttrPsKey, cnode, proxy_node);
|
||||
AnfAlgo::CopyNodeAttr("reduce_scatter_flag", cnode, proxy_node);
|
||||
AnfAlgo::CopyNodeAttr("offset", cnode, proxy_node);
|
||||
abstract_list.push_back(cnode->abstract());
|
||||
auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list);
|
||||
|
|
|
@ -18,9 +18,12 @@
|
|||
#include <utility>
|
||||
#include <memory>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "utils/union_find_set.h"
|
||||
#include "runtime/device/ascend/ascend_label_assign.h"
|
||||
#include "utils/context/ms_context.h"
|
||||
#include "debug/anf_ir_dump.h"
|
||||
|
||||
static constexpr size_t kCNodePrim = 0;
|
||||
static constexpr size_t kCNodeCallArg = 1;
|
||||
|
@ -104,7 +107,7 @@ static void ReuseParameter(NotNull<KernelGraphPtr> root_kg,
|
|||
|
||||
static CNodePtr GetNextRealKernel(const std::vector<CNodePtr> &list, size_t start) {
|
||||
for (size_t i = start; i < list.size() - 1; ++i) {
|
||||
if (!IsPrimitiveCNode(list[i], prim::kPrimPartial) && AnfAlgo::IsRealKernel(list[i])) {
|
||||
if (AnfAlgo::IsRealKernel(list[i])) {
|
||||
return list[i];
|
||||
}
|
||||
}
|
||||
|
@ -168,18 +171,43 @@ static void EraseNodeFromExecOrder(const AnfNodePtr &node, const NotNull<std::ve
|
|||
exec_order->erase(exec_iter);
|
||||
}
|
||||
|
||||
void AscendControlParser::AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph,
|
||||
const NotNull<std::set<KernelGraphPtr> *> memo) {
|
||||
if (memo->find(graph) != memo->end()) {
|
||||
return;
|
||||
}
|
||||
memo->insert(graph.get());
|
||||
const std::vector<std::shared_ptr<KernelGraph>> &child_graph_order = graph->child_graph_order();
|
||||
if (child_graph_order.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<AnfNodePtr> depend_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimPartial->name()))};
|
||||
for (auto &cg : child_graph_order) {
|
||||
MS_EXCEPTION_IF_NULL(cg);
|
||||
auto fg = cg->cast<FuncGraphPtr>();
|
||||
MS_EXCEPTION_IF_NULL(fg);
|
||||
depend_inputs.emplace_back(NewValueNode(fg));
|
||||
AttachChildGraphToReturnNode(NOT_NULL(cg), memo);
|
||||
}
|
||||
auto child_graphs = graph->NewCNode(depend_inputs);
|
||||
InsertDependToGraph(graph, NOT_NULL(child_graphs));
|
||||
}
|
||||
|
||||
void AscendControlParser::LinkGraph(NotNull<KernelGraphPtr> kg) {
|
||||
std::set<KernelGraphPtr> memo;
|
||||
std::vector<std::pair<AnfNodePtr, AnfNodePtr>> link_list;
|
||||
// Insert Assign
|
||||
ChildGraphDataAssign(kg, NOT_NULL(&link_list), NOT_NULL(&memo));
|
||||
memo.clear();
|
||||
// Reuse Parameter
|
||||
ReuseParameter(kg, link_list);
|
||||
// replace call by label goto / label switch
|
||||
memo.clear();
|
||||
(void)ProcessKernelGraph(kg, nullptr, nullptr, NOT_NULL(&memo));
|
||||
memo.clear();
|
||||
// assign label resource
|
||||
device::ascend::AscendLabelAssign::GetInstance().AssignLabel(kg);
|
||||
AttachChildGraphToReturnNode(kg, NOT_NULL(&memo));
|
||||
}
|
||||
|
||||
void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
|
||||
|
@ -248,10 +276,14 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
|
|||
}
|
||||
MS_LOG(INFO) << "Erase " << assign_node->DebugString(5);
|
||||
EraseNodeFromExecOrder(assign_node, NOT_NULL(&exec_order));
|
||||
|
||||
auto source = AnfAlgo::VisitKernelWithReturnType(assign_node->input(kCNodeAssignSource), 0).first;
|
||||
parameter_count.AddReadCount(source, -1);
|
||||
auto source = assign_node->input(kCNodeAssignSource);
|
||||
MS_EXCEPTION_IF_NULL(source);
|
||||
auto visit_source = AnfAlgo::VisitKernelWithReturnType(source, 0).first;
|
||||
parameter_count.AddWriteCount(para, -1);
|
||||
parameter_count.AddReadCount(para, -1);
|
||||
if (visit_source->isa<Parameter>()) {
|
||||
parameter_count.AddReadCount(visit_source, read - 1);
|
||||
}
|
||||
for (auto &node : all_nodes) {
|
||||
for (size_t i = 0; i < node->size(); ++i) {
|
||||
if (node->input(i) == para) {
|
||||
|
@ -260,8 +292,6 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
|
|||
}
|
||||
}
|
||||
}
|
||||
parameter_count.AddReadCount(source, 1);
|
||||
parameter_count.AddReadCount(para, -1);
|
||||
}
|
||||
root_graph->set_execution_order(exec_order);
|
||||
}
|
||||
|
@ -318,6 +348,17 @@ void AscendControlParser::ExecutorValidate(NotNull<KernelGraphPtr> root_graph) {
|
|||
(void)RecurseGraph(root_graph, NOT_NULL(&memo));
|
||||
EraseParameter(root_graph, memo);
|
||||
EraseLabel(root_graph);
|
||||
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
auto save_graphs_path = context_ptr->save_graphs_path();
|
||||
if (save_graphs_path.empty()) {
|
||||
save_graphs_path = ".";
|
||||
}
|
||||
if (context_ptr->save_graphs_flag()) {
|
||||
std::string file_path = save_graphs_path + "/after_erase_label_and_parameter.ir";
|
||||
DumpIR(file_path, root_graph.get());
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> AscendControlParser::ParseCallNode(
|
||||
|
|
|
@ -66,7 +66,8 @@ class AscendControlParser {
|
|||
static AnfNodePtr InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to);
|
||||
static std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> ParseCallNode(NotNull<CNodePtr> call_node);
|
||||
static std::tuple<KernelGraphPtr, std::vector<AnfNodePtr>> ParsePartial(NotNull<AnfNodePtr> node);
|
||||
|
||||
static void AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph,
|
||||
const NotNull<std::set<KernelGraphPtr> *> memo);
|
||||
// root graph order
|
||||
static bool CheckLabelIndex(uint32_t order_index, uint32_t label_index, const CNodePtr &cnode,
|
||||
NotNull<KernelGraphPtr> graph);
|
||||
|
|
|
@ -353,6 +353,10 @@ GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) {
|
|||
RootGraphExecutorValidate(NOT_NULL(root_graph));
|
||||
// adjust kernel
|
||||
AdjustKernel(root_graph);
|
||||
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
|
||||
// Assign parameter keys.
|
||||
AssignParamKey(root_graph);
|
||||
#endif
|
||||
// assign stream
|
||||
AssignStream(NOT_NULL(root_graph));
|
||||
// insert profiling point
|
||||
|
@ -511,6 +515,12 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
|
|||
}
|
||||
// load input data from user input
|
||||
LoadInputData(kernel_graph, inputs);
|
||||
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
|
||||
// Initialize parameter server
|
||||
if (!ps_init_) {
|
||||
InitPSParamAndOptim(kernel_graph, inputs);
|
||||
}
|
||||
#endif
|
||||
// convert inputs to model
|
||||
predictmodel::StepConvertWeight(inputs);
|
||||
{
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "backend/session/cpu_session.h"
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include "ir/tensor.h"
|
||||
#include "ir/anf.h"
|
||||
#include "backend/kernel_compiler/kernel.h"
|
||||
|
@ -25,9 +26,15 @@
|
|||
#include "predict/predict.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
#include "runtime/device/cpu/kernel_select_cpu.h"
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
#include "backend/optimizer/common/pass_manager.h"
|
||||
#include "backend/optimizer/pass/replace_node_by_proxy.h"
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debugger/debugger.h"
|
||||
#endif
|
||||
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
|
||||
#include "frontend/parallel/ps/util.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace session {
|
||||
|
@ -49,12 +56,29 @@ ParameterPtr CPUSession::CreateNewParameterFromParameter(const AnfNodePtr &anf,
|
|||
return new_parameter;
|
||||
}
|
||||
|
||||
void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto pm = std::make_shared<opt::PassManager>();
|
||||
std::string pass_name = "replace_node_by_proxy";
|
||||
pass_name.append(std::to_string(graph_sum_));
|
||||
pm->AddPass(std::make_shared<opt::ReplaceNodeByProxy>(pass_name));
|
||||
optimizer->AddPassManager(pm);
|
||||
(void)optimizer->Optimize(kernel_graph);
|
||||
kernel_graph->SetExecOrderByDefault();
|
||||
}
|
||||
|
||||
GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
|
||||
auto graph_id = graph_sum_;
|
||||
auto graph = ConstructKernelGraph(lst, outputs);
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_LOG(INFO) << "Set kernel info";
|
||||
SetKernelInfo(graph.get());
|
||||
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
|
||||
AssignParamKey(graph);
|
||||
if (parallel::ps::Util::IsRoleOfWorker()) {
|
||||
Optimize(graph);
|
||||
}
|
||||
#endif
|
||||
predictmodel::StepConvertGraph(graph);
|
||||
MS_LOG(INFO) << "Build kernel";
|
||||
BuildKernel(graph.get());
|
||||
|
@ -66,6 +90,12 @@ GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
|
|||
void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
|
||||
auto &kernel_graph = graphs_[graph_id];
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
|
||||
// Initialize parameter server
|
||||
if (!ps_init_) {
|
||||
InitPSParamAndOptim(kernel_graph, inputs);
|
||||
}
|
||||
#endif
|
||||
MS_LOG(INFO) << "Bind input output address";
|
||||
std::vector<tensor::TensorPtr> need_sync_outputs;
|
||||
runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs, &need_sync_outputs);
|
||||
|
@ -119,6 +149,48 @@ void CPUSession::SetKernelInfo(const KernelGraph *kernel_graph) {
|
|||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
void KernelNotSupportException(const AnfNodePtr &kernel_node) {
|
||||
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
std::stringstream operator_info;
|
||||
operator_info << "Operator[" << kernel_name << "] ";
|
||||
auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel_node->kernel_info());
|
||||
if (kernel_info == nullptr) {
|
||||
operator_info << "is not support.";
|
||||
MS_LOG(EXCEPTION) << operator_info.str();
|
||||
}
|
||||
auto kernel_build_Info = kernel_info->select_kernel_build_info();
|
||||
if (kernel_build_Info == nullptr) {
|
||||
operator_info << "is not support.";
|
||||
MS_LOG(EXCEPTION) << operator_info.str();
|
||||
}
|
||||
size_t input_num = kernel_build_Info->GetInputNum();
|
||||
if (input_num > 0) {
|
||||
operator_info << " input(";
|
||||
for (size_t i = 0; i < input_num; ++i) {
|
||||
operator_info << TypeIdLabel(kernel_build_Info->GetInputDeviceType(i));
|
||||
if (i != input_num - 1) {
|
||||
operator_info << ",";
|
||||
}
|
||||
}
|
||||
operator_info << ") ";
|
||||
}
|
||||
size_t output_num = kernel_build_Info->GetOutputNum();
|
||||
if (output_num > 0) {
|
||||
operator_info << "output(";
|
||||
for (size_t i = 0; i < output_num; ++i) {
|
||||
operator_info << TypeIdLabel(kernel_build_Info->GetOutputDeviceType(i));
|
||||
if (i != kernel_build_Info->GetOutputNum() - 1) {
|
||||
operator_info << ",";
|
||||
}
|
||||
}
|
||||
operator_info << ") ";
|
||||
}
|
||||
operator_info << "is not support.";
|
||||
MS_LOG(EXCEPTION) << operator_info.str();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
auto &kernel_nodes = kernel_graph->execution_order();
|
||||
|
@ -129,7 +201,7 @@ void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
|
|||
std::shared_ptr<kernel::CPUKernel> cpu_kernel =
|
||||
kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node);
|
||||
if (cpu_kernel == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Operator[" << kernel_name << "] is not support.";
|
||||
KernelNotSupportException(kernel_node);
|
||||
}
|
||||
cpu_kernel->Init(kernel_node);
|
||||
AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get());
|
||||
|
|
|
@ -37,6 +37,7 @@ class CPUSession : public SessionBasic {
|
|||
|
||||
protected:
|
||||
ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph) override;
|
||||
void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph);
|
||||
|
||||
private:
|
||||
void SetKernelInfo(const KernelGraph *kernel_graph);
|
||||
|
|
|
@ -25,6 +25,11 @@
|
|||
#include "backend/optimizer/pass/getitem_tuple.h"
|
||||
#include "backend/optimizer/gpu/adam_weight_decay_fusion.h"
|
||||
#include "backend/optimizer/gpu/adam_fusion.h"
|
||||
#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
|
||||
#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
|
||||
#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
|
||||
#include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
|
||||
#include "backend/optimizer/gpu/replace_addn_fusion.h"
|
||||
#include "runtime/device/kernel_runtime_manager.h"
|
||||
#include "predict/predict.h"
|
||||
#include "common/utils.h"
|
||||
|
@ -59,6 +64,11 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
|
|||
auto pm = std::make_shared<opt::PassManager>();
|
||||
pm->AddPass(std::make_shared<opt::AdamWeightDecayFusion>());
|
||||
pm->AddPass(std::make_shared<opt::AdamFusion>());
|
||||
pm->AddPass(std::make_shared<opt::ReplaceBNCastFusion>());
|
||||
pm->AddPass(std::make_shared<opt::ReplaceBNGradCastFusion>());
|
||||
pm->AddPass(std::make_shared<opt::ReplaceBNGradCast2Fusion>());
|
||||
pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
|
||||
pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
|
||||
optimizer->AddPassManager(pm);
|
||||
(void)optimizer->Optimize(kernel_graph);
|
||||
kernel_graph->SetExecOrderByDefault();
|
||||
|
@ -167,6 +177,10 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
|
|||
Optimize(graph);
|
||||
// Select kernel build info
|
||||
SelectKernel(graph);
|
||||
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
|
||||
// Assign parameter keys.
|
||||
AssignParamKey(graph);
|
||||
#endif
|
||||
// Convert kernel Graph to model
|
||||
predictmodel::StepConvertGraph(graph);
|
||||
// Start gpu kernel runtime
|
||||
|
@ -204,6 +218,10 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
|
|||
auto &kernel_graph = graphs_[graph_id];
|
||||
// Load input data from user input
|
||||
LoadInputData(kernel_graph, inputs);
|
||||
// Initialize parameter server
|
||||
if (!ps_init_) {
|
||||
InitPSParamAndOptim(kernel_graph, inputs);
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
// Convert inputs to model
|
||||
predictmodel::StepConvertWeight(inputs);
|
||||
|
|
|
@ -307,7 +307,7 @@ CNodePtr KernelGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
|
|||
if (inputs.size() == 1 || !feature_map_input_indexs.empty()) {
|
||||
kernel_info->SetFeatureMapFlag(true);
|
||||
}
|
||||
if (AnfAlgo::IsRealCNodeKernel(cnode)) {
|
||||
if (AnfAlgo::IsRealKernel(cnode)) {
|
||||
AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(kernel_info->is_feature_map()), cnode);
|
||||
AnfAlgo::SetNodeAttr(kIsFeatureMapInputList, MakeValue(feature_map_input_indexs), cnode);
|
||||
}
|
||||
|
@ -929,10 +929,15 @@ void KernelGraph::AddInternalOutput(const AnfNodePtr &front_node, const AnfNodeP
|
|||
}
|
||||
MS_LOG(INFO) << "Add internal node " << node->DebugString() << " with front node " << front_node->DebugString();
|
||||
front_to_internal_outputs_map_[front_node] = node;
|
||||
internal_outputs_to_front_map_[node] = front_node;
|
||||
int output_idx = 0;
|
||||
if (AnfAlgo::CheckPrimitiveType(front_node, prim::kPrimTupleGetItem)) {
|
||||
output_idx = AnfAlgo::GetTupleGetItemOutIndex(front_node->cast<CNodePtr>());
|
||||
}
|
||||
internal_outputs_to_front_map_[node][output_idx] = front_node;
|
||||
}
|
||||
|
||||
void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node) {
|
||||
void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx,
|
||||
int dst_output_idx) {
|
||||
if (new_node == nullptr || node == nullptr) {
|
||||
MS_LOG(INFO) << "New node or node is nullptr";
|
||||
return;
|
||||
|
@ -947,9 +952,30 @@ void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr
|
|||
return;
|
||||
}
|
||||
MS_LOG(INFO) << "Replace internal node " << node->DebugString() << " To " << new_node->DebugString();
|
||||
internal_outputs_to_front_map_[new_node] = iter->second;
|
||||
front_to_internal_outputs_map_[iter->second] = new_node;
|
||||
internal_outputs_to_front_map_.erase(iter);
|
||||
auto &front_nodes = iter->second;
|
||||
// Move all front nodes to new node mapping
|
||||
if (src_output_idx == -1) {
|
||||
internal_outputs_to_front_map_[new_node] = front_nodes;
|
||||
for (const auto &front_node_iter : front_nodes) {
|
||||
front_to_internal_outputs_map_[front_node_iter.second] = new_node;
|
||||
}
|
||||
internal_outputs_to_front_map_.erase(iter);
|
||||
return;
|
||||
}
|
||||
// Move specified front node to new node mapping
|
||||
int index = SizeToInt(src_output_idx);
|
||||
auto front_node_iter = front_nodes.find(index);
|
||||
if (front_node_iter == front_nodes.end()) {
|
||||
MS_LOG(INFO) << "The output " << src_output_idx << " of node " << node->DebugString() << " is not an internal node";
|
||||
return;
|
||||
}
|
||||
auto front_node = front_node_iter->second;
|
||||
internal_outputs_to_front_map_[new_node][dst_output_idx] = front_node;
|
||||
front_to_internal_outputs_map_[front_node] = new_node;
|
||||
front_nodes.erase(index);
|
||||
if (front_nodes.empty()) {
|
||||
internal_outputs_to_front_map_.erase(iter);
|
||||
}
|
||||
}
|
||||
|
||||
AnfNodePtr KernelGraph::GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const {
|
||||
|
@ -967,14 +993,6 @@ bool KernelGraph::IsInternalOutput(const AnfNodePtr &node) const {
|
|||
return false;
|
||||
}
|
||||
|
||||
AnfNodePtr KernelGraph::GetFrontNodeByInternalOutput(const AnfNodePtr &node) const {
|
||||
auto iter = internal_outputs_to_front_map_.find(node);
|
||||
if (iter != internal_outputs_to_front_map_.end()) {
|
||||
return iter->second;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void KernelGraph::AddFinalOutputKernel(const AnfNodePtr &node) {
|
||||
if (node == nullptr) {
|
||||
return;
|
||||
|
|
|
@ -148,10 +148,10 @@ class KernelGraph : public FuncGraph {
|
|||
const std::map<std::string, std::pair<AnfNodePtr, int>> &summary_nodes() const { return summary_nodes_; }
|
||||
void set_summary_nodes(const std::map<std::string, std::pair<AnfNodePtr, int>> &nodes) { summary_nodes_ = nodes; }
|
||||
void AddInternalOutput(const AnfNodePtr &front_node, const AnfNodePtr &node);
|
||||
void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node);
|
||||
void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx = -1,
|
||||
int dst_output_idx = -1);
|
||||
AnfNodePtr GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const;
|
||||
bool IsInternalOutput(const AnfNodePtr &node) const;
|
||||
AnfNodePtr GetFrontNodeByInternalOutput(const AnfNodePtr &node) const;
|
||||
void AddFinalOutputKernel(const AnfNodePtr &node);
|
||||
bool IsFinalOutputKernel(const AnfNodePtr &node) const;
|
||||
uint32_t current_epoch() const { return current_epoch_; }
|
||||
|
@ -223,7 +223,7 @@ class KernelGraph : public FuncGraph {
|
|||
CNodePtr end_goto_;
|
||||
bool null_output_;
|
||||
std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_;
|
||||
std::unordered_map<AnfNodePtr, AnfNodePtr> internal_outputs_to_front_map_;
|
||||
std::unordered_map<AnfNodePtr, std::unordered_map<int, AnfNodePtr>> internal_outputs_to_front_map_;
|
||||
std::set<AnfNodePtr> final_output_kernels_;
|
||||
uint32_t current_epoch_;
|
||||
};
|
||||
|
|
|
@ -35,6 +35,11 @@
|
|||
#include "ir/dtype.h"
|
||||
#include "ir/anf.h"
|
||||
#include "ir/func_graph_cloner.h"
|
||||
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
|
||||
#include "frontend/parallel/ps/worker.h"
|
||||
#include "frontend/parallel/ps/common.h"
|
||||
#include "frontend/parallel/ps/util.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace session {
|
||||
|
@ -295,7 +300,11 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
|
|||
MS_LOG(INFO) << "No corresponding internal output for output node";
|
||||
return;
|
||||
}
|
||||
auto real_kernel = AnfAlgo::VisitKernel(ref_node, 0);
|
||||
size_t output_idx = 0;
|
||||
if (AnfAlgo::CheckPrimitiveType(out_node, prim::kPrimTupleGetItem)) {
|
||||
output_idx = AnfAlgo::GetTupleGetItemOutIndex(out_node->cast<CNodePtr>());
|
||||
}
|
||||
auto real_kernel = AnfAlgo::VisitKernel(ref_node, output_idx);
|
||||
auto ref_real_node = real_kernel.first;
|
||||
auto ref_real_node_index = real_kernel.second;
|
||||
if (ref_real_node->isa<CNode>() && node_graph->IsInternalOutput(ref_real_node) &&
|
||||
|
@ -320,6 +329,7 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
|
|||
builder.SetOutputsFormat({format});
|
||||
d_kernel_info->set_select_kernel_build_info(builder.Build());
|
||||
AnfAlgo::SetOutputAddr(address, 0, parameter.get());
|
||||
AnfAlgo::SetOutputInferTypeAndShape({type}, {AnfAlgo::GetOutputInferShape(parameter, 0)}, parameter.get());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -973,6 +983,16 @@ CNodePtr SessionBasic::ConstructOutput(const AnfNodePtrList &outputs, const std:
|
|||
bool internal_output = true;
|
||||
std::string kernel_target = GetCNodeTarget(front_real_kernel.first);
|
||||
for (auto user : users) {
|
||||
auto cnode = user.first->cast<CNodePtr>();
|
||||
if (cnode == nullptr) {
|
||||
internal_output = false;
|
||||
break;
|
||||
}
|
||||
auto prim = cnode->input(kAnfPrimitiveIndex);
|
||||
if (prim == nullptr || !prim->isa<ValueNode>()) {
|
||||
internal_output = false;
|
||||
break;
|
||||
}
|
||||
if (!AnfAlgo::IsRealKernel(user.first) || kernel_target != GetCNodeTarget(user.first)) {
|
||||
internal_output = false;
|
||||
break;
|
||||
|
@ -1097,5 +1117,92 @@ KernelGraphPtr SessionBasic::NewKernelGraph() {
|
|||
graphs_[graph_sum_++] = graph;
|
||||
return graph;
|
||||
}
|
||||
|
||||
AnfNodePtr SessionBasic::FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list) {
|
||||
MS_EXCEPTION_IF_NULL(push_node);
|
||||
for (auto &node : node_list) {
|
||||
if (node != nullptr && node->isa<CNode>()) {
|
||||
for (auto input : node->cast<CNodePtr>()->inputs()) {
|
||||
if (push_node == AnfAlgo::VisitKernel(input, 0).first) {
|
||||
if (AnfAlgo::GetCNodeName(node) != kPullOpName) {
|
||||
MS_LOG(EXCEPTION) << "The edge between Push and Pull node is invalid.";
|
||||
}
|
||||
return node;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
|
||||
void SessionBasic::AssignParamKey(const KernelGraphPtr &kernel_graph) {
|
||||
if (!parallel::ps::Util::IsRoleOfWorker()) {
|
||||
MS_LOG(INFO) << "Not parameter server mode.";
|
||||
return;
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph->get_return());
|
||||
for (auto &node : node_list) {
|
||||
if (node != nullptr && node->isa<CNode>()) {
|
||||
// Assign key for forward kernel EmbeddingLookup.
|
||||
// The key will be assigned to embedding table ande Push kernel as well.
|
||||
if (AnfAlgo::GetCNodeName(node) == kEmbeddingLookupOpName) {
|
||||
size_t embedding_table_idx = 0;
|
||||
auto embedding_table = AnfAlgo::GetInputNode(node->cast<CNodePtr>(), embedding_table_idx);
|
||||
size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(embedding_table->fullname_with_scope());
|
||||
AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node);
|
||||
} else if (AnfAlgo::GetCNodeName(node) == kPushOpName) {
|
||||
auto pull_node = FindPullNode(node, node_list);
|
||||
if (!pull_node) {
|
||||
MS_LOG(EXCEPTION) << "Assigning parameter key failed: can't find Pull node of the Push node.";
|
||||
}
|
||||
|
||||
// Second input of Pull node is the trainable parameter.
|
||||
size_t parameter_index = 1;
|
||||
auto parameter_node = AnfAlgo::GetInputNode(pull_node->cast<CNodePtr>(), parameter_index);
|
||||
size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(parameter_node->fullname_with_scope());
|
||||
AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node);
|
||||
AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), pull_node);
|
||||
|
||||
std::string optimizer_name = AnfAlgo::GetNodeAttr<std::string>(node, kAttrOptimizerType);
|
||||
parallel::ps::Worker<float>::GetInstance().SetKeyOptimId(key, optimizer_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SessionBasic::InitPSParamAndOptim(const KernelGraphPtr &kernel_graph,
|
||||
const std::vector<tensor::TensorPtr> &inputs_const) {
|
||||
if (!parallel::ps::Util::IsRoleOfWorker()) {
|
||||
return;
|
||||
}
|
||||
std::vector<tensor::TensorPtr> inputs(inputs_const);
|
||||
size_t input_ctrl_size = 1;
|
||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||
if (kernel_graph->input_ctrl_tensors()) {
|
||||
input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs);
|
||||
}
|
||||
auto input_nodes = kernel_graph->inputs();
|
||||
if ((inputs.size() + input_ctrl_size) - 1 != input_nodes.size()) {
|
||||
MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size()
|
||||
<< ", input_ctrl_size:" << input_ctrl_size;
|
||||
}
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
for (size_t i = 0; i < inputs.size(); ++i) {
|
||||
auto tensor = inputs[i];
|
||||
MS_EXCEPTION_IF_NULL(tensor);
|
||||
auto input_node = input_nodes[i];
|
||||
MS_EXCEPTION_IF_NULL(input_node);
|
||||
if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
|
||||
auto pk_node = input_node->cast<ParameterPtr>();
|
||||
mindspore::parallel::ps::Worker<float>::GetInstance().InitPSParamAndOptim(
|
||||
pk_node->fullname_with_scope(), tensor->data_c(), LongToSize(tensor->data().nbytes()));
|
||||
}
|
||||
}
|
||||
ps_init_ = true;
|
||||
}
|
||||
#endif
|
||||
} // namespace session
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -51,7 +51,7 @@ using OpRunInfoPtr = std::shared_ptr<OpRunInfo>;
|
|||
|
||||
class SessionBasic {
|
||||
public:
|
||||
SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0) {
|
||||
SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0), ps_init_(false) {
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
debugger_ = nullptr;
|
||||
#endif
|
||||
|
@ -104,6 +104,8 @@ class SessionBasic {
|
|||
virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; }
|
||||
virtual void SetActive(GraphId, GraphId) {}
|
||||
virtual void GetSummaryNodes(KernelGraph *graph);
|
||||
void AssignParamKey(const KernelGraphPtr &kernel_graph);
|
||||
void InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, const std::vector<tensor::TensorPtr> &inputs_const);
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
// set debugger
|
||||
|
@ -140,6 +142,7 @@ class SessionBasic {
|
|||
AnfNodePtr CreateNewParameterFromCNode(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph);
|
||||
void AddParameterToGraphInputs(const std::vector<AnfNodePtr> ¶meters, KernelGraph *graph);
|
||||
void InitInternalOutputParameter(const AnfNodePtr &out_node, const AnfNodePtr ¶meter);
|
||||
AnfNodePtr FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list);
|
||||
|
||||
std::unordered_map<GraphId, std::shared_ptr<KernelGraph>> graphs_;
|
||||
std::unordered_map<GraphInfo, std::shared_ptr<KernelGraph>> run_op_graphs_;
|
||||
|
@ -148,6 +151,7 @@ class SessionBasic {
|
|||
CallBackFunc summary_callback_;
|
||||
static GraphId graph_sum_;
|
||||
uint32_t device_id_;
|
||||
bool ps_init_;
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
std::shared_ptr<Debugger> debugger_;
|
||||
#endif
|
||||
|
|
|
@ -23,9 +23,7 @@ if (ENABLE_D)
|
|||
list(APPEND _DEBUG_SRC_LIST
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
|
||||
)
|
||||
if (ENABLE_DATA_DUMP)
|
||||
list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc")
|
||||
endif(ENABLE_DATA_DUMP)
|
||||
list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc")
|
||||
endif()
|
||||
|
||||
if (ENABLE_DUMP_E2E)
|
||||
|
|
|
@ -120,6 +120,10 @@ std::optional<std::string> Common::GetConfigFile(const std::string &env) {
|
|||
MS_LOG(ERROR) << dump_config_file << " not exist.";
|
||||
return {};
|
||||
}
|
||||
auto suffix = dump_config_file.substr(dump_config_file.find_last_of('.') + 1);
|
||||
if (suffix != "json") {
|
||||
MS_LOG(EXCEPTION) << "[DataDump] dump config file suffix only support json! But got:." << suffix;
|
||||
}
|
||||
return dump_config_file;
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -29,13 +29,13 @@ void DataDumpParser::ResetParam() {
|
|||
net_name_.clear();
|
||||
dump_mode_ = 0;
|
||||
dump_step_ = 0;
|
||||
kernel_set_.clear();
|
||||
kernel_map_.clear();
|
||||
}
|
||||
|
||||
bool DataDumpParser::DumpEnabled() const {
|
||||
auto enable_dump = std::getenv(kEnableDataDump);
|
||||
if (!enable_dump) {
|
||||
MS_LOG(WARNING) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP";
|
||||
MS_LOG(INFO) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -60,9 +60,18 @@ std::optional<std::string> DataDumpParser::GetDumpPath() const {
|
|||
return {};
|
||||
}
|
||||
std::string dump_path_str(dump_path);
|
||||
if (!std::all_of(dump_path_str.begin(), dump_path_str.end(), ::isalpha)) {
|
||||
MS_LOG(EXCEPTION) << "[DataDump] dump path only support alphas, but got:" << dump_path_str;
|
||||
}
|
||||
return dump_path_str;
|
||||
}
|
||||
|
||||
std::string GetIfstreamString(const std::ifstream &ifstream) {
|
||||
std::stringstream buffer;
|
||||
buffer << ifstream.rdbuf();
|
||||
return buffer.str();
|
||||
}
|
||||
|
||||
void DataDumpParser::ParseDumpConfig() {
|
||||
std::lock_guard<std::mutex> guard(lock_);
|
||||
MS_LOG(INFO) << "[DataDump] parse start";
|
||||
|
@ -84,7 +93,12 @@ void DataDumpParser::ParseDumpConfig() {
|
|||
}
|
||||
|
||||
nlohmann::json j;
|
||||
json_file >> j;
|
||||
try {
|
||||
json_file >> j;
|
||||
} catch (nlohmann::json::parse_error &e) {
|
||||
MS_LOG(ERROR) << "[DataDump] json contents:" << GetIfstreamString(json_file);
|
||||
MS_LOG(EXCEPTION) << "[DataDump] parse json failed, error:" << e.what();
|
||||
}
|
||||
if (j.find("DumpSettings") == j.end()) {
|
||||
MS_LOG(EXCEPTION) << "[DataDump] DumpSettings is not exist.";
|
||||
}
|
||||
|
@ -111,8 +125,8 @@ bool DataDumpParser::NeedDump(const std::string &op_full_name) const {
|
|||
if (dump_mode_ == 0) {
|
||||
return true;
|
||||
}
|
||||
auto iter = kernel_set_.find(op_full_name);
|
||||
return iter != kernel_set_.end();
|
||||
auto iter = kernel_map_.find(op_full_name);
|
||||
return iter != kernel_map_.end();
|
||||
}
|
||||
|
||||
bool DataDumpParser::IsConfigExist(const nlohmann::json &dump_settings) const {
|
||||
|
@ -145,8 +159,25 @@ bool DataDumpParser::ParseDumpSetting(const nlohmann::json &dump_settings) {
|
|||
auto kernel_str = kernel.dump();
|
||||
kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end());
|
||||
MS_LOG(INFO) << "[DataDump] Need dump kernel:" << kernel_str;
|
||||
kernel_set_.insert(kernel_str);
|
||||
kernel_map_.insert({kernel_str, 0});
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void DataDumpParser::MatchKernel(const std::string &kernel_name) {
|
||||
auto iter = kernel_map_.find(kernel_name);
|
||||
if (iter == kernel_map_.end()) {
|
||||
return;
|
||||
}
|
||||
iter->second = iter->second + 1;
|
||||
MS_LOG(INFO) << "Match dump kernel:" << iter->first << " match times:" << iter->second;
|
||||
}
|
||||
|
||||
void DataDumpParser::PrintUnusedKernel() {
|
||||
for (const auto &iter : kernel_map_) {
|
||||
if (iter.second == 0) {
|
||||
MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_
|
||||
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <optional>
|
||||
#include "nlohmann/json.hpp"
|
||||
|
@ -39,7 +39,8 @@ class DataDumpParser {
|
|||
const std::string &net_name() const { return net_name_; }
|
||||
uint32_t dump_mode() const { return dump_mode_; }
|
||||
uint32_t dump_step() const { return dump_step_; }
|
||||
const std::set<std::string> &kernel_set() const { return kernel_set_; }
|
||||
void MatchKernel(const std::string &kernel_name);
|
||||
void PrintUnusedKernel();
|
||||
|
||||
private:
|
||||
DataDumpParser() = default;
|
||||
|
@ -55,7 +56,7 @@ class DataDumpParser {
|
|||
std::string net_name_;
|
||||
uint32_t dump_mode_{0};
|
||||
uint32_t dump_step_{0};
|
||||
std::set<std::string> kernel_set_;
|
||||
std::map<std::string, uint32_t> kernel_map_;
|
||||
};
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_
|
||||
|
|
|
@ -124,6 +124,8 @@ void ProtoExporter::SetNodeOutputType(const TypePtr &type, const BaseShapePtr &s
|
|||
// Do Nothing
|
||||
} else if (type->isa<UndeterminedType>()) {
|
||||
// Do Nothing
|
||||
} else if (type->isa<SparseTensorType>()) {
|
||||
// Do Nothing
|
||||
} else if (type->isa<Tuple>()) {
|
||||
TuplePtr tuple_type = dyn_cast<Tuple>(type);
|
||||
type_proto->set_data_type(irpb::DT_TUPLE);
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue