synchronize latest Ascend software suite 18 Jul 2020, and merging branches

This commit is contained in:
yanghaoran 2020-07-18 16:10:19 +08:00
commit 859acc6d2a
414 changed files with 10415 additions and 2035 deletions

2
.gitmodules vendored
View File

@ -15,4 +15,4 @@
url = https://gitee.com/mindspore/akg.git url = https://gitee.com/mindspore/akg.git
[submodule "graphengine"] [submodule "graphengine"]
path = graphengine path = graphengine
url = https://gitee.com/ms-incubator/graphengine.git url = https://gitee.com/mindspore/graphengine.git

View File

@ -202,10 +202,10 @@ Check out how MindSpore Open Governance [works](https://gitee.com/mindspore/comm
### Communication ### Communication
- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/enQtOTcwMTIxMDI3NjM0LTNkMWM2MzI5NjIyZWU5ZWQ5M2EwMTQ5MWNiYzMxOGM4OWFhZjI4M2E5OGI2YTg3ODU1ODE2Njg1MThiNWI3YmQ) - Communication platform for developers. - [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/zt-dgk65rli-3ex4xvS4wHX7UDmsQmfu8w) - Communication platform for developers.
- IRC channel at `#mindspore` (only for meeting minutes logging purpose) - IRC channel at `#mindspore` (only for meeting minutes logging purpose)
- Video Conferencing: https://meet.jit.si - Video Conferencing: TBD
- Mailing-list: https://mailweb.mindspore.cn/postorius/lists - Mailing-list: <https://mailweb.mindspore.cn/postorius/lists>
## Contributing ## Contributing

2
akg

@ -1 +1 @@
Subproject commit df57a6cf9450e347d1854687d1fe66a420ee3b35 Subproject commit f60af9df4220bf3db5de2b224418953c0dc1f625

View File

@ -24,7 +24,7 @@ usage()
{ {
echo "Usage:" echo "Usage:"
echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\" echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
echo " [-a on|off] [-Q on|off] [-S on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\" echo " [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]" echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]"
echo "" echo ""
echo "Options:" echo "Options:"
@ -48,7 +48,6 @@ usage()
echo " -P Enable dump anf graph to file in ProtoBuffer format, default on" echo " -P Enable dump anf graph to file in ProtoBuffer format, default on"
echo " -Q Enable dump memory, default off" echo " -Q Enable dump memory, default off"
echo " -D Enable dumping of function graph ir, default on" echo " -D Enable dumping of function graph ir, default on"
echo " -S Enable async data dump, default off"
echo " -z Compile dataset & mindrecord, default on" echo " -z Compile dataset & mindrecord, default on"
echo " -M Enable MPI and NCCL for GPU training, gpu default on" echo " -M Enable MPI and NCCL for GPU training, gpu default on"
echo " -V Specify the minimum required cuda version, default CUDA 10.1" echo " -V Specify the minimum required cuda version, default CUDA 10.1"
@ -89,7 +88,6 @@ checkopts()
ENABLE_TIMELINE="off" ENABLE_TIMELINE="off"
ENABLE_DUMP2PROTO="on" ENABLE_DUMP2PROTO="on"
ENABLE_DUMPE2E="off" ENABLE_DUMPE2E="off"
ENABLE_DATA_DUMP="off"
ENABLE_DUMP_IR="on" ENABLE_DUMP_IR="on"
COMPILE_MINDDATA="on" COMPILE_MINDDATA="on"
ENABLE_MPI="off" ENABLE_MPI="off"
@ -104,7 +102,7 @@ checkopts()
ENABLE_PYTHON="on" ENABLE_PYTHON="on"
# Process the options # Process the options
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:S:D:zM:V:K:sB:E' opt while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:sB:E' opt
do do
OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]') OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
case "${opt}" in case "${opt}" in
@ -186,6 +184,7 @@ checkopts()
elif [[ "X$OPTARG" == "Xd" || "X$OPTARG" == "Xascend" ]]; then elif [[ "X$OPTARG" == "Xd" || "X$OPTARG" == "Xascend" ]]; then
ENABLE_D="on" ENABLE_D="on"
ENABLE_CPU="on" ENABLE_CPU="on"
ENABLE_SERVING="on"
elif [[ "X$OPTARG" == "Xcpu" ]]; then elif [[ "X$OPTARG" == "Xcpu" ]]; then
ENABLE_CPU="on" ENABLE_CPU="on"
else else
@ -220,11 +219,6 @@ checkopts()
ENABLE_DUMPE2E="$OPTARG" ENABLE_DUMPE2E="$OPTARG"
echo "enable dump end to end" echo "enable dump end to end"
;; ;;
S)
check_on_off $OPTARG S
ENABLE_DATA_DUMP="$OPTARG"
echo "enable data dump"
;;
D) D)
check_on_off $OPTARG D check_on_off $OPTARG D
ENABLE_DUMP_IR="$OPTARG" ENABLE_DUMP_IR="$OPTARG"
@ -328,9 +322,6 @@ build_mindspore()
if [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then if [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON" CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON"
fi fi
if [[ "X$ENABLE_DATA_DUMP" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DATA_DUMP=ON"
fi
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}" CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}"
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}" CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}"
if [[ "X$ENABLE_MPI" = "Xon" ]]; then if [[ "X$ENABLE_MPI" = "Xon" ]]; then

View File

@ -1,4 +1,4 @@
set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS}") set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2") set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
mindspore_add_pkg(glog mindspore_add_pkg(glog
VER 0.4.0 VER 0.4.0

View File

@ -116,10 +116,10 @@ if(ENABLE_DUMP_E2E)
add_compile_definitions(ENABLE_DUMP_E2E) add_compile_definitions(ENABLE_DUMP_E2E)
endif() endif()
if(ENABLE_DATA_DUMP)
add_compile_definitions(ENABLE_DATA_DUMP)
endif()
if(ENABLE_DEBUGGER) if(ENABLE_DEBUGGER)
add_compile_definitions(ENABLE_DEBUGGER) add_compile_definitions(ENABLE_DEBUGGER)
endif() endif()
if(ENABLE_TESTCASES)
add_compile_definitions(ENABLE_TESTCASES)
endif()

View File

@ -1,13 +1,16 @@
# find exec # find exec
find_package(Python3 3.7 COMPONENTS Interpreter Development) find_package(Python3 3.7 COMPONENTS Interpreter Development)
if (NOT Python3_FOUND) if (NOT Python3_FOUND)
message("No python3 found.") message(FATAL_ERROR "No python3 found.")
return ()
endif () endif ()
set(PYTHON ${Python3_EXECUTABLE}) set(PYTHON ${Python3_EXECUTABLE})
set(PYTHON_VERSION ${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}) set(PYTHON_VERSION ${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR})
if (NOT PYTHON_VERSION MATCHES "3.7")
message(FATAL_ERROR "FIND PYTHON VERSION ${PYTHON_VERSION} BUT CAN NOT MATCH PYTHON VERSION 3.7")
endif ()
find_package(Git) find_package(Git)
if (NOT GIT_FOUND) if (NOT GIT_FOUND)
message("No git found.") message("No git found.")

@ -1 +1 @@
Subproject commit eee707935c066c16e9b9cd207f8125871b6b97cf Subproject commit 103f2d1019dc50d781d7a964551d9f1f50b3b009

0
hub/docs/.gitkeep Normal file
View File

0
hub/images/.gitkeep Normal file
View File

0
hub/scripts/.gitkeep Normal file
View File

View File

@ -17,7 +17,7 @@
"""Resources for ast tree parse.""" """Resources for ast tree parse."""
import ast import ast
import math import math
from mindspore import IndexedSlices from mindspore import IndexedSlices, SparseTensor
from mindspore.ops.composite import multitype_ops from mindspore.ops.composite import multitype_ops
from mindspore.ops import functional as F, composite as C from mindspore.ops import functional as F, composite as C
from . import standard_method as M from . import standard_method as M
@ -140,4 +140,5 @@ convert_object_map = {
# user defined # user defined
IndexedSlices: F.make_indexed_slices, IndexedSlices: F.make_indexed_slices,
SparseTensor: F.make_sparse_tensor,
} }

View File

@ -44,7 +44,7 @@ if(ENABLE_GPU)
"backend/kernel_compiler/akg/akg_kernel_attrs_process.cc" "backend/kernel_compiler/akg/akg_kernel_attrs_process.cc"
) )
list(APPEND CUDA_NVCC_FLAGS -arch=sm_53) list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/blocking_queue.cc" "runtime/device/gpu/gpu_buffer_mgr.cc") list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/blocking_queue.cc" "runtime/device/gpu/gpu_buffer_mgr.cc")
list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/mpi/mpi_initializer.cc" list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/mpi/mpi_initializer.cc"
"runtime/device/gpu/distribution/collective_wrapper.cc" "runtime/device/gpu/distribution/collective_wrapper.cc"

View File

@ -26,14 +26,6 @@ if (ENABLE_CPU)
"cpu/*.cc" "cpu/*.cc"
) )
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc"
"cpu/ps/pull_kernel.cc"
"cpu/ps/embedding_look_up_ps_kernel.cc"
"cpu/ps/embedding_look_up_proxy_kernel.cc"
"cpu/ps/apply_momentum_ps_kernel.cc"
"cpu/ps/sparse_apply_adam_ps_kernel.cc"
"cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
if (NOT ENABLE_MPI) if (NOT ENABLE_MPI)
list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc") list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc") list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc")
@ -41,6 +33,17 @@ if (ENABLE_CPU)
endif () endif ()
endif () endif ()
if (${CMAKE_SYSTEM_NAME} MATCHES "Windows" OR ENABLE_GE)
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/apply_momentum_ps_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_proxy_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_ps_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pserver_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pull_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_adam_ps_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
endif()
if (ENABLE_GPU) if (ENABLE_GPU)
file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"gpu/*.cu" "gpu/*.cu"

View File

@ -18,6 +18,7 @@
#include <algorithm> #include <algorithm>
#include "backend/session/anf_runtime_algorithm.h" #include "backend/session/anf_runtime_algorithm.h"
#include "backend/optimizer/common/helper.h" #include "backend/optimizer/common/helper.h"
#include "backend/kernel_compiler/common_utils.h"
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
@ -75,15 +76,7 @@ void SetAkgAttrsForCast(const AnfNodePtr &anf_node) {
std::string dst_type; std::string dst_type;
TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, 0); TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, 0);
if (output_type == kFloat32->type_id()) { dst_type = TypeId2String(output_type);
dst_type = "float32";
} else if (output_type == kFloat16->type_id()) {
dst_type = "float16";
} else if (output_type == kInt32->type_id()) {
dst_type = "int32";
} else {
MS_LOG(WARNING) << "Unknown cast_to type: " << TypeIdToType(output_type)->ToString();
}
AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node); AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node);
} }

View File

@ -21,9 +21,7 @@
#include <memory> #include <memory>
#include "framework/ge_runtime/task_info.h" #include "framework/ge_runtime/task_info.h"
#include "backend/kernel_compiler/kernel.h" #include "backend/kernel_compiler/kernel.h"
#ifdef ENABLE_DATA_DUMP
#include "debug/data_dump_parser.h" #include "debug/data_dump_parser.h"
#endif
using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>; using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>;
namespace mindspore { namespace mindspore {
@ -34,13 +32,7 @@ class AscendKernelMod : public KernelMod {
const std::vector<AddressPtr> &, uint32_t) = 0; const std::vector<AddressPtr> &, uint32_t) = 0;
uint32_t block_dim() { return block_dim_; } uint32_t block_dim() { return block_dim_; }
uint32_t stream_id() { return stream_id_; } uint32_t stream_id() { return stream_id_; }
virtual bool NeedDump() { virtual bool NeedDump() { return DataDumpParser::GetInstance().NeedDump(kernel_name_); }
#ifdef ENABLE_DATA_DUMP
return DataDumpParser::GetInstance().NeedDump(kernel_name_);
#else
return false;
#endif
}
protected: protected:
uint32_t block_dim_{1}; uint32_t block_dim_{1};

View File

@ -20,6 +20,7 @@
#include <iostream> #include <iostream>
#include <utility> #include <utility>
#include <fstream> #include <fstream>
#include <algorithm>
#include <thread> #include <thread>
#include "nlohmann/json.hpp" #include "nlohmann/json.hpp"
#include "backend/session/anf_runtime_algorithm.h" #include "backend/session/anf_runtime_algorithm.h"
@ -499,235 +500,329 @@ int Sign(float x) {
return 0; return 0;
} }
void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim, namespace {
size_t outer_dim) { struct BucketSparseGradient {
MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_); float *value_;
MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_); int *indices_;
MS_EXCEPTION_IF_NULL(unique_grad); int *global_indices_;
MS_EXCEPTION_IF_NULL(unique_grad->value_); size_t indices_size_;
MS_EXCEPTION_IF_NULL(unique_grad->indices_); };
struct MultiThreadReduceSparseGradientParam {
SparseGradient *input_grad_{nullptr};
SparseGradient *workspace_grad_{nullptr};
SparseGradient *output_grad_{nullptr};
size_t max_index_{0};
size_t value_stride_{0};
size_t thread_num_{0};
bool use_sort_reduce_{false};
};
void CalculateEachBucketSize(const std::shared_ptr<SparseGradient> &sparse_grad, size_t max_index,
std::vector<size_t> *each_bucket_size) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(sparse_grad);
MS_EXCEPTION_IF_NULL(sparse_grad->indices_);
MS_EXCEPTION_IF_NULL(each_bucket_size);
size_t bucket_num = each_bucket_size->size();
for (size_t i = 0; i < sparse_grad->indices_size_; ++i) {
int index = sparse_grad->indices_[i];
if (index >= 0 && IntToSize(index) < max_index) {
auto bucket_id = index % bucket_num;
each_bucket_size->at(bucket_id)++;
}
}
MS_LOG(DEBUG) << "End";
}
void SplitAndCalculateSegmentBucketSize(const MultiThreadReduceSparseGradientParam &param,
std::vector<std::shared_ptr<SparseGradient>> *segments_ptr,
std::vector<std::shared_ptr<std::vector<size_t>>> *segment_bucket_sizes_ptr) {
MS_EXCEPTION_IF_NULL(param.input_grad_);
MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr);
MS_EXCEPTION_IF_NULL(segments_ptr);
auto &segments = *segments_ptr;
auto &segment_bucket_sizes = *segment_bucket_sizes_ptr;
auto input_grad = param.input_grad_;
if (param.thread_num_ < 1) {
MS_EXCEPTION(ArgumentError) << "Input param thread num must > 0!";
}
size_t thread_indices_size = input_grad->indices_size_ / param.thread_num_;
size_t left_indices_size = input_grad->indices_size_ % param.thread_num_;
std::vector<std::thread> threads;
threads.reserve(param.thread_num_);
segments.reserve(param.thread_num_);
size_t current_indices_offset = 0;
for (size_t i = 0; i < param.thread_num_; ++i) {
segment_bucket_sizes.emplace_back(std::make_shared<std::vector<size_t>>(param.thread_num_, 0));
size_t indices_size = thread_indices_size;
if (i < left_indices_size) {
indices_size += 1;
}
segments.emplace_back(std::make_shared<SparseGradient>());
segments[i]->value_ = input_grad->value_ + current_indices_offset * param.value_stride_;
segments[i]->indices_ = input_grad->indices_ + current_indices_offset;
segments[i]->indices_size_ = indices_size;
threads.emplace_back(
std::thread(CalculateEachBucketSize, segments[i], param.max_index_, segment_bucket_sizes[i].get()));
current_indices_offset += indices_size;
}
for (size_t i = 0; i < param.thread_num_; ++i) {
threads[i].join();
}
}
void CopySegmentIndicesToBucket(const MultiThreadReduceSparseGradientParam &param,
const std::shared_ptr<SparseGradient> &segment, size_t bucket_offset,
const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(segment);
MS_EXCEPTION_IF_NULL(segment->indices_);
std::vector<size_t> bucket_data_num(param.thread_num_, 0);
for (size_t i = 0; i < segment->indices_size_; ++i) {
int index = segment->indices_[i];
if (index >= 0 && IntToSize(index) < param.max_index_) {
auto bucket_id = index % param.thread_num_;
auto bucket_index = bucket_data_num[bucket_id];
buckets[bucket_id]->indices_[bucket_index] = index;
buckets[bucket_id]->global_indices_[bucket_index] = bucket_offset + i;
bucket_data_num[bucket_id]++;
}
}
MS_LOG(DEBUG) << "End";
}
void GatherSegmentIndicesToOutputBucket(const MultiThreadReduceSparseGradientParam &param,
const std::vector<std::shared_ptr<SparseGradient>> &segments,
const std::vector<std::shared_ptr<std::vector<size_t>>> &segment_bucket_sizes,
std::vector<std::shared_ptr<BucketSparseGradient>> *buckets_ptr) {
MS_EXCEPTION_IF_NULL(param.output_grad_);
MS_EXCEPTION_IF_NULL(param.output_grad_->value_);
MS_EXCEPTION_IF_NULL(param.output_grad_->indices_);
MS_EXCEPTION_IF_NULL(buckets_ptr);
auto &buckets = *buckets_ptr;
size_t thread_num = param.thread_num_;
if (thread_num != segment_bucket_sizes.size()) {
MS_EXCEPTION(ArgumentError) << "Input param thread num not equal to segment size!";
}
std::vector<size_t> bucket_data_size(thread_num, 0);
for (size_t i = 0; i < thread_num; ++i) {
for (size_t j = 0; j < thread_num; ++j) {
bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
}
}
size_t current_indices_offset = 0;
for (size_t i = 0; i < thread_num; ++i) {
buckets.emplace_back(std::make_shared<BucketSparseGradient>());
buckets[i]->value_ = param.output_grad_->value_ + current_indices_offset * param.value_stride_;
buckets[i]->indices_ = param.output_grad_->indices_ + current_indices_offset;
buckets[i]->global_indices_ = param.workspace_grad_->indices_ + current_indices_offset;
buckets[i]->indices_size_ = bucket_data_size[i];
current_indices_offset += bucket_data_size[i];
}
std::vector<size_t> tmp_bucket_data_size(thread_num, 0);
std::vector<std::vector<std::shared_ptr<BucketSparseGradient>>> each_thread_buckets;
for (size_t i = 0; i < thread_num; ++i) {
std::vector<std::shared_ptr<BucketSparseGradient>> thread_buckets;
for (size_t j = 0; j < thread_num; ++j) {
thread_buckets.emplace_back(std::make_shared<BucketSparseGradient>());
thread_buckets[j]->indices_ = buckets[j]->indices_ + tmp_bucket_data_size[j];
thread_buckets[j]->global_indices_ = buckets[j]->global_indices_ + tmp_bucket_data_size[j];
thread_buckets[j]->value_ = buckets[j]->value_ + tmp_bucket_data_size[j] * param.value_stride_;
thread_buckets[j]->indices_size_ = segment_bucket_sizes[i]->at(j);
tmp_bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
}
each_thread_buckets.emplace_back(thread_buckets);
}
std::vector<std::thread> threads;
threads.reserve(thread_num);
current_indices_offset = 0;
for (size_t i = 0; i < thread_num; ++i) {
threads.emplace_back(
std::thread(CopySegmentIndicesToBucket, param, segments[i], current_indices_offset, each_thread_buckets[i]));
current_indices_offset += segments[i]->indices_size_;
}
for (size_t i = 0; i < thread_num; ++i) {
threads[i].join();
}
}
void SortAndReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam &param,
const std::shared_ptr<BucketSparseGradient> &bucket,
const std::shared_ptr<SparseGradient> &reduced_bucket) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(bucket);
MS_EXCEPTION_IF_NULL(bucket->value_);
MS_EXCEPTION_IF_NULL(bucket->indices_);
MS_EXCEPTION_IF_NULL(reduced_bucket);
MS_EXCEPTION_IF_NULL(reduced_bucket->value_);
MS_EXCEPTION_IF_NULL(reduced_bucket->indices_);
std::vector<std::pair<int, int>> sorted_indices;
sorted_indices.reserve(bucket->indices_size_);
for (size_t i = 0; i < bucket->indices_size_; ++i) {
int index = bucket->indices_[i];
int global_index = bucket->global_indices_[i];
sorted_indices.emplace_back(std::pair<int, int>(index, global_index));
}
std::sort(sorted_indices.begin(), sorted_indices.end());
float *global_value = param.input_grad_->value_;
size_t unique_indices_size = 0;
size_t max_length = reduced_bucket->indices_size_ * param.value_stride_;
int last_index{0};
size_t value_offset{0};
for (size_t i = 0; i < sorted_indices.size(); ++i) {
int index = sorted_indices[i].first;
int global_index = sorted_indices[i].second;
int global_value_offset = global_index * param.value_stride_;
if (i == 0 || index != last_index) {
if (i != 0) {
unique_indices_size++;
}
reduced_bucket->indices_[unique_indices_size] = index;
value_offset = unique_indices_size * param.value_stride_;
auto ret_code = memcpy_s(reduced_bucket->value_ + value_offset, (max_length - value_offset) * sizeof(float),
global_value + global_value_offset, param.value_stride_ * sizeof(float));
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
} else {
for (size_t j = 0; j < param.value_stride_; ++j) {
reduced_bucket->value_[value_offset + j] += global_value[global_value_offset + j];
}
}
last_index = index;
}
reduced_bucket->indices_size_ = unique_indices_size;
MS_LOG(DEBUG) << "End";
}
void ReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam &param,
const std::shared_ptr<BucketSparseGradient> &bucket,
const std::shared_ptr<SparseGradient> &reduced_bucket) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(bucket);
MS_EXCEPTION_IF_NULL(bucket->value_);
MS_EXCEPTION_IF_NULL(bucket->indices_);
MS_EXCEPTION_IF_NULL(reduced_bucket);
MS_EXCEPTION_IF_NULL(reduced_bucket->value_);
MS_EXCEPTION_IF_NULL(reduced_bucket->indices_);
float *global_value = param.input_grad_->value_;
std::unordered_map<int, size_t> index_map; std::unordered_map<int, size_t> index_map;
size_t unique_indices_size = 0; size_t unique_indices_size = 0;
for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) { size_t max_length = reduced_bucket->indices_size_ * param.value_stride_;
int index = origin_sparse_grad.indices_[i]; for (size_t i = 0; i < bucket->indices_size_; ++i) {
if (index < 0 || IntToSize(index) >= first_dim) { int index = bucket->indices_[i];
continue; int global_index = bucket->global_indices_[i];
}
auto iter = index_map.find(index); auto iter = index_map.find(index);
if (iter == index_map.end()) { if (iter == index_map.end()) {
index_map[index] = unique_indices_size; reduced_bucket->indices_[unique_indices_size] = index;
unique_grad->indices_[unique_indices_size] = index; size_t start_index = unique_indices_size * param.value_stride_;
size_t start_index = unique_indices_size * outer_dim; index_map[index] = start_index;
size_t end_index = start_index + outer_dim; auto ret_code = memcpy_s(reduced_bucket->value_ + start_index, (max_length - start_index) * sizeof(float),
for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) { global_value + global_index * param.value_stride_, param.value_stride_ * sizeof(float));
unique_grad->value_[j] = origin_sparse_grad.value_[k]; if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
} }
unique_indices_size++; unique_indices_size++;
} else { } else {
size_t first_index = iter->second; size_t start_index = iter->second;
size_t start_index = first_index * outer_dim; size_t end_index = start_index + param.value_stride_;
size_t end_index = start_index + outer_dim; for (size_t j = start_index, k = global_index * param.value_stride_; j < end_index; ++j, ++k) {
for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) { reduced_bucket->value_[j] += global_value[k];
unique_grad->value_[j] += origin_sparse_grad.value_[k];
} }
} }
} }
unique_grad->indices_size_ = unique_indices_size; reduced_bucket->indices_size_ = unique_indices_size;
}
struct WorkerParamsForReduceSparseGradient {
size_t slice_start_{0};
size_t slice_end_{0};
size_t max_length_{0};
size_t outer_dim_{0};
std::vector<std::pair<int, size_t>> *sorted_indices_{nullptr};
std::vector<size_t> *slice_positions_{nullptr};
float *src_value_{nullptr};
SparseGradient *unique_grad_{nullptr};
};
void WorkerForReduceSparseGradient(WorkerParamsForReduceSparseGradient param) {
MS_EXCEPTION_IF_NULL(param.sorted_indices_);
MS_EXCEPTION_IF_NULL(param.slice_positions_);
MS_EXCEPTION_IF_NULL(param.src_value_);
MS_EXCEPTION_IF_NULL(param.unique_grad_);
auto outer_dim = param.outer_dim_;
auto &sorted_indices = *(param.sorted_indices_);
auto &slice_positions = *(param.slice_positions_);
auto unique_grad = param.unique_grad_;
for (size_t slice_id = param.slice_start_; slice_id < param.slice_end_; ++slice_id) {
size_t cur_pos = slice_positions[slice_id];
int index = sorted_indices[cur_pos].first;
unique_grad->indices_[slice_id] = index;
size_t start_index = slice_id * outer_dim;
auto ret_code = memcpy_s(unique_grad->value_ + start_index, (param.max_length_ - start_index) * sizeof(float),
param.src_value_ + sorted_indices[cur_pos].second, outer_dim * sizeof(float));
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
cur_pos++;
size_t end_pos;
if (slice_id + 1 < slice_positions.size()) {
end_pos = slice_positions[slice_id + 1];
} else {
end_pos = sorted_indices.size();
}
while (cur_pos < end_pos) {
for (size_t i = 0; i < outer_dim; ++i) {
unique_grad->value_[start_index + i] += param.src_value_[sorted_indices[cur_pos].second + i];
}
cur_pos++;
}
}
}
void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
std::vector<size_t> *slice_positions) {
MS_LOG(DEBUG) << "Start";
size_t thread_num = 24;
if (slice_positions->size() < thread_num) {
thread_num = slice_positions->size();
}
size_t stride = (slice_positions->size() + thread_num - 1) / thread_num;
thread_num = (slice_positions->size() + stride - 1) / stride;
std::vector<std::thread> threads;
size_t max_length = sorted_indices->size() * outer_dim;
for (size_t i = 0; i < thread_num; ++i) {
size_t slice_start = i * stride;
size_t slice_end = 0;
if (i == thread_num - 1) {
slice_end = slice_positions->size();
} else {
slice_end = slice_start + stride;
}
WorkerParamsForReduceSparseGradient params{
slice_start, slice_end, max_length, outer_dim, sorted_indices, slice_positions, origin_sparse_grad.value_,
unique_grad};
threads.emplace_back(std::thread(WorkerForReduceSparseGradient, params));
}
for (size_t i = 0; i < thread_num; ++i) {
threads[i].join();
}
MS_LOG(DEBUG) << "End"; MS_LOG(DEBUG) << "End";
} }
void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim, void ReduceBucketSparseGradientToWorkspace(const MultiThreadReduceSparseGradientParam &param,
size_t outer_dim, bool use_multi_threads) { const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets,
MS_LOG(DEBUG) << "Start"; std::vector<std::shared_ptr<SparseGradient>> *reduced_buckets_ptr) {
MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_); MS_EXCEPTION_IF_NULL(param.workspace_grad_);
MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_); MS_EXCEPTION_IF_NULL(param.workspace_grad_->value_);
MS_EXCEPTION_IF_NULL(unique_grad); MS_EXCEPTION_IF_NULL(param.workspace_grad_->indices_);
MS_EXCEPTION_IF_NULL(unique_grad->value_); MS_EXCEPTION_IF_NULL(reduced_buckets_ptr);
MS_EXCEPTION_IF_NULL(unique_grad->indices_); auto &reduced_buckets = *reduced_buckets_ptr;
std::vector<std::pair<int, size_t>> sorted_indices; size_t thread_num = buckets.size();
sorted_indices.reserve(origin_sparse_grad.indices_size_);
for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
int index = origin_sparse_grad.indices_[i];
if (index >= 0 && IntToSize(index) < first_dim) {
sorted_indices.emplace_back(std::pair<int, size_t>(index, i * outer_dim));
}
}
std::sort(
sorted_indices.begin(), sorted_indices.end(),
[](const std::pair<int, size_t> &left, const std::pair<int, size_t> &right) { return left.first < right.first; });
int last_index = 0;
std::vector<size_t> slice_positions;
slice_positions.reserve(sorted_indices.size());
for (size_t i = 0; i < sorted_indices.size(); ++i) {
if (i == 0 || last_index != sorted_indices[i].first) {
slice_positions.emplace_back(i);
}
last_index = sorted_indices[i].first;
}
if (use_multi_threads) {
RunMultiThreadReduceSparseGradient(origin_sparse_grad, unique_grad, outer_dim, &sorted_indices, &slice_positions);
} else {
size_t max_length = sorted_indices.size() * outer_dim;
WorkerParamsForReduceSparseGradient params{0,
slice_positions.size(),
max_length,
outer_dim,
&sorted_indices,
&slice_positions,
origin_sparse_grad.value_,
unique_grad};
WorkerForReduceSparseGradient(params);
}
unique_grad->indices_size_ = slice_positions.size();
MS_LOG(DEBUG) << "End";
}
void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim) {
MS_LOG(DEBUG) << "Start";
if (unique_slice_grads.empty()) {
return;
}
size_t index_data_size = outer_dim * sizeof(float);
size_t unique_indices_size = 0;
for (size_t i = 0; i < unique_slice_grads.size(); ++i) {
auto &slice_grad = unique_slice_grads[i];
auto ret_code = memcpy_s(tmp_grad->value_ + unique_indices_size * outer_dim,
(tmp_grad->indices_size_ - unique_indices_size) * index_data_size, slice_grad->value_,
slice_grad->indices_size_ * index_data_size);
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
ret_code =
memcpy_s(tmp_grad->indices_ + unique_indices_size, (tmp_grad->indices_size_ - unique_indices_size) * sizeof(int),
slice_grad->indices_, slice_grad->indices_size_ * sizeof(int));
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
unique_indices_size += slice_grad->indices_size_;
}
tmp_grad->indices_size_ = unique_indices_size;
ReduceSparseGradient(*tmp_grad, unique_grad, first_dim, outer_dim);
MS_LOG(DEBUG) << "End";
}
void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
SparseGradient *unique_grad, size_t first_dim, size_t outer_dim) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
MS_EXCEPTION_IF_NULL(unique_grad);
MS_EXCEPTION_IF_NULL(unique_grad->value_);
MS_EXCEPTION_IF_NULL(unique_grad->indices_);
MS_EXCEPTION_IF_NULL(tmp_grad);
MS_EXCEPTION_IF_NULL(tmp_grad->value_);
MS_EXCEPTION_IF_NULL(tmp_grad->indices_);
size_t thread_num = 24;
if (origin_sparse_grad.indices_size_ < thread_num) {
thread_num = origin_sparse_grad.indices_size_;
}
size_t thread_indices_size = origin_sparse_grad.indices_size_ / thread_num;
size_t left_indices_size = origin_sparse_grad.indices_size_ % thread_num;
std::vector<std::thread> threads; std::vector<std::thread> threads;
threads.reserve(thread_num); threads.reserve(thread_num);
std::vector<std::shared_ptr<SparseGradient>> unique_slice_grads;
size_t current_indices_offset = 0;
for (size_t i = 0; i < thread_num; ++i) { for (size_t i = 0; i < thread_num; ++i) {
size_t indices_size = thread_indices_size; reduced_buckets.emplace_back(std::make_shared<SparseGradient>());
if (i == thread_num - 1) { reduced_buckets[i]->value_ = param.workspace_grad_->value_ + current_indices_offset * param.value_stride_;
indices_size = thread_indices_size + left_indices_size; reduced_buckets[i]->indices_ = param.workspace_grad_->indices_ + current_indices_offset;
reduced_buckets[i]->indices_size_ = buckets[i]->indices_size_;
if (param.use_sort_reduce_) {
threads.emplace_back(std::thread(SortAndReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i]));
} else {
threads.emplace_back(std::thread(ReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i]));
} }
size_t value_offset = i * thread_indices_size * outer_dim; current_indices_offset += buckets[i]->indices_size_;
size_t indices_offset = i * thread_indices_size;
auto slice_grad = SparseGradient(
{origin_sparse_grad.value_ + value_offset, origin_sparse_grad.indices_ + indices_offset, indices_size});
unique_slice_grads.emplace_back(std::make_shared<SparseGradient>());
unique_slice_grads[i]->value_ = unique_grad->value_ + value_offset;
unique_slice_grads[i]->indices_ = unique_grad->indices_ + indices_offset;
unique_slice_grads[i]->indices_size_ = indices_size;
threads.emplace_back(
std::thread(ReduceSparseGradient, slice_grad, unique_slice_grads[i].get(), first_dim, outer_dim, false));
} }
for (size_t i = 0; i < thread_num; ++i) { for (size_t i = 0; i < thread_num; ++i) {
threads[i].join(); threads[i].join();
} }
ReduceMultiSparseGradient(unique_slice_grads, tmp_grad, unique_grad, first_dim, outer_dim); }
void MergeReduceSparseGradient(const MultiThreadReduceSparseGradientParam &param,
const std::vector<std::shared_ptr<SparseGradient>> &reduced_buckets) {
MS_EXCEPTION_IF_NULL(param.output_grad_);
auto output_grad = param.output_grad_;
MS_EXCEPTION_IF_NULL(output_grad->value_);
MS_EXCEPTION_IF_NULL(output_grad->indices_);
size_t stride_data_size = param.value_stride_ * sizeof(float);
size_t unique_indices_size = 0;
for (size_t i = 0; i < reduced_buckets.size(); ++i) {
auto &bucket = reduced_buckets[i];
MS_EXCEPTION_IF_NULL(bucket);
if (bucket->indices_size_ == 0) {
continue;
}
auto ret_code = memcpy_s(output_grad->value_ + unique_indices_size * param.value_stride_,
(output_grad->indices_size_ - unique_indices_size) * stride_data_size, bucket->value_,
bucket->indices_size_ * stride_data_size);
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
ret_code = memcpy_s(output_grad->indices_ + unique_indices_size,
(output_grad->indices_size_ - unique_indices_size) * sizeof(int), bucket->indices_,
bucket->indices_size_ * sizeof(int));
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
unique_indices_size += bucket->indices_size_;
}
output_grad->indices_size_ = unique_indices_size;
}
} // namespace
void BucketReduceSparseGradient(const ReduceSparseGradientParam &param) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(param.input_grad_);
size_t thread_num = 23;
if (param.input_grad_->indices_size_ < thread_num) {
thread_num = param.input_grad_->indices_size_;
}
MultiThreadReduceSparseGradientParam multi_thread_param({param.input_grad_, param.workspace_grad_, param.output_grad_,
param.max_index_, param.value_stride_, thread_num,
param.use_sort_reduce_});
std::vector<std::shared_ptr<SparseGradient>> segments;
std::vector<std::shared_ptr<std::vector<size_t>>> segment_bucket_sizes;
SplitAndCalculateSegmentBucketSize(multi_thread_param, &segments, &segment_bucket_sizes);
std::vector<std::shared_ptr<BucketSparseGradient>> buckets;
GatherSegmentIndicesToOutputBucket(multi_thread_param, segments, segment_bucket_sizes, &buckets);
std::vector<std::shared_ptr<SparseGradient>> reduced_buckets;
ReduceBucketSparseGradientToWorkspace(multi_thread_param, buckets, &reduced_buckets);
MergeReduceSparseGradient(multi_thread_param, reduced_buckets);
MS_LOG(DEBUG) << "End"; MS_LOG(DEBUG) << "End";
} }

View File

@ -73,9 +73,18 @@ class KernelMeta {
}; };
struct SparseGradient { struct SparseGradient {
float *value_; float *value_{nullptr};
int *indices_; int *indices_{nullptr};
size_t indices_size_; size_t indices_size_{0};
};
struct ReduceSparseGradientParam {
SparseGradient *input_grad_{nullptr};
SparseGradient *workspace_grad_{nullptr};
SparseGradient *output_grad_{nullptr};
size_t max_index_{0};
size_t value_stride_{0};
bool use_sort_reduce_{false};
}; };
struct MultiThreadComputeParams { struct MultiThreadComputeParams {
@ -112,10 +121,6 @@ void SaveJsonInfo(const std::string &json_name, const std::string &info);
std::string GetProcessor(const AnfNodePtr &anf_node); std::string GetProcessor(const AnfNodePtr &anf_node);
bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b); bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
int Sign(float x); int Sign(float x);
void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim);
void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim, bool use_multi_threads = true);
std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index); std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list, std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
const std::vector<AnfNodePtr> &input_list); const std::vector<AnfNodePtr> &input_list);
@ -130,14 +135,7 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<An
bool IsWeightBoundary(const AnfNodePtr &node); bool IsWeightBoundary(const AnfNodePtr &node);
void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params, void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params,
size_t total_compute_size); size_t total_compute_size);
void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, void BucketReduceSparseGradient(const ReduceSparseGradientParam &param);
size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
std::vector<size_t> *slice_positions);
void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim);
void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
SparseGradient *unique_grad, size_t first_dim, size_t outer_dim);
std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode); std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode);
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore

View File

@ -46,7 +46,7 @@ class EmbeddingLookUpCPUKernel : public CPUKernel {
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override; const std::vector<AddressPtr> &outputs) override;
private: protected:
void LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2, void LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2,
float **output_addr); float **output_addr);
void CheckParam(const CNodePtr &kernel_node); void CheckParam(const CNodePtr &kernel_node);

View File

@ -53,15 +53,15 @@ bool EmbeddingLookUpProxyKernel::Launch(const std::vector<kernel::AddressPtr> &i
size_t output_size = outputs[0]->size; size_t output_size = outputs[0]->size;
size_t size = input_size / sizeof(float); size_t size = input_size / sizeof(float);
::ps::SArray<float> lookup_ids(size, 0); ::ps::SArray<int> lookup_ids(size, 0);
::ps::SArray<int> lengths{size}; ::ps::SArray<int> lengths{size};
::ps::SArray<float> lookup_result; ::ps::SArray<float> lookup_result(output_size / sizeof(float), 0);
auto ret = memcpy_s(lookup_ids.data(), input_size, indices_addr, input_size); auto ret = memcpy_s(lookup_ids.data(), input_size, indices_addr, input_size);
if (ret != EOK) { if (ret != EOK) {
MS_LOG(EXCEPTION) << "Lookup id memcpy failed."; MS_LOG(EXCEPTION) << "Lookup id memcpy failed.";
} }
parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, lookup_result, parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, &lookup_result,
parallel::ps::kEmbeddingLookupCmd); parallel::ps::kEmbeddingLookupCmd);
auto ret2 = memcpy_s(output_addr, output_size, lookup_result.data(), output_size); auto ret2 = memcpy_s(output_addr, output_size, lookup_result.data(), output_size);

View File

@ -50,7 +50,7 @@ void EmbeddingLookUpPSKernel::InitKernel(
split_num_ = pserver_num_; split_num_ = pserver_num_;
// input shape should be sharded after computing offset_; // input shape should be sharded after computing offset_;
Shard(input_shape_, axis_); Shard(&input_shape_, axis_);
size_t output_size = size_t output_size =
std::accumulate(output_shape_.begin(), output_shape_.end(), sizeof(float), std::multiplies<size_t>()); std::accumulate(output_shape_.begin(), output_shape_.end(), sizeof(float), std::multiplies<size_t>());

View File

@ -34,5 +34,13 @@ MS_REG_CPU_KERNEL_T(Push,
MS_REG_CPU_KERNEL_T( MS_REG_CPU_KERNEL_T(
Push, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt64), Push, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt64),
PushKernel, float); PushKernel, float);
MS_REG_CPU_KERNEL_T(Push,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeUInt64),
PushKernel, float);
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore

View File

@ -43,7 +43,7 @@ class PushKernel : public CPUKernel {
sizes.push_back(SizeToInt(input->size) / sizeof(T)); sizes.push_back(SizeToInt(input->size) / sizeof(T));
} }
parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes); parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes);
memcpy(outputs[0]->addr, &key_, sizeof(size_t)); memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t));
return true; return true;
} }

View File

@ -75,7 +75,7 @@ void SparseApplyAdamPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar
void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) { void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
const auto &indices_addr = inputs[10]; const auto &indices_addr = inputs[10];
indices_size_ = indices_addr->size; indices_size_ = indices_addr->size / sizeof(int);
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float); workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
workspace_size_list_[1] = indices_size_ * sizeof(int); workspace_size_list_[1] = indices_size_ * sizeof(int);
} }

View File

@ -64,7 +64,7 @@ void SparseApplyFtrlPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar
void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) { void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
const auto &indices_addr = inputs[4]; const auto &indices_addr = inputs[4];
indices_size_ = indices_addr->size; indices_size_ = indices_addr->size / sizeof(int);
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float); workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
workspace_size_list_[1] = indices_size_ * sizeof(int); workspace_size_list_[1] = indices_size_ * sizeof(int);
} }

View File

@ -81,6 +81,8 @@ void SparseApplyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node)
MS_EXCEPTION_IF_NULL(kernel_node); MS_EXCEPTION_IF_NULL(kernel_node);
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
workspace_size_list_.emplace_back(var_first_dim_size_ * var_outer_dim_size_ * sizeof(float)); workspace_size_list_.emplace_back(var_first_dim_size_ * var_outer_dim_size_ * sizeof(float));
} }
@ -142,11 +144,21 @@ bool SparseApplyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
auto indices = reinterpret_cast<int *>(inputs[10]->addr); auto indices = reinterpret_cast<int *>(inputs[10]->addr);
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr); auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr); auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
auto m_t = reinterpret_cast<float *>(workspace[2]->addr); auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
auto m_t = reinterpret_cast<float *>(workspace[4]->addr);
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_}); SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_, SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
var_outer_dim_size_); SparseGradient input_sparse_grad({grad, indices, indices_size_});
ReduceSparseGradientParam param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
size_t total_dim_size = var_first_dim_size_ * var_outer_dim_size_; size_t total_dim_size = var_first_dim_size_ * var_outer_dim_size_;
lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power); lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);

View File

@ -132,12 +132,19 @@ bool SparseApplyFtrlCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
auto indices = reinterpret_cast<int *>(inputs[4]->addr); auto indices = reinterpret_cast<int *>(inputs[4]->addr);
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr); auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr); auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr); auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr); auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_}); SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_}); SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad, SparseGradient input_sparse_grad({grad, indices, indices_size_});
var_first_dim_size_, var_outer_dim_size_); ReduceSparseGradientParam param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
MultiThreadComputeParams input_params; MultiThreadComputeParams input_params;
input_params.var_ = var; input_params.var_ = var;

View File

@ -123,13 +123,19 @@ bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr>
auto indices = reinterpret_cast<int *>(inputs[10]->addr); auto indices = reinterpret_cast<int *>(inputs[10]->addr);
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr); auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr); auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr); auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr); auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_}); SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_}); SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad, SparseGradient input_sparse_grad({grad, indices, indices_size_});
var_first_dim_size_, var_outer_dim_size_); ReduceSparseGradientParam param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power); lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
MultiThreadComputeParams input_params; MultiThreadComputeParams input_params;

View File

@ -61,6 +61,8 @@ void SparseApplyProximalAdagradCPUKernel::InitInputOutputSize(const CNodePtr &ke
MS_EXCEPTION_IF_NULL(kernel_node); MS_EXCEPTION_IF_NULL(kernel_node);
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
} }
void SparseApplyProximalAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) { void SparseApplyProximalAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
@ -119,9 +121,19 @@ bool SparseApplyProximalAdagradCPUKernel::Launch(const std::vector<kernel::Addre
auto indices = reinterpret_cast<int *>(inputs[6]->addr); auto indices = reinterpret_cast<int *>(inputs[6]->addr);
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr); auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr); auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_}); SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_, SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
var_outer_dim_size_); SparseGradient input_sparse_grad({grad, indices, indices_size_});
ReduceSparseGradientParam param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
MultiThreadComputeParams input_params; MultiThreadComputeParams input_params;
input_params.var_ = var; input_params.var_ = var;

View File

@ -0,0 +1,26 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h"
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastToGpuKernel, float)
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastToGpuKernel, half)
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,83 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
#include <vector>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh"
namespace mindspore {
namespace kernel {
template <typename T>
class BroadcastToGpuKernel : public GpuKernel {
public:
BroadcastToGpuKernel() {}
~BroadcastToGpuKernel() = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
T *input_addr = GetDeviceAddress<T>(inputs, 0);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], output_shape_[0], output_shape_[1],
output_shape_[2], output_shape_[3], input_addr, output_addr,
reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
bool Init(const CNodePtr &kernel_node) override {
auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
if (input_shapes.size() > 4 || output_shapes.size() > 4) {
MS_LOG(EXCEPTION) << "BroadcastTo operation not support dim greater than 4";
}
for (int i = input_shapes.size() - 1; i >= 0; i--) {
input_shape_[i] = input_shapes[i];
}
for (int j = output_shapes.size() - 1; j >= 0; j--) {
output_shape_[j] = output_shapes[j];
}
InitSizeLists();
return true;
}
protected:
void InitSizeLists() override {
input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * sizeof(T));
output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] * sizeof(T));
}
private:
int input_shape_[4] = {1, 1, 1, 1};
int output_shape_[4] = {1, 1, 1, 1};
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_

View File

@ -18,6 +18,7 @@
#define MINDSPORE_CCSRC_KERNEL_GPU_CONCATV2_GPU_KERNEL_H #define MINDSPORE_CCSRC_KERNEL_GPU_CONCATV2_GPU_KERNEL_H
#include <vector> #include <vector>
#include <memory>
#include "backend/kernel_compiler/gpu/gpu_kernel.h" #include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh" #include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
@ -27,40 +28,35 @@ namespace kernel {
template <typename T> template <typename T>
class ConcatV2GpuFwdKernel : public GpuKernel { class ConcatV2GpuFwdKernel : public GpuKernel {
public: public:
ConcatV2GpuFwdKernel() : axis_(0), output_size_(0) {} ConcatV2GpuFwdKernel()
: axis_(0),
input_num_(1),
output_size_(0),
all_size_before_axis_(1),
all_size_axis_(1),
inputs_host_(nullptr),
len_axis_(nullptr) {}
~ConcatV2GpuFwdKernel() override = default; ~ConcatV2GpuFwdKernel() override = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override { const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
if (inputs.size() == 2) { T *output = GetDeviceAddress<T>(outputs, 0);
T *input_0 = GetDeviceAddress<T>(inputs, 0); T **inputs_device = GetDeviceAddress<T *>(workspace, 0);
T *input_1 = GetDeviceAddress<T>(inputs, 1); int *len_axis_device = GetDeviceAddress<int>(workspace, 1);
T *output = GetDeviceAddress<T>(outputs, 0); for (size_t i = 0; i < inputs.size(); i++) {
ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], input_0, input_1, output, inputs_host_[i] = GetDeviceAddress<T>(inputs, i);
reinterpret_cast<cudaStream_t>(stream_ptr));
}
if (inputs.size() == 3) {
T *input_0 = GetDeviceAddress<T>(inputs, 0);
T *input_1 = GetDeviceAddress<T>(inputs, 1);
T *input_2 = GetDeviceAddress<T>(inputs, 2);
T *output = GetDeviceAddress<T>(outputs, 0);
ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], input_0, input_1, input_2, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
if (inputs.size() == 4) {
T *input_0 = GetDeviceAddress<T>(inputs, 0);
T *input_1 = GetDeviceAddress<T>(inputs, 1);
T *input_2 = GetDeviceAddress<T>(inputs, 2);
T *input_3 = GetDeviceAddress<T>(inputs, 3);
T *output = GetDeviceAddress<T>(outputs, 0);
ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], w_[3], input_0, input_1, input_2, input_3, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
} }
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"ConcatV2 opt cudaMemcpyAsync inputs failed");
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"ConcatV2 opt cudaMemcpyAsync length on axis failed");
ConcatKernel(output_size_, input_num_, all_size_before_axis_, all_size_axis_, len_axis_device, inputs_device,
output, reinterpret_cast<cudaStream_t>(stream_ptr));
return true; return true;
} }
bool Init(const CNodePtr &kernel_node) override { bool Init(const CNodePtr &kernel_node) override {
@ -74,25 +70,34 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
axis_ += SizeToInt(input_shape.size()); axis_ += SizeToInt(input_shape.size());
} }
auto input_num = AnfAlgo::GetInputTensorNum(kernel_node); input_num_ = SizeToInt(AnfAlgo::GetInputTensorNum(kernel_node));
for (size_t i = 0; i < input_num; i++) { inputs_host_ = std::make_unique<T *[]>(input_num_);
auto input_size = sizeof(T); len_axis_ = std::make_unique<int[]>(input_num_);
for (int i = 0; i < input_num_; i++) {
int input_size = 1;
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i); auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
for (size_t j = 0; j < input_shape.size(); j++) { for (size_t j = 0; j < input_shape.size(); j++) {
input_size *= SizeToInt(input_shape[j]); input_size *= SizeToInt(input_shape[j]);
if (j >= IntToSize(axis_)) {
w_[i] *= SizeToInt(input_shape[j]);
}
input_size_list_.push_back(input_size);
} }
input_size_list_.push_back(IntToSize(input_size * sizeof(T)));
len_axis_[i] = SizeToInt(input_shape[axis_]);
} }
workspace_size_list_.push_back(sizeof(T *) * input_num_);
workspace_size_list_.push_back(sizeof(int) * input_num_);
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
output_size_ = sizeof(T); output_size_ = 1;
for (size_t i = 0; i < output_shape.size(); i++) { for (int i = 0; i < SizeToInt(output_shape.size()); i++) {
output_size_ *= output_shape[i]; output_size_ *= output_shape[i];
if (i > axis_) {
all_size_before_axis_ *= output_shape[i];
all_size_axis_ *= output_shape[i];
}
if (i == axis_) {
all_size_before_axis_ *= output_shape[i];
}
} }
output_size_list_.push_back(output_size_); output_size_list_.push_back(IntToSize(output_size_ * sizeof(T)));
InitSizeLists(); InitSizeLists();
return true; return true;
@ -103,11 +108,6 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
private: private:
bool CheckParam(const CNodePtr &kernel_node) { bool CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num < 2 || input_num > 4) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but ConcatV2GpuFwdKernel needs inputs between 2 and 4.";
return false;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) { if (output_num != 1) {
MS_LOG(ERROR) << "Output number is " << output_num << ", but ConcatV2GpuFwdKernel needs 1 output."; MS_LOG(ERROR) << "Output number is " << output_num << ", but ConcatV2GpuFwdKernel needs 1 output.";
@ -115,9 +115,13 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
} }
return true; return true;
} }
int w_[4] = {1, 1, 1, 1};
int axis_; int axis_;
size_t output_size_; int input_num_;
int output_size_;
int all_size_before_axis_;
int all_size_axis_;
std::unique_ptr<T *[]> inputs_host_;
std::unique_ptr<int[]> len_axis_;
std::vector<size_t> input_size_list_; std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_; std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_; std::vector<size_t> workspace_size_list_;

View File

@ -0,0 +1,31 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h"
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_ONE(
Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
SplitGpuFwdKernel, float)
MS_REG_GPU_KERNEL_ONE(Split,
KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
SplitGpuFwdKernel, int)
MS_REG_GPU_KERNEL_ONE(
Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
SplitGpuFwdKernel, half)
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,153 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
#define MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
#include <vector>
#include <memory>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
namespace mindspore {
namespace kernel {
template <typename T>
class SplitGpuFwdKernel : public GpuKernel {
public:
SplitGpuFwdKernel()
: axis_(0),
output_num_(1),
input_size_(1),
axis_step_(1),
all_size_before_axis_(1),
all_size_axis_(1),
outputs_host_(nullptr) {}
~SplitGpuFwdKernel() override = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
T *input = GetDeviceAddress<T>(inputs, 0);
T **outputs_device = GetDeviceAddress<T *>(workspace, 0);
for (size_t i = 0; i < outputs.size(); i++) {
outputs_host_[i] = GetDeviceAddress<T>(outputs, i);
}
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"Split opt cudaMemcpyAsync outputs failed");
SplitKernel(input_size_, axis_step_, all_size_before_axis_, all_size_axis_, input, outputs_device,
reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
bool Init(const CNodePtr &kernel_node) override {
axis_ = GetAttr<int>(kernel_node, "axis");
if (axis_ < 0) {
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
axis_ += SizeToInt(input_shape.size());
}
output_num_ = GetAttr<int>(kernel_node, "output_num");
if (!CheckParam(kernel_node)) {
return false;
}
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
input_size_ = 1;
all_size_before_axis_ = 1;
all_size_axis_ = 1;
for (int i = 0; i < SizeToInt(input_shape.size()); i++) {
input_size_ *= input_shape[i];
if (i > axis_) {
all_size_before_axis_ *= input_shape[i];
all_size_axis_ *= input_shape[i];
}
if (i == axis_) {
all_size_before_axis_ *= input_shape[i];
}
}
input_size_list_.push_back(IntToSize(input_size_ * sizeof(T)));
axis_step_ = input_shape[axis_] / output_num_;
for (int i = 0; i < output_num_; i++) {
size_t output_size = 1;
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, i);
for (size_t j = 0; j < output_shape.size(); j++) {
output_size *= output_shape[j];
}
output_size_list_.push_back(output_size * sizeof(T));
}
workspace_size_list_.push_back(sizeof(T *) * output_num_);
InitSizeLists();
outputs_host_ = std::make_unique<T *[]>(output_num_);
return true;
}
protected:
void InitSizeLists() override {}
private:
bool CheckParam(const CNodePtr &kernel_node) {
auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
int dims = SizeToInt(input_shape.size());
int output_num = SizeToInt(AnfAlgo::GetOutputTensorNum(kernel_node));
if (input_num != 1) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but Split needs 1 input.";
return false;
}
if (dims == 0) {
MS_LOG(ERROR) << "Input dims is " << dims << ", scalar is not supported.";
return false;
}
if (axis_ < -dims || axis_ >= dims) {
MS_LOG(ERROR) << "Attr axis " << axis_ << " must be in " << -dims << "~" << dims;
return false;
}
if (output_num_ > SizeToInt(input_shape[axis_])) {
MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must less than" << input_shape[axis_];
return false;
}
if (input_shape[axis_] % output_num_ != 0) {
MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must be divided by" << input_shape[axis_];
return false;
}
if (output_num_ != output_num) {
MS_LOG(ERROR) << "Output num is " << output_num << ", but need " << output_num_;
return false;
}
return true;
}
int axis_;
int output_num_;
int input_size_;
int axis_step_;
int all_size_before_axis_;
int all_size_axis_;
std::unique_ptr<T *[]> outputs_host_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H

View File

@ -0,0 +1,29 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h"
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_TWO(TopK,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeInt32),
TopKGpuKernel, float, int)
}
} // namespace mindspore

View File

@ -0,0 +1,110 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_
#include <vector>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
namespace mindspore {
namespace kernel {
template <typename T, typename S>
class TopKGpuKernel : public GpuKernel {
public:
TopKGpuKernel() : sorted_(false), outer_size_(1), inner_size_(1), k_(1), use_share_mem_(true), ceil_power2_(0) {}
~TopKGpuKernel() override = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspaces,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
T *input_addr = GetDeviceAddress<T>(inputs, 0);
S *k = GetDeviceAddress<S>(inputs, 1);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
S *indices = GetDeviceAddress<S>(outputs, 1);
T *data_buff = nullptr;
S *index_buff = nullptr;
if (use_share_mem_ == false) {
data_buff = GetDeviceAddress<T>(workspaces, 0);
index_buff = GetDeviceAddress<S>(workspaces, 1);
}
TopK(outer_size_, inner_size_, input_addr, k, output_addr, indices, data_buff, index_buff,
reinterpret_cast<cudaStream_t>(stream_ptr));
if (sorted_ == false) {
std::cout << "================BitonicSortByKey" << std::endl;
BitonicSortByKey(outer_size_, k_, output_addr, indices, data_buff, index_buff,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
return true;
}
bool Init(const CNodePtr &kernel_node) override {
auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shapes.size() - 1; i++) {
outer_size_ *= input_shapes[i];
}
inner_size_ = input_shapes[input_shapes.size() - 1];
k_ = output_shapes[output_shapes.size() - 1];
sorted_ = GetAttr<bool>(kernel_node, "sorted");
ceil_power2_ = RoundUpPower2(inner_size_);
size_t buffer_size = ceil_power2_ * (sizeof(T) + sizeof(S));
if (buffer_size > SHARED_MEM_PER_BLOCK) {
use_share_mem_ = false;
MS_LOG(WARNING) << "CUDA share memory not enough, sort with RAM";
}
InitSizeLists();
return true;
}
protected:
void InitSizeLists() override {
input_size_list_.push_back(outer_size_ * inner_size_ * sizeof(T));
input_size_list_.push_back(sizeof(S));
output_size_list_.push_back(outer_size_ * k_ * sizeof(T));
output_size_list_.push_back(outer_size_ * k_ * sizeof(S));
if (use_share_mem_ == false) {
workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(T));
workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(S));
}
}
private:
bool sorted_;
int outer_size_;
int inner_size_;
int k_;
bool use_share_mem_;
int ceil_power2_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
};
} // namespace kernel
} // namespace mindspore
#endif // TopKpuKernel

View File

@ -116,16 +116,16 @@ __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const
output); output);
case BROADCAST_TYPE_REALDIV: case BROADCAST_TYPE_REALDIV:
return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1, return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
output); output);
case BROADCAST_TYPE_MUL: case BROADCAST_TYPE_MUL:
return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1, return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
output); output);
case BROADCAST_TYPE_SUB: case BROADCAST_TYPE_SUB:
return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1, return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
output); output);
case BROADCAST_TYPE_ADD: case BROADCAST_TYPE_ADD:
return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1, return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
output); output);
} }
} }
@ -176,6 +176,28 @@ void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, cons
NoBroadcastKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, op, input0, input1, output); NoBroadcastKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, op, input0, input1, output);
} }
template <typename T>
__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0,
const int o1, const int o2, const int o3, const T *input_addr, T *output_addr) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < o0 * o1 * o2 * o3; pos += blockDim.x * gridDim.x) {
int i = pos / (o1 * o2 * o3) % o0;
int j = pos / (o2 * o3) % o1;
int k = pos / o3 % o2;
int l = pos % o3;
int input_idx = Index(i, i0) * i1 * i2 * i3 + Index(j, i1) * i2 * i3 + Index(k, i2) * i3 + Index(l, i3);
output_addr[pos] = input_addr[input_idx];
}
}
template <typename T>
void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) {
int nums = o0 * o1 * o2 * o3;
BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr,
output_addr);
}
template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1, template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3, const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
enum BroadcastOpType op, const float *input0, const float *input1, bool *output, enum BroadcastOpType op, const float *input0, const float *input1, bool *output,
@ -204,5 +226,11 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
bool *output, cudaStream_t stream); bool *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1, template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
half *output, cudaStream_t stream); half *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
int *output, cudaStream_t stream); cudaStream_t stream);
template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
const int &o2, const int &o3, const float *input_addr, float *output_addr,
cudaStream_t stream);
template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
const int &o2, const int &o3, const half *input_addr, half *output_addr, cudaStream_t stream);

View File

@ -41,4 +41,8 @@ template <typename T, typename S>
void NoBroadcast(const int &size, enum BroadcastOpType op, const T *input0, const T *input1, S *output, void NoBroadcast(const int &size, enum BroadcastOpType op, const T *input0, const T *input1, S *output,
cudaStream_t stream); cudaStream_t stream);
template <typename T>
void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_ #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_

View File

@ -19,90 +19,51 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh" #include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
template <typename T> template <typename T>
__global__ void Concat(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output) { __global__ void Concat(const int size, const int input_num,
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { const int all_size_before_axis, const int all_size_axis,
int n = pos / (w1 + w2); int* len_axis, T** inputs, T* output) {
int m = pos % (w1 + w2); for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
output[pos] = m >= w1 ? input_2[n * w2 + m - w1] : input_1[n * w1 + m]; int num = pos % all_size_before_axis / all_size_axis;
int block = -1;
int axis_inc = 0;
int block_len = 0;
for (int i = 0; i < input_num; i++) {
if (axis_inc <= num) {
block++;
axis_inc += len_axis[i];
} else {
break;
}
}
block_len = len_axis[block];
axis_inc -= len_axis[block];
int block_pos = pos / all_size_before_axis * block_len * all_size_axis +
(num - axis_inc) * all_size_axis + pos % all_size_axis;;
output[pos] = inputs[block][block_pos];
} }
return; return;
} }
template <typename T> template <typename T>
__global__ void Concat(const size_t size, const int w1, const int w2, const int w3, void ConcatKernel(const int size, const int input_num,
const T* input_1, const T* input_2, const T* input_3, T* output) { const int all_size_before_axis, const int all_size_axis,
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { int* len_axis, T** inputs, T* output,
int n = pos / (w1 + w2 + w3);
int m = pos % (w1 + w2 + w3);
output[pos] = m < w1 ? input_1[n * w1 + m] :
m < w1 + w2 ? input_2[n * w2 + m - w1] :
input_3[n * w3 + m - w1 - w2];
}
return;
}
template <typename T>
__global__ void Concat(const size_t size, const int w1, const int w2, const int w3, const int w4,
const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
int n = pos / (w1 + w2 + w3 + w4);
int m = pos % (w1 + w2 + w3 + w4);
output[pos] = m < w1 ? input_1[n * w1 + m] :
m < w1 + w2 ? input_2[n * w2 + m - w1]:
m < w1 + w2 + w3 ? input_3[n * w3 + m - w1 - w2]:
input_4[n * w4 + m - w1 - w2 - w3];
}
return;
}
template <typename T>
void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output,
cudaStream_t cuda_stream) {
Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, input_1, input_2, output);
return;
}
template <typename T>
void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
const T* input_1, const T* input_2, const T* input_3, T* output,
cudaStream_t cuda_stream) { cudaStream_t cuda_stream) {
Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, input_1, input_2, input_3, output); Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_num,
all_size_before_axis, all_size_axis,
len_axis, inputs, output);
return; return;
} }
template <typename T> template void ConcatKernel(const int size, const int input_num,
void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4, const int all_size_before_axis, const int all_size_axis,
const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output, int* len_axis, float** inputs, float* output,
cudaStream_t cuda_stream) { cudaStream_t cuda_stream);
Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, w4, input_1, template void ConcatKernel(const int size, const int input_num,
input_2, input_3, input_4, output); const int all_size_before_axis, const int all_size_axis,
return; int* len_axis, int** inputs, int* output,
} cudaStream_t cuda_stream);
template void ConcatKernel(const int size, const int input_num,
template void ConcatKernel(const size_t size, const int w1, const int w2, const float* input_1, const float* input_2, const int all_size_before_axis, const int all_size_axis,
float* output, cudaStream_t cuda_stream); int* len_axis, half** inputs, half* output,
template void ConcatKernel(const size_t size, const int w1, const int w2, const int* input_1, const int* input_2, cudaStream_t cuda_stream);
int* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const half* input_1, const half* input_2,
half* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
const float* input_1, const float* input_2, const float* input_3,
float* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
const int* input_1, const int* input_2, const int* input_3,
int* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
const half* input_1, const half* input_2, const half* input_3,
half* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
const float* input_1, const float* input_2, const float* input_3, const float* input_4,
float* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
const int* input_1, const int* input_2, const int* input_3, const int* input_4,
int* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
const half* input_1, const half* input_2, const half* input_3, const half* input_4,
half* output, cudaStream_t cuda_stream);

View File

@ -19,13 +19,8 @@
#include "runtime/device/gpu/cuda_common.h" #include "runtime/device/gpu/cuda_common.h"
template <typename T> template <typename T>
void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output, void ConcatKernel(const int size, const int input_num,
cudaStream_t cuda_stream); const int all_size_before_axis, const int all_size_axis,
template <typename T> int* len_axis, T** inputs, T* output,
void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
const T* input_1, const T* input_2, const T* input_3, T* output, cudaStream_t cuda_stream);
template <typename T>
void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output,
cudaStream_t cuda_stream); cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CONCATV2IMPL_H_ #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CONCATV2IMPL_H_

View File

@ -15,9 +15,9 @@
*/ */
#include "momentum_impl.cuh" #include "momentum_impl.cuh"
template <typename T, typename S> template <typename T, typename S, typename G>
__global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const S *learning_rate, __global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const S *learning_rate,
const T *gradient, const S *momentum) { const G *gradient, const S *momentum) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) { for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
accumulation[i] = momentum[0] * accumulation[i] + gradient[i]; accumulation[i] = momentum[0] * accumulation[i] + gradient[i];
variable[i] -= learning_rate[0] * accumulation[i]; variable[i] -= learning_rate[0] * accumulation[i];
@ -34,19 +34,32 @@ __global__ void MomentumUpdateVariableKernel(const size_t size, half *variable,
} }
return; return;
} }
template <typename T, typename S> template <>
void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient, __global__ void MomentumUpdateVariableKernel(const size_t size, float *variable, float *accumulation,
const float *learning_rate, const half *gradient,
const float *momentum) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
accumulation[i] = momentum[0] * accumulation[i] + __half2float(gradient[i]);
variable[i] -= learning_rate[0] * accumulation[i];
}
return;
}
template <typename T, typename S, typename G>
void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
const S *momentum, cudaStream_t cuda_stream) { const S *momentum, cudaStream_t cuda_stream) {
MomentumUpdateVariableKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, variable, accumulation, MomentumUpdateVariableKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, variable, accumulation,
learning_rate, gradient, momentum); learning_rate, gradient, momentum);
return; return;
} }
template void MomentumUpdateVariable<float, float>(const size_t size, float *variable, float *accumulation, template void MomentumUpdateVariable<float, float, float>(const size_t size, float *variable, float *accumulation,
const float *learning_rate, const float *gradient, const float *learning_rate, const float *gradient,
const float *momentum, cudaStream_t cuda_stream); const float *momentum, cudaStream_t cuda_stream);
template void MomentumUpdateVariable<half, half>(const size_t size, half *variable, half *accumulation, template void MomentumUpdateVariable<half, half, half>(const size_t size, half *variable, half *accumulation,
const half *learning_rate, const half *gradient, const half *learning_rate, const half *gradient,
const half *momentum, cudaStream_t cuda_stream); const half *momentum, cudaStream_t cuda_stream);
template void MomentumUpdateVariable<half, float>(const size_t size, half *variable, half *accumulation, template void MomentumUpdateVariable<half, float, half>(const size_t size, half *variable, half *accumulation,
const float *learning_rate, const half *gradient,
const float *momentum, cudaStream_t cuda_stream);
template void MomentumUpdateVariable<float, float, half>(const size_t size, float *variable, float *accumulation,
const float *learning_rate, const half *gradient, const float *learning_rate, const half *gradient,
const float *momentum, cudaStream_t cuda_stream); const float *momentum, cudaStream_t cuda_stream);

View File

@ -18,8 +18,8 @@
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
#include "runtime/device/gpu/cuda_common.h" #include "runtime/device/gpu/cuda_common.h"
template <typename T, typename S> template <typename T, typename S, typename G>
void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient, void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
const S *momentum, cudaStream_t cuda_stream); const S *momentum, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_ #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_

View File

@ -0,0 +1,50 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdio.h>
#include <stdint.h>
#include <cuda_runtime.h>
#include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
template <typename T>
__global__ void Split(const int size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const T* input, T** outputs) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
int num = pos % all_size_before_axis / all_size_axis;
int block = num / axis_step;
int block_pos = pos / all_size_before_axis * axis_step * all_size_axis +
num % axis_step * all_size_axis + pos % all_size_axis;
outputs[block][block_pos] = input[pos];
}
return;
}
template <typename T>
void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) {
Split<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, axis_step, all_size_before_axis,
all_size_axis, input, outputs);
return;
}
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const float* input, float** outputs,
cudaStream_t cuda_stream);
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const int* input, int** outputs,
cudaStream_t cuda_stream);
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const half* input, half** outputs,
cudaStream_t cuda_stream);

View File

@ -0,0 +1,24 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
#include "runtime/device/gpu/cuda_common.h"
template <typename T>
void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_

View File

@ -0,0 +1,162 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
#include <limits>
#include <algorithm>
int RoundUpPower2(int v) {
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;
}
template <typename T>
__inline__ __device__ void Swap(T *lhs, T *rhs) {
T tmp = lhs[0];
lhs[0] = rhs[0];
rhs[0] = tmp;
}
template <typename T, typename S>
__global__ void TopkKernel(const int outer, const int inner, const int ceil_power2, const T *input, const S *k,
T *output, S *indices, T *data_buff, S *index_buff) {
// default: sort with share memory
extern __shared__ T share_mem[];
T *data_arr = share_mem;
S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2);
// sort with RAM
if (data_buff != nullptr && index_buff != nullptr) {
data_arr = data_buff + blockIdx.x * ceil_power2;
index_arr = index_buff + blockIdx.x * ceil_power2;
}
for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max();
index_arr[i] = i;
}
__syncthreads();
for (size_t i = 2; i <= ceil_power2; i <<= 1) {
for (size_t j = (i >> 1); j > 0; j >>= 1) {
for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
size_t tid_comp = tid ^ j;
if (tid_comp > tid) {
if ((tid & i) == 0) {
if (data_arr[tid] > data_arr[tid_comp]) {
Swap(&data_arr[tid], &data_arr[tid_comp]);
Swap(&index_arr[tid], &index_arr[tid_comp]);
}
} else {
if (data_arr[tid] < data_arr[tid_comp]) {
Swap(&data_arr[tid], &data_arr[tid_comp]);
Swap(&index_arr[tid], &index_arr[tid_comp]);
}
}
}
}
__syncthreads();
}
}
for (size_t tid = threadIdx.x; tid < k[0]; tid += blockDim.x) {
output[blockIdx.x * k[0] + tid] = data_arr[inner - tid - 1];
indices[blockIdx.x * k[0] + tid] = index_arr[inner - tid - 1];
}
}
template <typename T, typename S>
void TopK(const int &outer, const int &inner, const T *input, const S *k, T *output, S *indices, T *data_buff,
S *index_buff, cudaStream_t stream) {
int ceil_power2 = RoundUpPower2(inner);
int share_mem = (data_buff == nullptr) ? ceil_power2 * (sizeof(T) + sizeof(S)) : 0;
int thread = std::min(ceil_power2, GET_THREADS);
TopkKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, k, output, indices, data_buff,
index_buff);
}
template <typename T, typename S>
__global__ void BitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2, T *input,
S *indices, T *data_buff, S *index_buff) {
// default: sort with share memory
extern __shared__ T share_mem[];
T *data_arr = share_mem;
S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2);
// sort with RAM
if (data_buff != nullptr && index_buff != nullptr) {
data_arr = data_buff + blockIdx.x * ceil_power2;
index_arr = index_buff + blockIdx.x * ceil_power2;
}
for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max();
index_arr[i] = (i < inner) ? indices[blockIdx.x * inner + i] : std::numeric_limits<S>::max();;
}
__syncthreads();
for (size_t i = 2; i <= ceil_power2; i <<= 1) {
for (size_t j = (i >> 1); j > 0; j >>= 1) {
for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
size_t tid_comp = tid ^ j;
if (tid_comp > tid) {
if ((tid & i) == 0) {
if (index_arr[tid] > index_arr[tid_comp]) {
Swap(&data_arr[tid], &data_arr[tid_comp]);
Swap(&index_arr[tid], &index_arr[tid_comp]);
}
} else {
if (index_arr[tid] < index_arr[tid_comp]) {
Swap(&data_arr[tid], &data_arr[tid_comp]);
Swap(&index_arr[tid], &index_arr[tid_comp]);
}
}
}
}
__syncthreads();
}
}
for (size_t tid = threadIdx.x; tid < inner; tid += blockDim.x) {
input[blockIdx.x * inner + tid] = data_arr[tid];
indices[blockIdx.x * inner + tid] = index_arr[tid];
}
}
template <typename T, typename S>
void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff,
cudaStream_t stream) {
int ceil_power2 = RoundUpPower2(inner);
size_t share_mem = ceil_power2 * (sizeof(T) + sizeof(S));
if (share_mem > SHARED_MEM_PER_BLOCK) {
share_mem = 0;
} else {
data_buff = nullptr;
index_buff = nullptr;
}
int thread = std::min(ceil_power2, GET_THREADS);
BitonicSortByKeyKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, indices, data_buff,
index_buff);
}
template void TopK(const int &outer, const int &inner, const float *input_addr, const int *k, float *output,
int *indices, float *data_buff, int *index_buff, cudaStream_t stream);
template void BitonicSortByKey(const int &outer, const int &inner, float *input, int *indices, float *data_buff,
int *index_buff, cudaStream_t stream);

View File

@ -0,0 +1,32 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
#include <cuda_runtime.h>
#include "runtime/device/gpu/cuda_common.h"
template <typename T, typename S>
void TopK(const int &outer, const int &inner, const T *input_addr, const S *k, T *output, S *indices, T *data_buff,
S *index_buff, cudaStream_t stream);
template <typename T, typename S>
void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff,
cudaStream_t stream);
int RoundUpPower2(int v);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_

View File

@ -103,6 +103,35 @@ __global__ void ZeroslikeKernel(T *output, size_t count) {
return; return;
} }
template <typename T> template <typename T>
__global__ void AbsKernel(T *input, T *output, size_t count) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = abs(input[i]);
}
return;
}
template <>
__global__ void AbsKernel(half *input, half *output, size_t count) {
half zero = 0.0;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = input[i] < zero ? -input[i] : input[i];
}
return;
}
template <typename T>
__global__ void FloorKernel(T *input, T *output, size_t count) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = floor(input[i]);
}
return;
}
template <>
__global__ void FloorKernel(half *input, half *output, size_t count) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = hfloor(input[i]);
}
return;
}
template <typename T>
void Exponential(T *input, T *output, size_t count, cudaStream_t cuda_stream) { void Exponential(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
return; return;
@ -147,6 +176,16 @@ void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream) {
ZeroslikeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(output, count); ZeroslikeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(output, count);
return; return;
} }
template <typename T>
void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
AbsKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
return;
}
template <typename T>
void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
FloorKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
return;
}
template void Exponential<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); template void Exponential<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
template void Logarithm<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); template void Logarithm<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
@ -156,6 +195,8 @@ template void Square<float>(float *input, float *output, size_t count, cudaStrea
template void Sqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); template void Sqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
template void Rsqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); template void Rsqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
template void Zeroslike<float>(float *output, size_t count, cudaStream_t cuda_stream); template void Zeroslike<float>(float *output, size_t count, cudaStream_t cuda_stream);
template void Abs<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
template void Floor<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
template void Exponential<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); template void Exponential<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
template void Logarithm<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); template void Logarithm<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
template void Negative<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); template void Negative<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
@ -164,3 +205,5 @@ template void Square<half>(half *input, half *output, size_t count, cudaStream_t
template void Sqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); template void Sqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
template void Rsqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); template void Rsqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
template void Zeroslike<half>(half *output, size_t count, cudaStream_t cuda_stream); template void Zeroslike<half>(half *output, size_t count, cudaStream_t cuda_stream);
template void Abs<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
template void Floor<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);

View File

@ -34,5 +34,9 @@ template <typename T>
void Rsqrt(T *input, T *output, size_t count, cudaStream_t cuda_stream); void Rsqrt(T *input, T *output, size_t count, cudaStream_t cuda_stream);
template <typename T> template <typename T>
void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream); void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream);
template <typename T>
void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream);
template <typename T>
void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_ #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_

View File

@ -88,6 +88,12 @@ class GpuKernelRegister {
static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \ static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \
static const GpuKernelRegister g_##OPNAME##_##T##_##S##_gpu_kernel_reg(#OPNAME, ATTR, \ static const GpuKernelRegister g_##OPNAME##_##T##_##S##_gpu_kernel_reg(#OPNAME, ATTR, \
[]() { return new OPCLASS<T, S>(); }); []() { return new OPCLASS<T, S>(); });
// register of mixed accuracy kernels which use template and maintain three typename
#define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G) \
static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S, G>>::value, " must be base of GpuKernel"); \
static const GpuKernelRegister g_##OPNAME##_##T##_##S##_##G##_gpu_kernel_reg( \
#OPNAME, ATTR, []() { return new OPCLASS<T, S, G>(); });
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore
#endif // MINDSPORE_CCSRC_KERNEL_GPU_GPUKERNELFACTORY_H_ #endif // MINDSPORE_CCSRC_KERNEL_GPU_GPUKERNELFACTORY_H_

View File

@ -46,5 +46,13 @@ MS_REG_GPU_KERNEL_ONE(Sqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOut
UnaryOpGpuKernel, float) UnaryOpGpuKernel, float)
MS_REG_GPU_KERNEL_ONE(Rsqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), MS_REG_GPU_KERNEL_ONE(Rsqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
UnaryOpGpuKernel, float) UnaryOpGpuKernel, float)
MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
UnaryOpGpuKernel, float)
MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
UnaryOpGpuKernel, half)
MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
UnaryOpGpuKernel, float)
MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
UnaryOpGpuKernel, half)
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore

View File

@ -36,6 +36,8 @@ enum UnaryOptype {
UNARY_OP_SQUARE, UNARY_OP_SQUARE,
UNARY_OP_SQRT, UNARY_OP_SQRT,
UNARY_OP_RSQRT, UNARY_OP_RSQRT,
UNARY_OP_ABS,
UNARY_OP_FLOOR,
UNARY_OP_INVALID_TYPE = 255 UNARY_OP_INVALID_TYPE = 255
}; };
static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY_OP_EXP}, static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY_OP_EXP},
@ -45,7 +47,9 @@ static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY
{"ZerosLike", UNARY_OP_ZEROSLIKE}, {"ZerosLike", UNARY_OP_ZEROSLIKE},
{"Square", UNARY_OP_SQUARE}, {"Square", UNARY_OP_SQUARE},
{"Sqrt", UNARY_OP_SQRT}, {"Sqrt", UNARY_OP_SQRT},
{"Rsqrt", UNARY_OP_RSQRT}}; {"Rsqrt", UNARY_OP_RSQRT},
{"Abs", UNARY_OP_ABS},
{"Floor", UNARY_OP_FLOOR}};
template <typename T> template <typename T>
class UnaryOpGpuKernel : public GpuKernel { class UnaryOpGpuKernel : public GpuKernel {
public: public:
@ -100,6 +104,14 @@ class UnaryOpGpuKernel : public GpuKernel {
Zeroslike(output_addr, output_size_ / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr)); Zeroslike(output_addr, output_size_ / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
return true; return true;
} }
case UNARY_OP_ABS: {
Abs(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
break;
}
case UNARY_OP_FLOOR: {
Floor(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
break;
}
default: { default: {
MS_LOG(EXCEPTION) << "Unary operation " << unary_op_type_ << " is not supported."; MS_LOG(EXCEPTION) << "Unary operation " << unary_op_type_ << " is not supported.";
} }

View File

@ -34,15 +34,15 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNorm,
MS_REG_GPU_KERNEL_ONE(FusedBatchNorm, MS_REG_GPU_KERNEL_ONE(FusedBatchNorm,
KernelAttr() KernelAttr()
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16) .AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16) .AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16) .AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16) .AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16), .AddOutputAttr(kNumberTypeFloat32),
FusedBatchNormGpuKernel, half) FusedBatchNormGpuKernel, half)
MS_REG_GPU_KERNEL_ONE(BatchNorm, MS_REG_GPU_KERNEL_ONE(BatchNorm,
KernelAttr() KernelAttr()
@ -60,15 +60,15 @@ MS_REG_GPU_KERNEL_ONE(BatchNorm,
MS_REG_GPU_KERNEL_ONE(BatchNorm, MS_REG_GPU_KERNEL_ONE(BatchNorm,
KernelAttr() KernelAttr()
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16) .AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16) .AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16) .AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16) .AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16), .AddOutputAttr(kNumberTypeFloat32),
FusedBatchNormGpuKernel, half) FusedBatchNormGpuKernel, half)
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore

View File

@ -56,17 +56,17 @@ class FusedBatchNormGpuKernel : public GpuKernel {
return true; return true;
} }
auto x = GetDeviceAddress<T>(inputs, 0); auto x = GetDeviceAddress<T>(inputs, 0);
auto scale = GetDeviceAddress<T>(inputs, 1); auto scale = GetDeviceAddress<float>(inputs, 1);
auto bias = GetDeviceAddress<T>(inputs, 2); auto bias = GetDeviceAddress<float>(inputs, 2);
auto runing_mean = GetDeviceAddress<T>(inputs, 3); auto runing_mean = GetDeviceAddress<float>(inputs, 3);
auto runnig_variance = GetDeviceAddress<T>(inputs, 4); auto runnig_variance = GetDeviceAddress<float>(inputs, 4);
auto y = GetDeviceAddress<T>(outputs, 0); auto y = GetDeviceAddress<T>(outputs, 0);
const float alpha = 1; const float alpha = 1;
const float beta = 0; const float beta = 0;
if (is_train_) { if (is_train_) {
auto save_mean = GetDeviceAddress<T>(outputs, 3); auto save_mean = GetDeviceAddress<float>(outputs, 3);
auto save_variance = GetDeviceAddress<T>(outputs, 4); auto save_variance = GetDeviceAddress<float>(outputs, 4);
CHECK_CUDNN_RET_WITH_EXCEPT( CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnBatchNormalizationForwardTraining(handle_, mode_, &alpha, &beta, x_desc_, x, y_desc_, y, cudnnBatchNormalizationForwardTraining(handle_, mode_, &alpha, &beta, x_desc_, x, y_desc_, y,
scale_bias_mean_var_desc_, scale, bias, exp_avg_factor_, runing_mean, scale_bias_mean_var_desc_, scale, bias, exp_avg_factor_, runing_mean,

View File

@ -33,12 +33,12 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNormGrad,
KernelAttr() KernelAttr()
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16) .AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16) .AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16), .AddOutputAttr(kNumberTypeFloat32),
FusedBatchNormGradGpuKernel, half) FusedBatchNormGradGpuKernel, half)
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore

View File

@ -55,12 +55,12 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
} }
auto dy = GetDeviceAddress<T>(inputs, 0); auto dy = GetDeviceAddress<T>(inputs, 0);
auto x = GetDeviceAddress<T>(inputs, 1); auto x = GetDeviceAddress<T>(inputs, 1);
auto scale = GetDeviceAddress<T>(inputs, 2); auto scale = GetDeviceAddress<float>(inputs, 2);
auto save_mean = GetDeviceAddress<T>(inputs, 3); auto save_mean = GetDeviceAddress<float>(inputs, 3);
auto save_variance = GetDeviceAddress<T>(inputs, 4); auto save_variance = GetDeviceAddress<float>(inputs, 4);
auto dx = GetDeviceAddress<T>(outputs, 0); auto dx = GetDeviceAddress<T>(outputs, 0);
auto bn_scale = GetDeviceAddress<T>(outputs, 1); auto bn_scale = GetDeviceAddress<float>(outputs, 1);
auto bn_bias = GetDeviceAddress<T>(outputs, 2); auto bn_bias = GetDeviceAddress<float>(outputs, 2);
const float alpha_data_diff = 1; const float alpha_data_diff = 1;
const float beta_data_diff = 0; const float beta_data_diff = 0;

View File

@ -18,32 +18,41 @@
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
MS_REG_GPU_KERNEL_TWO(ApplyMomentum, MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
KernelAttr() KernelAttr()
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32), .AddOutputAttr(kNumberTypeFloat32),
MomentumGpuKernel, float, float) MomentumGpuKernel, float, float, float)
MS_REG_GPU_KERNEL_TWO(ApplyMomentum, MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
KernelAttr() KernelAttr()
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16), .AddOutputAttr(kNumberTypeFloat16),
MomentumGpuKernel, half, half) MomentumGpuKernel, half, half, half)
MS_REG_GPU_KERNEL_TWO(ApplyMomentum, MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
KernelAttr() KernelAttr()
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat16) .AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat32) .AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16), .AddOutputAttr(kNumberTypeFloat16),
MomentumGpuKernel, half, float) MomentumGpuKernel, half, float, half)
MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
MomentumGpuKernel, float, float, half)
} // namespace kernel } // namespace kernel
} // namespace mindspore } // namespace mindspore

View File

@ -23,7 +23,7 @@
#include "backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh" #include "backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh"
namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {
template <typename T, typename S> template <typename T, typename S, typename G>
class MomentumGpuKernel : public GpuKernel { class MomentumGpuKernel : public GpuKernel {
public: public:
MomentumGpuKernel() MomentumGpuKernel()
@ -38,7 +38,7 @@ class MomentumGpuKernel : public GpuKernel {
T *variable = GetDeviceAddress<T>(inputs, 0); T *variable = GetDeviceAddress<T>(inputs, 0);
T *accumulation = GetDeviceAddress<T>(inputs, 1); T *accumulation = GetDeviceAddress<T>(inputs, 1);
S *learning_rate = GetDeviceAddress<S>(inputs, 2); S *learning_rate = GetDeviceAddress<S>(inputs, 2);
T *gradient = GetDeviceAddress<T>(inputs, 3); G *gradient = GetDeviceAddress<G>(inputs, 3);
S *momentum = GetDeviceAddress<S>(inputs, 4); S *momentum = GetDeviceAddress<S>(inputs, 4);
MomentumUpdateVariable(inputs[0]->size / sizeof(T), variable, accumulation, learning_rate, gradient, momentum, MomentumUpdateVariable(inputs[0]->size / sizeof(T), variable, accumulation, learning_rate, gradient, momentum,
reinterpret_cast<cudaStream_t>(stream_ptr)); reinterpret_cast<cudaStream_t>(stream_ptr));
@ -54,7 +54,7 @@ class MomentumGpuKernel : public GpuKernel {
variable_size_ = sizeof(T); variable_size_ = sizeof(T);
accumulation_size_ = sizeof(T); accumulation_size_ = sizeof(T);
learning_rate_size_ = sizeof(S); learning_rate_size_ = sizeof(S);
gradient_size_ = sizeof(T); gradient_size_ = sizeof(G);
momentum_size_ = sizeof(S); momentum_size_ = sizeof(S);
auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);

View File

@ -81,6 +81,7 @@ static std::map<string, string> tbe_func_adapter_map = {
{"sparse_apply_proximal_adagrad", "sparse_apply_proximal_adagrad_d"}, {"sparse_apply_proximal_adagrad", "sparse_apply_proximal_adagrad_d"},
{"apply_add_sign", "apply_add_sign_d"}, {"apply_add_sign", "apply_add_sign_d"},
{"apply_power_sign", "apply_power_sign_d"}, {"apply_power_sign", "apply_power_sign_d"},
{"apply_centered_rms_prop", "apply_centered_rms_prop_d"},
{"transpose", "transpose_d"}, {"transpose", "transpose_d"},
{"fill", "fill_d"}, {"fill", "fill_d"},
{"unsorted_segment_sum", "unsorted_segment_sum_d"}, {"unsorted_segment_sum", "unsorted_segment_sum_d"},

View File

@ -43,6 +43,7 @@ constexpr auto kJInputs = "inputs";
constexpr auto kJOutputs = "outputs"; constexpr auto kJOutputs = "outputs";
constexpr auto kJAttrs = "attrs"; constexpr auto kJAttrs = "attrs";
constexpr auto kJKernelName = "kernel_name"; constexpr auto kJKernelName = "kernel_name";
constexpr auto kJFullName = "full_name";
constexpr auto kJOpInfo = "op_info"; constexpr auto kJOpInfo = "op_info";
constexpr auto kJDtype = "dtype"; constexpr auto kJDtype = "dtype";
constexpr auto kJtype = "type"; constexpr auto kJtype = "type";
@ -125,6 +126,7 @@ bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspor
op_info_json[kJKernelName] = json_name_; op_info_json[kJKernelName] = json_name_;
} }
(*kernel_json)[kJOpInfo] = op_info_json; (*kernel_json)[kJOpInfo] = op_info_json;
(*kernel_json)[kJFullName] = anf_node->fullname_with_scope();
if (creater_type_ == SINGLE_BUILD) { if (creater_type_ == SINGLE_BUILD) {
TbeUtils::SaveJsonInfo(json_name_, json_info_); TbeUtils::SaveJsonInfo(json_name_, json_info_);
} }

View File

@ -97,6 +97,7 @@
#include "backend/optimizer/ascend/format_type/modify_ops_attrs.h" #include "backend/optimizer/ascend/format_type/modify_ops_attrs.h"
#include "backend/optimizer/ascend/format_type/remove_no_use_reshape_op.h" #include "backend/optimizer/ascend/format_type/remove_no_use_reshape_op.h"
#include "backend/optimizer/ascend/ir_fusion/add_input_to_output.h" #include "backend/optimizer/ascend/ir_fusion/add_input_to_output.h"
#include "backend/optimizer/ascend/format_type/remove_internal_output.h"
#include "utils/context/ms_context.h" #include "utils/context/ms_context.h"
#include "utils/config_manager.h" #include "utils/config_manager.h"
#include "debug/anf_ir_dump.h" #include "debug/anf_ir_dump.h"
@ -201,6 +202,7 @@ void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph)
data_layout_pm->AddPass(std::make_shared<OptimizeDependence>()); data_layout_pm->AddPass(std::make_shared<OptimizeDependence>());
data_layout_pm->AddPass(std::make_shared<TransDataSplit>()); data_layout_pm->AddPass(std::make_shared<TransDataSplit>());
data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>()); data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
data_layout_pm->AddPass(std::make_shared<RemoveInternalOutputTransOp>());
optimizer->AddPassManager(data_layout_pm); optimizer->AddPassManager(data_layout_pm);
(void)optimizer->Optimize(kernel_graph); (void)optimizer->Optimize(kernel_graph);
kernel_graph->SetExecOrderByDefault(); kernel_graph->SetExecOrderByDefault();
@ -222,6 +224,7 @@ void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_grap
mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>()); mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>());
mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>()); mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>()); mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>());
mixed_precision_pm->AddPass(std::make_shared<RemoveInternalOutputCast>());
optimizer->AddPassManager(mixed_precision_pm); optimizer->AddPassManager(mixed_precision_pm);
(void)optimizer->Optimize(kernel_graph); (void)optimizer->Optimize(kernel_graph);
kernel_graph->SetExecOrderByDefault(); kernel_graph->SetExecOrderByDefault();

View File

@ -142,6 +142,7 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
MS_EXCEPTION_IF_NULL(node); MS_EXCEPTION_IF_NULL(node);
std::vector<AnfNodePtr> make_tuple_inputs; std::vector<AnfNodePtr> make_tuple_inputs;
make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple)); make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(node); ++output_idx) { for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(node); ++output_idx) {
std::string output_format = AnfAlgo::GetOutputFormat(node, output_idx); std::string output_format = AnfAlgo::GetOutputFormat(node, output_idx);
if (output_format == kOpFormat_NC1KHKWHWC0) { if (output_format == kOpFormat_NC1KHKWHWC0) {
@ -151,7 +152,11 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
auto tuple_getitem = CreatTupleGetItemNode(func_graph, node, output_idx); auto tuple_getitem = CreatTupleGetItemNode(func_graph, node, output_idx);
std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx); std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx);
if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) { if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) {
make_tuple_inputs.emplace_back(AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false)); auto trans_op = AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false);
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
kernel_graph->ReplaceInternalOutput(node, trans_op, output_idx, 0);
}
make_tuple_inputs.emplace_back(trans_op);
} else { } else {
// No need insert trans op. // No need insert trans op.
make_tuple_inputs.push_back(tuple_getitem); make_tuple_inputs.push_back(tuple_getitem);
@ -249,9 +254,14 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP
if (outputs_num == 0) { if (outputs_num == 0) {
return node; return node;
} }
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
// Single output // Single output
if (outputs_num == 1 && (!AnfAlgo::IsTupleOutput(node))) { if (outputs_num == 1 && (!AnfAlgo::IsTupleOutput(node))) {
return InsertTransOpForSingleOutput(func_graph, node, kernel_select); auto new_node = InsertTransOpForSingleOutput(func_graph, node, kernel_select);
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
kernel_graph->ReplaceInternalOutput(node, new_node);
}
return new_node;
} }
// Multiple output // Multiple output
return InsertTransOpForMultipleOutput(func_graph, node, kernel_select); return InsertTransOpForMultipleOutput(func_graph, node, kernel_select);

View File

@ -40,6 +40,7 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
std::vector<AnfNodePtr> make_tuple_inputs; std::vector<AnfNodePtr> make_tuple_inputs;
AbstractBasePtrList abstract_list; AbstractBasePtrList abstract_list;
make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple)); make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(cnode); ++output_idx) { for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(cnode); ++output_idx) {
AnfNodePtr replace_node = nullptr; AnfNodePtr replace_node = nullptr;
const auto origin_shape = AnfAlgo::GetOutputInferShape(cnode, output_idx); const auto origin_shape = AnfAlgo::GetOutputInferShape(cnode, output_idx);
@ -64,6 +65,9 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
MS_EXCEPTION_IF_NULL(replace_node); MS_EXCEPTION_IF_NULL(replace_node);
replace_node->set_scope(cnode->scope()); replace_node->set_scope(cnode->scope());
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node); AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
kernel_graph->ReplaceInternalOutput(cnode, replace_node, output_idx, 0);
}
} else { } else {
replace_node = getitem; replace_node = getitem;
} }
@ -87,6 +91,7 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
return cnode; return cnode;
} }
MS_EXCEPTION_IF_NULL(cnode->Type()); MS_EXCEPTION_IF_NULL(cnode->Type());
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
// Single output // Single output
if (!cnode->Type()->isa<Tuple>()) { if (!cnode->Type()->isa<Tuple>()) {
if (!need_insert_cast[0]) { if (!need_insert_cast[0]) {
@ -109,6 +114,9 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
MS_EXCEPTION_IF_NULL(replace_node); MS_EXCEPTION_IF_NULL(replace_node);
replace_node->set_scope(cnode->scope()); replace_node->set_scope(cnode->scope());
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node); AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
kernel_graph->ReplaceInternalOutput(cnode, replace_node);
}
} }
return replace_node; return replace_node;
} }
@ -188,6 +196,10 @@ const AnfNodePtr InsertCast::Process(const FuncGraphPtr &func_graph, const AnfNo
CNodePtr cnode = node->cast<CNodePtr>(); CNodePtr cnode = node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(cnode); MS_EXCEPTION_IF_NULL(cnode);
auto new_node = InsertCastForInput(func_graph, cnode); auto new_node = InsertCastForInput(func_graph, cnode);
auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
kernel_graph->ReplaceInternalOutput(node, new_node);
}
// process output // process output
return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true)); return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true));
} }

View File

@ -46,14 +46,13 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
if (node == nullptr || !AnfAlgo::IsRealKernel(node)) { if (node == nullptr || !AnfAlgo::IsRealKernel(node)) {
return nullptr; return nullptr;
} }
AnfNodePtr front_node; AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
MS_LOG(DEBUG) << "process op: " << node->DebugString();
AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>(); auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) { if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
front_node = kernel_graph->GetFrontNodeByInternalOutput(node); kernel_graph->ReplaceInternalOutput(node, new_node);
} }
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
MS_LOG(DEBUG) << "====process op: " << node->DebugString();
AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
auto ms_context = MsContext::GetInstance(); auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context); MS_EXCEPTION_IF_NULL(ms_context);
if (ms_context->execution_mode() == kPynativeMode && !ms_context->enable_pynative_hook()) { if (ms_context->execution_mode() == kPynativeMode && !ms_context->enable_pynative_hook()) {
@ -61,12 +60,7 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
return new_node; return new_node;
} }
} }
auto final_node = InsertTransOpForOutput(func_graph, new_node, kernel_select_); return InsertTransOpForOutput(func_graph, new_node, kernel_select_);
if (kernel_graph != nullptr && front_node != nullptr) {
auto old_node = kernel_graph->GetInternalOutputByFrontNode(front_node);
kernel_graph->ReplaceInternalOutput(old_node, final_node);
}
return final_node;
} }
} // namespace opt } // namespace opt
} // namespace mindspore } // namespace mindspore

View File

@ -0,0 +1,83 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/ascend/format_type/remove_internal_output.h"
#include <memory>
#include "backend/session/anf_runtime_algorithm.h"
namespace mindspore {
namespace opt {
namespace {
bool UsedForOutputOnly(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
MS_EXCEPTION_IF_NULL(func_graph);
auto manager = func_graph->manager();
MS_EXCEPTION_IF_NULL(manager);
auto &node_users = manager->node_users();
auto iter = node_users.find(node);
if (iter == node_users.end()) {
return false;
}
const auto &node_set = iter->second;
for (const auto &node_index : node_set) {
if (!AnfAlgo::CheckPrimitiveType(node_index.first, prim::kPrimMakeTuple)) {
return false;
}
}
return true;
}
} // namespace
const BaseRef RemoveInternalOutputTransOp::DefinePattern() const {
VarPtr X = std::make_shared<Var>();
auto prim = std::make_shared<Primitive>(kTransDataOpName);
return VectorRef({prim, X});
}
const BaseRef RemoveInternalOutputCast::DefinePattern() const {
VarPtr X = std::make_shared<Var>();
return VectorRef({prim::kPrimCast, X});
}
const AnfNodePtr RemoveInternalOutput::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
MS_EXCEPTION_IF_NULL(func_graph);
MS_EXCEPTION_IF_NULL(node);
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
if (kernel_graph == nullptr) {
return nullptr;
}
if (!kernel_graph->IsInternalOutput(node)) {
return nullptr;
}
if (!UsedForOutputOnly(func_graph, node)) {
return nullptr;
}
auto cnode = node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(cnode);
CheckCNodeInputSize(cnode, kTransOpInputNum);
auto input_node = cnode->input(1);
if (!AnfAlgo::CheckPrimitiveType(input_node, prim::kPrimTupleGetItem)) {
kernel_graph->ReplaceInternalOutput(node, input_node);
} else {
auto tuple_getitem = input_node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(tuple_getitem);
int idx = AnfAlgo::GetTupleGetItemOutIndex(tuple_getitem);
AnfNodePtr real_input_node = AnfAlgo::GetTupleGetItemRealInput(tuple_getitem);
kernel_graph->ReplaceInternalOutput(node, real_input_node, 0, idx);
}
return input_node;
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,51 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
#include <string>
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class RemoveInternalOutput : public PatternProcessPass {
public:
explicit RemoveInternalOutput(const std::string &name, bool multigraph = true)
: PatternProcessPass(name, multigraph) {}
~RemoveInternalOutput() override = default;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
};
class RemoveInternalOutputTransOp : public RemoveInternalOutput {
public:
explicit RemoveInternalOutputTransOp(bool multigraph = true)
: RemoveInternalOutput("remove_internal_output_trans_op", multigraph) {}
~RemoveInternalOutputTransOp() override = default;
const BaseRef DefinePattern() const override;
};
class RemoveInternalOutputCast : public RemoveInternalOutput {
public:
explicit RemoveInternalOutputCast(bool multigraph = true)
: RemoveInternalOutput("remove_internal_output_cast", multigraph) {}
~RemoveInternalOutputCast() override = default;
const BaseRef DefinePattern() const override;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_

View File

@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_ #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
#define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_ #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
#include <memory> #include <memory>
#include "backend/optimizer/common/optimizer.h" #include "backend/optimizer/common/optimizer.h"
@ -53,4 +53,4 @@ class AdamFusion : public PatternProcessPass {
}; };
} // namespace opt } // namespace opt
} // namespace mindspore } // namespace mindspore
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_ #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_

View File

@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_ #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
#define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_ #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
#include <memory> #include <memory>
#include "backend/optimizer/common/optimizer.h" #include "backend/optimizer/common/optimizer.h"
@ -55,4 +55,4 @@ class AdamWeightDecayFusion : public PatternProcessPass {
}; };
} // namespace opt } // namespace opt
} // namespace mindspore } // namespace mindspore
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_ #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_

View File

@ -0,0 +1,65 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/gpu/replace_addn_fusion.h"
#include <memory>
#include <vector>
#include <string>
#include "backend/session/anf_runtime_algorithm.h"
#include "ir/primitive.h"
#include "utils/utils.h"
#include "backend/optimizer/common/helper.h"
namespace mindspore {
namespace opt {
const BaseRef ReplaceAddNFusion::DefinePattern() const {
VectorRef addn = VectorRef({prim::kPrimAddN, A, B});
return addn;
}
const AnfNodePtr ReplaceAddNFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(equiv);
auto A = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
auto B = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 1);
MS_EXCEPTION_IF_NULL(A);
MS_EXCEPTION_IF_NULL(B);
int num_input = AnfAlgo::GetNodeAttr<int>(node, "n");
if (num_input == 2) {
auto prim = std::make_shared<Primitive>(prim::kPrimTensorAdd->name());
MS_EXCEPTION_IF_NULL(prim);
std::vector<AnfNodePtr> inputs = {NewValueNode(prim), A, B};
auto add_new = graph->NewCNode(inputs);
std::vector<TypeId> outputs_type;
std::vector<std::vector<size_t>> outputs_shape;
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(A, 0));
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(A, 0));
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, add_new.get());
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(add_new));
return add_new;
} else {
return nullptr;
}
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,40 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class ReplaceAddNFusion : public PatternProcessPass {
public:
explicit ReplaceAddNFusion(bool multigraph = true) : PatternProcessPass("replace_addn", multigraph) {
A = std::make_shared<Var>();
B = std::make_shared<Var>();
}
~ReplaceAddNFusion() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
private:
VarPtr A;
VarPtr B;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_

View File

@ -0,0 +1,92 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
#include <memory>
#include <vector>
#include <string>
#include "backend/session/anf_runtime_algorithm.h"
#include "ir/primitive.h"
#include "utils/utils.h"
#include "backend/optimizer/common/helper.h"
namespace mindspore {
namespace opt {
const BaseRef ReplaceBNCastFusion::DefinePattern() const {
VectorRef in_cast = VectorRef({prim::kPrimCast, x_});
VectorRef fbn2 = VectorRef({prim::kPrimFusedBatchNorm, in_cast, scale_, bias_, mean_, var_});
VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2, index_});
VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
return out_cast;
}
const AnfNodePtr ReplaceBNCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(equiv);
auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
MS_EXCEPTION_IF_NULL(index_node);
auto value_node = index_node->cast<ValueNodePtr>();
MS_EXCEPTION_IF_NULL(value_node);
int item_idx = GetValue<int>(value_node->value());
auto fbn2 = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
auto x_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 0);
auto x_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(x_after), 0);
if (item_idx != 0) {
return nullptr;
}
auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 1);
auto bias = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 2);
auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 3);
auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 4);
MS_EXCEPTION_IF_NULL(fbn2);
MS_EXCEPTION_IF_NULL(x_after);
MS_EXCEPTION_IF_NULL(x_before);
MS_EXCEPTION_IF_NULL(scale);
MS_EXCEPTION_IF_NULL(bias);
MS_EXCEPTION_IF_NULL(mean);
MS_EXCEPTION_IF_NULL(var);
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
manager->Replace(utils::cast<CNodePtr>(x_after), utils::cast<CNodePtr>(x_before));
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
std::vector<TypeId> outputs_type;
std::vector<std::vector<size_t>> outputs_shape;
auto output_num = AnfAlgo::GetOutputTensorNum(fbn2);
for (size_t i = 0; i < output_num; i++) {
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2, i));
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2, i));
}
outputs_type[0] = kNumberTypeFloat16;
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2.get());
outputs_type.clear();
outputs_shape.clear();
outputs_type.push_back(kNumberTypeFloat16);
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
return tuple;
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,58 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class ReplaceBNCastFusion : public PatternProcessPass {
public:
explicit ReplaceBNCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_cast", multigraph) {
x_ = std::make_shared<Var>();
scale_ = std::make_shared<Var>();
bias_ = std::make_shared<Var>();
mean_ = std::make_shared<Var>();
var_ = std::make_shared<Var>();
y_ = std::make_shared<Var>();
running_mean_ = std::make_shared<Var>();
running_var_ = std::make_shared<Var>();
save_mean_ = std::make_shared<Var>();
save_var_ = std::make_shared<Var>();
index_ = std::make_shared<Var>();
}
~ReplaceBNCastFusion() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
private:
VarPtr x_;
VarPtr scale_;
VarPtr bias_;
VarPtr mean_;
VarPtr var_;
VarPtr y_;
VarPtr running_mean_;
VarPtr running_var_;
VarPtr save_mean_;
VarPtr save_var_;
VarPtr index_;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_

View File

@ -0,0 +1,88 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
#include <memory>
#include <vector>
#include <string>
#include "backend/session/anf_runtime_algorithm.h"
#include "ir/primitive.h"
#include "utils/utils.h"
#include "backend/optimizer/common/helper.h"
namespace mindspore {
namespace opt {
const BaseRef ReplaceBNGradCast2Fusion::DefinePattern() const {
VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_, x_, scale_, mean_, var_});
VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
return out_cast;
}
const AnfNodePtr ReplaceBNGradCast2Fusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(equiv);
auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
MS_EXCEPTION_IF_NULL(index_node);
auto value_node = index_node->cast<ValueNodePtr>();
MS_EXCEPTION_IF_NULL(value_node);
int item_idx = GetValue<int>(value_node->value());
if (item_idx != 0) {
return nullptr;
}
auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
auto dy_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);
MS_EXCEPTION_IF_NULL(fbn2g);
MS_EXCEPTION_IF_NULL(dy_);
MS_EXCEPTION_IF_NULL(scale);
MS_EXCEPTION_IF_NULL(x_);
MS_EXCEPTION_IF_NULL(mean);
MS_EXCEPTION_IF_NULL(var);
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
std::vector<TypeId> outputs_type;
std::vector<std::vector<size_t>> outputs_shape;
auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
for (size_t i = 0; i < output_num; i++) {
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
}
outputs_type[0] = AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0);
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
outputs_type.clear();
outputs_shape.clear();
outputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0));
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
return tuple;
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,54 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class ReplaceBNGradCast2Fusion : public PatternProcessPass {
public:
explicit ReplaceBNGradCast2Fusion(bool multigraph = true) : PatternProcessPass("replace_grad_cast2", multigraph) {
dy_ = std::make_shared<Var>();
x_ = std::make_shared<Var>();
scale_ = std::make_shared<Var>();
mean_ = std::make_shared<Var>();
var_ = std::make_shared<Var>();
dx_ = std::make_shared<Var>();
bn_scale_ = std::make_shared<Var>();
bn_bias_ = std::make_shared<Var>();
index_ = std::make_shared<Var>();
}
~ReplaceBNGradCast2Fusion() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
private:
VarPtr dy_;
VarPtr x_;
VarPtr scale_;
VarPtr mean_;
VarPtr var_;
VarPtr dx_;
VarPtr bn_scale_;
VarPtr bn_bias_;
VarPtr index_;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_

View File

@ -0,0 +1,91 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
#include <memory>
#include <vector>
#include <string>
#include "backend/session/anf_runtime_algorithm.h"
#include "ir/primitive.h"
#include "utils/utils.h"
#include "backend/optimizer/common/helper.h"
namespace mindspore {
namespace opt {
const BaseRef ReplaceBNGradCastFusion::DefinePattern() const {
VectorRef dy_cast = VectorRef({prim::kPrimCast, dy_});
VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_cast, x_, scale_, mean_, var_});
VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
return out_cast;
}
const AnfNodePtr ReplaceBNGradCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(equiv);
auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
MS_EXCEPTION_IF_NULL(index_node);
auto value_node = index_node->cast<ValueNodePtr>();
MS_EXCEPTION_IF_NULL(value_node);
int item_idx = GetValue<int>(value_node->value());
if (item_idx != 0) {
return nullptr;
}
auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
auto dy_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
auto dy_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(dy_after), 0);
auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);
MS_EXCEPTION_IF_NULL(fbn2g);
MS_EXCEPTION_IF_NULL(dy_after);
MS_EXCEPTION_IF_NULL(dy_before);
MS_EXCEPTION_IF_NULL(scale);
MS_EXCEPTION_IF_NULL(x_);
MS_EXCEPTION_IF_NULL(mean);
MS_EXCEPTION_IF_NULL(var);
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
manager->Replace(utils::cast<CNodePtr>(dy_after), utils::cast<CNodePtr>(dy_before));
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
std::vector<TypeId> outputs_type;
std::vector<std::vector<size_t>> outputs_shape;
auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
for (size_t i = 0; i < output_num; i++) {
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
}
outputs_type[0] = kNumberTypeFloat16;
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
outputs_type.clear();
outputs_shape.clear();
outputs_type.push_back(kNumberTypeFloat16);
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
return tuple;
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,54 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class ReplaceBNGradCastFusion : public PatternProcessPass {
public:
explicit ReplaceBNGradCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_grad_cast", multigraph) {
dy_ = std::make_shared<Var>();
x_ = std::make_shared<Var>();
scale_ = std::make_shared<Var>();
mean_ = std::make_shared<Var>();
var_ = std::make_shared<Var>();
dx_ = std::make_shared<Var>();
bn_scale_ = std::make_shared<Var>();
bn_bias_ = std::make_shared<Var>();
index_ = std::make_shared<Var>();
}
~ReplaceBNGradCastFusion() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
private:
VarPtr dy_;
VarPtr x_;
VarPtr scale_;
VarPtr mean_;
VarPtr var_;
VarPtr dx_;
VarPtr bn_scale_;
VarPtr bn_bias_;
VarPtr index_;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_

View File

@ -0,0 +1,63 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
#include <memory>
#include <vector>
#include <string>
#include "backend/session/anf_runtime_algorithm.h"
#include "ir/primitive.h"
#include "utils/utils.h"
#include "backend/optimizer/common/helper.h"
namespace mindspore {
namespace opt {
const BaseRef ReplaceMomentumCastFusion::DefinePattern() const {
VectorRef grad_cast = VectorRef({prim::kPrimCast, grad_});
VectorRef momentum = VectorRef({prim::kPrimApplyMomentum, var_, acc_, lr_, grad_cast, mom_});
return momentum;
}
const AnfNodePtr ReplaceMomentumCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(equiv);
auto grad_cast = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 3);
auto grad = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(grad_cast), 0);
MS_EXCEPTION_IF_NULL(grad_cast);
MS_EXCEPTION_IF_NULL(grad);
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
manager->Replace(utils::cast<CNodePtr>(grad_cast), utils::cast<CNodePtr>(grad));
std::vector<TypeId> outputs_type;
std::vector<std::vector<size_t>> outputs_shape;
auto output_num = AnfAlgo::GetOutputTensorNum(node);
for (size_t i = 0; i < output_num; i++) {
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(node, i));
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(node, i));
}
outputs_type[3] = AnfAlgo::GetPrevNodeOutputInferDataType(grad_cast, 0);
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, node.get());
return node;
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,46 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class ReplaceMomentumCastFusion : public PatternProcessPass {
public:
explicit ReplaceMomentumCastFusion(bool multigraph = true) : PatternProcessPass("replace_momentum_cast", multigraph) {
var_ = std::make_shared<Var>();
acc_ = std::make_shared<Var>();
lr_ = std::make_shared<Var>();
grad_ = std::make_shared<Var>();
mom_ = std::make_shared<Var>();
}
~ReplaceMomentumCastFusion() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
private:
VarPtr var_;
VarPtr acc_;
VarPtr lr_;
VarPtr grad_;
VarPtr mom_;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_

View File

@ -25,7 +25,8 @@
namespace mindspore { namespace mindspore {
namespace memreuse { namespace memreuse {
enum RefCountType { kDynamicRefCount, kStaticRefCount }; enum RefCountType { kDynamicRefCount, kStaticRefCount };
enum NodeType { NORMAL, SPECIAL }; enum NodeType { COMMON_NODE, COMMUNICATION_NODE };
enum KernelRefType { COMMON, REFNODE_OUTPUT, COMM_NOTREUSE, COMM_REUSE, SUMMARY };
static constexpr int kInitIndex = -1; static constexpr int kInitIndex = -1;
class KernelRefCount { class KernelRefCount {
public: public:
@ -36,6 +37,7 @@ class KernelRefCount {
size_t offset_; size_t offset_;
size_t size_; size_t size_;
int index_; int index_;
KernelRefType type_;
// remember to reset offset // remember to reset offset
KernelRefCount() KernelRefCount()
: stream_id_(0), : stream_id_(0),
@ -44,6 +46,7 @@ class KernelRefCount {
offset_(0), offset_(0),
size_(0), size_(0),
index_(kInitIndex), index_(kInitIndex),
type_(COMMON),
reftype_(kStaticRefCount) {} reftype_(kStaticRefCount) {}
~KernelRefCount() = default; ~KernelRefCount() = default;
void SetKernelRefCountInfo(int index, size_t size, RefCountType reftype); void SetKernelRefCountInfo(int index, size_t size, RefCountType reftype);
@ -65,7 +68,7 @@ class KernelDef {
KernelMap inputs_; KernelMap inputs_;
KernelMap outputs_; KernelMap outputs_;
KernelMap wk_space_; KernelMap wk_space_;
NodeType dirty = NORMAL; NodeType type_ = COMMON_NODE;
KernelDef() = default; KernelDef() = default;
~KernelDef() = default; ~KernelDef() = default;
void set_input_refs(const KernelRefCountPtrList &kernelRefPtrList) { input_refs_ = kernelRefPtrList; } void set_input_refs(const KernelRefCountPtrList &kernelRefPtrList) { input_refs_ = kernelRefPtrList; }

View File

@ -46,6 +46,8 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() {
if (iter == kernel_output_refs_.end()) { if (iter == kernel_output_refs_.end()) {
auto output_sizes = kernel_mod->GetOutputSizeList(); auto output_sizes = kernel_mod->GetOutputSizeList();
KernelRefCountPtrList kernel_refs; KernelRefCountPtrList kernel_refs;
bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel_cnode);
size_t output_index = 0;
for (auto size : output_sizes) { for (auto size : output_sizes) {
total_dy_size_ += size; total_dy_size_ += size;
// do not MallocDynamicMem just record this // do not MallocDynamicMem just record this
@ -54,9 +56,20 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() {
auto curr_stream_id = AnfAlgo::GetStreamId(kernel_cnode); auto curr_stream_id = AnfAlgo::GetStreamId(kernel_cnode);
kernel_ref->stream_id_ = curr_stream_id; kernel_ref->stream_id_ = curr_stream_id;
kernel_ref->SetKernelRefCountInfo(index, size, kDynamicRefCount); kernel_ref->SetKernelRefCountInfo(index, size, kDynamicRefCount);
if (is_comm_op) {
kernel_ref->type_ = COMM_REUSE;
} else {
session::AnfWithOutIndex out_pair(kernel_cnode, output_index);
if (graph_->IsInRefOutputMap(out_pair)) {
kernel_ref->type_ = REFNODE_OUTPUT;
} else {
kernel_ref->type_ = COMMON;
}
}
kernel_refs.push_back(kernel_ref); kernel_refs.push_back(kernel_ref);
kernel_out_ref_num++; kernel_out_ref_num++;
total_refs_list_.push_back(kernel_ref); total_refs_list_.push_back(kernel_ref);
output_index++;
} }
if (!kernel_refs.empty()) { if (!kernel_refs.empty()) {
kernel_output_refs_[key] = kernel_refs; kernel_output_refs_[key] = kernel_refs;
@ -155,9 +168,19 @@ void MemReuseUtil::SetInputMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr
MS_EXCEPTION_IF_NULL(kernel); MS_EXCEPTION_IF_NULL(kernel);
MS_EXCEPTION_IF_NULL(kernel_def_ptr); MS_EXCEPTION_IF_NULL(kernel_def_ptr);
auto key = kernel.get(); auto key = kernel.get();
for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) { bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel);
size_t input_tensor_num = AnfAlgo::GetInputTensorNum(kernel);
for (size_t i = 0; i < input_tensor_num; ++i) {
auto ref_ptr = GetKernelInputRef(kernel, i); auto ref_ptr = GetKernelInputRef(kernel, i);
if (ref_ptr != nullptr) { if (ref_ptr != nullptr) {
if (is_comm_op) {
if (input_tensor_num == 1) {
ref_ptr->type_ = COMM_REUSE;
} else {
ref_ptr->type_ = COMM_NOTREUSE;
}
}
if (ref_ptr->reftype() == kStaticRefCount) { if (ref_ptr->reftype() == kStaticRefCount) {
continue; continue;
} else if (ref_ptr->reftype() == kDynamicRefCount) { } else if (ref_ptr->reftype() == kDynamicRefCount) {
@ -258,6 +281,11 @@ void MemReuseUtil::SetKernelDefMap() {
auto key = kernel.get(); auto key = kernel.get();
kernel_def_ptr->set_input_refs(kernel_def_ptr->inputs_[key]); kernel_def_ptr->set_input_refs(kernel_def_ptr->inputs_[key]);
kernel_def_ptr->set_output_refs(kernel_def_ptr->outputs_[key]); kernel_def_ptr->set_output_refs(kernel_def_ptr->outputs_[key]);
if (AnfAlgo::IsCommunicationOp(kernel)) {
kernel_def_ptr->type_ = COMMUNICATION_NODE;
} else {
kernel_def_ptr->type_ = COMMON_NODE;
}
kernel_def_ptr_list_.push_back(kernel_def_ptr); kernel_def_ptr_list_.push_back(kernel_def_ptr);
kernel_map_[key] = kernel_def_ptr; kernel_map_[key] = kernel_def_ptr;
} }
@ -337,6 +365,7 @@ void MemReuseUtil::SetSummaryNodesRefCount() {
KernelRefCountPtr kernel_ref = kernel_output_refs_[node.get()][index]; KernelRefCountPtr kernel_ref = kernel_output_refs_[node.get()][index];
kernel_ref->ref_count_ = kMaxRefCount; kernel_ref->ref_count_ = kMaxRefCount;
kernel_ref->ref_count_dynamic_use_ = kMaxRefCount; kernel_ref->ref_count_dynamic_use_ = kMaxRefCount;
kernel_ref->type_ = SUMMARY;
total_summary_size += kernel_ref->size_; total_summary_size += kernel_ref->size_;
MS_LOG(INFO) << "Set summary node's ref count, node: " << node->fullname_with_scope() << " index: " << index; MS_LOG(INFO) << "Set summary node's ref count, node: " << node->fullname_with_scope() << " index: " << index;
} else { } else {

View File

@ -83,6 +83,7 @@ class MemReuseUtil {
void set_mem_base(uint8_t *mem_base) { mem_base_ = mem_base; } void set_mem_base(uint8_t *mem_base) { mem_base_ = mem_base; }
uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const; uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const; uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
bool is_all_nop_node() const { return is_all_nop_node_; }
private: private:
int util_index_; int util_index_;

View File

@ -33,11 +33,11 @@ void BestFitMemReuse::InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr) {
set_op_ptr_list(mem_reuse_util_ptr->kernel_def_ptr_list()); set_op_ptr_list(mem_reuse_util_ptr->kernel_def_ptr_list());
// check info Correctness // check info Correctness
for (auto &tensor : tensor_ptr_list_) { for (auto &tensor : tensor_ptr_list_) {
tensor->size_ = AlignMemorySize(tensor->size_); tensor->size_ = AlignCommonMemorySize(tensor->size_);
} }
// align wk size to 512 && refcount == 1 // align wk size to 512 && refcount == 1
for (auto &wk : wk_tensor_list_) { for (auto &wk : wk_tensor_list_) {
wk->size_ = AlignMemorySize(wk->size_); wk->size_ = AlignCommonMemorySize(wk->size_);
wk->ref_count_ = 1; wk->ref_count_ = 1;
} }
#ifdef ENABLE_D #ifdef ENABLE_D
@ -135,11 +135,23 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr
return false; return false;
} }
void BestFitMemReuse::AssignNodeOutputOffset() { void BestFitMemReuse::AssignCommonNodeOutputOffset() {
MS_EXCEPTION_IF_NULL(current_kernel_);
for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) { for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
size_t index = GetTensorIndex(tensor_idx); size_t index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[index]; auto tensor_desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(tensor_desc); MS_EXCEPTION_IF_NULL(tensor_desc);
if (tensor_desc->type_ == REFNODE_OUTPUT) {
total_refoutput_size += tensor_desc->size_;
continue;
} else if (tensor_desc->type_ == COMM_NOTREUSE) {
total_comm_not_reuse_size += tensor_desc->size_;
} else if (tensor_desc->type_ == COMM_REUSE) {
// get align size for communication op's single input
tensor_desc->size_ = AlignCommunicationMemorySize(tensor_desc->size_);
total_comm_reuse_size += tensor_desc->size_;
}
auto reusable_membuf_map = GetReusableMembufMap(tensor_desc->size_); auto reusable_membuf_map = GetReusableMembufMap(tensor_desc->size_);
if (!reusable_membuf_map.empty()) { if (!reusable_membuf_map.empty()) {
auto membuf_index = reusable_membuf_map.begin()->second; auto membuf_index = reusable_membuf_map.begin()->second;
@ -152,6 +164,93 @@ void BestFitMemReuse::AssignNodeOutputOffset() {
MemReuseChecker::GetInstance().IsAddNewMembuf_ = true; MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
#endif #endif
} }
// skip left align border for communication op single input to used
if (tensor_desc->type_ == COMM_REUSE) {
tensor_desc->offset_ += kDefaultMemAlignSize;
}
}
}
void BestFitMemReuse::AssignCommunicationNodeOutputOffset() {
size_t total_kernel_output_size = 0;
size_t output_num = 0;
// get all output size
MS_EXCEPTION_IF_NULL(current_kernel_);
for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
size_t index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(tensor_desc);
if (tensor_desc->type_ == COMM_REUSE) {
total_comm_reuse_size += tensor_desc->size_;
total_comm_output_reuse_size += tensor_desc->size_;
total_kernel_output_size += tensor_desc->size_;
} else {
MS_LOG(ERROR) << "All communication op's outputs should be memory reuse, Kernel:"
<< current_kernel_->scope_full_name();
continue;
}
}
total_kernel_output_size = AlignCommunicationMemorySize(total_kernel_output_size);
// add left align border for the first output and right align border for the last output to alloc align border memory
size_t output_index = 0;
auto output_ref_indexes = current_kernel_->GetOutputRefIndexs();
for (auto &tensor_idx : output_ref_indexes) {
size_t index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(tensor_desc);
if (output_index == 0 || output_index == output_num - 1) {
tensor_desc->size_ += kDefaultMemAlignSize;
}
if ((output_index == 0) && (output_ref_indexes.size() == 1)) {
// add right align border for single output
tensor_desc->size_ += kDefaultMemAlignSize;
}
output_index++;
}
auto reusable_membuf_map = GetReusableMembufMap(total_kernel_output_size);
if (!reusable_membuf_map.empty()) {
auto membuf_index = reusable_membuf_map.begin()->second;
output_index = 0;
for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
size_t index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(tensor_desc);
ReuseExistMembuf(tensor_desc.get(), membuf_index + output_index, kDynamicMem);
// skip skip left align border for communication op's first output to used
if (output_index == 0) {
tensor_desc->offset_ += kDefaultMemAlignSize;
}
output_index++;
}
} else {
// no membuf can reuse, add new membuf after the membuf_ptr_list
output_index = 0;
for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
size_t index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(tensor_desc);
AddNewMembufPtr(tensor_desc.get(), kDynamicMem);
// skip align size offset for first output to used
if (output_index == 0) {
tensor_desc->offset_ += kDefaultMemAlignSize;
}
output_index++;
#ifdef MEM_REUSE_DEBUG
MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
#endif
}
}
}
void BestFitMemReuse::AssignNodeOutputOffset() {
if (current_kernel_->type_ == COMMUNICATION_NODE) {
AssignCommunicationNodeOutputOffset();
} else {
AssignCommonNodeOutputOffset();
} }
} }
@ -319,11 +418,17 @@ void BestFitMemReuse::ReleaseMembuf(size_t tensor_index, int flag) {
} }
} }
size_t BestFitMemReuse::AlignMemorySize(size_t size) const { size_t BestFitMemReuse::AlignCommonMemorySize(size_t size) const {
// memory size 512 align // memory size 512 align
return (size + kDefaultMemAlignSize + kAttAlignSize) / kDefaultMemAlignSize * kDefaultMemAlignSize; return (size + kDefaultMemAlignSize + kAttAlignSize) / kDefaultMemAlignSize * kDefaultMemAlignSize;
} }
size_t BestFitMemReuse::AlignCommunicationMemorySize(size_t size) const {
// memory size 512 align and add communication memory: left align border memory - data - right align border memory
return kDefaultMemAlignSize + (size + kDefaultMemAlignSize - 1) / kDefaultMemAlignSize * kDefaultMemAlignSize +
kDefaultMemAlignSize;
}
size_t BestFitMemReuse::GetAllocatedSize() { size_t BestFitMemReuse::GetAllocatedSize() {
size_t AllocatedSize = kTotalSize; size_t AllocatedSize = kTotalSize;
if (membuf_ptr_list_.empty()) { if (membuf_ptr_list_.empty()) {
@ -412,6 +517,9 @@ void BestFitMemReuse::Reuse(const MemReuseUtil *mem_reuse_util_ptr) {
++op_num; ++op_num;
#endif #endif
} }
MS_LOG(INFO) << "Special Tensor total size: RefOutput: " << total_refoutput_size
<< " CommReuse: " << total_comm_reuse_size << " CommOutputReuse: " << total_comm_output_reuse_size
<< " CommNotReuse: " << total_comm_not_reuse_size;
#ifdef MEM_REUSE_DEBUG #ifdef MEM_REUSE_DEBUG
MemReuseChecker::GetInstance().ExportMembufInfoIR(); MemReuseChecker::GetInstance().ExportMembufInfoIR();
MemReuseChecker::GetInstance().ExportAddNewMmebufIR(); MemReuseChecker::GetInstance().ExportAddNewMmebufIR();

View File

@ -74,6 +74,14 @@ class BestFitMemReuse {
* Assign output tensor memory offset of current kernel * Assign output tensor memory offset of current kernel
*/ */
void AssignNodeOutputOffset(); void AssignNodeOutputOffset();
/**
* Assign output tensor memory offset of common kernel
*/
void AssignCommonNodeOutputOffset();
/**
* Assign output tensor memory offset of communication kernel
*/
void AssignCommunicationNodeOutputOffset();
/** /**
* Update input tensor's status of current kernel, and the status of membuf used by current kernel * Update input tensor's status of current kernel, and the status of membuf used by current kernel
*/ */
@ -110,8 +118,10 @@ class BestFitMemReuse {
void AddNewMembufPtr(KernelRefCount *tensor_desc, int flag); void AddNewMembufPtr(KernelRefCount *tensor_desc, int flag);
// Merge unused membuf // Merge unused membuf
void ReleaseMembuf(size_t tensor_index, int flag); void ReleaseMembuf(size_t tensor_index, int flag);
// Memory address alignment 512 // Memory address alignment for common memory
size_t AlignMemorySize(size_t size) const; size_t AlignCommonMemorySize(size_t size) const;
// Memory address alignment for communication used memory
size_t AlignCommunicationMemorySize(size_t size) const;
int GetRealIndex(size_t index, int flag = kDynamicMem) const; int GetRealIndex(size_t index, int flag = kDynamicMem) const;
size_t GetTensorIndex(int index) const; size_t GetTensorIndex(int index) const;
size_t GetWorkspaceIndex(int index) const; size_t GetWorkspaceIndex(int index) const;
@ -153,6 +163,10 @@ class BestFitMemReuse {
// kernel_front_map_, key: the kernel_def, value: kernels before this kernel_def // kernel_front_map_, key: the kernel_def, value: kernels before this kernel_def
std::map<KernelDefPtr, std::set<KernelDefPtr>> kernel_front_map_; std::map<KernelDefPtr, std::set<KernelDefPtr>> kernel_front_map_;
std::vector<std::vector<uint32_t>> stream_groups_; std::vector<std::vector<uint32_t>> stream_groups_;
size_t total_refoutput_size{0};
size_t total_comm_reuse_size{0};
size_t total_comm_output_reuse_size{0};
size_t total_comm_not_reuse_size{0};
}; };
} // namespace memreuse } // namespace memreuse
} // namespace mindspore } // namespace mindspore

View File

@ -170,12 +170,14 @@ void MemReuseChecker::CheckMemReuseIR(const KernelRefCountPtrList &total_refs_li
ofs << "all_tensor_refs:\n"; ofs << "all_tensor_refs:\n";
ofs << "index:" ofs << "index:"
<< "\tsize:" << "\tsize:"
<< "\trefcount:\n"; << "\trefcount:"
<< "\ttype:\n";
for (auto &ref : total_refs_list) { for (auto &ref : total_refs_list) {
ofs << "%" << ref->index_ << "T" ofs << "%" << ref->index_ << "T"
<< "\t" << "\t"
<< "#" << ref->size_ << "S" << "#" << ref->size_ << "S"
<< "\t" << ref->ref_count_ << "C" << "\t" << ref->ref_count_ << "C"
<< "\t" << ref->type_ << "t"
<< "\n"; << "\n";
} }
ofs << "kernel_def exc_order:\n"; ofs << "kernel_def exc_order:\n";
@ -241,7 +243,7 @@ bool MemReuseChecker::CheckGraphOutputAssigned(const session::KernelGraph *graph
void MemReuseChecker::ExportMemOpIr(const KernelDef *def, std::ofstream &ofs, int def_idx) { void MemReuseChecker::ExportMemOpIr(const KernelDef *def, std::ofstream &ofs, int def_idx) {
auto scope_name = def->scope_full_name(); auto scope_name = def->scope_full_name();
std::string split_name = GetSplitName(scope_name); std::string split_name = GetSplitName(scope_name);
ofs << "$" << def_idx << "\t" << split_name << "\t"; ofs << "$" << def_idx << "\t" << split_name << "\t" << static_cast<int>(def->type_) << "\t";
ofs << "inputs["; ofs << "inputs[";
for (auto &in : def->inputs_) { for (auto &in : def->inputs_) {
for (auto &in_ref : in.second) { for (auto &in_ref : in.second) {

View File

@ -100,7 +100,10 @@ bool CommunicationOpFusion::GetSplitSegments(const CommunicationOpInfo &communic
auto parallel_context = parallel::ParallelContext::GetInstance(); auto parallel_context = parallel::ParallelContext::GetInstance();
MS_EXCEPTION_IF_NULL(parallel_context); MS_EXCEPTION_IF_NULL(parallel_context);
const auto &split_indices = parallel_context->GetAllReduceFusionSplitIndices(group); std::vector<uint32_t> split_indices;
if (!parallel_context->enable_parallel_optimizer()) {
split_indices = parallel_context->GetAllReduceFusionSplitIndices(group);
}
size_t segments = 0; size_t segments = 0;
if (split_indices.size() != 0) { if (split_indices.size() != 0) {

View File

@ -71,7 +71,6 @@ bool ReplaceNodeByProxy::Run(const FuncGraphPtr &func_graph) {
AbstractBasePtrList abstract_list; AbstractBasePtrList abstract_list;
AnfAlgo::CopyNodeAttr(kAttrPsKey, cnode, proxy_node); AnfAlgo::CopyNodeAttr(kAttrPsKey, cnode, proxy_node);
AnfAlgo::CopyNodeAttr("reduce_scatter_flag", cnode, proxy_node);
AnfAlgo::CopyNodeAttr("offset", cnode, proxy_node); AnfAlgo::CopyNodeAttr("offset", cnode, proxy_node);
abstract_list.push_back(cnode->abstract()); abstract_list.push_back(cnode->abstract());
auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list); auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list);

View File

@ -18,9 +18,12 @@
#include <utility> #include <utility>
#include <memory> #include <memory>
#include <algorithm> #include <algorithm>
#include <string>
#include "backend/session/anf_runtime_algorithm.h" #include "backend/session/anf_runtime_algorithm.h"
#include "utils/union_find_set.h" #include "utils/union_find_set.h"
#include "runtime/device/ascend/ascend_label_assign.h" #include "runtime/device/ascend/ascend_label_assign.h"
#include "utils/context/ms_context.h"
#include "debug/anf_ir_dump.h"
static constexpr size_t kCNodePrim = 0; static constexpr size_t kCNodePrim = 0;
static constexpr size_t kCNodeCallArg = 1; static constexpr size_t kCNodeCallArg = 1;
@ -104,7 +107,7 @@ static void ReuseParameter(NotNull<KernelGraphPtr> root_kg,
static CNodePtr GetNextRealKernel(const std::vector<CNodePtr> &list, size_t start) { static CNodePtr GetNextRealKernel(const std::vector<CNodePtr> &list, size_t start) {
for (size_t i = start; i < list.size() - 1; ++i) { for (size_t i = start; i < list.size() - 1; ++i) {
if (!IsPrimitiveCNode(list[i], prim::kPrimPartial) && AnfAlgo::IsRealKernel(list[i])) { if (AnfAlgo::IsRealKernel(list[i])) {
return list[i]; return list[i];
} }
} }
@ -168,18 +171,43 @@ static void EraseNodeFromExecOrder(const AnfNodePtr &node, const NotNull<std::ve
exec_order->erase(exec_iter); exec_order->erase(exec_iter);
} }
void AscendControlParser::AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph,
const NotNull<std::set<KernelGraphPtr> *> memo) {
if (memo->find(graph) != memo->end()) {
return;
}
memo->insert(graph.get());
const std::vector<std::shared_ptr<KernelGraph>> &child_graph_order = graph->child_graph_order();
if (child_graph_order.empty()) {
return;
}
std::vector<AnfNodePtr> depend_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimPartial->name()))};
for (auto &cg : child_graph_order) {
MS_EXCEPTION_IF_NULL(cg);
auto fg = cg->cast<FuncGraphPtr>();
MS_EXCEPTION_IF_NULL(fg);
depend_inputs.emplace_back(NewValueNode(fg));
AttachChildGraphToReturnNode(NOT_NULL(cg), memo);
}
auto child_graphs = graph->NewCNode(depend_inputs);
InsertDependToGraph(graph, NOT_NULL(child_graphs));
}
void AscendControlParser::LinkGraph(NotNull<KernelGraphPtr> kg) { void AscendControlParser::LinkGraph(NotNull<KernelGraphPtr> kg) {
std::set<KernelGraphPtr> memo; std::set<KernelGraphPtr> memo;
std::vector<std::pair<AnfNodePtr, AnfNodePtr>> link_list; std::vector<std::pair<AnfNodePtr, AnfNodePtr>> link_list;
// Insert Assign // Insert Assign
ChildGraphDataAssign(kg, NOT_NULL(&link_list), NOT_NULL(&memo)); ChildGraphDataAssign(kg, NOT_NULL(&link_list), NOT_NULL(&memo));
memo.clear();
// Reuse Parameter // Reuse Parameter
ReuseParameter(kg, link_list); ReuseParameter(kg, link_list);
// replace call by label goto / label switch // replace call by label goto / label switch
memo.clear();
(void)ProcessKernelGraph(kg, nullptr, nullptr, NOT_NULL(&memo)); (void)ProcessKernelGraph(kg, nullptr, nullptr, NOT_NULL(&memo));
memo.clear();
// assign label resource // assign label resource
device::ascend::AscendLabelAssign::GetInstance().AssignLabel(kg); device::ascend::AscendLabelAssign::GetInstance().AssignLabel(kg);
AttachChildGraphToReturnNode(kg, NOT_NULL(&memo));
} }
void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph, void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
@ -248,10 +276,14 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
} }
MS_LOG(INFO) << "Erase " << assign_node->DebugString(5); MS_LOG(INFO) << "Erase " << assign_node->DebugString(5);
EraseNodeFromExecOrder(assign_node, NOT_NULL(&exec_order)); EraseNodeFromExecOrder(assign_node, NOT_NULL(&exec_order));
auto source = assign_node->input(kCNodeAssignSource);
auto source = AnfAlgo::VisitKernelWithReturnType(assign_node->input(kCNodeAssignSource), 0).first; MS_EXCEPTION_IF_NULL(source);
parameter_count.AddReadCount(source, -1); auto visit_source = AnfAlgo::VisitKernelWithReturnType(source, 0).first;
parameter_count.AddWriteCount(para, -1); parameter_count.AddWriteCount(para, -1);
parameter_count.AddReadCount(para, -1);
if (visit_source->isa<Parameter>()) {
parameter_count.AddReadCount(visit_source, read - 1);
}
for (auto &node : all_nodes) { for (auto &node : all_nodes) {
for (size_t i = 0; i < node->size(); ++i) { for (size_t i = 0; i < node->size(); ++i) {
if (node->input(i) == para) { if (node->input(i) == para) {
@ -260,8 +292,6 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
} }
} }
} }
parameter_count.AddReadCount(source, 1);
parameter_count.AddReadCount(para, -1);
} }
root_graph->set_execution_order(exec_order); root_graph->set_execution_order(exec_order);
} }
@ -318,6 +348,17 @@ void AscendControlParser::ExecutorValidate(NotNull<KernelGraphPtr> root_graph) {
(void)RecurseGraph(root_graph, NOT_NULL(&memo)); (void)RecurseGraph(root_graph, NOT_NULL(&memo));
EraseParameter(root_graph, memo); EraseParameter(root_graph, memo);
EraseLabel(root_graph); EraseLabel(root_graph);
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
auto save_graphs_path = context_ptr->save_graphs_path();
if (save_graphs_path.empty()) {
save_graphs_path = ".";
}
if (context_ptr->save_graphs_flag()) {
std::string file_path = save_graphs_path + "/after_erase_label_and_parameter.ir";
DumpIR(file_path, root_graph.get());
}
} }
std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> AscendControlParser::ParseCallNode( std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> AscendControlParser::ParseCallNode(

View File

@ -66,7 +66,8 @@ class AscendControlParser {
static AnfNodePtr InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to); static AnfNodePtr InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to);
static std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> ParseCallNode(NotNull<CNodePtr> call_node); static std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> ParseCallNode(NotNull<CNodePtr> call_node);
static std::tuple<KernelGraphPtr, std::vector<AnfNodePtr>> ParsePartial(NotNull<AnfNodePtr> node); static std::tuple<KernelGraphPtr, std::vector<AnfNodePtr>> ParsePartial(NotNull<AnfNodePtr> node);
static void AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph,
const NotNull<std::set<KernelGraphPtr> *> memo);
// root graph order // root graph order
static bool CheckLabelIndex(uint32_t order_index, uint32_t label_index, const CNodePtr &cnode, static bool CheckLabelIndex(uint32_t order_index, uint32_t label_index, const CNodePtr &cnode,
NotNull<KernelGraphPtr> graph); NotNull<KernelGraphPtr> graph);

View File

@ -353,6 +353,10 @@ GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) {
RootGraphExecutorValidate(NOT_NULL(root_graph)); RootGraphExecutorValidate(NOT_NULL(root_graph));
// adjust kernel // adjust kernel
AdjustKernel(root_graph); AdjustKernel(root_graph);
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
// Assign parameter keys.
AssignParamKey(root_graph);
#endif
// assign stream // assign stream
AssignStream(NOT_NULL(root_graph)); AssignStream(NOT_NULL(root_graph));
// insert profiling point // insert profiling point
@ -511,6 +515,12 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
} }
// load input data from user input // load input data from user input
LoadInputData(kernel_graph, inputs); LoadInputData(kernel_graph, inputs);
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
// Initialize parameter server
if (!ps_init_) {
InitPSParamAndOptim(kernel_graph, inputs);
}
#endif
// convert inputs to model // convert inputs to model
predictmodel::StepConvertWeight(inputs); predictmodel::StepConvertWeight(inputs);
{ {

View File

@ -16,6 +16,7 @@
#include "backend/session/cpu_session.h" #include "backend/session/cpu_session.h"
#include <algorithm> #include <algorithm>
#include <sstream>
#include "ir/tensor.h" #include "ir/tensor.h"
#include "ir/anf.h" #include "ir/anf.h"
#include "backend/kernel_compiler/kernel.h" #include "backend/kernel_compiler/kernel.h"
@ -25,9 +26,15 @@
#include "predict/predict.h" #include "predict/predict.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "runtime/device/cpu/kernel_select_cpu.h" #include "runtime/device/cpu/kernel_select_cpu.h"
#include "backend/optimizer/common/optimizer.h"
#include "backend/optimizer/common/pass_manager.h"
#include "backend/optimizer/pass/replace_node_by_proxy.h"
#ifdef ENABLE_DEBUGGER #ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h" #include "debug/debugger/debugger.h"
#endif #endif
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
#include "frontend/parallel/ps/util.h"
#endif
namespace mindspore { namespace mindspore {
namespace session { namespace session {
@ -49,12 +56,29 @@ ParameterPtr CPUSession::CreateNewParameterFromParameter(const AnfNodePtr &anf,
return new_parameter; return new_parameter;
} }
void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
std::string pass_name = "replace_node_by_proxy";
pass_name.append(std::to_string(graph_sum_));
pm->AddPass(std::make_shared<opt::ReplaceNodeByProxy>(pass_name));
optimizer->AddPassManager(pm);
(void)optimizer->Optimize(kernel_graph);
kernel_graph->SetExecOrderByDefault();
}
GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
auto graph_id = graph_sum_; auto graph_id = graph_sum_;
auto graph = ConstructKernelGraph(lst, outputs); auto graph = ConstructKernelGraph(lst, outputs);
MS_EXCEPTION_IF_NULL(graph); MS_EXCEPTION_IF_NULL(graph);
MS_LOG(INFO) << "Set kernel info"; MS_LOG(INFO) << "Set kernel info";
SetKernelInfo(graph.get()); SetKernelInfo(graph.get());
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
AssignParamKey(graph);
if (parallel::ps::Util::IsRoleOfWorker()) {
Optimize(graph);
}
#endif
predictmodel::StepConvertGraph(graph); predictmodel::StepConvertGraph(graph);
MS_LOG(INFO) << "Build kernel"; MS_LOG(INFO) << "Build kernel";
BuildKernel(graph.get()); BuildKernel(graph.get());
@ -66,6 +90,12 @@ GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) { void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
auto &kernel_graph = graphs_[graph_id]; auto &kernel_graph = graphs_[graph_id];
MS_EXCEPTION_IF_NULL(kernel_graph); MS_EXCEPTION_IF_NULL(kernel_graph);
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
// Initialize parameter server
if (!ps_init_) {
InitPSParamAndOptim(kernel_graph, inputs);
}
#endif
MS_LOG(INFO) << "Bind input output address"; MS_LOG(INFO) << "Bind input output address";
std::vector<tensor::TensorPtr> need_sync_outputs; std::vector<tensor::TensorPtr> need_sync_outputs;
runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs, &need_sync_outputs); runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs, &need_sync_outputs);
@ -119,6 +149,48 @@ void CPUSession::SetKernelInfo(const KernelGraph *kernel_graph) {
} }
} }
namespace {
void KernelNotSupportException(const AnfNodePtr &kernel_node) {
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
std::stringstream operator_info;
operator_info << "Operator[" << kernel_name << "] ";
auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel_node->kernel_info());
if (kernel_info == nullptr) {
operator_info << "is not support.";
MS_LOG(EXCEPTION) << operator_info.str();
}
auto kernel_build_Info = kernel_info->select_kernel_build_info();
if (kernel_build_Info == nullptr) {
operator_info << "is not support.";
MS_LOG(EXCEPTION) << operator_info.str();
}
size_t input_num = kernel_build_Info->GetInputNum();
if (input_num > 0) {
operator_info << " input(";
for (size_t i = 0; i < input_num; ++i) {
operator_info << TypeIdLabel(kernel_build_Info->GetInputDeviceType(i));
if (i != input_num - 1) {
operator_info << ",";
}
}
operator_info << ") ";
}
size_t output_num = kernel_build_Info->GetOutputNum();
if (output_num > 0) {
operator_info << "output(";
for (size_t i = 0; i < output_num; ++i) {
operator_info << TypeIdLabel(kernel_build_Info->GetOutputDeviceType(i));
if (i != kernel_build_Info->GetOutputNum() - 1) {
operator_info << ",";
}
}
operator_info << ") ";
}
operator_info << "is not support.";
MS_LOG(EXCEPTION) << operator_info.str();
}
} // namespace
void CPUSession::BuildKernel(const KernelGraph *kernel_graph) { void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
MS_EXCEPTION_IF_NULL(kernel_graph); MS_EXCEPTION_IF_NULL(kernel_graph);
auto &kernel_nodes = kernel_graph->execution_order(); auto &kernel_nodes = kernel_graph->execution_order();
@ -129,7 +201,7 @@ void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
std::shared_ptr<kernel::CPUKernel> cpu_kernel = std::shared_ptr<kernel::CPUKernel> cpu_kernel =
kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node); kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node);
if (cpu_kernel == nullptr) { if (cpu_kernel == nullptr) {
MS_LOG(EXCEPTION) << "Operator[" << kernel_name << "] is not support."; KernelNotSupportException(kernel_node);
} }
cpu_kernel->Init(kernel_node); cpu_kernel->Init(kernel_node);
AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get()); AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get());

View File

@ -37,6 +37,7 @@ class CPUSession : public SessionBasic {
protected: protected:
ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph) override; ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph) override;
void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph);
private: private:
void SetKernelInfo(const KernelGraph *kernel_graph); void SetKernelInfo(const KernelGraph *kernel_graph);

View File

@ -25,6 +25,11 @@
#include "backend/optimizer/pass/getitem_tuple.h" #include "backend/optimizer/pass/getitem_tuple.h"
#include "backend/optimizer/gpu/adam_weight_decay_fusion.h" #include "backend/optimizer/gpu/adam_weight_decay_fusion.h"
#include "backend/optimizer/gpu/adam_fusion.h" #include "backend/optimizer/gpu/adam_fusion.h"
#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
#include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
#include "backend/optimizer/gpu/replace_addn_fusion.h"
#include "runtime/device/kernel_runtime_manager.h" #include "runtime/device/kernel_runtime_manager.h"
#include "predict/predict.h" #include "predict/predict.h"
#include "common/utils.h" #include "common/utils.h"
@ -59,6 +64,11 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
auto pm = std::make_shared<opt::PassManager>(); auto pm = std::make_shared<opt::PassManager>();
pm->AddPass(std::make_shared<opt::AdamWeightDecayFusion>()); pm->AddPass(std::make_shared<opt::AdamWeightDecayFusion>());
pm->AddPass(std::make_shared<opt::AdamFusion>()); pm->AddPass(std::make_shared<opt::AdamFusion>());
pm->AddPass(std::make_shared<opt::ReplaceBNCastFusion>());
pm->AddPass(std::make_shared<opt::ReplaceBNGradCastFusion>());
pm->AddPass(std::make_shared<opt::ReplaceBNGradCast2Fusion>());
pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
optimizer->AddPassManager(pm); optimizer->AddPassManager(pm);
(void)optimizer->Optimize(kernel_graph); (void)optimizer->Optimize(kernel_graph);
kernel_graph->SetExecOrderByDefault(); kernel_graph->SetExecOrderByDefault();
@ -167,6 +177,10 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
Optimize(graph); Optimize(graph);
// Select kernel build info // Select kernel build info
SelectKernel(graph); SelectKernel(graph);
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
// Assign parameter keys.
AssignParamKey(graph);
#endif
// Convert kernel Graph to model // Convert kernel Graph to model
predictmodel::StepConvertGraph(graph); predictmodel::StepConvertGraph(graph);
// Start gpu kernel runtime // Start gpu kernel runtime
@ -204,6 +218,10 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
auto &kernel_graph = graphs_[graph_id]; auto &kernel_graph = graphs_[graph_id];
// Load input data from user input // Load input data from user input
LoadInputData(kernel_graph, inputs); LoadInputData(kernel_graph, inputs);
// Initialize parameter server
if (!ps_init_) {
InitPSParamAndOptim(kernel_graph, inputs);
}
MS_EXCEPTION_IF_NULL(kernel_graph); MS_EXCEPTION_IF_NULL(kernel_graph);
// Convert inputs to model // Convert inputs to model
predictmodel::StepConvertWeight(inputs); predictmodel::StepConvertWeight(inputs);

View File

@ -307,7 +307,7 @@ CNodePtr KernelGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
if (inputs.size() == 1 || !feature_map_input_indexs.empty()) { if (inputs.size() == 1 || !feature_map_input_indexs.empty()) {
kernel_info->SetFeatureMapFlag(true); kernel_info->SetFeatureMapFlag(true);
} }
if (AnfAlgo::IsRealCNodeKernel(cnode)) { if (AnfAlgo::IsRealKernel(cnode)) {
AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(kernel_info->is_feature_map()), cnode); AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(kernel_info->is_feature_map()), cnode);
AnfAlgo::SetNodeAttr(kIsFeatureMapInputList, MakeValue(feature_map_input_indexs), cnode); AnfAlgo::SetNodeAttr(kIsFeatureMapInputList, MakeValue(feature_map_input_indexs), cnode);
} }
@ -929,10 +929,15 @@ void KernelGraph::AddInternalOutput(const AnfNodePtr &front_node, const AnfNodeP
} }
MS_LOG(INFO) << "Add internal node " << node->DebugString() << " with front node " << front_node->DebugString(); MS_LOG(INFO) << "Add internal node " << node->DebugString() << " with front node " << front_node->DebugString();
front_to_internal_outputs_map_[front_node] = node; front_to_internal_outputs_map_[front_node] = node;
internal_outputs_to_front_map_[node] = front_node; int output_idx = 0;
if (AnfAlgo::CheckPrimitiveType(front_node, prim::kPrimTupleGetItem)) {
output_idx = AnfAlgo::GetTupleGetItemOutIndex(front_node->cast<CNodePtr>());
}
internal_outputs_to_front_map_[node][output_idx] = front_node;
} }
void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node) { void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx,
int dst_output_idx) {
if (new_node == nullptr || node == nullptr) { if (new_node == nullptr || node == nullptr) {
MS_LOG(INFO) << "New node or node is nullptr"; MS_LOG(INFO) << "New node or node is nullptr";
return; return;
@ -947,9 +952,30 @@ void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr
return; return;
} }
MS_LOG(INFO) << "Replace internal node " << node->DebugString() << " To " << new_node->DebugString(); MS_LOG(INFO) << "Replace internal node " << node->DebugString() << " To " << new_node->DebugString();
internal_outputs_to_front_map_[new_node] = iter->second; auto &front_nodes = iter->second;
front_to_internal_outputs_map_[iter->second] = new_node; // Move all front nodes to new node mapping
internal_outputs_to_front_map_.erase(iter); if (src_output_idx == -1) {
internal_outputs_to_front_map_[new_node] = front_nodes;
for (const auto &front_node_iter : front_nodes) {
front_to_internal_outputs_map_[front_node_iter.second] = new_node;
}
internal_outputs_to_front_map_.erase(iter);
return;
}
// Move specified front node to new node mapping
int index = SizeToInt(src_output_idx);
auto front_node_iter = front_nodes.find(index);
if (front_node_iter == front_nodes.end()) {
MS_LOG(INFO) << "The output " << src_output_idx << " of node " << node->DebugString() << " is not an internal node";
return;
}
auto front_node = front_node_iter->second;
internal_outputs_to_front_map_[new_node][dst_output_idx] = front_node;
front_to_internal_outputs_map_[front_node] = new_node;
front_nodes.erase(index);
if (front_nodes.empty()) {
internal_outputs_to_front_map_.erase(iter);
}
} }
AnfNodePtr KernelGraph::GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const { AnfNodePtr KernelGraph::GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const {
@ -967,14 +993,6 @@ bool KernelGraph::IsInternalOutput(const AnfNodePtr &node) const {
return false; return false;
} }
AnfNodePtr KernelGraph::GetFrontNodeByInternalOutput(const AnfNodePtr &node) const {
auto iter = internal_outputs_to_front_map_.find(node);
if (iter != internal_outputs_to_front_map_.end()) {
return iter->second;
}
return nullptr;
}
void KernelGraph::AddFinalOutputKernel(const AnfNodePtr &node) { void KernelGraph::AddFinalOutputKernel(const AnfNodePtr &node) {
if (node == nullptr) { if (node == nullptr) {
return; return;

View File

@ -148,10 +148,10 @@ class KernelGraph : public FuncGraph {
const std::map<std::string, std::pair<AnfNodePtr, int>> &summary_nodes() const { return summary_nodes_; } const std::map<std::string, std::pair<AnfNodePtr, int>> &summary_nodes() const { return summary_nodes_; }
void set_summary_nodes(const std::map<std::string, std::pair<AnfNodePtr, int>> &nodes) { summary_nodes_ = nodes; } void set_summary_nodes(const std::map<std::string, std::pair<AnfNodePtr, int>> &nodes) { summary_nodes_ = nodes; }
void AddInternalOutput(const AnfNodePtr &front_node, const AnfNodePtr &node); void AddInternalOutput(const AnfNodePtr &front_node, const AnfNodePtr &node);
void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node); void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx = -1,
int dst_output_idx = -1);
AnfNodePtr GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const; AnfNodePtr GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const;
bool IsInternalOutput(const AnfNodePtr &node) const; bool IsInternalOutput(const AnfNodePtr &node) const;
AnfNodePtr GetFrontNodeByInternalOutput(const AnfNodePtr &node) const;
void AddFinalOutputKernel(const AnfNodePtr &node); void AddFinalOutputKernel(const AnfNodePtr &node);
bool IsFinalOutputKernel(const AnfNodePtr &node) const; bool IsFinalOutputKernel(const AnfNodePtr &node) const;
uint32_t current_epoch() const { return current_epoch_; } uint32_t current_epoch() const { return current_epoch_; }
@ -223,7 +223,7 @@ class KernelGraph : public FuncGraph {
CNodePtr end_goto_; CNodePtr end_goto_;
bool null_output_; bool null_output_;
std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_; std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_;
std::unordered_map<AnfNodePtr, AnfNodePtr> internal_outputs_to_front_map_; std::unordered_map<AnfNodePtr, std::unordered_map<int, AnfNodePtr>> internal_outputs_to_front_map_;
std::set<AnfNodePtr> final_output_kernels_; std::set<AnfNodePtr> final_output_kernels_;
uint32_t current_epoch_; uint32_t current_epoch_;
}; };

View File

@ -35,6 +35,11 @@
#include "ir/dtype.h" #include "ir/dtype.h"
#include "ir/anf.h" #include "ir/anf.h"
#include "ir/func_graph_cloner.h" #include "ir/func_graph_cloner.h"
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
#include "frontend/parallel/ps/worker.h"
#include "frontend/parallel/ps/common.h"
#include "frontend/parallel/ps/util.h"
#endif
namespace mindspore { namespace mindspore {
namespace session { namespace session {
@ -295,7 +300,11 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
MS_LOG(INFO) << "No corresponding internal output for output node"; MS_LOG(INFO) << "No corresponding internal output for output node";
return; return;
} }
auto real_kernel = AnfAlgo::VisitKernel(ref_node, 0); size_t output_idx = 0;
if (AnfAlgo::CheckPrimitiveType(out_node, prim::kPrimTupleGetItem)) {
output_idx = AnfAlgo::GetTupleGetItemOutIndex(out_node->cast<CNodePtr>());
}
auto real_kernel = AnfAlgo::VisitKernel(ref_node, output_idx);
auto ref_real_node = real_kernel.first; auto ref_real_node = real_kernel.first;
auto ref_real_node_index = real_kernel.second; auto ref_real_node_index = real_kernel.second;
if (ref_real_node->isa<CNode>() && node_graph->IsInternalOutput(ref_real_node) && if (ref_real_node->isa<CNode>() && node_graph->IsInternalOutput(ref_real_node) &&
@ -320,6 +329,7 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
builder.SetOutputsFormat({format}); builder.SetOutputsFormat({format});
d_kernel_info->set_select_kernel_build_info(builder.Build()); d_kernel_info->set_select_kernel_build_info(builder.Build());
AnfAlgo::SetOutputAddr(address, 0, parameter.get()); AnfAlgo::SetOutputAddr(address, 0, parameter.get());
AnfAlgo::SetOutputInferTypeAndShape({type}, {AnfAlgo::GetOutputInferShape(parameter, 0)}, parameter.get());
} }
} }
@ -973,6 +983,16 @@ CNodePtr SessionBasic::ConstructOutput(const AnfNodePtrList &outputs, const std:
bool internal_output = true; bool internal_output = true;
std::string kernel_target = GetCNodeTarget(front_real_kernel.first); std::string kernel_target = GetCNodeTarget(front_real_kernel.first);
for (auto user : users) { for (auto user : users) {
auto cnode = user.first->cast<CNodePtr>();
if (cnode == nullptr) {
internal_output = false;
break;
}
auto prim = cnode->input(kAnfPrimitiveIndex);
if (prim == nullptr || !prim->isa<ValueNode>()) {
internal_output = false;
break;
}
if (!AnfAlgo::IsRealKernel(user.first) || kernel_target != GetCNodeTarget(user.first)) { if (!AnfAlgo::IsRealKernel(user.first) || kernel_target != GetCNodeTarget(user.first)) {
internal_output = false; internal_output = false;
break; break;
@ -1097,5 +1117,92 @@ KernelGraphPtr SessionBasic::NewKernelGraph() {
graphs_[graph_sum_++] = graph; graphs_[graph_sum_++] = graph;
return graph; return graph;
} }
AnfNodePtr SessionBasic::FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list) {
MS_EXCEPTION_IF_NULL(push_node);
for (auto &node : node_list) {
if (node != nullptr && node->isa<CNode>()) {
for (auto input : node->cast<CNodePtr>()->inputs()) {
if (push_node == AnfAlgo::VisitKernel(input, 0).first) {
if (AnfAlgo::GetCNodeName(node) != kPullOpName) {
MS_LOG(EXCEPTION) << "The edge between Push and Pull node is invalid.";
}
return node;
}
}
}
}
return nullptr;
}
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
void SessionBasic::AssignParamKey(const KernelGraphPtr &kernel_graph) {
if (!parallel::ps::Util::IsRoleOfWorker()) {
MS_LOG(INFO) << "Not parameter server mode.";
return;
}
MS_EXCEPTION_IF_NULL(kernel_graph);
std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph->get_return());
for (auto &node : node_list) {
if (node != nullptr && node->isa<CNode>()) {
// Assign key for forward kernel EmbeddingLookup.
// The key will be assigned to embedding table ande Push kernel as well.
if (AnfAlgo::GetCNodeName(node) == kEmbeddingLookupOpName) {
size_t embedding_table_idx = 0;
auto embedding_table = AnfAlgo::GetInputNode(node->cast<CNodePtr>(), embedding_table_idx);
size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(embedding_table->fullname_with_scope());
AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node);
} else if (AnfAlgo::GetCNodeName(node) == kPushOpName) {
auto pull_node = FindPullNode(node, node_list);
if (!pull_node) {
MS_LOG(EXCEPTION) << "Assigning parameter key failed: can't find Pull node of the Push node.";
}
// Second input of Pull node is the trainable parameter.
size_t parameter_index = 1;
auto parameter_node = AnfAlgo::GetInputNode(pull_node->cast<CNodePtr>(), parameter_index);
size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(parameter_node->fullname_with_scope());
AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node);
AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), pull_node);
std::string optimizer_name = AnfAlgo::GetNodeAttr<std::string>(node, kAttrOptimizerType);
parallel::ps::Worker<float>::GetInstance().SetKeyOptimId(key, optimizer_name);
}
}
}
}
void SessionBasic::InitPSParamAndOptim(const KernelGraphPtr &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs_const) {
if (!parallel::ps::Util::IsRoleOfWorker()) {
return;
}
std::vector<tensor::TensorPtr> inputs(inputs_const);
size_t input_ctrl_size = 1;
MS_EXCEPTION_IF_NULL(kernel_graph);
if (kernel_graph->input_ctrl_tensors()) {
input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs);
}
auto input_nodes = kernel_graph->inputs();
if ((inputs.size() + input_ctrl_size) - 1 != input_nodes.size()) {
MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size()
<< ", input_ctrl_size:" << input_ctrl_size;
}
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
for (size_t i = 0; i < inputs.size(); ++i) {
auto tensor = inputs[i];
MS_EXCEPTION_IF_NULL(tensor);
auto input_node = input_nodes[i];
MS_EXCEPTION_IF_NULL(input_node);
if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
auto pk_node = input_node->cast<ParameterPtr>();
mindspore::parallel::ps::Worker<float>::GetInstance().InitPSParamAndOptim(
pk_node->fullname_with_scope(), tensor->data_c(), LongToSize(tensor->data().nbytes()));
}
}
ps_init_ = true;
}
#endif
} // namespace session } // namespace session
} // namespace mindspore } // namespace mindspore

View File

@ -51,7 +51,7 @@ using OpRunInfoPtr = std::shared_ptr<OpRunInfo>;
class SessionBasic { class SessionBasic {
public: public:
SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0) { SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0), ps_init_(false) {
#ifdef ENABLE_DEBUGGER #ifdef ENABLE_DEBUGGER
debugger_ = nullptr; debugger_ = nullptr;
#endif #endif
@ -104,6 +104,8 @@ class SessionBasic {
virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; } virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; }
virtual void SetActive(GraphId, GraphId) {} virtual void SetActive(GraphId, GraphId) {}
virtual void GetSummaryNodes(KernelGraph *graph); virtual void GetSummaryNodes(KernelGraph *graph);
void AssignParamKey(const KernelGraphPtr &kernel_graph);
void InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, const std::vector<tensor::TensorPtr> &inputs_const);
#ifdef ENABLE_DEBUGGER #ifdef ENABLE_DEBUGGER
// set debugger // set debugger
@ -140,6 +142,7 @@ class SessionBasic {
AnfNodePtr CreateNewParameterFromCNode(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph); AnfNodePtr CreateNewParameterFromCNode(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph);
void AddParameterToGraphInputs(const std::vector<AnfNodePtr> &parameters, KernelGraph *graph); void AddParameterToGraphInputs(const std::vector<AnfNodePtr> &parameters, KernelGraph *graph);
void InitInternalOutputParameter(const AnfNodePtr &out_node, const AnfNodePtr &parameter); void InitInternalOutputParameter(const AnfNodePtr &out_node, const AnfNodePtr &parameter);
AnfNodePtr FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list);
std::unordered_map<GraphId, std::shared_ptr<KernelGraph>> graphs_; std::unordered_map<GraphId, std::shared_ptr<KernelGraph>> graphs_;
std::unordered_map<GraphInfo, std::shared_ptr<KernelGraph>> run_op_graphs_; std::unordered_map<GraphInfo, std::shared_ptr<KernelGraph>> run_op_graphs_;
@ -148,6 +151,7 @@ class SessionBasic {
CallBackFunc summary_callback_; CallBackFunc summary_callback_;
static GraphId graph_sum_; static GraphId graph_sum_;
uint32_t device_id_; uint32_t device_id_;
bool ps_init_;
#ifdef ENABLE_DEBUGGER #ifdef ENABLE_DEBUGGER
std::shared_ptr<Debugger> debugger_; std::shared_ptr<Debugger> debugger_;
#endif #endif

View File

@ -23,9 +23,7 @@ if (ENABLE_D)
list(APPEND _DEBUG_SRC_LIST list(APPEND _DEBUG_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/common.cc" "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
) )
if (ENABLE_DATA_DUMP) list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc")
list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc")
endif(ENABLE_DATA_DUMP)
endif() endif()
if (ENABLE_DUMP_E2E) if (ENABLE_DUMP_E2E)

View File

@ -120,6 +120,10 @@ std::optional<std::string> Common::GetConfigFile(const std::string &env) {
MS_LOG(ERROR) << dump_config_file << " not exist."; MS_LOG(ERROR) << dump_config_file << " not exist.";
return {}; return {};
} }
auto suffix = dump_config_file.substr(dump_config_file.find_last_of('.') + 1);
if (suffix != "json") {
MS_LOG(EXCEPTION) << "[DataDump] dump config file suffix only support json! But got:." << suffix;
}
return dump_config_file; return dump_config_file;
} }
} // namespace mindspore } // namespace mindspore

View File

@ -29,13 +29,13 @@ void DataDumpParser::ResetParam() {
net_name_.clear(); net_name_.clear();
dump_mode_ = 0; dump_mode_ = 0;
dump_step_ = 0; dump_step_ = 0;
kernel_set_.clear(); kernel_map_.clear();
} }
bool DataDumpParser::DumpEnabled() const { bool DataDumpParser::DumpEnabled() const {
auto enable_dump = std::getenv(kEnableDataDump); auto enable_dump = std::getenv(kEnableDataDump);
if (!enable_dump) { if (!enable_dump) {
MS_LOG(WARNING) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP"; MS_LOG(INFO) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP";
return false; return false;
} }
@ -60,9 +60,18 @@ std::optional<std::string> DataDumpParser::GetDumpPath() const {
return {}; return {};
} }
std::string dump_path_str(dump_path); std::string dump_path_str(dump_path);
if (!std::all_of(dump_path_str.begin(), dump_path_str.end(), ::isalpha)) {
MS_LOG(EXCEPTION) << "[DataDump] dump path only support alphas, but got:" << dump_path_str;
}
return dump_path_str; return dump_path_str;
} }
std::string GetIfstreamString(const std::ifstream &ifstream) {
std::stringstream buffer;
buffer << ifstream.rdbuf();
return buffer.str();
}
void DataDumpParser::ParseDumpConfig() { void DataDumpParser::ParseDumpConfig() {
std::lock_guard<std::mutex> guard(lock_); std::lock_guard<std::mutex> guard(lock_);
MS_LOG(INFO) << "[DataDump] parse start"; MS_LOG(INFO) << "[DataDump] parse start";
@ -84,7 +93,12 @@ void DataDumpParser::ParseDumpConfig() {
} }
nlohmann::json j; nlohmann::json j;
json_file >> j; try {
json_file >> j;
} catch (nlohmann::json::parse_error &e) {
MS_LOG(ERROR) << "[DataDump] json contents:" << GetIfstreamString(json_file);
MS_LOG(EXCEPTION) << "[DataDump] parse json failed, error:" << e.what();
}
if (j.find("DumpSettings") == j.end()) { if (j.find("DumpSettings") == j.end()) {
MS_LOG(EXCEPTION) << "[DataDump] DumpSettings is not exist."; MS_LOG(EXCEPTION) << "[DataDump] DumpSettings is not exist.";
} }
@ -111,8 +125,8 @@ bool DataDumpParser::NeedDump(const std::string &op_full_name) const {
if (dump_mode_ == 0) { if (dump_mode_ == 0) {
return true; return true;
} }
auto iter = kernel_set_.find(op_full_name); auto iter = kernel_map_.find(op_full_name);
return iter != kernel_set_.end(); return iter != kernel_map_.end();
} }
bool DataDumpParser::IsConfigExist(const nlohmann::json &dump_settings) const { bool DataDumpParser::IsConfigExist(const nlohmann::json &dump_settings) const {
@ -145,8 +159,25 @@ bool DataDumpParser::ParseDumpSetting(const nlohmann::json &dump_settings) {
auto kernel_str = kernel.dump(); auto kernel_str = kernel.dump();
kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end()); kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end());
MS_LOG(INFO) << "[DataDump] Need dump kernel:" << kernel_str; MS_LOG(INFO) << "[DataDump] Need dump kernel:" << kernel_str;
kernel_set_.insert(kernel_str); kernel_map_.insert({kernel_str, 0});
} }
return true; return true;
} }
void DataDumpParser::MatchKernel(const std::string &kernel_name) {
auto iter = kernel_map_.find(kernel_name);
if (iter == kernel_map_.end()) {
return;
}
iter->second = iter->second + 1;
MS_LOG(INFO) << "Match dump kernel:" << iter->first << " match times:" << iter->second;
}
void DataDumpParser::PrintUnusedKernel() {
for (const auto &iter : kernel_map_) {
if (iter.second == 0) {
MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
}
}
}
} // namespace mindspore } // namespace mindspore

View File

@ -18,7 +18,7 @@
#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_ #define MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_
#include <string> #include <string>
#include <set> #include <map>
#include <mutex> #include <mutex>
#include <optional> #include <optional>
#include "nlohmann/json.hpp" #include "nlohmann/json.hpp"
@ -39,7 +39,8 @@ class DataDumpParser {
const std::string &net_name() const { return net_name_; } const std::string &net_name() const { return net_name_; }
uint32_t dump_mode() const { return dump_mode_; } uint32_t dump_mode() const { return dump_mode_; }
uint32_t dump_step() const { return dump_step_; } uint32_t dump_step() const { return dump_step_; }
const std::set<std::string> &kernel_set() const { return kernel_set_; } void MatchKernel(const std::string &kernel_name);
void PrintUnusedKernel();
private: private:
DataDumpParser() = default; DataDumpParser() = default;
@ -55,7 +56,7 @@ class DataDumpParser {
std::string net_name_; std::string net_name_;
uint32_t dump_mode_{0}; uint32_t dump_mode_{0};
uint32_t dump_step_{0}; uint32_t dump_step_{0};
std::set<std::string> kernel_set_; std::map<std::string, uint32_t> kernel_map_;
}; };
} // namespace mindspore } // namespace mindspore
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_ #endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_

View File

@ -124,6 +124,8 @@ void ProtoExporter::SetNodeOutputType(const TypePtr &type, const BaseShapePtr &s
// Do Nothing // Do Nothing
} else if (type->isa<UndeterminedType>()) { } else if (type->isa<UndeterminedType>()) {
// Do Nothing // Do Nothing
} else if (type->isa<SparseTensorType>()) {
// Do Nothing
} else if (type->isa<Tuple>()) { } else if (type->isa<Tuple>()) {
TuplePtr tuple_type = dyn_cast<Tuple>(type); TuplePtr tuple_type = dyn_cast<Tuple>(type);
type_proto->set_data_type(irpb::DT_TUPLE); type_proto->set_data_type(irpb::DT_TUPLE);

Some files were not shown because too many files have changed in this diff Show More