synchronize latest Ascend software suite 18 Jul 2020, and merging branches

This commit is contained in:
yanghaoran 2020-07-18 16:10:19 +08:00
commit 859acc6d2a
414 changed files with 10415 additions and 2035 deletions

2
.gitmodules vendored
View File

@ -15,4 +15,4 @@
url = https://gitee.com/mindspore/akg.git
[submodule "graphengine"]
path = graphengine
url = https://gitee.com/ms-incubator/graphengine.git
url = https://gitee.com/mindspore/graphengine.git

View File

@ -202,10 +202,10 @@ Check out how MindSpore Open Governance [works](https://gitee.com/mindspore/comm
### Communication
- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/enQtOTcwMTIxMDI3NjM0LTNkMWM2MzI5NjIyZWU5ZWQ5M2EwMTQ5MWNiYzMxOGM4OWFhZjI4M2E5OGI2YTg3ODU1ODE2Njg1MThiNWI3YmQ) - Communication platform for developers.
- [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/zt-dgk65rli-3ex4xvS4wHX7UDmsQmfu8w) - Communication platform for developers.
- IRC channel at `#mindspore` (only for meeting minutes logging purpose)
- Video Conferencing: https://meet.jit.si
- Mailing-list: https://mailweb.mindspore.cn/postorius/lists
- Video Conferencing: TBD
- Mailing-list: <https://mailweb.mindspore.cn/postorius/lists>
## Contributing

2
akg

@ -1 +1 @@
Subproject commit df57a6cf9450e347d1854687d1fe66a420ee3b35
Subproject commit f60af9df4220bf3db5de2b224418953c0dc1f625

View File

@ -24,7 +24,7 @@ usage()
{
echo "Usage:"
echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
echo " [-a on|off] [-Q on|off] [-S on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
echo " [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
echo " [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]"
echo ""
echo "Options:"
@ -48,7 +48,6 @@ usage()
echo " -P Enable dump anf graph to file in ProtoBuffer format, default on"
echo " -Q Enable dump memory, default off"
echo " -D Enable dumping of function graph ir, default on"
echo " -S Enable async data dump, default off"
echo " -z Compile dataset & mindrecord, default on"
echo " -M Enable MPI and NCCL for GPU training, gpu default on"
echo " -V Specify the minimum required cuda version, default CUDA 10.1"
@ -89,7 +88,6 @@ checkopts()
ENABLE_TIMELINE="off"
ENABLE_DUMP2PROTO="on"
ENABLE_DUMPE2E="off"
ENABLE_DATA_DUMP="off"
ENABLE_DUMP_IR="on"
COMPILE_MINDDATA="on"
ENABLE_MPI="off"
@ -104,7 +102,7 @@ checkopts()
ENABLE_PYTHON="on"
# Process the options
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:S:D:zM:V:K:sB:E' opt
while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:sB:E' opt
do
OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
case "${opt}" in
@ -186,6 +184,7 @@ checkopts()
elif [[ "X$OPTARG" == "Xd" || "X$OPTARG" == "Xascend" ]]; then
ENABLE_D="on"
ENABLE_CPU="on"
ENABLE_SERVING="on"
elif [[ "X$OPTARG" == "Xcpu" ]]; then
ENABLE_CPU="on"
else
@ -220,11 +219,6 @@ checkopts()
ENABLE_DUMPE2E="$OPTARG"
echo "enable dump end to end"
;;
S)
check_on_off $OPTARG S
ENABLE_DATA_DUMP="$OPTARG"
echo "enable data dump"
;;
D)
check_on_off $OPTARG D
ENABLE_DUMP_IR="$OPTARG"
@ -328,9 +322,6 @@ build_mindspore()
if [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON"
fi
if [[ "X$ENABLE_DATA_DUMP" = "Xon" ]]; then
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DATA_DUMP=ON"
fi
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}"
CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}"
if [[ "X$ENABLE_MPI" = "Xon" ]]; then

View File

@ -1,4 +1,4 @@
set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS}")
set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
mindspore_add_pkg(glog
VER 0.4.0

View File

@ -116,10 +116,10 @@ if(ENABLE_DUMP_E2E)
add_compile_definitions(ENABLE_DUMP_E2E)
endif()
if(ENABLE_DATA_DUMP)
add_compile_definitions(ENABLE_DATA_DUMP)
endif()
if(ENABLE_DEBUGGER)
add_compile_definitions(ENABLE_DEBUGGER)
endif()
if(ENABLE_TESTCASES)
add_compile_definitions(ENABLE_TESTCASES)
endif()

View File

@ -1,13 +1,16 @@
# find exec
find_package(Python3 3.7 COMPONENTS Interpreter Development)
if (NOT Python3_FOUND)
message("No python3 found.")
return ()
message(FATAL_ERROR "No python3 found.")
endif ()
set(PYTHON ${Python3_EXECUTABLE})
set(PYTHON_VERSION ${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR})
if (NOT PYTHON_VERSION MATCHES "3.7")
message(FATAL_ERROR "FIND PYTHON VERSION ${PYTHON_VERSION} BUT CAN NOT MATCH PYTHON VERSION 3.7")
endif ()
find_package(Git)
if (NOT GIT_FOUND)
message("No git found.")

@ -1 +1 @@
Subproject commit eee707935c066c16e9b9cd207f8125871b6b97cf
Subproject commit 103f2d1019dc50d781d7a964551d9f1f50b3b009

0
hub/docs/.gitkeep Normal file
View File

0
hub/images/.gitkeep Normal file
View File

0
hub/scripts/.gitkeep Normal file
View File

View File

@ -17,7 +17,7 @@
"""Resources for ast tree parse."""
import ast
import math
from mindspore import IndexedSlices
from mindspore import IndexedSlices, SparseTensor
from mindspore.ops.composite import multitype_ops
from mindspore.ops import functional as F, composite as C
from . import standard_method as M
@ -140,4 +140,5 @@ convert_object_map = {
# user defined
IndexedSlices: F.make_indexed_slices,
SparseTensor: F.make_sparse_tensor,
}

View File

@ -44,7 +44,7 @@ if(ENABLE_GPU)
"backend/kernel_compiler/akg/akg_kernel_attrs_process.cc"
)
list(APPEND CUDA_NVCC_FLAGS -arch=sm_53)
list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/blocking_queue.cc" "runtime/device/gpu/gpu_buffer_mgr.cc")
list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/mpi/mpi_initializer.cc"
"runtime/device/gpu/distribution/collective_wrapper.cc"

View File

@ -26,14 +26,6 @@ if (ENABLE_CPU)
"cpu/*.cc"
)
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc"
"cpu/ps/pull_kernel.cc"
"cpu/ps/embedding_look_up_ps_kernel.cc"
"cpu/ps/embedding_look_up_proxy_kernel.cc"
"cpu/ps/apply_momentum_ps_kernel.cc"
"cpu/ps/sparse_apply_adam_ps_kernel.cc"
"cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
if (NOT ENABLE_MPI)
list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc")
@ -41,6 +33,17 @@ if (ENABLE_CPU)
endif ()
endif ()
if (${CMAKE_SYSTEM_NAME} MATCHES "Windows" OR ENABLE_GE)
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/apply_momentum_ps_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_proxy_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_ps_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pserver_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pull_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_adam_ps_kernel.cc")
list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
endif()
if (ENABLE_GPU)
file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"gpu/*.cu"

View File

@ -18,6 +18,7 @@
#include <algorithm>
#include "backend/session/anf_runtime_algorithm.h"
#include "backend/optimizer/common/helper.h"
#include "backend/kernel_compiler/common_utils.h"
namespace mindspore {
namespace kernel {
@ -75,15 +76,7 @@ void SetAkgAttrsForCast(const AnfNodePtr &anf_node) {
std::string dst_type;
TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, 0);
if (output_type == kFloat32->type_id()) {
dst_type = "float32";
} else if (output_type == kFloat16->type_id()) {
dst_type = "float16";
} else if (output_type == kInt32->type_id()) {
dst_type = "int32";
} else {
MS_LOG(WARNING) << "Unknown cast_to type: " << TypeIdToType(output_type)->ToString();
}
dst_type = TypeId2String(output_type);
AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node);
}

View File

@ -21,9 +21,7 @@
#include <memory>
#include "framework/ge_runtime/task_info.h"
#include "backend/kernel_compiler/kernel.h"
#ifdef ENABLE_DATA_DUMP
#include "debug/data_dump_parser.h"
#endif
using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>;
namespace mindspore {
@ -34,13 +32,7 @@ class AscendKernelMod : public KernelMod {
const std::vector<AddressPtr> &, uint32_t) = 0;
uint32_t block_dim() { return block_dim_; }
uint32_t stream_id() { return stream_id_; }
virtual bool NeedDump() {
#ifdef ENABLE_DATA_DUMP
return DataDumpParser::GetInstance().NeedDump(kernel_name_);
#else
return false;
#endif
}
virtual bool NeedDump() { return DataDumpParser::GetInstance().NeedDump(kernel_name_); }
protected:
uint32_t block_dim_{1};

View File

@ -20,6 +20,7 @@
#include <iostream>
#include <utility>
#include <fstream>
#include <algorithm>
#include <thread>
#include "nlohmann/json.hpp"
#include "backend/session/anf_runtime_algorithm.h"
@ -499,235 +500,329 @@ int Sign(float x) {
return 0;
}
void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim) {
MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
MS_EXCEPTION_IF_NULL(unique_grad);
MS_EXCEPTION_IF_NULL(unique_grad->value_);
MS_EXCEPTION_IF_NULL(unique_grad->indices_);
namespace {
struct BucketSparseGradient {
float *value_;
int *indices_;
int *global_indices_;
size_t indices_size_;
};
struct MultiThreadReduceSparseGradientParam {
SparseGradient *input_grad_{nullptr};
SparseGradient *workspace_grad_{nullptr};
SparseGradient *output_grad_{nullptr};
size_t max_index_{0};
size_t value_stride_{0};
size_t thread_num_{0};
bool use_sort_reduce_{false};
};
void CalculateEachBucketSize(const std::shared_ptr<SparseGradient> &sparse_grad, size_t max_index,
std::vector<size_t> *each_bucket_size) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(sparse_grad);
MS_EXCEPTION_IF_NULL(sparse_grad->indices_);
MS_EXCEPTION_IF_NULL(each_bucket_size);
size_t bucket_num = each_bucket_size->size();
for (size_t i = 0; i < sparse_grad->indices_size_; ++i) {
int index = sparse_grad->indices_[i];
if (index >= 0 && IntToSize(index) < max_index) {
auto bucket_id = index % bucket_num;
each_bucket_size->at(bucket_id)++;
}
}
MS_LOG(DEBUG) << "End";
}
void SplitAndCalculateSegmentBucketSize(const MultiThreadReduceSparseGradientParam &param,
std::vector<std::shared_ptr<SparseGradient>> *segments_ptr,
std::vector<std::shared_ptr<std::vector<size_t>>> *segment_bucket_sizes_ptr) {
MS_EXCEPTION_IF_NULL(param.input_grad_);
MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr);
MS_EXCEPTION_IF_NULL(segments_ptr);
auto &segments = *segments_ptr;
auto &segment_bucket_sizes = *segment_bucket_sizes_ptr;
auto input_grad = param.input_grad_;
if (param.thread_num_ < 1) {
MS_EXCEPTION(ArgumentError) << "Input param thread num must > 0!";
}
size_t thread_indices_size = input_grad->indices_size_ / param.thread_num_;
size_t left_indices_size = input_grad->indices_size_ % param.thread_num_;
std::vector<std::thread> threads;
threads.reserve(param.thread_num_);
segments.reserve(param.thread_num_);
size_t current_indices_offset = 0;
for (size_t i = 0; i < param.thread_num_; ++i) {
segment_bucket_sizes.emplace_back(std::make_shared<std::vector<size_t>>(param.thread_num_, 0));
size_t indices_size = thread_indices_size;
if (i < left_indices_size) {
indices_size += 1;
}
segments.emplace_back(std::make_shared<SparseGradient>());
segments[i]->value_ = input_grad->value_ + current_indices_offset * param.value_stride_;
segments[i]->indices_ = input_grad->indices_ + current_indices_offset;
segments[i]->indices_size_ = indices_size;
threads.emplace_back(
std::thread(CalculateEachBucketSize, segments[i], param.max_index_, segment_bucket_sizes[i].get()));
current_indices_offset += indices_size;
}
for (size_t i = 0; i < param.thread_num_; ++i) {
threads[i].join();
}
}
void CopySegmentIndicesToBucket(const MultiThreadReduceSparseGradientParam &param,
const std::shared_ptr<SparseGradient> &segment, size_t bucket_offset,
const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(segment);
MS_EXCEPTION_IF_NULL(segment->indices_);
std::vector<size_t> bucket_data_num(param.thread_num_, 0);
for (size_t i = 0; i < segment->indices_size_; ++i) {
int index = segment->indices_[i];
if (index >= 0 && IntToSize(index) < param.max_index_) {
auto bucket_id = index % param.thread_num_;
auto bucket_index = bucket_data_num[bucket_id];
buckets[bucket_id]->indices_[bucket_index] = index;
buckets[bucket_id]->global_indices_[bucket_index] = bucket_offset + i;
bucket_data_num[bucket_id]++;
}
}
MS_LOG(DEBUG) << "End";
}
void GatherSegmentIndicesToOutputBucket(const MultiThreadReduceSparseGradientParam &param,
const std::vector<std::shared_ptr<SparseGradient>> &segments,
const std::vector<std::shared_ptr<std::vector<size_t>>> &segment_bucket_sizes,
std::vector<std::shared_ptr<BucketSparseGradient>> *buckets_ptr) {
MS_EXCEPTION_IF_NULL(param.output_grad_);
MS_EXCEPTION_IF_NULL(param.output_grad_->value_);
MS_EXCEPTION_IF_NULL(param.output_grad_->indices_);
MS_EXCEPTION_IF_NULL(buckets_ptr);
auto &buckets = *buckets_ptr;
size_t thread_num = param.thread_num_;
if (thread_num != segment_bucket_sizes.size()) {
MS_EXCEPTION(ArgumentError) << "Input param thread num not equal to segment size!";
}
std::vector<size_t> bucket_data_size(thread_num, 0);
for (size_t i = 0; i < thread_num; ++i) {
for (size_t j = 0; j < thread_num; ++j) {
bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
}
}
size_t current_indices_offset = 0;
for (size_t i = 0; i < thread_num; ++i) {
buckets.emplace_back(std::make_shared<BucketSparseGradient>());
buckets[i]->value_ = param.output_grad_->value_ + current_indices_offset * param.value_stride_;
buckets[i]->indices_ = param.output_grad_->indices_ + current_indices_offset;
buckets[i]->global_indices_ = param.workspace_grad_->indices_ + current_indices_offset;
buckets[i]->indices_size_ = bucket_data_size[i];
current_indices_offset += bucket_data_size[i];
}
std::vector<size_t> tmp_bucket_data_size(thread_num, 0);
std::vector<std::vector<std::shared_ptr<BucketSparseGradient>>> each_thread_buckets;
for (size_t i = 0; i < thread_num; ++i) {
std::vector<std::shared_ptr<BucketSparseGradient>> thread_buckets;
for (size_t j = 0; j < thread_num; ++j) {
thread_buckets.emplace_back(std::make_shared<BucketSparseGradient>());
thread_buckets[j]->indices_ = buckets[j]->indices_ + tmp_bucket_data_size[j];
thread_buckets[j]->global_indices_ = buckets[j]->global_indices_ + tmp_bucket_data_size[j];
thread_buckets[j]->value_ = buckets[j]->value_ + tmp_bucket_data_size[j] * param.value_stride_;
thread_buckets[j]->indices_size_ = segment_bucket_sizes[i]->at(j);
tmp_bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
}
each_thread_buckets.emplace_back(thread_buckets);
}
std::vector<std::thread> threads;
threads.reserve(thread_num);
current_indices_offset = 0;
for (size_t i = 0; i < thread_num; ++i) {
threads.emplace_back(
std::thread(CopySegmentIndicesToBucket, param, segments[i], current_indices_offset, each_thread_buckets[i]));
current_indices_offset += segments[i]->indices_size_;
}
for (size_t i = 0; i < thread_num; ++i) {
threads[i].join();
}
}
void SortAndReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam &param,
const std::shared_ptr<BucketSparseGradient> &bucket,
const std::shared_ptr<SparseGradient> &reduced_bucket) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(bucket);
MS_EXCEPTION_IF_NULL(bucket->value_);
MS_EXCEPTION_IF_NULL(bucket->indices_);
MS_EXCEPTION_IF_NULL(reduced_bucket);
MS_EXCEPTION_IF_NULL(reduced_bucket->value_);
MS_EXCEPTION_IF_NULL(reduced_bucket->indices_);
std::vector<std::pair<int, int>> sorted_indices;
sorted_indices.reserve(bucket->indices_size_);
for (size_t i = 0; i < bucket->indices_size_; ++i) {
int index = bucket->indices_[i];
int global_index = bucket->global_indices_[i];
sorted_indices.emplace_back(std::pair<int, int>(index, global_index));
}
std::sort(sorted_indices.begin(), sorted_indices.end());
float *global_value = param.input_grad_->value_;
size_t unique_indices_size = 0;
size_t max_length = reduced_bucket->indices_size_ * param.value_stride_;
int last_index{0};
size_t value_offset{0};
for (size_t i = 0; i < sorted_indices.size(); ++i) {
int index = sorted_indices[i].first;
int global_index = sorted_indices[i].second;
int global_value_offset = global_index * param.value_stride_;
if (i == 0 || index != last_index) {
if (i != 0) {
unique_indices_size++;
}
reduced_bucket->indices_[unique_indices_size] = index;
value_offset = unique_indices_size * param.value_stride_;
auto ret_code = memcpy_s(reduced_bucket->value_ + value_offset, (max_length - value_offset) * sizeof(float),
global_value + global_value_offset, param.value_stride_ * sizeof(float));
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
} else {
for (size_t j = 0; j < param.value_stride_; ++j) {
reduced_bucket->value_[value_offset + j] += global_value[global_value_offset + j];
}
}
last_index = index;
}
reduced_bucket->indices_size_ = unique_indices_size;
MS_LOG(DEBUG) << "End";
}
void ReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam &param,
const std::shared_ptr<BucketSparseGradient> &bucket,
const std::shared_ptr<SparseGradient> &reduced_bucket) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(bucket);
MS_EXCEPTION_IF_NULL(bucket->value_);
MS_EXCEPTION_IF_NULL(bucket->indices_);
MS_EXCEPTION_IF_NULL(reduced_bucket);
MS_EXCEPTION_IF_NULL(reduced_bucket->value_);
MS_EXCEPTION_IF_NULL(reduced_bucket->indices_);
float *global_value = param.input_grad_->value_;
std::unordered_map<int, size_t> index_map;
size_t unique_indices_size = 0;
for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
int index = origin_sparse_grad.indices_[i];
if (index < 0 || IntToSize(index) >= first_dim) {
continue;
}
size_t max_length = reduced_bucket->indices_size_ * param.value_stride_;
for (size_t i = 0; i < bucket->indices_size_; ++i) {
int index = bucket->indices_[i];
int global_index = bucket->global_indices_[i];
auto iter = index_map.find(index);
if (iter == index_map.end()) {
index_map[index] = unique_indices_size;
unique_grad->indices_[unique_indices_size] = index;
size_t start_index = unique_indices_size * outer_dim;
size_t end_index = start_index + outer_dim;
for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) {
unique_grad->value_[j] = origin_sparse_grad.value_[k];
reduced_bucket->indices_[unique_indices_size] = index;
size_t start_index = unique_indices_size * param.value_stride_;
index_map[index] = start_index;
auto ret_code = memcpy_s(reduced_bucket->value_ + start_index, (max_length - start_index) * sizeof(float),
global_value + global_index * param.value_stride_, param.value_stride_ * sizeof(float));
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
unique_indices_size++;
} else {
size_t first_index = iter->second;
size_t start_index = first_index * outer_dim;
size_t end_index = start_index + outer_dim;
for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) {
unique_grad->value_[j] += origin_sparse_grad.value_[k];
size_t start_index = iter->second;
size_t end_index = start_index + param.value_stride_;
for (size_t j = start_index, k = global_index * param.value_stride_; j < end_index; ++j, ++k) {
reduced_bucket->value_[j] += global_value[k];
}
}
}
unique_grad->indices_size_ = unique_indices_size;
}
struct WorkerParamsForReduceSparseGradient {
size_t slice_start_{0};
size_t slice_end_{0};
size_t max_length_{0};
size_t outer_dim_{0};
std::vector<std::pair<int, size_t>> *sorted_indices_{nullptr};
std::vector<size_t> *slice_positions_{nullptr};
float *src_value_{nullptr};
SparseGradient *unique_grad_{nullptr};
};
void WorkerForReduceSparseGradient(WorkerParamsForReduceSparseGradient param) {
MS_EXCEPTION_IF_NULL(param.sorted_indices_);
MS_EXCEPTION_IF_NULL(param.slice_positions_);
MS_EXCEPTION_IF_NULL(param.src_value_);
MS_EXCEPTION_IF_NULL(param.unique_grad_);
auto outer_dim = param.outer_dim_;
auto &sorted_indices = *(param.sorted_indices_);
auto &slice_positions = *(param.slice_positions_);
auto unique_grad = param.unique_grad_;
for (size_t slice_id = param.slice_start_; slice_id < param.slice_end_; ++slice_id) {
size_t cur_pos = slice_positions[slice_id];
int index = sorted_indices[cur_pos].first;
unique_grad->indices_[slice_id] = index;
size_t start_index = slice_id * outer_dim;
auto ret_code = memcpy_s(unique_grad->value_ + start_index, (param.max_length_ - start_index) * sizeof(float),
param.src_value_ + sorted_indices[cur_pos].second, outer_dim * sizeof(float));
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
cur_pos++;
size_t end_pos;
if (slice_id + 1 < slice_positions.size()) {
end_pos = slice_positions[slice_id + 1];
} else {
end_pos = sorted_indices.size();
}
while (cur_pos < end_pos) {
for (size_t i = 0; i < outer_dim; ++i) {
unique_grad->value_[start_index + i] += param.src_value_[sorted_indices[cur_pos].second + i];
}
cur_pos++;
}
}
}
void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
std::vector<size_t> *slice_positions) {
MS_LOG(DEBUG) << "Start";
size_t thread_num = 24;
if (slice_positions->size() < thread_num) {
thread_num = slice_positions->size();
}
size_t stride = (slice_positions->size() + thread_num - 1) / thread_num;
thread_num = (slice_positions->size() + stride - 1) / stride;
std::vector<std::thread> threads;
size_t max_length = sorted_indices->size() * outer_dim;
for (size_t i = 0; i < thread_num; ++i) {
size_t slice_start = i * stride;
size_t slice_end = 0;
if (i == thread_num - 1) {
slice_end = slice_positions->size();
} else {
slice_end = slice_start + stride;
}
WorkerParamsForReduceSparseGradient params{
slice_start, slice_end, max_length, outer_dim, sorted_indices, slice_positions, origin_sparse_grad.value_,
unique_grad};
threads.emplace_back(std::thread(WorkerForReduceSparseGradient, params));
}
for (size_t i = 0; i < thread_num; ++i) {
threads[i].join();
}
reduced_bucket->indices_size_ = unique_indices_size;
MS_LOG(DEBUG) << "End";
}
void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim, bool use_multi_threads) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
MS_EXCEPTION_IF_NULL(unique_grad);
MS_EXCEPTION_IF_NULL(unique_grad->value_);
MS_EXCEPTION_IF_NULL(unique_grad->indices_);
std::vector<std::pair<int, size_t>> sorted_indices;
sorted_indices.reserve(origin_sparse_grad.indices_size_);
for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
int index = origin_sparse_grad.indices_[i];
if (index >= 0 && IntToSize(index) < first_dim) {
sorted_indices.emplace_back(std::pair<int, size_t>(index, i * outer_dim));
}
}
std::sort(
sorted_indices.begin(), sorted_indices.end(),
[](const std::pair<int, size_t> &left, const std::pair<int, size_t> &right) { return left.first < right.first; });
int last_index = 0;
std::vector<size_t> slice_positions;
slice_positions.reserve(sorted_indices.size());
for (size_t i = 0; i < sorted_indices.size(); ++i) {
if (i == 0 || last_index != sorted_indices[i].first) {
slice_positions.emplace_back(i);
}
last_index = sorted_indices[i].first;
}
if (use_multi_threads) {
RunMultiThreadReduceSparseGradient(origin_sparse_grad, unique_grad, outer_dim, &sorted_indices, &slice_positions);
} else {
size_t max_length = sorted_indices.size() * outer_dim;
WorkerParamsForReduceSparseGradient params{0,
slice_positions.size(),
max_length,
outer_dim,
&sorted_indices,
&slice_positions,
origin_sparse_grad.value_,
unique_grad};
WorkerForReduceSparseGradient(params);
}
unique_grad->indices_size_ = slice_positions.size();
MS_LOG(DEBUG) << "End";
}
void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim) {
MS_LOG(DEBUG) << "Start";
if (unique_slice_grads.empty()) {
return;
}
size_t index_data_size = outer_dim * sizeof(float);
size_t unique_indices_size = 0;
for (size_t i = 0; i < unique_slice_grads.size(); ++i) {
auto &slice_grad = unique_slice_grads[i];
auto ret_code = memcpy_s(tmp_grad->value_ + unique_indices_size * outer_dim,
(tmp_grad->indices_size_ - unique_indices_size) * index_data_size, slice_grad->value_,
slice_grad->indices_size_ * index_data_size);
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
ret_code =
memcpy_s(tmp_grad->indices_ + unique_indices_size, (tmp_grad->indices_size_ - unique_indices_size) * sizeof(int),
slice_grad->indices_, slice_grad->indices_size_ * sizeof(int));
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
unique_indices_size += slice_grad->indices_size_;
}
tmp_grad->indices_size_ = unique_indices_size;
ReduceSparseGradient(*tmp_grad, unique_grad, first_dim, outer_dim);
MS_LOG(DEBUG) << "End";
}
void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
SparseGradient *unique_grad, size_t first_dim, size_t outer_dim) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
MS_EXCEPTION_IF_NULL(unique_grad);
MS_EXCEPTION_IF_NULL(unique_grad->value_);
MS_EXCEPTION_IF_NULL(unique_grad->indices_);
MS_EXCEPTION_IF_NULL(tmp_grad);
MS_EXCEPTION_IF_NULL(tmp_grad->value_);
MS_EXCEPTION_IF_NULL(tmp_grad->indices_);
size_t thread_num = 24;
if (origin_sparse_grad.indices_size_ < thread_num) {
thread_num = origin_sparse_grad.indices_size_;
}
size_t thread_indices_size = origin_sparse_grad.indices_size_ / thread_num;
size_t left_indices_size = origin_sparse_grad.indices_size_ % thread_num;
void ReduceBucketSparseGradientToWorkspace(const MultiThreadReduceSparseGradientParam &param,
const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets,
std::vector<std::shared_ptr<SparseGradient>> *reduced_buckets_ptr) {
MS_EXCEPTION_IF_NULL(param.workspace_grad_);
MS_EXCEPTION_IF_NULL(param.workspace_grad_->value_);
MS_EXCEPTION_IF_NULL(param.workspace_grad_->indices_);
MS_EXCEPTION_IF_NULL(reduced_buckets_ptr);
auto &reduced_buckets = *reduced_buckets_ptr;
size_t thread_num = buckets.size();
std::vector<std::thread> threads;
threads.reserve(thread_num);
std::vector<std::shared_ptr<SparseGradient>> unique_slice_grads;
size_t current_indices_offset = 0;
for (size_t i = 0; i < thread_num; ++i) {
size_t indices_size = thread_indices_size;
if (i == thread_num - 1) {
indices_size = thread_indices_size + left_indices_size;
reduced_buckets.emplace_back(std::make_shared<SparseGradient>());
reduced_buckets[i]->value_ = param.workspace_grad_->value_ + current_indices_offset * param.value_stride_;
reduced_buckets[i]->indices_ = param.workspace_grad_->indices_ + current_indices_offset;
reduced_buckets[i]->indices_size_ = buckets[i]->indices_size_;
if (param.use_sort_reduce_) {
threads.emplace_back(std::thread(SortAndReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i]));
} else {
threads.emplace_back(std::thread(ReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i]));
}
size_t value_offset = i * thread_indices_size * outer_dim;
size_t indices_offset = i * thread_indices_size;
auto slice_grad = SparseGradient(
{origin_sparse_grad.value_ + value_offset, origin_sparse_grad.indices_ + indices_offset, indices_size});
unique_slice_grads.emplace_back(std::make_shared<SparseGradient>());
unique_slice_grads[i]->value_ = unique_grad->value_ + value_offset;
unique_slice_grads[i]->indices_ = unique_grad->indices_ + indices_offset;
unique_slice_grads[i]->indices_size_ = indices_size;
threads.emplace_back(
std::thread(ReduceSparseGradient, slice_grad, unique_slice_grads[i].get(), first_dim, outer_dim, false));
current_indices_offset += buckets[i]->indices_size_;
}
for (size_t i = 0; i < thread_num; ++i) {
threads[i].join();
}
ReduceMultiSparseGradient(unique_slice_grads, tmp_grad, unique_grad, first_dim, outer_dim);
}
void MergeReduceSparseGradient(const MultiThreadReduceSparseGradientParam &param,
const std::vector<std::shared_ptr<SparseGradient>> &reduced_buckets) {
MS_EXCEPTION_IF_NULL(param.output_grad_);
auto output_grad = param.output_grad_;
MS_EXCEPTION_IF_NULL(output_grad->value_);
MS_EXCEPTION_IF_NULL(output_grad->indices_);
size_t stride_data_size = param.value_stride_ * sizeof(float);
size_t unique_indices_size = 0;
for (size_t i = 0; i < reduced_buckets.size(); ++i) {
auto &bucket = reduced_buckets[i];
MS_EXCEPTION_IF_NULL(bucket);
if (bucket->indices_size_ == 0) {
continue;
}
auto ret_code = memcpy_s(output_grad->value_ + unique_indices_size * param.value_stride_,
(output_grad->indices_size_ - unique_indices_size) * stride_data_size, bucket->value_,
bucket->indices_size_ * stride_data_size);
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
ret_code = memcpy_s(output_grad->indices_ + unique_indices_size,
(output_grad->indices_size_ - unique_indices_size) * sizeof(int), bucket->indices_,
bucket->indices_size_ * sizeof(int));
if (ret_code != EOK) {
MS_LOG(EXCEPTION) << "Failed to copy data!";
}
unique_indices_size += bucket->indices_size_;
}
output_grad->indices_size_ = unique_indices_size;
}
} // namespace
void BucketReduceSparseGradient(const ReduceSparseGradientParam &param) {
MS_LOG(DEBUG) << "Start";
MS_EXCEPTION_IF_NULL(param.input_grad_);
size_t thread_num = 23;
if (param.input_grad_->indices_size_ < thread_num) {
thread_num = param.input_grad_->indices_size_;
}
MultiThreadReduceSparseGradientParam multi_thread_param({param.input_grad_, param.workspace_grad_, param.output_grad_,
param.max_index_, param.value_stride_, thread_num,
param.use_sort_reduce_});
std::vector<std::shared_ptr<SparseGradient>> segments;
std::vector<std::shared_ptr<std::vector<size_t>>> segment_bucket_sizes;
SplitAndCalculateSegmentBucketSize(multi_thread_param, &segments, &segment_bucket_sizes);
std::vector<std::shared_ptr<BucketSparseGradient>> buckets;
GatherSegmentIndicesToOutputBucket(multi_thread_param, segments, segment_bucket_sizes, &buckets);
std::vector<std::shared_ptr<SparseGradient>> reduced_buckets;
ReduceBucketSparseGradientToWorkspace(multi_thread_param, buckets, &reduced_buckets);
MergeReduceSparseGradient(multi_thread_param, reduced_buckets);
MS_LOG(DEBUG) << "End";
}

View File

@ -73,9 +73,18 @@ class KernelMeta {
};
struct SparseGradient {
float *value_;
int *indices_;
size_t indices_size_;
float *value_{nullptr};
int *indices_{nullptr};
size_t indices_size_{0};
};
struct ReduceSparseGradientParam {
SparseGradient *input_grad_{nullptr};
SparseGradient *workspace_grad_{nullptr};
SparseGradient *output_grad_{nullptr};
size_t max_index_{0};
size_t value_stride_{0};
bool use_sort_reduce_{false};
};
struct MultiThreadComputeParams {
@ -112,10 +121,6 @@ void SaveJsonInfo(const std::string &json_name, const std::string &info);
std::string GetProcessor(const AnfNodePtr &anf_node);
bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
int Sign(float x);
void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim);
void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim, bool use_multi_threads = true);
std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
const std::vector<AnfNodePtr> &input_list);
@ -130,14 +135,7 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<An
bool IsWeightBoundary(const AnfNodePtr &node);
void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params,
size_t total_compute_size);
void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
std::vector<size_t> *slice_positions);
void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
size_t outer_dim);
void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
SparseGradient *unique_grad, size_t first_dim, size_t outer_dim);
void BucketReduceSparseGradient(const ReduceSparseGradientParam &param);
std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode);
} // namespace kernel
} // namespace mindspore

View File

@ -46,7 +46,7 @@ class EmbeddingLookUpCPUKernel : public CPUKernel {
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
private:
protected:
void LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2,
float **output_addr);
void CheckParam(const CNodePtr &kernel_node);

View File

@ -53,15 +53,15 @@ bool EmbeddingLookUpProxyKernel::Launch(const std::vector<kernel::AddressPtr> &i
size_t output_size = outputs[0]->size;
size_t size = input_size / sizeof(float);
::ps::SArray<float> lookup_ids(size, 0);
::ps::SArray<int> lookup_ids(size, 0);
::ps::SArray<int> lengths{size};
::ps::SArray<float> lookup_result;
::ps::SArray<float> lookup_result(output_size / sizeof(float), 0);
auto ret = memcpy_s(lookup_ids.data(), input_size, indices_addr, input_size);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "Lookup id memcpy failed.";
}
parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, lookup_result,
parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, &lookup_result,
parallel::ps::kEmbeddingLookupCmd);
auto ret2 = memcpy_s(output_addr, output_size, lookup_result.data(), output_size);

View File

@ -50,7 +50,7 @@ void EmbeddingLookUpPSKernel::InitKernel(
split_num_ = pserver_num_;
// input shape should be sharded after computing offset_;
Shard(input_shape_, axis_);
Shard(&input_shape_, axis_);
size_t output_size =
std::accumulate(output_shape_.begin(), output_shape_.end(), sizeof(float), std::multiplies<size_t>());

View File

@ -34,5 +34,13 @@ MS_REG_CPU_KERNEL_T(Push,
MS_REG_CPU_KERNEL_T(
Push, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt64),
PushKernel, float);
MS_REG_CPU_KERNEL_T(Push,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeUInt64),
PushKernel, float);
} // namespace kernel
} // namespace mindspore

View File

@ -43,7 +43,7 @@ class PushKernel : public CPUKernel {
sizes.push_back(SizeToInt(input->size) / sizeof(T));
}
parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes);
memcpy(outputs[0]->addr, &key_, sizeof(size_t));
memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t));
return true;
}

View File

@ -75,7 +75,7 @@ void SparseApplyAdamPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar
void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
const auto &indices_addr = inputs[10];
indices_size_ = indices_addr->size;
indices_size_ = indices_addr->size / sizeof(int);
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
workspace_size_list_[1] = indices_size_ * sizeof(int);
}

View File

@ -64,7 +64,7 @@ void SparseApplyFtrlPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar
void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
const auto &indices_addr = inputs[4];
indices_size_ = indices_addr->size;
indices_size_ = indices_addr->size / sizeof(int);
workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
workspace_size_list_[1] = indices_size_ * sizeof(int);
}

View File

@ -81,6 +81,8 @@ void SparseApplyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node)
MS_EXCEPTION_IF_NULL(kernel_node);
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
workspace_size_list_.emplace_back(var_first_dim_size_ * var_outer_dim_size_ * sizeof(float));
}
@ -142,11 +144,21 @@ bool SparseApplyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
auto indices = reinterpret_cast<int *>(inputs[10]->addr);
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
auto m_t = reinterpret_cast<float *>(workspace[2]->addr);
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
auto m_t = reinterpret_cast<float *>(workspace[4]->addr);
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
var_outer_dim_size_);
SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
SparseGradient input_sparse_grad({grad, indices, indices_size_});
ReduceSparseGradientParam param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
size_t total_dim_size = var_first_dim_size_ * var_outer_dim_size_;
lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);

View File

@ -132,12 +132,19 @@ bool SparseApplyFtrlCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
auto indices = reinterpret_cast<int *>(inputs[4]->addr);
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr);
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_});
TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad,
var_first_dim_size_, var_outer_dim_size_);
SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
SparseGradient input_sparse_grad({grad, indices, indices_size_});
ReduceSparseGradientParam param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
MultiThreadComputeParams input_params;
input_params.var_ = var;

View File

@ -123,13 +123,19 @@ bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr>
auto indices = reinterpret_cast<int *>(inputs[10]->addr);
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr);
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_});
TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad,
var_first_dim_size_, var_outer_dim_size_);
SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
SparseGradient input_sparse_grad({grad, indices, indices_size_});
ReduceSparseGradientParam param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
MultiThreadComputeParams input_params;

View File

@ -61,6 +61,8 @@ void SparseApplyProximalAdagradCPUKernel::InitInputOutputSize(const CNodePtr &ke
MS_EXCEPTION_IF_NULL(kernel_node);
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
}
void SparseApplyProximalAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
@ -119,9 +121,19 @@ bool SparseApplyProximalAdagradCPUKernel::Launch(const std::vector<kernel::Addre
auto indices = reinterpret_cast<int *>(inputs[6]->addr);
auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
var_outer_dim_size_);
SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
SparseGradient input_sparse_grad({grad, indices, indices_size_});
ReduceSparseGradientParam param;
param.input_grad_ = &input_sparse_grad;
param.workspace_grad_ = &workspace_sparse_grad;
param.output_grad_ = &unique_sparse_grad;
param.max_index_ = var_first_dim_size_;
param.value_stride_ = var_outer_dim_size_;
BucketReduceSparseGradient(param);
MultiThreadComputeParams input_params;
input_params.var_ = var;

View File

@ -0,0 +1,26 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h"
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastToGpuKernel, float)
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastToGpuKernel, half)
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,83 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
#include <vector>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh"
namespace mindspore {
namespace kernel {
template <typename T>
class BroadcastToGpuKernel : public GpuKernel {
public:
BroadcastToGpuKernel() {}
~BroadcastToGpuKernel() = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
T *input_addr = GetDeviceAddress<T>(inputs, 0);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], output_shape_[0], output_shape_[1],
output_shape_[2], output_shape_[3], input_addr, output_addr,
reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
bool Init(const CNodePtr &kernel_node) override {
auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
if (input_shapes.size() > 4 || output_shapes.size() > 4) {
MS_LOG(EXCEPTION) << "BroadcastTo operation not support dim greater than 4";
}
for (int i = input_shapes.size() - 1; i >= 0; i--) {
input_shape_[i] = input_shapes[i];
}
for (int j = output_shapes.size() - 1; j >= 0; j--) {
output_shape_[j] = output_shapes[j];
}
InitSizeLists();
return true;
}
protected:
void InitSizeLists() override {
input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * sizeof(T));
output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] * sizeof(T));
}
private:
int input_shape_[4] = {1, 1, 1, 1};
int output_shape_[4] = {1, 1, 1, 1};
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_

View File

@ -18,6 +18,7 @@
#define MINDSPORE_CCSRC_KERNEL_GPU_CONCATV2_GPU_KERNEL_H
#include <vector>
#include <memory>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
@ -27,40 +28,35 @@ namespace kernel {
template <typename T>
class ConcatV2GpuFwdKernel : public GpuKernel {
public:
ConcatV2GpuFwdKernel() : axis_(0), output_size_(0) {}
ConcatV2GpuFwdKernel()
: axis_(0),
input_num_(1),
output_size_(0),
all_size_before_axis_(1),
all_size_axis_(1),
inputs_host_(nullptr),
len_axis_(nullptr) {}
~ConcatV2GpuFwdKernel() override = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
if (inputs.size() == 2) {
T *input_0 = GetDeviceAddress<T>(inputs, 0);
T *input_1 = GetDeviceAddress<T>(inputs, 1);
T *output = GetDeviceAddress<T>(outputs, 0);
ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], input_0, input_1, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
if (inputs.size() == 3) {
T *input_0 = GetDeviceAddress<T>(inputs, 0);
T *input_1 = GetDeviceAddress<T>(inputs, 1);
T *input_2 = GetDeviceAddress<T>(inputs, 2);
T *output = GetDeviceAddress<T>(outputs, 0);
ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], input_0, input_1, input_2, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
if (inputs.size() == 4) {
T *input_0 = GetDeviceAddress<T>(inputs, 0);
T *input_1 = GetDeviceAddress<T>(inputs, 1);
T *input_2 = GetDeviceAddress<T>(inputs, 2);
T *input_3 = GetDeviceAddress<T>(inputs, 3);
T *output = GetDeviceAddress<T>(outputs, 0);
ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], w_[3], input_0, input_1, input_2, input_3, output,
reinterpret_cast<cudaStream_t>(stream_ptr));
T **inputs_device = GetDeviceAddress<T *>(workspace, 0);
int *len_axis_device = GetDeviceAddress<int>(workspace, 1);
for (size_t i = 0; i < inputs.size(); i++) {
inputs_host_[i] = GetDeviceAddress<T>(inputs, i);
}
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"ConcatV2 opt cudaMemcpyAsync inputs failed");
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"ConcatV2 opt cudaMemcpyAsync length on axis failed");
ConcatKernel(output_size_, input_num_, all_size_before_axis_, all_size_axis_, len_axis_device, inputs_device,
output, reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
bool Init(const CNodePtr &kernel_node) override {
@ -74,25 +70,34 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
axis_ += SizeToInt(input_shape.size());
}
auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
for (size_t i = 0; i < input_num; i++) {
auto input_size = sizeof(T);
input_num_ = SizeToInt(AnfAlgo::GetInputTensorNum(kernel_node));
inputs_host_ = std::make_unique<T *[]>(input_num_);
len_axis_ = std::make_unique<int[]>(input_num_);
for (int i = 0; i < input_num_; i++) {
int input_size = 1;
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
for (size_t j = 0; j < input_shape.size(); j++) {
input_size *= SizeToInt(input_shape[j]);
if (j >= IntToSize(axis_)) {
w_[i] *= SizeToInt(input_shape[j]);
}
input_size_list_.push_back(input_size);
}
input_size_list_.push_back(IntToSize(input_size * sizeof(T)));
len_axis_[i] = SizeToInt(input_shape[axis_]);
}
workspace_size_list_.push_back(sizeof(T *) * input_num_);
workspace_size_list_.push_back(sizeof(int) * input_num_);
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
output_size_ = sizeof(T);
for (size_t i = 0; i < output_shape.size(); i++) {
output_size_ = 1;
for (int i = 0; i < SizeToInt(output_shape.size()); i++) {
output_size_ *= output_shape[i];
if (i > axis_) {
all_size_before_axis_ *= output_shape[i];
all_size_axis_ *= output_shape[i];
}
output_size_list_.push_back(output_size_);
if (i == axis_) {
all_size_before_axis_ *= output_shape[i];
}
}
output_size_list_.push_back(IntToSize(output_size_ * sizeof(T)));
InitSizeLists();
return true;
@ -103,11 +108,6 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
private:
bool CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num < 2 || input_num > 4) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but ConcatV2GpuFwdKernel needs inputs between 2 and 4.";
return false;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(ERROR) << "Output number is " << output_num << ", but ConcatV2GpuFwdKernel needs 1 output.";
@ -115,9 +115,13 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
}
return true;
}
int w_[4] = {1, 1, 1, 1};
int axis_;
size_t output_size_;
int input_num_;
int output_size_;
int all_size_before_axis_;
int all_size_axis_;
std::unique_ptr<T *[]> inputs_host_;
std::unique_ptr<int[]> len_axis_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;

View File

@ -0,0 +1,31 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h"
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_ONE(
Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
SplitGpuFwdKernel, float)
MS_REG_GPU_KERNEL_ONE(Split,
KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
SplitGpuFwdKernel, int)
MS_REG_GPU_KERNEL_ONE(
Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
SplitGpuFwdKernel, half)
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,153 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
#define MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
#include <vector>
#include <memory>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
namespace mindspore {
namespace kernel {
template <typename T>
class SplitGpuFwdKernel : public GpuKernel {
public:
SplitGpuFwdKernel()
: axis_(0),
output_num_(1),
input_size_(1),
axis_step_(1),
all_size_before_axis_(1),
all_size_axis_(1),
outputs_host_(nullptr) {}
~SplitGpuFwdKernel() override = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
T *input = GetDeviceAddress<T>(inputs, 0);
T **outputs_device = GetDeviceAddress<T *>(workspace, 0);
for (size_t i = 0; i < outputs.size(); i++) {
outputs_host_[i] = GetDeviceAddress<T>(outputs, i);
}
CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_,
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
"Split opt cudaMemcpyAsync outputs failed");
SplitKernel(input_size_, axis_step_, all_size_before_axis_, all_size_axis_, input, outputs_device,
reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
bool Init(const CNodePtr &kernel_node) override {
axis_ = GetAttr<int>(kernel_node, "axis");
if (axis_ < 0) {
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
axis_ += SizeToInt(input_shape.size());
}
output_num_ = GetAttr<int>(kernel_node, "output_num");
if (!CheckParam(kernel_node)) {
return false;
}
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
input_size_ = 1;
all_size_before_axis_ = 1;
all_size_axis_ = 1;
for (int i = 0; i < SizeToInt(input_shape.size()); i++) {
input_size_ *= input_shape[i];
if (i > axis_) {
all_size_before_axis_ *= input_shape[i];
all_size_axis_ *= input_shape[i];
}
if (i == axis_) {
all_size_before_axis_ *= input_shape[i];
}
}
input_size_list_.push_back(IntToSize(input_size_ * sizeof(T)));
axis_step_ = input_shape[axis_] / output_num_;
for (int i = 0; i < output_num_; i++) {
size_t output_size = 1;
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, i);
for (size_t j = 0; j < output_shape.size(); j++) {
output_size *= output_shape[j];
}
output_size_list_.push_back(output_size * sizeof(T));
}
workspace_size_list_.push_back(sizeof(T *) * output_num_);
InitSizeLists();
outputs_host_ = std::make_unique<T *[]>(output_num_);
return true;
}
protected:
void InitSizeLists() override {}
private:
bool CheckParam(const CNodePtr &kernel_node) {
auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
int dims = SizeToInt(input_shape.size());
int output_num = SizeToInt(AnfAlgo::GetOutputTensorNum(kernel_node));
if (input_num != 1) {
MS_LOG(ERROR) << "Input number is " << input_num << ", but Split needs 1 input.";
return false;
}
if (dims == 0) {
MS_LOG(ERROR) << "Input dims is " << dims << ", scalar is not supported.";
return false;
}
if (axis_ < -dims || axis_ >= dims) {
MS_LOG(ERROR) << "Attr axis " << axis_ << " must be in " << -dims << "~" << dims;
return false;
}
if (output_num_ > SizeToInt(input_shape[axis_])) {
MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must less than" << input_shape[axis_];
return false;
}
if (input_shape[axis_] % output_num_ != 0) {
MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must be divided by" << input_shape[axis_];
return false;
}
if (output_num_ != output_num) {
MS_LOG(ERROR) << "Output num is " << output_num << ", but need " << output_num_;
return false;
}
return true;
}
int axis_;
int output_num_;
int input_size_;
int axis_step_;
int all_size_before_axis_;
int all_size_axis_;
std::unique_ptr<T *[]> outputs_host_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H

View File

@ -0,0 +1,29 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h"
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_TWO(TopK,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeInt32),
TopKGpuKernel, float, int)
}
} // namespace mindspore

View File

@ -0,0 +1,110 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_
#include <vector>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
namespace mindspore {
namespace kernel {
template <typename T, typename S>
class TopKGpuKernel : public GpuKernel {
public:
TopKGpuKernel() : sorted_(false), outer_size_(1), inner_size_(1), k_(1), use_share_mem_(true), ceil_power2_(0) {}
~TopKGpuKernel() override = default;
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspaces,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
T *input_addr = GetDeviceAddress<T>(inputs, 0);
S *k = GetDeviceAddress<S>(inputs, 1);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
S *indices = GetDeviceAddress<S>(outputs, 1);
T *data_buff = nullptr;
S *index_buff = nullptr;
if (use_share_mem_ == false) {
data_buff = GetDeviceAddress<T>(workspaces, 0);
index_buff = GetDeviceAddress<S>(workspaces, 1);
}
TopK(outer_size_, inner_size_, input_addr, k, output_addr, indices, data_buff, index_buff,
reinterpret_cast<cudaStream_t>(stream_ptr));
if (sorted_ == false) {
std::cout << "================BitonicSortByKey" << std::endl;
BitonicSortByKey(outer_size_, k_, output_addr, indices, data_buff, index_buff,
reinterpret_cast<cudaStream_t>(stream_ptr));
}
return true;
}
bool Init(const CNodePtr &kernel_node) override {
auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shapes.size() - 1; i++) {
outer_size_ *= input_shapes[i];
}
inner_size_ = input_shapes[input_shapes.size() - 1];
k_ = output_shapes[output_shapes.size() - 1];
sorted_ = GetAttr<bool>(kernel_node, "sorted");
ceil_power2_ = RoundUpPower2(inner_size_);
size_t buffer_size = ceil_power2_ * (sizeof(T) + sizeof(S));
if (buffer_size > SHARED_MEM_PER_BLOCK) {
use_share_mem_ = false;
MS_LOG(WARNING) << "CUDA share memory not enough, sort with RAM";
}
InitSizeLists();
return true;
}
protected:
void InitSizeLists() override {
input_size_list_.push_back(outer_size_ * inner_size_ * sizeof(T));
input_size_list_.push_back(sizeof(S));
output_size_list_.push_back(outer_size_ * k_ * sizeof(T));
output_size_list_.push_back(outer_size_ * k_ * sizeof(S));
if (use_share_mem_ == false) {
workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(T));
workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(S));
}
}
private:
bool sorted_;
int outer_size_;
int inner_size_;
int k_;
bool use_share_mem_;
int ceil_power2_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_;
};
} // namespace kernel
} // namespace mindspore
#endif // TopKpuKernel

View File

@ -176,6 +176,28 @@ void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, cons
NoBroadcastKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, op, input0, input1, output);
}
template <typename T>
__global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0,
const int o1, const int o2, const int o3, const T *input_addr, T *output_addr) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < o0 * o1 * o2 * o3; pos += blockDim.x * gridDim.x) {
int i = pos / (o1 * o2 * o3) % o0;
int j = pos / (o2 * o3) % o1;
int k = pos / o3 % o2;
int l = pos % o3;
int input_idx = Index(i, i0) * i1 * i2 * i3 + Index(j, i1) * i2 * i3 + Index(k, i2) * i3 + Index(l, i3);
output_addr[pos] = input_addr[input_idx];
}
}
template <typename T>
void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) {
int nums = o0 * o1 * o2 * o3;
BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr,
output_addr);
}
template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
enum BroadcastOpType op, const float *input0, const float *input1, bool *output,
@ -204,5 +226,11 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
bool *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
half *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
int *output, cudaStream_t stream);
template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
cudaStream_t stream);
template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
const int &o2, const int &o3, const float *input_addr, float *output_addr,
cudaStream_t stream);
template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
const int &o2, const int &o3, const half *input_addr, half *output_addr, cudaStream_t stream);

View File

@ -41,4 +41,8 @@ template <typename T, typename S>
void NoBroadcast(const int &size, enum BroadcastOpType op, const T *input0, const T *input1, S *output,
cudaStream_t stream);
template <typename T>
void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_

View File

@ -19,90 +19,51 @@
#include <cuda_runtime.h>
#include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
template <typename T>
__global__ void Concat(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
int n = pos / (w1 + w2);
int m = pos % (w1 + w2);
output[pos] = m >= w1 ? input_2[n * w2 + m - w1] : input_1[n * w1 + m];
__global__ void Concat(const int size, const int input_num,
const int all_size_before_axis, const int all_size_axis,
int* len_axis, T** inputs, T* output) {
for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
int num = pos % all_size_before_axis / all_size_axis;
int block = -1;
int axis_inc = 0;
int block_len = 0;
for (int i = 0; i < input_num; i++) {
if (axis_inc <= num) {
block++;
axis_inc += len_axis[i];
} else {
break;
}
}
block_len = len_axis[block];
axis_inc -= len_axis[block];
int block_pos = pos / all_size_before_axis * block_len * all_size_axis +
(num - axis_inc) * all_size_axis + pos % all_size_axis;;
output[pos] = inputs[block][block_pos];
}
return;
}
template <typename T>
__global__ void Concat(const size_t size, const int w1, const int w2, const int w3,
const T* input_1, const T* input_2, const T* input_3, T* output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
int n = pos / (w1 + w2 + w3);
int m = pos % (w1 + w2 + w3);
output[pos] = m < w1 ? input_1[n * w1 + m] :
m < w1 + w2 ? input_2[n * w2 + m - w1] :
input_3[n * w3 + m - w1 - w2];
}
return;
}
template <typename T>
__global__ void Concat(const size_t size, const int w1, const int w2, const int w3, const int w4,
const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
int n = pos / (w1 + w2 + w3 + w4);
int m = pos % (w1 + w2 + w3 + w4);
output[pos] = m < w1 ? input_1[n * w1 + m] :
m < w1 + w2 ? input_2[n * w2 + m - w1]:
m < w1 + w2 + w3 ? input_3[n * w3 + m - w1 - w2]:
input_4[n * w4 + m - w1 - w2 - w3];
}
return;
}
template <typename T>
void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output,
void ConcatKernel(const int size, const int input_num,
const int all_size_before_axis, const int all_size_axis,
int* len_axis, T** inputs, T* output,
cudaStream_t cuda_stream) {
Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, input_1, input_2, output);
Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_num,
all_size_before_axis, all_size_axis,
len_axis, inputs, output);
return;
}
template <typename T>
void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
const T* input_1, const T* input_2, const T* input_3, T* output,
cudaStream_t cuda_stream) {
Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, input_1, input_2, input_3, output);
return;
}
template <typename T>
void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output,
cudaStream_t cuda_stream) {
Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, w4, input_1,
input_2, input_3, input_4, output);
return;
}
template void ConcatKernel(const size_t size, const int w1, const int w2, const float* input_1, const float* input_2,
float* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int* input_1, const int* input_2,
int* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const half* input_1, const half* input_2,
half* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
const float* input_1, const float* input_2, const float* input_3,
float* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
const int* input_1, const int* input_2, const int* input_3,
int* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
const half* input_1, const half* input_2, const half* input_3,
half* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
const float* input_1, const float* input_2, const float* input_3, const float* input_4,
float* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
const int* input_1, const int* input_2, const int* input_3, const int* input_4,
int* output, cudaStream_t cuda_stream);
template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
const half* input_1, const half* input_2, const half* input_3, const half* input_4,
half* output, cudaStream_t cuda_stream);
template void ConcatKernel(const int size, const int input_num,
const int all_size_before_axis, const int all_size_axis,
int* len_axis, float** inputs, float* output,
cudaStream_t cuda_stream);
template void ConcatKernel(const int size, const int input_num,
const int all_size_before_axis, const int all_size_axis,
int* len_axis, int** inputs, int* output,
cudaStream_t cuda_stream);
template void ConcatKernel(const int size, const int input_num,
const int all_size_before_axis, const int all_size_axis,
int* len_axis, half** inputs, half* output,
cudaStream_t cuda_stream);

View File

@ -19,13 +19,8 @@
#include "runtime/device/gpu/cuda_common.h"
template <typename T>
void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output,
cudaStream_t cuda_stream);
template <typename T>
void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
const T* input_1, const T* input_2, const T* input_3, T* output, cudaStream_t cuda_stream);
template <typename T>
void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output,
void ConcatKernel(const int size, const int input_num,
const int all_size_before_axis, const int all_size_axis,
int* len_axis, T** inputs, T* output,
cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CONCATV2IMPL_H_

View File

@ -15,9 +15,9 @@
*/
#include "momentum_impl.cuh"
template <typename T, typename S>
template <typename T, typename S, typename G>
__global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const S *learning_rate,
const T *gradient, const S *momentum) {
const G *gradient, const S *momentum) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
accumulation[i] = momentum[0] * accumulation[i] + gradient[i];
variable[i] -= learning_rate[0] * accumulation[i];
@ -34,19 +34,32 @@ __global__ void MomentumUpdateVariableKernel(const size_t size, half *variable,
}
return;
}
template <typename T, typename S>
void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient,
template <>
__global__ void MomentumUpdateVariableKernel(const size_t size, float *variable, float *accumulation,
const float *learning_rate, const half *gradient,
const float *momentum) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
accumulation[i] = momentum[0] * accumulation[i] + __half2float(gradient[i]);
variable[i] -= learning_rate[0] * accumulation[i];
}
return;
}
template <typename T, typename S, typename G>
void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
const S *momentum, cudaStream_t cuda_stream) {
MomentumUpdateVariableKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, variable, accumulation,
learning_rate, gradient, momentum);
return;
}
template void MomentumUpdateVariable<float, float>(const size_t size, float *variable, float *accumulation,
template void MomentumUpdateVariable<float, float, float>(const size_t size, float *variable, float *accumulation,
const float *learning_rate, const float *gradient,
const float *momentum, cudaStream_t cuda_stream);
template void MomentumUpdateVariable<half, half>(const size_t size, half *variable, half *accumulation,
template void MomentumUpdateVariable<half, half, half>(const size_t size, half *variable, half *accumulation,
const half *learning_rate, const half *gradient,
const half *momentum, cudaStream_t cuda_stream);
template void MomentumUpdateVariable<half, float>(const size_t size, half *variable, half *accumulation,
template void MomentumUpdateVariable<half, float, half>(const size_t size, half *variable, half *accumulation,
const float *learning_rate, const half *gradient,
const float *momentum, cudaStream_t cuda_stream);
template void MomentumUpdateVariable<float, float, half>(const size_t size, float *variable, float *accumulation,
const float *learning_rate, const half *gradient,
const float *momentum, cudaStream_t cuda_stream);

View File

@ -18,8 +18,8 @@
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
#include "runtime/device/gpu/cuda_common.h"
template <typename T, typename S>
void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient,
template <typename T, typename S, typename G>
void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
const S *momentum, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_

View File

@ -0,0 +1,50 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdio.h>
#include <stdint.h>
#include <cuda_runtime.h>
#include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
template <typename T>
__global__ void Split(const int size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const T* input, T** outputs) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
int num = pos % all_size_before_axis / all_size_axis;
int block = num / axis_step;
int block_pos = pos / all_size_before_axis * axis_step * all_size_axis +
num % axis_step * all_size_axis + pos % all_size_axis;
outputs[block][block_pos] = input[pos];
}
return;
}
template <typename T>
void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) {
Split<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, axis_step, all_size_before_axis,
all_size_axis, input, outputs);
return;
}
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const float* input, float** outputs,
cudaStream_t cuda_stream);
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const int* input, int** outputs,
cudaStream_t cuda_stream);
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const half* input, half** outputs,
cudaStream_t cuda_stream);

View File

@ -0,0 +1,24 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
#include "runtime/device/gpu/cuda_common.h"
template <typename T>
void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_

View File

@ -0,0 +1,162 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
#include <limits>
#include <algorithm>
int RoundUpPower2(int v) {
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;
}
template <typename T>
__inline__ __device__ void Swap(T *lhs, T *rhs) {
T tmp = lhs[0];
lhs[0] = rhs[0];
rhs[0] = tmp;
}
template <typename T, typename S>
__global__ void TopkKernel(const int outer, const int inner, const int ceil_power2, const T *input, const S *k,
T *output, S *indices, T *data_buff, S *index_buff) {
// default: sort with share memory
extern __shared__ T share_mem[];
T *data_arr = share_mem;
S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2);
// sort with RAM
if (data_buff != nullptr && index_buff != nullptr) {
data_arr = data_buff + blockIdx.x * ceil_power2;
index_arr = index_buff + blockIdx.x * ceil_power2;
}
for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max();
index_arr[i] = i;
}
__syncthreads();
for (size_t i = 2; i <= ceil_power2; i <<= 1) {
for (size_t j = (i >> 1); j > 0; j >>= 1) {
for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
size_t tid_comp = tid ^ j;
if (tid_comp > tid) {
if ((tid & i) == 0) {
if (data_arr[tid] > data_arr[tid_comp]) {
Swap(&data_arr[tid], &data_arr[tid_comp]);
Swap(&index_arr[tid], &index_arr[tid_comp]);
}
} else {
if (data_arr[tid] < data_arr[tid_comp]) {
Swap(&data_arr[tid], &data_arr[tid_comp]);
Swap(&index_arr[tid], &index_arr[tid_comp]);
}
}
}
}
__syncthreads();
}
}
for (size_t tid = threadIdx.x; tid < k[0]; tid += blockDim.x) {
output[blockIdx.x * k[0] + tid] = data_arr[inner - tid - 1];
indices[blockIdx.x * k[0] + tid] = index_arr[inner - tid - 1];
}
}
template <typename T, typename S>
void TopK(const int &outer, const int &inner, const T *input, const S *k, T *output, S *indices, T *data_buff,
S *index_buff, cudaStream_t stream) {
int ceil_power2 = RoundUpPower2(inner);
int share_mem = (data_buff == nullptr) ? ceil_power2 * (sizeof(T) + sizeof(S)) : 0;
int thread = std::min(ceil_power2, GET_THREADS);
TopkKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, k, output, indices, data_buff,
index_buff);
}
template <typename T, typename S>
__global__ void BitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2, T *input,
S *indices, T *data_buff, S *index_buff) {
// default: sort with share memory
extern __shared__ T share_mem[];
T *data_arr = share_mem;
S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2);
// sort with RAM
if (data_buff != nullptr && index_buff != nullptr) {
data_arr = data_buff + blockIdx.x * ceil_power2;
index_arr = index_buff + blockIdx.x * ceil_power2;
}
for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max();
index_arr[i] = (i < inner) ? indices[blockIdx.x * inner + i] : std::numeric_limits<S>::max();;
}
__syncthreads();
for (size_t i = 2; i <= ceil_power2; i <<= 1) {
for (size_t j = (i >> 1); j > 0; j >>= 1) {
for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
size_t tid_comp = tid ^ j;
if (tid_comp > tid) {
if ((tid & i) == 0) {
if (index_arr[tid] > index_arr[tid_comp]) {
Swap(&data_arr[tid], &data_arr[tid_comp]);
Swap(&index_arr[tid], &index_arr[tid_comp]);
}
} else {
if (index_arr[tid] < index_arr[tid_comp]) {
Swap(&data_arr[tid], &data_arr[tid_comp]);
Swap(&index_arr[tid], &index_arr[tid_comp]);
}
}
}
}
__syncthreads();
}
}
for (size_t tid = threadIdx.x; tid < inner; tid += blockDim.x) {
input[blockIdx.x * inner + tid] = data_arr[tid];
indices[blockIdx.x * inner + tid] = index_arr[tid];
}
}
template <typename T, typename S>
void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff,
cudaStream_t stream) {
int ceil_power2 = RoundUpPower2(inner);
size_t share_mem = ceil_power2 * (sizeof(T) + sizeof(S));
if (share_mem > SHARED_MEM_PER_BLOCK) {
share_mem = 0;
} else {
data_buff = nullptr;
index_buff = nullptr;
}
int thread = std::min(ceil_power2, GET_THREADS);
BitonicSortByKeyKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, indices, data_buff,
index_buff);
}
template void TopK(const int &outer, const int &inner, const float *input_addr, const int *k, float *output,
int *indices, float *data_buff, int *index_buff, cudaStream_t stream);
template void BitonicSortByKey(const int &outer, const int &inner, float *input, int *indices, float *data_buff,
int *index_buff, cudaStream_t stream);

View File

@ -0,0 +1,32 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
#include <cuda_runtime.h>
#include "runtime/device/gpu/cuda_common.h"
template <typename T, typename S>
void TopK(const int &outer, const int &inner, const T *input_addr, const S *k, T *output, S *indices, T *data_buff,
S *index_buff, cudaStream_t stream);
template <typename T, typename S>
void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff,
cudaStream_t stream);
int RoundUpPower2(int v);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_

View File

@ -103,6 +103,35 @@ __global__ void ZeroslikeKernel(T *output, size_t count) {
return;
}
template <typename T>
__global__ void AbsKernel(T *input, T *output, size_t count) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = abs(input[i]);
}
return;
}
template <>
__global__ void AbsKernel(half *input, half *output, size_t count) {
half zero = 0.0;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = input[i] < zero ? -input[i] : input[i];
}
return;
}
template <typename T>
__global__ void FloorKernel(T *input, T *output, size_t count) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = floor(input[i]);
}
return;
}
template <>
__global__ void FloorKernel(half *input, half *output, size_t count) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
output[i] = hfloor(input[i]);
}
return;
}
template <typename T>
void Exponential(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
return;
@ -147,6 +176,16 @@ void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream) {
ZeroslikeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(output, count);
return;
}
template <typename T>
void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
AbsKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
return;
}
template <typename T>
void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
FloorKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
return;
}
template void Exponential<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
template void Logarithm<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
@ -156,6 +195,8 @@ template void Square<float>(float *input, float *output, size_t count, cudaStrea
template void Sqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
template void Rsqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
template void Zeroslike<float>(float *output, size_t count, cudaStream_t cuda_stream);
template void Abs<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
template void Floor<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
template void Exponential<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
template void Logarithm<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
template void Negative<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
@ -164,3 +205,5 @@ template void Square<half>(half *input, half *output, size_t count, cudaStream_t
template void Sqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
template void Rsqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
template void Zeroslike<half>(half *output, size_t count, cudaStream_t cuda_stream);
template void Abs<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
template void Floor<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);

View File

@ -34,5 +34,9 @@ template <typename T>
void Rsqrt(T *input, T *output, size_t count, cudaStream_t cuda_stream);
template <typename T>
void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream);
template <typename T>
void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream);
template <typename T>
void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_

View File

@ -88,6 +88,12 @@ class GpuKernelRegister {
static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \
static const GpuKernelRegister g_##OPNAME##_##T##_##S##_gpu_kernel_reg(#OPNAME, ATTR, \
[]() { return new OPCLASS<T, S>(); });
// register of mixed accuracy kernels which use template and maintain three typename
#define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G) \
static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S, G>>::value, " must be base of GpuKernel"); \
static const GpuKernelRegister g_##OPNAME##_##T##_##S##_##G##_gpu_kernel_reg( \
#OPNAME, ATTR, []() { return new OPCLASS<T, S, G>(); });
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_KERNEL_GPU_GPUKERNELFACTORY_H_

View File

@ -46,5 +46,13 @@ MS_REG_GPU_KERNEL_ONE(Sqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOut
UnaryOpGpuKernel, float)
MS_REG_GPU_KERNEL_ONE(Rsqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
UnaryOpGpuKernel, float)
MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
UnaryOpGpuKernel, float)
MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
UnaryOpGpuKernel, half)
MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
UnaryOpGpuKernel, float)
MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
UnaryOpGpuKernel, half)
} // namespace kernel
} // namespace mindspore

View File

@ -36,6 +36,8 @@ enum UnaryOptype {
UNARY_OP_SQUARE,
UNARY_OP_SQRT,
UNARY_OP_RSQRT,
UNARY_OP_ABS,
UNARY_OP_FLOOR,
UNARY_OP_INVALID_TYPE = 255
};
static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY_OP_EXP},
@ -45,7 +47,9 @@ static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY
{"ZerosLike", UNARY_OP_ZEROSLIKE},
{"Square", UNARY_OP_SQUARE},
{"Sqrt", UNARY_OP_SQRT},
{"Rsqrt", UNARY_OP_RSQRT}};
{"Rsqrt", UNARY_OP_RSQRT},
{"Abs", UNARY_OP_ABS},
{"Floor", UNARY_OP_FLOOR}};
template <typename T>
class UnaryOpGpuKernel : public GpuKernel {
public:
@ -100,6 +104,14 @@ class UnaryOpGpuKernel : public GpuKernel {
Zeroslike(output_addr, output_size_ / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
case UNARY_OP_ABS: {
Abs(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
break;
}
case UNARY_OP_FLOOR: {
Floor(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
break;
}
default: {
MS_LOG(EXCEPTION) << "Unary operation " << unary_op_type_ << " is not supported.";
}

View File

@ -34,15 +34,15 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNorm,
MS_REG_GPU_KERNEL_ONE(FusedBatchNorm,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
FusedBatchNormGpuKernel, half)
MS_REG_GPU_KERNEL_ONE(BatchNorm,
KernelAttr()
@ -60,15 +60,15 @@ MS_REG_GPU_KERNEL_ONE(BatchNorm,
MS_REG_GPU_KERNEL_ONE(BatchNorm,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
FusedBatchNormGpuKernel, half)
} // namespace kernel
} // namespace mindspore

View File

@ -56,17 +56,17 @@ class FusedBatchNormGpuKernel : public GpuKernel {
return true;
}
auto x = GetDeviceAddress<T>(inputs, 0);
auto scale = GetDeviceAddress<T>(inputs, 1);
auto bias = GetDeviceAddress<T>(inputs, 2);
auto runing_mean = GetDeviceAddress<T>(inputs, 3);
auto runnig_variance = GetDeviceAddress<T>(inputs, 4);
auto scale = GetDeviceAddress<float>(inputs, 1);
auto bias = GetDeviceAddress<float>(inputs, 2);
auto runing_mean = GetDeviceAddress<float>(inputs, 3);
auto runnig_variance = GetDeviceAddress<float>(inputs, 4);
auto y = GetDeviceAddress<T>(outputs, 0);
const float alpha = 1;
const float beta = 0;
if (is_train_) {
auto save_mean = GetDeviceAddress<T>(outputs, 3);
auto save_variance = GetDeviceAddress<T>(outputs, 4);
auto save_mean = GetDeviceAddress<float>(outputs, 3);
auto save_variance = GetDeviceAddress<float>(outputs, 4);
CHECK_CUDNN_RET_WITH_EXCEPT(
cudnnBatchNormalizationForwardTraining(handle_, mode_, &alpha, &beta, x_desc_, x, y_desc_, y,
scale_bias_mean_var_desc_, scale, bias, exp_avg_factor_, runing_mean,

View File

@ -33,12 +33,12 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNormGrad,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
.AddOutputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
FusedBatchNormGradGpuKernel, half)
} // namespace kernel
} // namespace mindspore

View File

@ -55,12 +55,12 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
}
auto dy = GetDeviceAddress<T>(inputs, 0);
auto x = GetDeviceAddress<T>(inputs, 1);
auto scale = GetDeviceAddress<T>(inputs, 2);
auto save_mean = GetDeviceAddress<T>(inputs, 3);
auto save_variance = GetDeviceAddress<T>(inputs, 4);
auto scale = GetDeviceAddress<float>(inputs, 2);
auto save_mean = GetDeviceAddress<float>(inputs, 3);
auto save_variance = GetDeviceAddress<float>(inputs, 4);
auto dx = GetDeviceAddress<T>(outputs, 0);
auto bn_scale = GetDeviceAddress<T>(outputs, 1);
auto bn_bias = GetDeviceAddress<T>(outputs, 2);
auto bn_scale = GetDeviceAddress<float>(outputs, 1);
auto bn_bias = GetDeviceAddress<float>(outputs, 2);
const float alpha_data_diff = 1;
const float beta_data_diff = 0;

View File

@ -18,7 +18,7 @@
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
@ -26,8 +26,8 @@ MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
MomentumGpuKernel, float, float)
MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
MomentumGpuKernel, float, float, float)
MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
@ -35,8 +35,8 @@ MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
.AddOutputAttr(kNumberTypeFloat16),
MomentumGpuKernel, half, half)
MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
MomentumGpuKernel, half, half, half)
MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
KernelAttr()
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat16)
@ -44,6 +44,15 @@ MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat16),
MomentumGpuKernel, half, float)
MomentumGpuKernel, half, float, half)
MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
KernelAttr()
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat32)
.AddInputAttr(kNumberTypeFloat16)
.AddInputAttr(kNumberTypeFloat32)
.AddOutputAttr(kNumberTypeFloat32),
MomentumGpuKernel, float, float, half)
} // namespace kernel
} // namespace mindspore

View File

@ -23,7 +23,7 @@
#include "backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh"
namespace mindspore {
namespace kernel {
template <typename T, typename S>
template <typename T, typename S, typename G>
class MomentumGpuKernel : public GpuKernel {
public:
MomentumGpuKernel()
@ -38,7 +38,7 @@ class MomentumGpuKernel : public GpuKernel {
T *variable = GetDeviceAddress<T>(inputs, 0);
T *accumulation = GetDeviceAddress<T>(inputs, 1);
S *learning_rate = GetDeviceAddress<S>(inputs, 2);
T *gradient = GetDeviceAddress<T>(inputs, 3);
G *gradient = GetDeviceAddress<G>(inputs, 3);
S *momentum = GetDeviceAddress<S>(inputs, 4);
MomentumUpdateVariable(inputs[0]->size / sizeof(T), variable, accumulation, learning_rate, gradient, momentum,
reinterpret_cast<cudaStream_t>(stream_ptr));
@ -54,7 +54,7 @@ class MomentumGpuKernel : public GpuKernel {
variable_size_ = sizeof(T);
accumulation_size_ = sizeof(T);
learning_rate_size_ = sizeof(S);
gradient_size_ = sizeof(T);
gradient_size_ = sizeof(G);
momentum_size_ = sizeof(S);
auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);

View File

@ -81,6 +81,7 @@ static std::map<string, string> tbe_func_adapter_map = {
{"sparse_apply_proximal_adagrad", "sparse_apply_proximal_adagrad_d"},
{"apply_add_sign", "apply_add_sign_d"},
{"apply_power_sign", "apply_power_sign_d"},
{"apply_centered_rms_prop", "apply_centered_rms_prop_d"},
{"transpose", "transpose_d"},
{"fill", "fill_d"},
{"unsorted_segment_sum", "unsorted_segment_sum_d"},

View File

@ -43,6 +43,7 @@ constexpr auto kJInputs = "inputs";
constexpr auto kJOutputs = "outputs";
constexpr auto kJAttrs = "attrs";
constexpr auto kJKernelName = "kernel_name";
constexpr auto kJFullName = "full_name";
constexpr auto kJOpInfo = "op_info";
constexpr auto kJDtype = "dtype";
constexpr auto kJtype = "type";
@ -125,6 +126,7 @@ bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspor
op_info_json[kJKernelName] = json_name_;
}
(*kernel_json)[kJOpInfo] = op_info_json;
(*kernel_json)[kJFullName] = anf_node->fullname_with_scope();
if (creater_type_ == SINGLE_BUILD) {
TbeUtils::SaveJsonInfo(json_name_, json_info_);
}

View File

@ -97,6 +97,7 @@
#include "backend/optimizer/ascend/format_type/modify_ops_attrs.h"
#include "backend/optimizer/ascend/format_type/remove_no_use_reshape_op.h"
#include "backend/optimizer/ascend/ir_fusion/add_input_to_output.h"
#include "backend/optimizer/ascend/format_type/remove_internal_output.h"
#include "utils/context/ms_context.h"
#include "utils/config_manager.h"
#include "debug/anf_ir_dump.h"
@ -201,6 +202,7 @@ void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph)
data_layout_pm->AddPass(std::make_shared<OptimizeDependence>());
data_layout_pm->AddPass(std::make_shared<TransDataSplit>());
data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
data_layout_pm->AddPass(std::make_shared<RemoveInternalOutputTransOp>());
optimizer->AddPassManager(data_layout_pm);
(void)optimizer->Optimize(kernel_graph);
kernel_graph->SetExecOrderByDefault();
@ -222,6 +224,7 @@ void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_grap
mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>());
mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>());
mixed_precision_pm->AddPass(std::make_shared<RemoveInternalOutputCast>());
optimizer->AddPassManager(mixed_precision_pm);
(void)optimizer->Optimize(kernel_graph);
kernel_graph->SetExecOrderByDefault();

View File

@ -142,6 +142,7 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
MS_EXCEPTION_IF_NULL(node);
std::vector<AnfNodePtr> make_tuple_inputs;
make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(node); ++output_idx) {
std::string output_format = AnfAlgo::GetOutputFormat(node, output_idx);
if (output_format == kOpFormat_NC1KHKWHWC0) {
@ -151,7 +152,11 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
auto tuple_getitem = CreatTupleGetItemNode(func_graph, node, output_idx);
std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx);
if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) {
make_tuple_inputs.emplace_back(AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false));
auto trans_op = AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false);
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
kernel_graph->ReplaceInternalOutput(node, trans_op, output_idx, 0);
}
make_tuple_inputs.emplace_back(trans_op);
} else {
// No need insert trans op.
make_tuple_inputs.push_back(tuple_getitem);
@ -249,9 +254,14 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP
if (outputs_num == 0) {
return node;
}
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
// Single output
if (outputs_num == 1 && (!AnfAlgo::IsTupleOutput(node))) {
return InsertTransOpForSingleOutput(func_graph, node, kernel_select);
auto new_node = InsertTransOpForSingleOutput(func_graph, node, kernel_select);
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
kernel_graph->ReplaceInternalOutput(node, new_node);
}
return new_node;
}
// Multiple output
return InsertTransOpForMultipleOutput(func_graph, node, kernel_select);

View File

@ -40,6 +40,7 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
std::vector<AnfNodePtr> make_tuple_inputs;
AbstractBasePtrList abstract_list;
make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(cnode); ++output_idx) {
AnfNodePtr replace_node = nullptr;
const auto origin_shape = AnfAlgo::GetOutputInferShape(cnode, output_idx);
@ -64,6 +65,9 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
MS_EXCEPTION_IF_NULL(replace_node);
replace_node->set_scope(cnode->scope());
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
kernel_graph->ReplaceInternalOutput(cnode, replace_node, output_idx, 0);
}
} else {
replace_node = getitem;
}
@ -87,6 +91,7 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
return cnode;
}
MS_EXCEPTION_IF_NULL(cnode->Type());
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
// Single output
if (!cnode->Type()->isa<Tuple>()) {
if (!need_insert_cast[0]) {
@ -109,6 +114,9 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
MS_EXCEPTION_IF_NULL(replace_node);
replace_node->set_scope(cnode->scope());
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
kernel_graph->ReplaceInternalOutput(cnode, replace_node);
}
}
return replace_node;
}
@ -188,6 +196,10 @@ const AnfNodePtr InsertCast::Process(const FuncGraphPtr &func_graph, const AnfNo
CNodePtr cnode = node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(cnode);
auto new_node = InsertCastForInput(func_graph, cnode);
auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
kernel_graph->ReplaceInternalOutput(node, new_node);
}
// process output
return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true));
}

View File

@ -46,14 +46,13 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
if (node == nullptr || !AnfAlgo::IsRealKernel(node)) {
return nullptr;
}
AnfNodePtr front_node;
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
MS_LOG(DEBUG) << "process op: " << node->DebugString();
AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
front_node = kernel_graph->GetFrontNodeByInternalOutput(node);
kernel_graph->ReplaceInternalOutput(node, new_node);
}
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
MS_LOG(DEBUG) << "====process op: " << node->DebugString();
AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (ms_context->execution_mode() == kPynativeMode && !ms_context->enable_pynative_hook()) {
@ -61,12 +60,7 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
return new_node;
}
}
auto final_node = InsertTransOpForOutput(func_graph, new_node, kernel_select_);
if (kernel_graph != nullptr && front_node != nullptr) {
auto old_node = kernel_graph->GetInternalOutputByFrontNode(front_node);
kernel_graph->ReplaceInternalOutput(old_node, final_node);
}
return final_node;
return InsertTransOpForOutput(func_graph, new_node, kernel_select_);
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,83 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/ascend/format_type/remove_internal_output.h"
#include <memory>
#include "backend/session/anf_runtime_algorithm.h"
namespace mindspore {
namespace opt {
namespace {
bool UsedForOutputOnly(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
MS_EXCEPTION_IF_NULL(func_graph);
auto manager = func_graph->manager();
MS_EXCEPTION_IF_NULL(manager);
auto &node_users = manager->node_users();
auto iter = node_users.find(node);
if (iter == node_users.end()) {
return false;
}
const auto &node_set = iter->second;
for (const auto &node_index : node_set) {
if (!AnfAlgo::CheckPrimitiveType(node_index.first, prim::kPrimMakeTuple)) {
return false;
}
}
return true;
}
} // namespace
const BaseRef RemoveInternalOutputTransOp::DefinePattern() const {
VarPtr X = std::make_shared<Var>();
auto prim = std::make_shared<Primitive>(kTransDataOpName);
return VectorRef({prim, X});
}
const BaseRef RemoveInternalOutputCast::DefinePattern() const {
VarPtr X = std::make_shared<Var>();
return VectorRef({prim::kPrimCast, X});
}
const AnfNodePtr RemoveInternalOutput::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
MS_EXCEPTION_IF_NULL(func_graph);
MS_EXCEPTION_IF_NULL(node);
auto kernel_graph = func_graph->cast<KernelGraphPtr>();
if (kernel_graph == nullptr) {
return nullptr;
}
if (!kernel_graph->IsInternalOutput(node)) {
return nullptr;
}
if (!UsedForOutputOnly(func_graph, node)) {
return nullptr;
}
auto cnode = node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(cnode);
CheckCNodeInputSize(cnode, kTransOpInputNum);
auto input_node = cnode->input(1);
if (!AnfAlgo::CheckPrimitiveType(input_node, prim::kPrimTupleGetItem)) {
kernel_graph->ReplaceInternalOutput(node, input_node);
} else {
auto tuple_getitem = input_node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(tuple_getitem);
int idx = AnfAlgo::GetTupleGetItemOutIndex(tuple_getitem);
AnfNodePtr real_input_node = AnfAlgo::GetTupleGetItemRealInput(tuple_getitem);
kernel_graph->ReplaceInternalOutput(node, real_input_node, 0, idx);
}
return input_node;
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,51 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
#include <string>
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class RemoveInternalOutput : public PatternProcessPass {
public:
explicit RemoveInternalOutput(const std::string &name, bool multigraph = true)
: PatternProcessPass(name, multigraph) {}
~RemoveInternalOutput() override = default;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
};
class RemoveInternalOutputTransOp : public RemoveInternalOutput {
public:
explicit RemoveInternalOutputTransOp(bool multigraph = true)
: RemoveInternalOutput("remove_internal_output_trans_op", multigraph) {}
~RemoveInternalOutputTransOp() override = default;
const BaseRef DefinePattern() const override;
};
class RemoveInternalOutputCast : public RemoveInternalOutput {
public:
explicit RemoveInternalOutputCast(bool multigraph = true)
: RemoveInternalOutput("remove_internal_output_cast", multigraph) {}
~RemoveInternalOutputCast() override = default;
const BaseRef DefinePattern() const override;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_

View File

@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
#define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
@ -53,4 +53,4 @@ class AdamFusion : public PatternProcessPass {
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_

View File

@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
#define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
@ -55,4 +55,4 @@ class AdamWeightDecayFusion : public PatternProcessPass {
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_

View File

@ -0,0 +1,65 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/gpu/replace_addn_fusion.h"
#include <memory>
#include <vector>
#include <string>
#include "backend/session/anf_runtime_algorithm.h"
#include "ir/primitive.h"
#include "utils/utils.h"
#include "backend/optimizer/common/helper.h"
namespace mindspore {
namespace opt {
const BaseRef ReplaceAddNFusion::DefinePattern() const {
VectorRef addn = VectorRef({prim::kPrimAddN, A, B});
return addn;
}
const AnfNodePtr ReplaceAddNFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(equiv);
auto A = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
auto B = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 1);
MS_EXCEPTION_IF_NULL(A);
MS_EXCEPTION_IF_NULL(B);
int num_input = AnfAlgo::GetNodeAttr<int>(node, "n");
if (num_input == 2) {
auto prim = std::make_shared<Primitive>(prim::kPrimTensorAdd->name());
MS_EXCEPTION_IF_NULL(prim);
std::vector<AnfNodePtr> inputs = {NewValueNode(prim), A, B};
auto add_new = graph->NewCNode(inputs);
std::vector<TypeId> outputs_type;
std::vector<std::vector<size_t>> outputs_shape;
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(A, 0));
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(A, 0));
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, add_new.get());
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(add_new));
return add_new;
} else {
return nullptr;
}
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,40 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class ReplaceAddNFusion : public PatternProcessPass {
public:
explicit ReplaceAddNFusion(bool multigraph = true) : PatternProcessPass("replace_addn", multigraph) {
A = std::make_shared<Var>();
B = std::make_shared<Var>();
}
~ReplaceAddNFusion() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
private:
VarPtr A;
VarPtr B;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_

View File

@ -0,0 +1,92 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
#include <memory>
#include <vector>
#include <string>
#include "backend/session/anf_runtime_algorithm.h"
#include "ir/primitive.h"
#include "utils/utils.h"
#include "backend/optimizer/common/helper.h"
namespace mindspore {
namespace opt {
const BaseRef ReplaceBNCastFusion::DefinePattern() const {
VectorRef in_cast = VectorRef({prim::kPrimCast, x_});
VectorRef fbn2 = VectorRef({prim::kPrimFusedBatchNorm, in_cast, scale_, bias_, mean_, var_});
VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2, index_});
VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
return out_cast;
}
const AnfNodePtr ReplaceBNCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(equiv);
auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
MS_EXCEPTION_IF_NULL(index_node);
auto value_node = index_node->cast<ValueNodePtr>();
MS_EXCEPTION_IF_NULL(value_node);
int item_idx = GetValue<int>(value_node->value());
auto fbn2 = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
auto x_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 0);
auto x_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(x_after), 0);
if (item_idx != 0) {
return nullptr;
}
auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 1);
auto bias = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 2);
auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 3);
auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 4);
MS_EXCEPTION_IF_NULL(fbn2);
MS_EXCEPTION_IF_NULL(x_after);
MS_EXCEPTION_IF_NULL(x_before);
MS_EXCEPTION_IF_NULL(scale);
MS_EXCEPTION_IF_NULL(bias);
MS_EXCEPTION_IF_NULL(mean);
MS_EXCEPTION_IF_NULL(var);
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
manager->Replace(utils::cast<CNodePtr>(x_after), utils::cast<CNodePtr>(x_before));
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
std::vector<TypeId> outputs_type;
std::vector<std::vector<size_t>> outputs_shape;
auto output_num = AnfAlgo::GetOutputTensorNum(fbn2);
for (size_t i = 0; i < output_num; i++) {
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2, i));
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2, i));
}
outputs_type[0] = kNumberTypeFloat16;
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2.get());
outputs_type.clear();
outputs_shape.clear();
outputs_type.push_back(kNumberTypeFloat16);
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
return tuple;
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,58 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class ReplaceBNCastFusion : public PatternProcessPass {
public:
explicit ReplaceBNCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_cast", multigraph) {
x_ = std::make_shared<Var>();
scale_ = std::make_shared<Var>();
bias_ = std::make_shared<Var>();
mean_ = std::make_shared<Var>();
var_ = std::make_shared<Var>();
y_ = std::make_shared<Var>();
running_mean_ = std::make_shared<Var>();
running_var_ = std::make_shared<Var>();
save_mean_ = std::make_shared<Var>();
save_var_ = std::make_shared<Var>();
index_ = std::make_shared<Var>();
}
~ReplaceBNCastFusion() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
private:
VarPtr x_;
VarPtr scale_;
VarPtr bias_;
VarPtr mean_;
VarPtr var_;
VarPtr y_;
VarPtr running_mean_;
VarPtr running_var_;
VarPtr save_mean_;
VarPtr save_var_;
VarPtr index_;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_

View File

@ -0,0 +1,88 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
#include <memory>
#include <vector>
#include <string>
#include "backend/session/anf_runtime_algorithm.h"
#include "ir/primitive.h"
#include "utils/utils.h"
#include "backend/optimizer/common/helper.h"
namespace mindspore {
namespace opt {
const BaseRef ReplaceBNGradCast2Fusion::DefinePattern() const {
VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_, x_, scale_, mean_, var_});
VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
return out_cast;
}
const AnfNodePtr ReplaceBNGradCast2Fusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(equiv);
auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
MS_EXCEPTION_IF_NULL(index_node);
auto value_node = index_node->cast<ValueNodePtr>();
MS_EXCEPTION_IF_NULL(value_node);
int item_idx = GetValue<int>(value_node->value());
if (item_idx != 0) {
return nullptr;
}
auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
auto dy_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);
MS_EXCEPTION_IF_NULL(fbn2g);
MS_EXCEPTION_IF_NULL(dy_);
MS_EXCEPTION_IF_NULL(scale);
MS_EXCEPTION_IF_NULL(x_);
MS_EXCEPTION_IF_NULL(mean);
MS_EXCEPTION_IF_NULL(var);
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
std::vector<TypeId> outputs_type;
std::vector<std::vector<size_t>> outputs_shape;
auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
for (size_t i = 0; i < output_num; i++) {
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
}
outputs_type[0] = AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0);
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
outputs_type.clear();
outputs_shape.clear();
outputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0));
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
return tuple;
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,54 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class ReplaceBNGradCast2Fusion : public PatternProcessPass {
public:
explicit ReplaceBNGradCast2Fusion(bool multigraph = true) : PatternProcessPass("replace_grad_cast2", multigraph) {
dy_ = std::make_shared<Var>();
x_ = std::make_shared<Var>();
scale_ = std::make_shared<Var>();
mean_ = std::make_shared<Var>();
var_ = std::make_shared<Var>();
dx_ = std::make_shared<Var>();
bn_scale_ = std::make_shared<Var>();
bn_bias_ = std::make_shared<Var>();
index_ = std::make_shared<Var>();
}
~ReplaceBNGradCast2Fusion() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
private:
VarPtr dy_;
VarPtr x_;
VarPtr scale_;
VarPtr mean_;
VarPtr var_;
VarPtr dx_;
VarPtr bn_scale_;
VarPtr bn_bias_;
VarPtr index_;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_

View File

@ -0,0 +1,91 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
#include <memory>
#include <vector>
#include <string>
#include "backend/session/anf_runtime_algorithm.h"
#include "ir/primitive.h"
#include "utils/utils.h"
#include "backend/optimizer/common/helper.h"
namespace mindspore {
namespace opt {
const BaseRef ReplaceBNGradCastFusion::DefinePattern() const {
VectorRef dy_cast = VectorRef({prim::kPrimCast, dy_});
VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_cast, x_, scale_, mean_, var_});
VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
return out_cast;
}
const AnfNodePtr ReplaceBNGradCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(equiv);
auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
MS_EXCEPTION_IF_NULL(index_node);
auto value_node = index_node->cast<ValueNodePtr>();
MS_EXCEPTION_IF_NULL(value_node);
int item_idx = GetValue<int>(value_node->value());
if (item_idx != 0) {
return nullptr;
}
auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
auto dy_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
auto dy_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(dy_after), 0);
auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);
MS_EXCEPTION_IF_NULL(fbn2g);
MS_EXCEPTION_IF_NULL(dy_after);
MS_EXCEPTION_IF_NULL(dy_before);
MS_EXCEPTION_IF_NULL(scale);
MS_EXCEPTION_IF_NULL(x_);
MS_EXCEPTION_IF_NULL(mean);
MS_EXCEPTION_IF_NULL(var);
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
manager->Replace(utils::cast<CNodePtr>(dy_after), utils::cast<CNodePtr>(dy_before));
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
std::vector<TypeId> outputs_type;
std::vector<std::vector<size_t>> outputs_shape;
auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
for (size_t i = 0; i < output_num; i++) {
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
}
outputs_type[0] = kNumberTypeFloat16;
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
outputs_type.clear();
outputs_shape.clear();
outputs_type.push_back(kNumberTypeFloat16);
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
return tuple;
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,54 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class ReplaceBNGradCastFusion : public PatternProcessPass {
public:
explicit ReplaceBNGradCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_grad_cast", multigraph) {
dy_ = std::make_shared<Var>();
x_ = std::make_shared<Var>();
scale_ = std::make_shared<Var>();
mean_ = std::make_shared<Var>();
var_ = std::make_shared<Var>();
dx_ = std::make_shared<Var>();
bn_scale_ = std::make_shared<Var>();
bn_bias_ = std::make_shared<Var>();
index_ = std::make_shared<Var>();
}
~ReplaceBNGradCastFusion() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
private:
VarPtr dy_;
VarPtr x_;
VarPtr scale_;
VarPtr mean_;
VarPtr var_;
VarPtr dx_;
VarPtr bn_scale_;
VarPtr bn_bias_;
VarPtr index_;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_

View File

@ -0,0 +1,63 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
#include <memory>
#include <vector>
#include <string>
#include "backend/session/anf_runtime_algorithm.h"
#include "ir/primitive.h"
#include "utils/utils.h"
#include "backend/optimizer/common/helper.h"
namespace mindspore {
namespace opt {
const BaseRef ReplaceMomentumCastFusion::DefinePattern() const {
VectorRef grad_cast = VectorRef({prim::kPrimCast, grad_});
VectorRef momentum = VectorRef({prim::kPrimApplyMomentum, var_, acc_, lr_, grad_cast, mom_});
return momentum;
}
const AnfNodePtr ReplaceMomentumCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
MS_EXCEPTION_IF_NULL(equiv);
auto grad_cast = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 3);
auto grad = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(grad_cast), 0);
MS_EXCEPTION_IF_NULL(grad_cast);
MS_EXCEPTION_IF_NULL(grad);
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
manager->Replace(utils::cast<CNodePtr>(grad_cast), utils::cast<CNodePtr>(grad));
std::vector<TypeId> outputs_type;
std::vector<std::vector<size_t>> outputs_shape;
auto output_num = AnfAlgo::GetOutputTensorNum(node);
for (size_t i = 0; i < output_num; i++) {
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(node, i));
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(node, i));
}
outputs_type[3] = AnfAlgo::GetPrevNodeOutputInferDataType(grad_cast, 0);
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, node.get());
return node;
}
} // namespace opt
} // namespace mindspore

View File

@ -0,0 +1,46 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class ReplaceMomentumCastFusion : public PatternProcessPass {
public:
explicit ReplaceMomentumCastFusion(bool multigraph = true) : PatternProcessPass("replace_momentum_cast", multigraph) {
var_ = std::make_shared<Var>();
acc_ = std::make_shared<Var>();
lr_ = std::make_shared<Var>();
grad_ = std::make_shared<Var>();
mom_ = std::make_shared<Var>();
}
~ReplaceMomentumCastFusion() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
private:
VarPtr var_;
VarPtr acc_;
VarPtr lr_;
VarPtr grad_;
VarPtr mom_;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_

View File

@ -25,7 +25,8 @@
namespace mindspore {
namespace memreuse {
enum RefCountType { kDynamicRefCount, kStaticRefCount };
enum NodeType { NORMAL, SPECIAL };
enum NodeType { COMMON_NODE, COMMUNICATION_NODE };
enum KernelRefType { COMMON, REFNODE_OUTPUT, COMM_NOTREUSE, COMM_REUSE, SUMMARY };
static constexpr int kInitIndex = -1;
class KernelRefCount {
public:
@ -36,6 +37,7 @@ class KernelRefCount {
size_t offset_;
size_t size_;
int index_;
KernelRefType type_;
// remember to reset offset
KernelRefCount()
: stream_id_(0),
@ -44,6 +46,7 @@ class KernelRefCount {
offset_(0),
size_(0),
index_(kInitIndex),
type_(COMMON),
reftype_(kStaticRefCount) {}
~KernelRefCount() = default;
void SetKernelRefCountInfo(int index, size_t size, RefCountType reftype);
@ -65,7 +68,7 @@ class KernelDef {
KernelMap inputs_;
KernelMap outputs_;
KernelMap wk_space_;
NodeType dirty = NORMAL;
NodeType type_ = COMMON_NODE;
KernelDef() = default;
~KernelDef() = default;
void set_input_refs(const KernelRefCountPtrList &kernelRefPtrList) { input_refs_ = kernelRefPtrList; }

View File

@ -46,6 +46,8 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() {
if (iter == kernel_output_refs_.end()) {
auto output_sizes = kernel_mod->GetOutputSizeList();
KernelRefCountPtrList kernel_refs;
bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel_cnode);
size_t output_index = 0;
for (auto size : output_sizes) {
total_dy_size_ += size;
// do not MallocDynamicMem just record this
@ -54,9 +56,20 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() {
auto curr_stream_id = AnfAlgo::GetStreamId(kernel_cnode);
kernel_ref->stream_id_ = curr_stream_id;
kernel_ref->SetKernelRefCountInfo(index, size, kDynamicRefCount);
if (is_comm_op) {
kernel_ref->type_ = COMM_REUSE;
} else {
session::AnfWithOutIndex out_pair(kernel_cnode, output_index);
if (graph_->IsInRefOutputMap(out_pair)) {
kernel_ref->type_ = REFNODE_OUTPUT;
} else {
kernel_ref->type_ = COMMON;
}
}
kernel_refs.push_back(kernel_ref);
kernel_out_ref_num++;
total_refs_list_.push_back(kernel_ref);
output_index++;
}
if (!kernel_refs.empty()) {
kernel_output_refs_[key] = kernel_refs;
@ -155,9 +168,19 @@ void MemReuseUtil::SetInputMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr
MS_EXCEPTION_IF_NULL(kernel);
MS_EXCEPTION_IF_NULL(kernel_def_ptr);
auto key = kernel.get();
for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel);
size_t input_tensor_num = AnfAlgo::GetInputTensorNum(kernel);
for (size_t i = 0; i < input_tensor_num; ++i) {
auto ref_ptr = GetKernelInputRef(kernel, i);
if (ref_ptr != nullptr) {
if (is_comm_op) {
if (input_tensor_num == 1) {
ref_ptr->type_ = COMM_REUSE;
} else {
ref_ptr->type_ = COMM_NOTREUSE;
}
}
if (ref_ptr->reftype() == kStaticRefCount) {
continue;
} else if (ref_ptr->reftype() == kDynamicRefCount) {
@ -258,6 +281,11 @@ void MemReuseUtil::SetKernelDefMap() {
auto key = kernel.get();
kernel_def_ptr->set_input_refs(kernel_def_ptr->inputs_[key]);
kernel_def_ptr->set_output_refs(kernel_def_ptr->outputs_[key]);
if (AnfAlgo::IsCommunicationOp(kernel)) {
kernel_def_ptr->type_ = COMMUNICATION_NODE;
} else {
kernel_def_ptr->type_ = COMMON_NODE;
}
kernel_def_ptr_list_.push_back(kernel_def_ptr);
kernel_map_[key] = kernel_def_ptr;
}
@ -337,6 +365,7 @@ void MemReuseUtil::SetSummaryNodesRefCount() {
KernelRefCountPtr kernel_ref = kernel_output_refs_[node.get()][index];
kernel_ref->ref_count_ = kMaxRefCount;
kernel_ref->ref_count_dynamic_use_ = kMaxRefCount;
kernel_ref->type_ = SUMMARY;
total_summary_size += kernel_ref->size_;
MS_LOG(INFO) << "Set summary node's ref count, node: " << node->fullname_with_scope() << " index: " << index;
} else {

View File

@ -83,6 +83,7 @@ class MemReuseUtil {
void set_mem_base(uint8_t *mem_base) { mem_base_ = mem_base; }
uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
bool is_all_nop_node() const { return is_all_nop_node_; }
private:
int util_index_;

View File

@ -33,11 +33,11 @@ void BestFitMemReuse::InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr) {
set_op_ptr_list(mem_reuse_util_ptr->kernel_def_ptr_list());
// check info Correctness
for (auto &tensor : tensor_ptr_list_) {
tensor->size_ = AlignMemorySize(tensor->size_);
tensor->size_ = AlignCommonMemorySize(tensor->size_);
}
// align wk size to 512 && refcount == 1
for (auto &wk : wk_tensor_list_) {
wk->size_ = AlignMemorySize(wk->size_);
wk->size_ = AlignCommonMemorySize(wk->size_);
wk->ref_count_ = 1;
}
#ifdef ENABLE_D
@ -135,11 +135,23 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr
return false;
}
void BestFitMemReuse::AssignNodeOutputOffset() {
void BestFitMemReuse::AssignCommonNodeOutputOffset() {
MS_EXCEPTION_IF_NULL(current_kernel_);
for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
size_t index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(tensor_desc);
if (tensor_desc->type_ == REFNODE_OUTPUT) {
total_refoutput_size += tensor_desc->size_;
continue;
} else if (tensor_desc->type_ == COMM_NOTREUSE) {
total_comm_not_reuse_size += tensor_desc->size_;
} else if (tensor_desc->type_ == COMM_REUSE) {
// get align size for communication op's single input
tensor_desc->size_ = AlignCommunicationMemorySize(tensor_desc->size_);
total_comm_reuse_size += tensor_desc->size_;
}
auto reusable_membuf_map = GetReusableMembufMap(tensor_desc->size_);
if (!reusable_membuf_map.empty()) {
auto membuf_index = reusable_membuf_map.begin()->second;
@ -152,6 +164,93 @@ void BestFitMemReuse::AssignNodeOutputOffset() {
MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
#endif
}
// skip left align border for communication op single input to used
if (tensor_desc->type_ == COMM_REUSE) {
tensor_desc->offset_ += kDefaultMemAlignSize;
}
}
}
void BestFitMemReuse::AssignCommunicationNodeOutputOffset() {
size_t total_kernel_output_size = 0;
size_t output_num = 0;
// get all output size
MS_EXCEPTION_IF_NULL(current_kernel_);
for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
size_t index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(tensor_desc);
if (tensor_desc->type_ == COMM_REUSE) {
total_comm_reuse_size += tensor_desc->size_;
total_comm_output_reuse_size += tensor_desc->size_;
total_kernel_output_size += tensor_desc->size_;
} else {
MS_LOG(ERROR) << "All communication op's outputs should be memory reuse, Kernel:"
<< current_kernel_->scope_full_name();
continue;
}
}
total_kernel_output_size = AlignCommunicationMemorySize(total_kernel_output_size);
// add left align border for the first output and right align border for the last output to alloc align border memory
size_t output_index = 0;
auto output_ref_indexes = current_kernel_->GetOutputRefIndexs();
for (auto &tensor_idx : output_ref_indexes) {
size_t index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(tensor_desc);
if (output_index == 0 || output_index == output_num - 1) {
tensor_desc->size_ += kDefaultMemAlignSize;
}
if ((output_index == 0) && (output_ref_indexes.size() == 1)) {
// add right align border for single output
tensor_desc->size_ += kDefaultMemAlignSize;
}
output_index++;
}
auto reusable_membuf_map = GetReusableMembufMap(total_kernel_output_size);
if (!reusable_membuf_map.empty()) {
auto membuf_index = reusable_membuf_map.begin()->second;
output_index = 0;
for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
size_t index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(tensor_desc);
ReuseExistMembuf(tensor_desc.get(), membuf_index + output_index, kDynamicMem);
// skip skip left align border for communication op's first output to used
if (output_index == 0) {
tensor_desc->offset_ += kDefaultMemAlignSize;
}
output_index++;
}
} else {
// no membuf can reuse, add new membuf after the membuf_ptr_list
output_index = 0;
for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
size_t index = GetTensorIndex(tensor_idx);
auto tensor_desc = tensor_ptr_list_[index];
MS_EXCEPTION_IF_NULL(tensor_desc);
AddNewMembufPtr(tensor_desc.get(), kDynamicMem);
// skip align size offset for first output to used
if (output_index == 0) {
tensor_desc->offset_ += kDefaultMemAlignSize;
}
output_index++;
#ifdef MEM_REUSE_DEBUG
MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
#endif
}
}
}
void BestFitMemReuse::AssignNodeOutputOffset() {
if (current_kernel_->type_ == COMMUNICATION_NODE) {
AssignCommunicationNodeOutputOffset();
} else {
AssignCommonNodeOutputOffset();
}
}
@ -319,11 +418,17 @@ void BestFitMemReuse::ReleaseMembuf(size_t tensor_index, int flag) {
}
}
size_t BestFitMemReuse::AlignMemorySize(size_t size) const {
size_t BestFitMemReuse::AlignCommonMemorySize(size_t size) const {
// memory size 512 align
return (size + kDefaultMemAlignSize + kAttAlignSize) / kDefaultMemAlignSize * kDefaultMemAlignSize;
}
size_t BestFitMemReuse::AlignCommunicationMemorySize(size_t size) const {
// memory size 512 align and add communication memory: left align border memory - data - right align border memory
return kDefaultMemAlignSize + (size + kDefaultMemAlignSize - 1) / kDefaultMemAlignSize * kDefaultMemAlignSize +
kDefaultMemAlignSize;
}
size_t BestFitMemReuse::GetAllocatedSize() {
size_t AllocatedSize = kTotalSize;
if (membuf_ptr_list_.empty()) {
@ -412,6 +517,9 @@ void BestFitMemReuse::Reuse(const MemReuseUtil *mem_reuse_util_ptr) {
++op_num;
#endif
}
MS_LOG(INFO) << "Special Tensor total size: RefOutput: " << total_refoutput_size
<< " CommReuse: " << total_comm_reuse_size << " CommOutputReuse: " << total_comm_output_reuse_size
<< " CommNotReuse: " << total_comm_not_reuse_size;
#ifdef MEM_REUSE_DEBUG
MemReuseChecker::GetInstance().ExportMembufInfoIR();
MemReuseChecker::GetInstance().ExportAddNewMmebufIR();

View File

@ -74,6 +74,14 @@ class BestFitMemReuse {
* Assign output tensor memory offset of current kernel
*/
void AssignNodeOutputOffset();
/**
* Assign output tensor memory offset of common kernel
*/
void AssignCommonNodeOutputOffset();
/**
* Assign output tensor memory offset of communication kernel
*/
void AssignCommunicationNodeOutputOffset();
/**
* Update input tensor's status of current kernel, and the status of membuf used by current kernel
*/
@ -110,8 +118,10 @@ class BestFitMemReuse {
void AddNewMembufPtr(KernelRefCount *tensor_desc, int flag);
// Merge unused membuf
void ReleaseMembuf(size_t tensor_index, int flag);
// Memory address alignment 512
size_t AlignMemorySize(size_t size) const;
// Memory address alignment for common memory
size_t AlignCommonMemorySize(size_t size) const;
// Memory address alignment for communication used memory
size_t AlignCommunicationMemorySize(size_t size) const;
int GetRealIndex(size_t index, int flag = kDynamicMem) const;
size_t GetTensorIndex(int index) const;
size_t GetWorkspaceIndex(int index) const;
@ -153,6 +163,10 @@ class BestFitMemReuse {
// kernel_front_map_, key: the kernel_def, value: kernels before this kernel_def
std::map<KernelDefPtr, std::set<KernelDefPtr>> kernel_front_map_;
std::vector<std::vector<uint32_t>> stream_groups_;
size_t total_refoutput_size{0};
size_t total_comm_reuse_size{0};
size_t total_comm_output_reuse_size{0};
size_t total_comm_not_reuse_size{0};
};
} // namespace memreuse
} // namespace mindspore

View File

@ -170,12 +170,14 @@ void MemReuseChecker::CheckMemReuseIR(const KernelRefCountPtrList &total_refs_li
ofs << "all_tensor_refs:\n";
ofs << "index:"
<< "\tsize:"
<< "\trefcount:\n";
<< "\trefcount:"
<< "\ttype:\n";
for (auto &ref : total_refs_list) {
ofs << "%" << ref->index_ << "T"
<< "\t"
<< "#" << ref->size_ << "S"
<< "\t" << ref->ref_count_ << "C"
<< "\t" << ref->type_ << "t"
<< "\n";
}
ofs << "kernel_def exc_order:\n";
@ -241,7 +243,7 @@ bool MemReuseChecker::CheckGraphOutputAssigned(const session::KernelGraph *graph
void MemReuseChecker::ExportMemOpIr(const KernelDef *def, std::ofstream &ofs, int def_idx) {
auto scope_name = def->scope_full_name();
std::string split_name = GetSplitName(scope_name);
ofs << "$" << def_idx << "\t" << split_name << "\t";
ofs << "$" << def_idx << "\t" << split_name << "\t" << static_cast<int>(def->type_) << "\t";
ofs << "inputs[";
for (auto &in : def->inputs_) {
for (auto &in_ref : in.second) {

View File

@ -100,7 +100,10 @@ bool CommunicationOpFusion::GetSplitSegments(const CommunicationOpInfo &communic
auto parallel_context = parallel::ParallelContext::GetInstance();
MS_EXCEPTION_IF_NULL(parallel_context);
const auto &split_indices = parallel_context->GetAllReduceFusionSplitIndices(group);
std::vector<uint32_t> split_indices;
if (!parallel_context->enable_parallel_optimizer()) {
split_indices = parallel_context->GetAllReduceFusionSplitIndices(group);
}
size_t segments = 0;
if (split_indices.size() != 0) {

View File

@ -71,7 +71,6 @@ bool ReplaceNodeByProxy::Run(const FuncGraphPtr &func_graph) {
AbstractBasePtrList abstract_list;
AnfAlgo::CopyNodeAttr(kAttrPsKey, cnode, proxy_node);
AnfAlgo::CopyNodeAttr("reduce_scatter_flag", cnode, proxy_node);
AnfAlgo::CopyNodeAttr("offset", cnode, proxy_node);
abstract_list.push_back(cnode->abstract());
auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list);

View File

@ -18,9 +18,12 @@
#include <utility>
#include <memory>
#include <algorithm>
#include <string>
#include "backend/session/anf_runtime_algorithm.h"
#include "utils/union_find_set.h"
#include "runtime/device/ascend/ascend_label_assign.h"
#include "utils/context/ms_context.h"
#include "debug/anf_ir_dump.h"
static constexpr size_t kCNodePrim = 0;
static constexpr size_t kCNodeCallArg = 1;
@ -104,7 +107,7 @@ static void ReuseParameter(NotNull<KernelGraphPtr> root_kg,
static CNodePtr GetNextRealKernel(const std::vector<CNodePtr> &list, size_t start) {
for (size_t i = start; i < list.size() - 1; ++i) {
if (!IsPrimitiveCNode(list[i], prim::kPrimPartial) && AnfAlgo::IsRealKernel(list[i])) {
if (AnfAlgo::IsRealKernel(list[i])) {
return list[i];
}
}
@ -168,18 +171,43 @@ static void EraseNodeFromExecOrder(const AnfNodePtr &node, const NotNull<std::ve
exec_order->erase(exec_iter);
}
void AscendControlParser::AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph,
const NotNull<std::set<KernelGraphPtr> *> memo) {
if (memo->find(graph) != memo->end()) {
return;
}
memo->insert(graph.get());
const std::vector<std::shared_ptr<KernelGraph>> &child_graph_order = graph->child_graph_order();
if (child_graph_order.empty()) {
return;
}
std::vector<AnfNodePtr> depend_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimPartial->name()))};
for (auto &cg : child_graph_order) {
MS_EXCEPTION_IF_NULL(cg);
auto fg = cg->cast<FuncGraphPtr>();
MS_EXCEPTION_IF_NULL(fg);
depend_inputs.emplace_back(NewValueNode(fg));
AttachChildGraphToReturnNode(NOT_NULL(cg), memo);
}
auto child_graphs = graph->NewCNode(depend_inputs);
InsertDependToGraph(graph, NOT_NULL(child_graphs));
}
void AscendControlParser::LinkGraph(NotNull<KernelGraphPtr> kg) {
std::set<KernelGraphPtr> memo;
std::vector<std::pair<AnfNodePtr, AnfNodePtr>> link_list;
// Insert Assign
ChildGraphDataAssign(kg, NOT_NULL(&link_list), NOT_NULL(&memo));
memo.clear();
// Reuse Parameter
ReuseParameter(kg, link_list);
// replace call by label goto / label switch
memo.clear();
(void)ProcessKernelGraph(kg, nullptr, nullptr, NOT_NULL(&memo));
memo.clear();
// assign label resource
device::ascend::AscendLabelAssign::GetInstance().AssignLabel(kg);
AttachChildGraphToReturnNode(kg, NOT_NULL(&memo));
}
void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
@ -248,10 +276,14 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
}
MS_LOG(INFO) << "Erase " << assign_node->DebugString(5);
EraseNodeFromExecOrder(assign_node, NOT_NULL(&exec_order));
auto source = AnfAlgo::VisitKernelWithReturnType(assign_node->input(kCNodeAssignSource), 0).first;
parameter_count.AddReadCount(source, -1);
auto source = assign_node->input(kCNodeAssignSource);
MS_EXCEPTION_IF_NULL(source);
auto visit_source = AnfAlgo::VisitKernelWithReturnType(source, 0).first;
parameter_count.AddWriteCount(para, -1);
parameter_count.AddReadCount(para, -1);
if (visit_source->isa<Parameter>()) {
parameter_count.AddReadCount(visit_source, read - 1);
}
for (auto &node : all_nodes) {
for (size_t i = 0; i < node->size(); ++i) {
if (node->input(i) == para) {
@ -260,8 +292,6 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
}
}
}
parameter_count.AddReadCount(source, 1);
parameter_count.AddReadCount(para, -1);
}
root_graph->set_execution_order(exec_order);
}
@ -318,6 +348,17 @@ void AscendControlParser::ExecutorValidate(NotNull<KernelGraphPtr> root_graph) {
(void)RecurseGraph(root_graph, NOT_NULL(&memo));
EraseParameter(root_graph, memo);
EraseLabel(root_graph);
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
auto save_graphs_path = context_ptr->save_graphs_path();
if (save_graphs_path.empty()) {
save_graphs_path = ".";
}
if (context_ptr->save_graphs_flag()) {
std::string file_path = save_graphs_path + "/after_erase_label_and_parameter.ir";
DumpIR(file_path, root_graph.get());
}
}
std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> AscendControlParser::ParseCallNode(

View File

@ -66,7 +66,8 @@ class AscendControlParser {
static AnfNodePtr InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to);
static std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> ParseCallNode(NotNull<CNodePtr> call_node);
static std::tuple<KernelGraphPtr, std::vector<AnfNodePtr>> ParsePartial(NotNull<AnfNodePtr> node);
static void AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph,
const NotNull<std::set<KernelGraphPtr> *> memo);
// root graph order
static bool CheckLabelIndex(uint32_t order_index, uint32_t label_index, const CNodePtr &cnode,
NotNull<KernelGraphPtr> graph);

View File

@ -353,6 +353,10 @@ GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) {
RootGraphExecutorValidate(NOT_NULL(root_graph));
// adjust kernel
AdjustKernel(root_graph);
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
// Assign parameter keys.
AssignParamKey(root_graph);
#endif
// assign stream
AssignStream(NOT_NULL(root_graph));
// insert profiling point
@ -511,6 +515,12 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
}
// load input data from user input
LoadInputData(kernel_graph, inputs);
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
// Initialize parameter server
if (!ps_init_) {
InitPSParamAndOptim(kernel_graph, inputs);
}
#endif
// convert inputs to model
predictmodel::StepConvertWeight(inputs);
{

View File

@ -16,6 +16,7 @@
#include "backend/session/cpu_session.h"
#include <algorithm>
#include <sstream>
#include "ir/tensor.h"
#include "ir/anf.h"
#include "backend/kernel_compiler/kernel.h"
@ -25,9 +26,15 @@
#include "predict/predict.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "runtime/device/cpu/kernel_select_cpu.h"
#include "backend/optimizer/common/optimizer.h"
#include "backend/optimizer/common/pass_manager.h"
#include "backend/optimizer/pass/replace_node_by_proxy.h"
#ifdef ENABLE_DEBUGGER
#include "debug/debugger/debugger.h"
#endif
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
#include "frontend/parallel/ps/util.h"
#endif
namespace mindspore {
namespace session {
@ -49,12 +56,29 @@ ParameterPtr CPUSession::CreateNewParameterFromParameter(const AnfNodePtr &anf,
return new_parameter;
}
void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
std::string pass_name = "replace_node_by_proxy";
pass_name.append(std::to_string(graph_sum_));
pm->AddPass(std::make_shared<opt::ReplaceNodeByProxy>(pass_name));
optimizer->AddPassManager(pm);
(void)optimizer->Optimize(kernel_graph);
kernel_graph->SetExecOrderByDefault();
}
GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
auto graph_id = graph_sum_;
auto graph = ConstructKernelGraph(lst, outputs);
MS_EXCEPTION_IF_NULL(graph);
MS_LOG(INFO) << "Set kernel info";
SetKernelInfo(graph.get());
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
AssignParamKey(graph);
if (parallel::ps::Util::IsRoleOfWorker()) {
Optimize(graph);
}
#endif
predictmodel::StepConvertGraph(graph);
MS_LOG(INFO) << "Build kernel";
BuildKernel(graph.get());
@ -66,6 +90,12 @@ GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
auto &kernel_graph = graphs_[graph_id];
MS_EXCEPTION_IF_NULL(kernel_graph);
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
// Initialize parameter server
if (!ps_init_) {
InitPSParamAndOptim(kernel_graph, inputs);
}
#endif
MS_LOG(INFO) << "Bind input output address";
std::vector<tensor::TensorPtr> need_sync_outputs;
runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs, &need_sync_outputs);
@ -119,6 +149,48 @@ void CPUSession::SetKernelInfo(const KernelGraph *kernel_graph) {
}
}
namespace {
void KernelNotSupportException(const AnfNodePtr &kernel_node) {
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
std::stringstream operator_info;
operator_info << "Operator[" << kernel_name << "] ";
auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel_node->kernel_info());
if (kernel_info == nullptr) {
operator_info << "is not support.";
MS_LOG(EXCEPTION) << operator_info.str();
}
auto kernel_build_Info = kernel_info->select_kernel_build_info();
if (kernel_build_Info == nullptr) {
operator_info << "is not support.";
MS_LOG(EXCEPTION) << operator_info.str();
}
size_t input_num = kernel_build_Info->GetInputNum();
if (input_num > 0) {
operator_info << " input(";
for (size_t i = 0; i < input_num; ++i) {
operator_info << TypeIdLabel(kernel_build_Info->GetInputDeviceType(i));
if (i != input_num - 1) {
operator_info << ",";
}
}
operator_info << ") ";
}
size_t output_num = kernel_build_Info->GetOutputNum();
if (output_num > 0) {
operator_info << "output(";
for (size_t i = 0; i < output_num; ++i) {
operator_info << TypeIdLabel(kernel_build_Info->GetOutputDeviceType(i));
if (i != kernel_build_Info->GetOutputNum() - 1) {
operator_info << ",";
}
}
operator_info << ") ";
}
operator_info << "is not support.";
MS_LOG(EXCEPTION) << operator_info.str();
}
} // namespace
void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
MS_EXCEPTION_IF_NULL(kernel_graph);
auto &kernel_nodes = kernel_graph->execution_order();
@ -129,7 +201,7 @@ void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
std::shared_ptr<kernel::CPUKernel> cpu_kernel =
kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node);
if (cpu_kernel == nullptr) {
MS_LOG(EXCEPTION) << "Operator[" << kernel_name << "] is not support.";
KernelNotSupportException(kernel_node);
}
cpu_kernel->Init(kernel_node);
AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get());

View File

@ -37,6 +37,7 @@ class CPUSession : public SessionBasic {
protected:
ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph) override;
void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph);
private:
void SetKernelInfo(const KernelGraph *kernel_graph);

View File

@ -25,6 +25,11 @@
#include "backend/optimizer/pass/getitem_tuple.h"
#include "backend/optimizer/gpu/adam_weight_decay_fusion.h"
#include "backend/optimizer/gpu/adam_fusion.h"
#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
#include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
#include "backend/optimizer/gpu/replace_addn_fusion.h"
#include "runtime/device/kernel_runtime_manager.h"
#include "predict/predict.h"
#include "common/utils.h"
@ -59,6 +64,11 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
auto pm = std::make_shared<opt::PassManager>();
pm->AddPass(std::make_shared<opt::AdamWeightDecayFusion>());
pm->AddPass(std::make_shared<opt::AdamFusion>());
pm->AddPass(std::make_shared<opt::ReplaceBNCastFusion>());
pm->AddPass(std::make_shared<opt::ReplaceBNGradCastFusion>());
pm->AddPass(std::make_shared<opt::ReplaceBNGradCast2Fusion>());
pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
optimizer->AddPassManager(pm);
(void)optimizer->Optimize(kernel_graph);
kernel_graph->SetExecOrderByDefault();
@ -167,6 +177,10 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
Optimize(graph);
// Select kernel build info
SelectKernel(graph);
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
// Assign parameter keys.
AssignParamKey(graph);
#endif
// Convert kernel Graph to model
predictmodel::StepConvertGraph(graph);
// Start gpu kernel runtime
@ -204,6 +218,10 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
auto &kernel_graph = graphs_[graph_id];
// Load input data from user input
LoadInputData(kernel_graph, inputs);
// Initialize parameter server
if (!ps_init_) {
InitPSParamAndOptim(kernel_graph, inputs);
}
MS_EXCEPTION_IF_NULL(kernel_graph);
// Convert inputs to model
predictmodel::StepConvertWeight(inputs);

View File

@ -307,7 +307,7 @@ CNodePtr KernelGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
if (inputs.size() == 1 || !feature_map_input_indexs.empty()) {
kernel_info->SetFeatureMapFlag(true);
}
if (AnfAlgo::IsRealCNodeKernel(cnode)) {
if (AnfAlgo::IsRealKernel(cnode)) {
AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(kernel_info->is_feature_map()), cnode);
AnfAlgo::SetNodeAttr(kIsFeatureMapInputList, MakeValue(feature_map_input_indexs), cnode);
}
@ -929,10 +929,15 @@ void KernelGraph::AddInternalOutput(const AnfNodePtr &front_node, const AnfNodeP
}
MS_LOG(INFO) << "Add internal node " << node->DebugString() << " with front node " << front_node->DebugString();
front_to_internal_outputs_map_[front_node] = node;
internal_outputs_to_front_map_[node] = front_node;
int output_idx = 0;
if (AnfAlgo::CheckPrimitiveType(front_node, prim::kPrimTupleGetItem)) {
output_idx = AnfAlgo::GetTupleGetItemOutIndex(front_node->cast<CNodePtr>());
}
internal_outputs_to_front_map_[node][output_idx] = front_node;
}
void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node) {
void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx,
int dst_output_idx) {
if (new_node == nullptr || node == nullptr) {
MS_LOG(INFO) << "New node or node is nullptr";
return;
@ -947,9 +952,30 @@ void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr
return;
}
MS_LOG(INFO) << "Replace internal node " << node->DebugString() << " To " << new_node->DebugString();
internal_outputs_to_front_map_[new_node] = iter->second;
front_to_internal_outputs_map_[iter->second] = new_node;
auto &front_nodes = iter->second;
// Move all front nodes to new node mapping
if (src_output_idx == -1) {
internal_outputs_to_front_map_[new_node] = front_nodes;
for (const auto &front_node_iter : front_nodes) {
front_to_internal_outputs_map_[front_node_iter.second] = new_node;
}
internal_outputs_to_front_map_.erase(iter);
return;
}
// Move specified front node to new node mapping
int index = SizeToInt(src_output_idx);
auto front_node_iter = front_nodes.find(index);
if (front_node_iter == front_nodes.end()) {
MS_LOG(INFO) << "The output " << src_output_idx << " of node " << node->DebugString() << " is not an internal node";
return;
}
auto front_node = front_node_iter->second;
internal_outputs_to_front_map_[new_node][dst_output_idx] = front_node;
front_to_internal_outputs_map_[front_node] = new_node;
front_nodes.erase(index);
if (front_nodes.empty()) {
internal_outputs_to_front_map_.erase(iter);
}
}
AnfNodePtr KernelGraph::GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const {
@ -967,14 +993,6 @@ bool KernelGraph::IsInternalOutput(const AnfNodePtr &node) const {
return false;
}
AnfNodePtr KernelGraph::GetFrontNodeByInternalOutput(const AnfNodePtr &node) const {
auto iter = internal_outputs_to_front_map_.find(node);
if (iter != internal_outputs_to_front_map_.end()) {
return iter->second;
}
return nullptr;
}
void KernelGraph::AddFinalOutputKernel(const AnfNodePtr &node) {
if (node == nullptr) {
return;

View File

@ -148,10 +148,10 @@ class KernelGraph : public FuncGraph {
const std::map<std::string, std::pair<AnfNodePtr, int>> &summary_nodes() const { return summary_nodes_; }
void set_summary_nodes(const std::map<std::string, std::pair<AnfNodePtr, int>> &nodes) { summary_nodes_ = nodes; }
void AddInternalOutput(const AnfNodePtr &front_node, const AnfNodePtr &node);
void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node);
void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx = -1,
int dst_output_idx = -1);
AnfNodePtr GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const;
bool IsInternalOutput(const AnfNodePtr &node) const;
AnfNodePtr GetFrontNodeByInternalOutput(const AnfNodePtr &node) const;
void AddFinalOutputKernel(const AnfNodePtr &node);
bool IsFinalOutputKernel(const AnfNodePtr &node) const;
uint32_t current_epoch() const { return current_epoch_; }
@ -223,7 +223,7 @@ class KernelGraph : public FuncGraph {
CNodePtr end_goto_;
bool null_output_;
std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_;
std::unordered_map<AnfNodePtr, AnfNodePtr> internal_outputs_to_front_map_;
std::unordered_map<AnfNodePtr, std::unordered_map<int, AnfNodePtr>> internal_outputs_to_front_map_;
std::set<AnfNodePtr> final_output_kernels_;
uint32_t current_epoch_;
};

View File

@ -35,6 +35,11 @@
#include "ir/dtype.h"
#include "ir/anf.h"
#include "ir/func_graph_cloner.h"
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
#include "frontend/parallel/ps/worker.h"
#include "frontend/parallel/ps/common.h"
#include "frontend/parallel/ps/util.h"
#endif
namespace mindspore {
namespace session {
@ -295,7 +300,11 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
MS_LOG(INFO) << "No corresponding internal output for output node";
return;
}
auto real_kernel = AnfAlgo::VisitKernel(ref_node, 0);
size_t output_idx = 0;
if (AnfAlgo::CheckPrimitiveType(out_node, prim::kPrimTupleGetItem)) {
output_idx = AnfAlgo::GetTupleGetItemOutIndex(out_node->cast<CNodePtr>());
}
auto real_kernel = AnfAlgo::VisitKernel(ref_node, output_idx);
auto ref_real_node = real_kernel.first;
auto ref_real_node_index = real_kernel.second;
if (ref_real_node->isa<CNode>() && node_graph->IsInternalOutput(ref_real_node) &&
@ -320,6 +329,7 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
builder.SetOutputsFormat({format});
d_kernel_info->set_select_kernel_build_info(builder.Build());
AnfAlgo::SetOutputAddr(address, 0, parameter.get());
AnfAlgo::SetOutputInferTypeAndShape({type}, {AnfAlgo::GetOutputInferShape(parameter, 0)}, parameter.get());
}
}
@ -973,6 +983,16 @@ CNodePtr SessionBasic::ConstructOutput(const AnfNodePtrList &outputs, const std:
bool internal_output = true;
std::string kernel_target = GetCNodeTarget(front_real_kernel.first);
for (auto user : users) {
auto cnode = user.first->cast<CNodePtr>();
if (cnode == nullptr) {
internal_output = false;
break;
}
auto prim = cnode->input(kAnfPrimitiveIndex);
if (prim == nullptr || !prim->isa<ValueNode>()) {
internal_output = false;
break;
}
if (!AnfAlgo::IsRealKernel(user.first) || kernel_target != GetCNodeTarget(user.first)) {
internal_output = false;
break;
@ -1097,5 +1117,92 @@ KernelGraphPtr SessionBasic::NewKernelGraph() {
graphs_[graph_sum_++] = graph;
return graph;
}
AnfNodePtr SessionBasic::FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list) {
MS_EXCEPTION_IF_NULL(push_node);
for (auto &node : node_list) {
if (node != nullptr && node->isa<CNode>()) {
for (auto input : node->cast<CNodePtr>()->inputs()) {
if (push_node == AnfAlgo::VisitKernel(input, 0).first) {
if (AnfAlgo::GetCNodeName(node) != kPullOpName) {
MS_LOG(EXCEPTION) << "The edge between Push and Pull node is invalid.";
}
return node;
}
}
}
}
return nullptr;
}
#if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
void SessionBasic::AssignParamKey(const KernelGraphPtr &kernel_graph) {
if (!parallel::ps::Util::IsRoleOfWorker()) {
MS_LOG(INFO) << "Not parameter server mode.";
return;
}
MS_EXCEPTION_IF_NULL(kernel_graph);
std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph->get_return());
for (auto &node : node_list) {
if (node != nullptr && node->isa<CNode>()) {
// Assign key for forward kernel EmbeddingLookup.
// The key will be assigned to embedding table ande Push kernel as well.
if (AnfAlgo::GetCNodeName(node) == kEmbeddingLookupOpName) {
size_t embedding_table_idx = 0;
auto embedding_table = AnfAlgo::GetInputNode(node->cast<CNodePtr>(), embedding_table_idx);
size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(embedding_table->fullname_with_scope());
AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node);
} else if (AnfAlgo::GetCNodeName(node) == kPushOpName) {
auto pull_node = FindPullNode(node, node_list);
if (!pull_node) {
MS_LOG(EXCEPTION) << "Assigning parameter key failed: can't find Pull node of the Push node.";
}
// Second input of Pull node is the trainable parameter.
size_t parameter_index = 1;
auto parameter_node = AnfAlgo::GetInputNode(pull_node->cast<CNodePtr>(), parameter_index);
size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(parameter_node->fullname_with_scope());
AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node);
AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), pull_node);
std::string optimizer_name = AnfAlgo::GetNodeAttr<std::string>(node, kAttrOptimizerType);
parallel::ps::Worker<float>::GetInstance().SetKeyOptimId(key, optimizer_name);
}
}
}
}
void SessionBasic::InitPSParamAndOptim(const KernelGraphPtr &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs_const) {
if (!parallel::ps::Util::IsRoleOfWorker()) {
return;
}
std::vector<tensor::TensorPtr> inputs(inputs_const);
size_t input_ctrl_size = 1;
MS_EXCEPTION_IF_NULL(kernel_graph);
if (kernel_graph->input_ctrl_tensors()) {
input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs);
}
auto input_nodes = kernel_graph->inputs();
if ((inputs.size() + input_ctrl_size) - 1 != input_nodes.size()) {
MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size()
<< ", input_ctrl_size:" << input_ctrl_size;
}
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
for (size_t i = 0; i < inputs.size(); ++i) {
auto tensor = inputs[i];
MS_EXCEPTION_IF_NULL(tensor);
auto input_node = input_nodes[i];
MS_EXCEPTION_IF_NULL(input_node);
if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
auto pk_node = input_node->cast<ParameterPtr>();
mindspore::parallel::ps::Worker<float>::GetInstance().InitPSParamAndOptim(
pk_node->fullname_with_scope(), tensor->data_c(), LongToSize(tensor->data().nbytes()));
}
}
ps_init_ = true;
}
#endif
} // namespace session
} // namespace mindspore

View File

@ -51,7 +51,7 @@ using OpRunInfoPtr = std::shared_ptr<OpRunInfo>;
class SessionBasic {
public:
SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0) {
SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0), ps_init_(false) {
#ifdef ENABLE_DEBUGGER
debugger_ = nullptr;
#endif
@ -104,6 +104,8 @@ class SessionBasic {
virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; }
virtual void SetActive(GraphId, GraphId) {}
virtual void GetSummaryNodes(KernelGraph *graph);
void AssignParamKey(const KernelGraphPtr &kernel_graph);
void InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, const std::vector<tensor::TensorPtr> &inputs_const);
#ifdef ENABLE_DEBUGGER
// set debugger
@ -140,6 +142,7 @@ class SessionBasic {
AnfNodePtr CreateNewParameterFromCNode(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph);
void AddParameterToGraphInputs(const std::vector<AnfNodePtr> &parameters, KernelGraph *graph);
void InitInternalOutputParameter(const AnfNodePtr &out_node, const AnfNodePtr &parameter);
AnfNodePtr FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list);
std::unordered_map<GraphId, std::shared_ptr<KernelGraph>> graphs_;
std::unordered_map<GraphInfo, std::shared_ptr<KernelGraph>> run_op_graphs_;
@ -148,6 +151,7 @@ class SessionBasic {
CallBackFunc summary_callback_;
static GraphId graph_sum_;
uint32_t device_id_;
bool ps_init_;
#ifdef ENABLE_DEBUGGER
std::shared_ptr<Debugger> debugger_;
#endif

View File

@ -23,9 +23,7 @@ if (ENABLE_D)
list(APPEND _DEBUG_SRC_LIST
"${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
)
if (ENABLE_DATA_DUMP)
list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc")
endif(ENABLE_DATA_DUMP)
endif()
if (ENABLE_DUMP_E2E)

View File

@ -120,6 +120,10 @@ std::optional<std::string> Common::GetConfigFile(const std::string &env) {
MS_LOG(ERROR) << dump_config_file << " not exist.";
return {};
}
auto suffix = dump_config_file.substr(dump_config_file.find_last_of('.') + 1);
if (suffix != "json") {
MS_LOG(EXCEPTION) << "[DataDump] dump config file suffix only support json! But got:." << suffix;
}
return dump_config_file;
}
} // namespace mindspore

View File

@ -29,13 +29,13 @@ void DataDumpParser::ResetParam() {
net_name_.clear();
dump_mode_ = 0;
dump_step_ = 0;
kernel_set_.clear();
kernel_map_.clear();
}
bool DataDumpParser::DumpEnabled() const {
auto enable_dump = std::getenv(kEnableDataDump);
if (!enable_dump) {
MS_LOG(WARNING) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP";
MS_LOG(INFO) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP";
return false;
}
@ -60,9 +60,18 @@ std::optional<std::string> DataDumpParser::GetDumpPath() const {
return {};
}
std::string dump_path_str(dump_path);
if (!std::all_of(dump_path_str.begin(), dump_path_str.end(), ::isalpha)) {
MS_LOG(EXCEPTION) << "[DataDump] dump path only support alphas, but got:" << dump_path_str;
}
return dump_path_str;
}
std::string GetIfstreamString(const std::ifstream &ifstream) {
std::stringstream buffer;
buffer << ifstream.rdbuf();
return buffer.str();
}
void DataDumpParser::ParseDumpConfig() {
std::lock_guard<std::mutex> guard(lock_);
MS_LOG(INFO) << "[DataDump] parse start";
@ -84,7 +93,12 @@ void DataDumpParser::ParseDumpConfig() {
}
nlohmann::json j;
try {
json_file >> j;
} catch (nlohmann::json::parse_error &e) {
MS_LOG(ERROR) << "[DataDump] json contents:" << GetIfstreamString(json_file);
MS_LOG(EXCEPTION) << "[DataDump] parse json failed, error:" << e.what();
}
if (j.find("DumpSettings") == j.end()) {
MS_LOG(EXCEPTION) << "[DataDump] DumpSettings is not exist.";
}
@ -111,8 +125,8 @@ bool DataDumpParser::NeedDump(const std::string &op_full_name) const {
if (dump_mode_ == 0) {
return true;
}
auto iter = kernel_set_.find(op_full_name);
return iter != kernel_set_.end();
auto iter = kernel_map_.find(op_full_name);
return iter != kernel_map_.end();
}
bool DataDumpParser::IsConfigExist(const nlohmann::json &dump_settings) const {
@ -145,8 +159,25 @@ bool DataDumpParser::ParseDumpSetting(const nlohmann::json &dump_settings) {
auto kernel_str = kernel.dump();
kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end());
MS_LOG(INFO) << "[DataDump] Need dump kernel:" << kernel_str;
kernel_set_.insert(kernel_str);
kernel_map_.insert({kernel_str, 0});
}
return true;
}
void DataDumpParser::MatchKernel(const std::string &kernel_name) {
auto iter = kernel_map_.find(kernel_name);
if (iter == kernel_map_.end()) {
return;
}
iter->second = iter->second + 1;
MS_LOG(INFO) << "Match dump kernel:" << iter->first << " match times:" << iter->second;
}
void DataDumpParser::PrintUnusedKernel() {
for (const auto &iter : kernel_map_) {
if (iter.second == 0) {
MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
}
}
}
} // namespace mindspore

View File

@ -18,7 +18,7 @@
#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_
#include <string>
#include <set>
#include <map>
#include <mutex>
#include <optional>
#include "nlohmann/json.hpp"
@ -39,7 +39,8 @@ class DataDumpParser {
const std::string &net_name() const { return net_name_; }
uint32_t dump_mode() const { return dump_mode_; }
uint32_t dump_step() const { return dump_step_; }
const std::set<std::string> &kernel_set() const { return kernel_set_; }
void MatchKernel(const std::string &kernel_name);
void PrintUnusedKernel();
private:
DataDumpParser() = default;
@ -55,7 +56,7 @@ class DataDumpParser {
std::string net_name_;
uint32_t dump_mode_{0};
uint32_t dump_step_{0};
std::set<std::string> kernel_set_;
std::map<std::string, uint32_t> kernel_map_;
};
} // namespace mindspore
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_

View File

@ -124,6 +124,8 @@ void ProtoExporter::SetNodeOutputType(const TypePtr &type, const BaseShapePtr &s
// Do Nothing
} else if (type->isa<UndeterminedType>()) {
// Do Nothing
} else if (type->isa<SparseTensorType>()) {
// Do Nothing
} else if (type->isa<Tuple>()) {
TuplePtr tuple_type = dyn_cast<Tuple>(type);
type_proto->set_data_type(irpb::DT_TUPLE);

Some files were not shown because too many files have changed in this diff Show More