!14993 [GraphKernel][Ascend]Increase the rules for enabling atomic addition

From: @hanhuifeng2020
Reviewed-by: @gaoxiong1,@dylangeng
Signed-off-by: @dylangeng
This commit is contained in:
mindspore-ci-bot 2021-04-13 14:54:26 +08:00 committed by Gitee
commit 3c61a7c778
1 changed files with 48 additions and 7 deletions

View File

@ -37,13 +37,18 @@
namespace mindspore {
namespace opt {
namespace {
std::set<int64_t> GetUniqReduceAxes(const AnfNodePtr &node) {
std::set<int64_t> GetUniqReduceAxes(const AnfNodePtr &node, bool is_ascend = false) {
if (!IsPrimitiveCNode(node, prim::kPrimReduceSum)) {
MS_LOG(EXCEPTION) << "Only process for reduce sum!";
}
auto input = node->cast<CNodePtr>()->input(kFirstDataInputIndex);
auto src_shape_vec = GetShape(input);
ShapeVector src_shape_vec;
if (is_ascend) {
src_shape_vec = GetDeviceShape(input);
} else {
src_shape_vec = GetShape(input);
}
auto axis_vec = GetReduceAxis(node);
if (axis_vec.empty()) {
for (size_t i = 0; i < src_shape_vec.size(); ++i) {
@ -140,7 +145,8 @@ bool AtomicAddChecker::CanActivateAtomicAdd(const AnfNodePtr &anf_node) {
// which mean it should be in output list.
// 2. The reduce axis and reduce number should meet condition:
// (GPU) all-reduce or reduce-x when fuse number is greater than or equal to 1024, or reduce-y.
// (Ascend) all-reduce or non-reduce axes with dimension 1
// (Ascend) The first valid axis of the input data is the reduce axis or the non-reduce axis
// cannot make full use of multi-core.
// 3. No other ReduceSum as output ReduceSum's predecessors (reduce compile limitation).
// Rule 1.
@ -180,11 +186,46 @@ bool AtomicAddCheckerGPU::SuitableForAtomicAdd(const AnfNodePtr &node) {
}
bool AtomicAddCheckerAscend::SuitableForAtomicAdd(const AnfNodePtr &node) {
auto dst_shape_vec = AnfAlgo::GetOutputDeviceShape(node, 0);
auto input = node->cast<CNodePtr>()->input(kFirstDataInputIndex);
// all reduce
// non-reduce axes with dimension 1
return std::all_of(dst_shape_vec.cbegin(), dst_shape_vec.cend(), [](const size_t &dim) { return dim == 1; });
// Atomic addition is enabled only when the data type is fp32
auto type = AnfAlgo::GetOutputDeviceDataType(input, 0);
if (type != kNumberTypeFloat32) {
return false;
}
// If the first valid axis of the input data is the reduce axis, enable atomic addition
auto src_shape_vec = GetDeviceShape(input);
std::set<int64_t> reduce_axis_set = GetUniqReduceAxes(node, true);
auto start_with_reduce = false;
for (size_t i = 0; i < src_shape_vec.size(); ++i) {
auto dim = src_shape_vec[i];
if (dim != 1) {
if (reduce_axis_set.count(i)) {
start_with_reduce = true;
}
break;
}
}
if (start_with_reduce) {
return true;
}
// If the non-reduce axis cannot make full use of multi-core, enable atomic addition
auto processor_core_num = 32;
auto start_non_reduce_dim = 1;
for (size_t i = 0; i < src_shape_vec.size(); ++i) {
auto dim = src_shape_vec[i];
if (reduce_axis_set.count(i)) {
break;
}
start_non_reduce_dim = start_non_reduce_dim * dim;
}
if (start_non_reduce_dim < processor_core_num) {
return true;
}
return false;
}
void AtomicCleanInsertter::CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input) {