!42061 support DynamicBroadcastGradientArgs op on cpu

Merge pull request !42061 from hanhuifeng/dyn_broad_grad_cpu
This commit is contained in:
i-robot 2022-09-16 09:49:06 +00:00 committed by Gitee
commit a9d308be5a
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
2 changed files with 298 additions and 0 deletions

View File

@ -0,0 +1,216 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string>
#include <utility>
#include "mindspore/core/ops/dynamic_broadcast_gradient_args.h"
#include "plugin/device/cpu/kernel/dynamic_broadcast_grad_args_cpu_kernel.h"
#include "plugin/device/cpu/hal/device/cpu_device_address.h"
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kDynamicBroadcastGradientArgsInputsNum = 2;
constexpr size_t kDynamicBroadcastGradientArgsOutputsNum = 2;
constexpr char kKernelName[] = "DynamicBroadcastGradientArgs";
using KernelRunFunc = DynamicBroadcastGradientArgsCpuKernelMod::KernelRunFunc;
} // namespace
template <typename T>
void AddGradReduceIdx(std::vector<std::vector<T>> *grad_reduce_idx, std::vector<bool> cur_one, bool none_one,
const size_t max_rank, size_t j) {
MS_EXCEPTION_IF_NULL(grad_reduce_idx);
for (size_t i = 0; i < kDynamicBroadcastGradientArgsInputsNum; i++) {
if (cur_one[i] && !none_one) {
(void)(*grad_reduce_idx)[i].emplace_back(SizeToLong(max_rank - 1 - j));
}
}
}
template <typename T>
std::vector<std::vector<T>> GetGradIndex(const std::vector<std::vector<T>> &revers_shapes, const size_t max_rank) {
std::vector<std::vector<T>> grad_reduce_index(kDynamicBroadcastGradientArgsInputsNum);
std::vector<bool> pre_one(kDynamicBroadcastGradientArgsInputsNum);
std::vector<bool> cur_one(kDynamicBroadcastGradientArgsInputsNum);
for (size_t i = 0; i < kDynamicBroadcastGradientArgsInputsNum; i++) {
pre_one[i] = false;
cur_one[i] = false;
}
bool set_one = false;
for (size_t j = 0; j < max_rank; j++) {
int out_dim = -1;
bool out_dim_set = false;
bool none_one = true;
for (size_t i = 0; i < kDynamicBroadcastGradientArgsInputsNum; i++) {
if (revers_shapes[i][j] == 1) {
cur_one[i] = true;
none_one = false;
} else {
cur_one[i] = false;
if (!out_dim_set || revers_shapes[i][j] == static_cast<T>(out_dim)) {
out_dim = static_cast<int>(revers_shapes[i][j]);
out_dim_set = true;
} else {
MS_LOG(EXCEPTION) << "Can not broadcast inputs[0] and inputs[1].";
}
}
}
if (!out_dim_set) {
for (size_t i = 0; i < kDynamicBroadcastGradientArgsInputsNum; i++) {
(void)grad_reduce_index[i].emplace_back(max_rank - 1 - j);
}
continue;
} else if (std::equal(cur_one.begin(), cur_one.end(), pre_one.begin()) && set_one) {
AddGradReduceIdx(&grad_reduce_index, cur_one, none_one, max_rank, j);
} else {
AddGradReduceIdx(&grad_reduce_index, cur_one, none_one, max_rank, j);
}
set_one = true;
for (size_t i = 0; i < kDynamicBroadcastGradientArgsInputsNum; i++) {
pre_one[i] = cur_one[i];
}
}
return grad_reduce_index;
}
template <typename T, typename S>
size_t SetOuputValue(S *addr, const std::vector<T> &grad_reduce_idx, size_t input_num) {
size_t index_num = grad_reduce_idx.size();
for (size_t i = 0; i < index_num; i++) {
addr[i] = static_cast<S>(grad_reduce_idx[index_num - 1 - i]);
}
return index_num;
}
template <typename T, typename S>
bool DynamicBroadcastGradientArgsCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
const T *s0_addr = reinterpret_cast<T *>(inputs[0]->addr);
const T *s1_addr = reinterpret_cast<T *>(inputs[1]->addr);
S *r0_addr = reinterpret_cast<S *>(outputs[0]->addr);
S *r1_addr = reinterpret_cast<S *>(outputs[1]->addr);
std::vector<size_t> ranks = {input_size_list_[0] / sizeof(T), input_size_list_[1] / sizeof(T)};
std::vector<std::vector<T>> grad_reduce_idx(kDynamicBroadcastGradientArgsInputsNum);
bool all_equal = true;
size_t max_rank = ranks[0] > ranks[1] ? ranks[0] : ranks[1];
size_t min_rank = ranks[0] < ranks[1] ? ranks[0] : ranks[1];
for (size_t i = 0; i < min_rank; i++) {
if (s0_addr[i] != s1_addr[i]) {
all_equal = false;
break;
}
}
if (!all_equal) {
// Reverse shapes
std::vector<std::vector<T>> reverse_shapes(kDynamicBroadcastGradientArgsInputsNum);
for (size_t j = 0; j < ranks[0]; j++) {
reverse_shapes[0].push_back(s0_addr[ranks[0] - j - 1]);
}
if (reverse_shapes[0].size() < max_rank) {
reverse_shapes[0].resize(max_rank, 1);
}
for (size_t j = 0; j < ranks[1]; j++) {
reverse_shapes[1].push_back(s1_addr[ranks[1] - j - 1]);
}
if (reverse_shapes[1].size() < max_rank) {
reverse_shapes[1].resize(max_rank, 1);
}
grad_reduce_idx = GetGradIndex(reverse_shapes, max_rank);
}
r0_size_ = SetOuputValue(r0_addr, grad_reduce_idx[0], input_size_list_[0] / sizeof(T));
r1_size_ = SetOuputValue(r1_addr, grad_reduce_idx[1], input_size_list_[1] / sizeof(T));
return true;
}
bool DynamicBroadcastGradientArgsCpuKernelMod::Init(const BaseOperatorPtr &base_operator,
const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) {
auto kernel_ptr = std::dynamic_pointer_cast<ops::DynamicBroadcastGradientArgs>(base_operator);
if (!kernel_ptr) {
MS_LOG(ERROR) << "cast DynamicBroadcastGradientArgs ops failed!";
return false;
}
kernel_name_ = kernel_ptr->name();
if (inputs.size() != kDynamicBroadcastGradientArgsInputsNum ||
outputs.size() != kDynamicBroadcastGradientArgsOutputsNum) {
MS_LOG(ERROR) << kernel_name_ << ": input and output size should be " << kDynamicBroadcastGradientArgsInputsNum
<< " and " << kDynamicBroadcastGradientArgsOutputsNum << ", but get " << inputs.size() << " and "
<< outputs.size();
return false;
}
if (!MatchKernelFunc(base_operator, inputs, outputs)) {
return false;
}
return true;
}
int DynamicBroadcastGradientArgsCpuKernelMod::Resize(const BaseOperatorPtr &base_operator,
const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs,
const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost) {
if (KernelMod::Resize(base_operator, inputs, outputs, inputsOnHost) == static_cast<int>(KRET_RESIZE_FAILED)) {
MS_LOG(WARNING) << kernel_name_ << " reinit failed.";
return static_cast<int>(KRET_RESIZE_FAILED);
}
// get input_shape
outputs_ = outputs;
is_need_retrieve_output_shape_ = true;
return static_cast<int>(KRET_OK);
}
const std::vector<std::pair<KernelAttr, KernelRunFunc>> &DynamicBroadcastGradientArgsCpuKernelMod::GetFuncList() const {
static const std::vector<std::pair<KernelAttr, KernelRunFunc>> func_list = {
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&DynamicBroadcastGradientArgsCpuKernelMod::LaunchKernel<int32_t, int32_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt32)
.AddInputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64),
&DynamicBroadcastGradientArgsCpuKernelMod::LaunchKernel<int32_t, int64_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt32)
.AddOutputAttr(kNumberTypeInt32),
&DynamicBroadcastGradientArgsCpuKernelMod::LaunchKernel<int64_t, int32_t>},
{KernelAttr()
.AddInputAttr(kNumberTypeInt64)
.AddInputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64)
.AddOutputAttr(kNumberTypeInt64),
&DynamicBroadcastGradientArgsCpuKernelMod::LaunchKernel<int64_t, int64_t>},
};
return func_list;
}
MS_KERNEL_FACTORY_REG(NativeCpuKernelMod, DynamicBroadcastGradientArgs, DynamicBroadcastGradientArgsCpuKernelMod);
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,82 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DYN_BROADCAST_GRAD_ARGS_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DYN_BROADCAST_GRAD_ARGS_KERNEL_H_
#include <vector>
#include <map>
#include <memory>
#include <algorithm>
#include <utility>
#include "plugin/device/cpu/kernel/cpu_kernel.h"
#include "plugin/factory/ms_factory.h"
namespace mindspore {
namespace kernel {
class DynamicBroadcastGradientArgsCpuKernelMod : public NativeCpuKernelMod,
public MatchKernelHelper<DynamicBroadcastGradientArgsCpuKernelMod> {
public:
DynamicBroadcastGradientArgsCpuKernelMod() : r0_size_(0), r1_size_(0) { ResetResource(); }
~DynamicBroadcastGradientArgsCpuKernelMod() override = default;
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) override;
int Resize(
const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs,
const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost = std::map<uint32_t, tensor::TensorPtr>()) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override {
return kernel_func_(this, inputs, workspace, outputs);
}
const std::vector<std::pair<KernelAttr, KernelRunFunc>> &GetFuncList() const override;
std::vector<KernelAttr> GetOpSupport() override { return OpSupport(); }
void ResetResource() noexcept {
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
}
std::vector<KernelTensorPtr> GetOutputs() override {
ShapeVector r0_shape{SizeToLong(r0_size_)};
ShapeVector r1_shape{SizeToLong(r1_size_)};
outputs_[0]->SetShapeVector(r0_shape);
outputs_[1]->SetShapeVector(r1_shape);
return outputs_;
}
protected:
template <typename T, typename S>
bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs);
size_t r0_size_;
size_t r1_size_;
std::vector<KernelTensorPtr> outputs_;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DYN_BROADCAST_GRAD_ARGS_KERNEL_H_