!30589 [DynamicShape][GPU]add dynamic shape support some ops for new network

Merge pull request !30589 from hanhuifeng/dcn_dyn_ops_2_new
This commit is contained in:
i-robot 2022-03-01 06:42:59 +00:00 committed by Gitee
commit 9a5268f2be
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
14 changed files with 477 additions and 133 deletions

View File

@ -56,5 +56,22 @@ MS_REG_GPU_KERNEL_ONE(ReduceProd, KernelAttr().AddInputAttr(kNumberTypeFloat32).
ArrayReduceGpuKernelMod, float)
MS_REG_GPU_KERNEL_ONE(ReduceProd, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
ArrayReduceGpuKernelMod, double)
// dynamic
MS_REG_GPU_KERNEL_TWO(
ReduceSum,
KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
ArrayReduceGpuKernelMod, double, int64_t)
MS_REG_GPU_KERNEL_TWO(
ReduceSum,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
ArrayReduceGpuKernelMod, float, int64_t)
MS_REG_GPU_KERNEL_TWO(
ReduceSum,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
ArrayReduceGpuKernelMod, half, int64_t)
MS_REG_GPU_KERNEL_TWO(
ReduceSum, KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
ArrayReduceGpuKernelMod, bool, int64_t)
} // namespace kernel
} // namespace mindspore

View File

@ -32,7 +32,7 @@ const std::map<std::string, cudnnReduceTensorOp_t> kReduceTypeMap = {
{"ReduceAny", CUDNN_REDUCE_TENSOR_MAX}, {"ReduceAll", CUDNN_REDUCE_TENSOR_MUL},
{"ReduceProd", CUDNN_REDUCE_TENSOR_MUL},
};
template <typename T>
template <typename T, typename S = int64_t>
class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
public:
ArrayReduceGpuKernelMod() { ResetResource(); }
@ -43,6 +43,9 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
if (is_null_input_) {
return true;
}
if (is_dynamic_axis_ && !get_dynamic_axis_value_) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', fail to get value of the axis when axis is dynamic!";
}
T *input_addr = GetDeviceAddress<T>(inputs, 0);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
T *workspace_addr = GetPossiblyNullDeviceAddress<T>(workspace, 0);
@ -87,8 +90,13 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
}
data_type_ = GetCudnnDataType(type_name);
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1, but got " << input_num;
constexpr size_t kDynamicAxisInputNum = 2;
if (input_num != 1 && input_num != kDynamicAxisInputNum) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1 or " << kDynamicAxisInputNum
<< ", but got " << input_num;
}
if (input_num == kDynamicAxisInputNum) {
is_dynamic_axis_ = true;
}
size_t output_num = common::AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
@ -96,29 +104,28 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
}
int input_dim_length = SizeToInt(AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0).size());
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
MS_EXCEPTION_IF_NULL(prim);
if (prim->GetAttr("axis")->isa<ValueTuple>() || prim->GetAttr("axis")->isa<ValueList>()) {
std::vector<int> attr_axis;
std::vector<int64_t> attr_axis_me = GetAttr<std::vector<int64_t>>(kernel_node, "axis");
(void)std::transform(attr_axis_me.begin(), attr_axis_me.end(), std::back_inserter(attr_axis),
[](const int64_t &value) { return static_cast<int>(value); });
if (attr_axis.empty()) {
axis_.push_back(-1);
} else {
for (auto axis : attr_axis) {
axis < 0 ? axis_.push_back(axis + input_dim_length) : axis_.push_back(axis);
}
std::sort(axis_.begin(), axis_.end());
auto multiple_pos = std::unique(axis_.begin(), axis_.end());
axis_.erase(multiple_pos, axis_.end());
std::vector<int64_t> attr_axis;
if (is_dynamic_axis_) {
get_dynamic_axis_value_ = GetDynamicAttrIntValue(kernel_node, kAxisIndex_, &attr_axis);
if (!get_dynamic_axis_value_) {
InitSizeLists();
return true;
}
} else if (prim->GetAttr("axis")->isa<Int64Imm>()) {
int axis = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis"));
axis < 0 ? axis_.push_back(axis + input_dim_length) : axis_.push_back(axis);
dynamic_axis_size_ = attr_axis.size();
} else {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute 'axis' type is invalid.";
attr_axis = GetAxisValue(kernel_node);
}
if (attr_axis.empty()) {
axis_.push_back(-1);
} else {
for (auto axis : attr_axis) {
axis < 0 ? axis_.push_back(axis + input_dim_length) : axis_.push_back(axis);
}
std::sort(axis_.begin(), axis_.end());
auto multiple_pos = std::unique(axis_.begin(), axis_.end());
axis_.erase(multiple_pos, axis_.end());
}
keep_dims_ = GetAttr<bool>(kernel_node, "keep_dims");
auto inputA_shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
@ -156,6 +163,9 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
dynamic_axis_size_ = 0;
is_dynamic_axis_ = false;
get_dynamic_axis_value_ = false;
}
void DestroyResource() noexcept override {
@ -182,6 +192,10 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
"cudnnGetTensorSizeInBytes failed.");
input_size_list_.push_back(input_size_);
if (is_dynamic_axis_) {
input_size_list_.push_back(dynamic_axis_size_ * sizeof(S));
}
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(outputC_descriptor_, &output_size_),
"cudnnGetTensorSizeInBytes failed.");
output_size_list_.push_back(output_size_);
@ -279,6 +293,21 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
return;
}
std::vector<int64_t> GetAxisValue(const CNodePtr &kernel_node) {
auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
MS_EXCEPTION_IF_NULL(prim);
std::vector<int64_t> attr_axis_me;
if (prim->GetAttr("axis")->isa<ValueTuple>() || prim->GetAttr("axis")->isa<ValueList>()) {
attr_axis_me = GetAttr<std::vector<int64_t>>(kernel_node, "axis");
} else if (prim->GetAttr("axis")->isa<Int64Imm>()) {
auto axis = GetAttr<int64_t>(kernel_node, "axis");
attr_axis_me.push_back(axis);
} else {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute 'axis' type is invalid.";
}
return attr_axis_me;
}
cudnnHandle_t cudnn_handle_;
cudnnReduceTensorOp_t reduce_tensor_op_;
cudnnDataType_t data_type_;
@ -295,7 +324,10 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
size_t input_size_;
size_t output_size_;
size_t workspace_size_;
std::string kernel_name_;
size_t dynamic_axis_size_;
bool is_dynamic_axis_;
bool get_dynamic_axis_value_;
static constexpr size_t kAxisIndex_{1};
};
} // namespace kernel
} // namespace mindspore

View File

@ -33,18 +33,18 @@ namespace kernel {
REG_SLICE_GPU_DTYPES(REG_SLICE_GPU)
#define REG_DYNAMIC_SLICE_GPU_ATTR(T0_MS_DTYPE, T0_DTYPE, T1_MS_DTYPE) \
MS_REG_GPU_KERNEL_ONE(Slice, \
KernelAttr() \
.AddInputAttr(T0_MS_DTYPE) \
.AddInputAttr(T1_MS_DTYPE) \
.AddInputAttr(T1_MS_DTYPE) \
.AddOutputAttr(T0_MS_DTYPE), \
SliceFwdGpuKernelMod, T0_DTYPE)
#define REG_DYNAMIC_SLICE_GPU_ATTR(T0_MS_DTYPE, T0_DTYPE, T1_MS_DTYPE, T1_DTYPE) \
MS_REG_GPU_KERNEL_TWO(Slice, \
KernelAttr() \
.AddInputAttr(T0_MS_DTYPE) \
.AddInputAttr(T1_MS_DTYPE) \
.AddInputAttr(T1_MS_DTYPE) \
.AddOutputAttr(T0_MS_DTYPE), \
SliceFwdGpuKernelMod, T0_DTYPE, T1_DTYPE)
#define REG_DYNAMIC_SLICE_GPU(MS_DTYPE, DTYPE) \
REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt32) \
REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt64)
#define REG_DYNAMIC_SLICE_GPU(MS_DTYPE, DTYPE) \
REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt32, int32_t) \
REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt64, int64_t)
REG_SLICE_GPU_DTYPES(REG_DYNAMIC_SLICE_GPU)
} // namespace kernel

View File

@ -40,7 +40,7 @@ constexpr auto kIdx4 = 4;
constexpr auto kIdx5 = 5;
constexpr auto kIdx6 = 6;
template <typename T>
template <typename T, typename S = int64_t>
class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
public:
SliceFwdGpuKernelMod() { kernel_name_ = "Slice"; }
@ -106,7 +106,10 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
kernel_node_ = kernel_node;
(void)CheckParam(kernel_node);
if (is_dynamic_attr_ && !get_dynamic_attr_value_) {
InitSizeLists();
return true;
}
auto input_shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
auto out_shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, 0);
is_null_input_ =
@ -118,27 +121,15 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
(void)std::transform(input_shape.begin(), input_shape.end(), std::back_inserter(input_shape_),
[](const int64_t &e) { return static_cast<int32_t>(e); });
size_t input_size = sizeof(T);
input_size_ = sizeof(T);
for (size_t x : input_shape) {
input_size *= x;
}
input_size_list_.push_back(input_size);
if (is_dynamic_attr_) {
std::vector<size_t> dynamic_attr_indexs = {kBeginIndex_, kSizeIndex_};
for (size_t index : dynamic_attr_indexs) {
input_size = sizeof(T);
for (size_t x : AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, index)) {
input_size *= x;
}
input_size_list_.push_back(input_size);
}
input_size_ *= x;
}
size_t output_size = sizeof(T);
output_size_ = sizeof(T);
for (size_t x : out_shape) {
output_size *= x;
output_size_ *= x;
}
output_size_list_.push_back(output_size);
// transpose begin and size for NHWC data
auto data_format = AnfAlgo::GetInputFormat(kernel_node, 0);
@ -166,6 +157,8 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
begin_.clear();
size_.clear();
input_shape_.clear();
input_size_ = 0;
output_size_ = 0;
is_null_input_ = false;
kernel_name_ = "Slice";
is_dynamic_attr_ = false;
@ -173,7 +166,29 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
}
protected:
void InitSizeLists() override {}
void InitSizeLists() override {
input_size_list_.push_back(input_size_);
output_size_list_.push_back(output_size_);
if (is_dynamic_attr_) {
InitDynamicAttrSizeLists(kernel_node_.lock());
}
}
inline void InitDynamicAttrSizeLists(const CNodePtr &kernel_node) {
if (get_dynamic_attr_value_) {
std::vector<size_t> dynamic_attr_indexs = {kBeginIndex_, kSizeIndex_};
for (size_t index : dynamic_attr_indexs) {
size_t input_size = sizeof(S);
for (size_t x : AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, index)) {
input_size *= x;
}
input_size_list_.push_back(input_size);
}
} else {
input_size_list_.push_back(0);
input_size_list_.push_back(0);
}
}
private:
void CheckParam(const CNodePtr &kernel_node) {
@ -205,13 +220,13 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
size = GetAttr<std::vector<int64_t>>(kernel_node, "size");
begin = GetAttr<std::vector<int64_t>>(kernel_node, "begin");
} else {
// The value of dynamic attr can only be obtained after the InferShape() of dynamic kernel is executed
if (depend_tensor_map_.empty()) {
if (GetDynamicAttrIntValue(kernel_node, kBeginIndex_, &begin) &&
GetDynamicAttrIntValue(kernel_node, kSizeIndex_, &size)) {
get_dynamic_attr_value_ = true;
}
if (!get_dynamic_attr_value_) {
return;
}
begin = GetDynamicAttrIntValue(kernel_node, kBeginIndex_);
size = GetDynamicAttrIntValue(kernel_node, kSizeIndex_);
get_dynamic_attr_value_ = true;
}
if (size.size() != input_shape.size() || begin.size() != input_shape.size()) {
@ -220,9 +235,8 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
<< "of size: " << size.size() << ", the dimension of begin: " << begin.size()
<< ", the dimension of input_x: " << input_shape.size();
}
const int64_t NEG_ONE = -1;
for (size_t i = 0; i < input_shape.size(); i++) {
if (size[i] == NEG_ONE) {
if (size[i] == -1) {
size[i] = input_shape[i] - begin[i];
}
if (input_shape[i] <= 0 || size[i] <= 0) {
@ -245,6 +259,8 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
std::vector<int32_t> size_;
std::vector<int32_t> input_shape_;
size_t input_size_{0};
size_t output_size_{0};
bool is_null_input_{false};
bool is_dynamic_attr_{false};
bool get_dynamic_attr_value_{false};

View File

@ -346,9 +346,12 @@ class NativeGpuKernelMod : public GpuKernelMod {
return std::equal(s1.begin(), s1.end(), s2_trans.begin(), s2_trans.end());
}
inline std::vector<int64_t> GetDynamicAttrIntValue(const CNodePtr &kernel_node, const size_t input_index) {
inline bool GetDynamicAttrIntValue(const CNodePtr &kernel_node, const size_t input_index,
std::vector<int64_t> *attr_value) {
// The value of dynamic attr can only be obtained after the InferShape() is executed
if (depend_tensor_map_.empty()) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the depend_tensor_map is empty!";
MS_LOG(DEBUG) << "For '" << kernel_name_ << "', the depend_tensor_map is currently empty";
return false;
}
auto depend_iter = depend_tensor_map_.find(input_index);
if (depend_iter == depend_tensor_map_.end()) {
@ -366,7 +369,8 @@ class NativeGpuKernelMod : public GpuKernelMod {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the format of the " << input_index
<< "th input currently should be the default format and does not support " << data_format;
}
return GetTensorIntValue(input_tensor, input_index);
*attr_value = GetTensorIntValue(input_tensor, input_index);
return true;
}
};
} // namespace kernel

View File

@ -33,30 +33,7 @@ constexpr size_t NO_CUDNN_BATCHNORM_OPS_BN_INPUT_NUM = 8;
template <typename T>
class BatchNormGradGpuKernelMod : public NativeGpuKernelMod {
public:
BatchNormGradGpuKernelMod()
: batch_(0),
channel_(0),
height_(0),
width_(0),
x_size_(0),
para_size_(0),
workspace_size_(0),
reserve_size_(0),
mode_(CUDNN_BATCHNORM_SPATIAL),
bn_ops_(CUDNN_BATCHNORM_OPS_BN),
epsilon_(10e-5),
is_train_(false),
is_null_input_(false),
x_desc_(nullptr),
y_desc_(nullptr),
dy_desc_(nullptr),
dx_desc_(nullptr),
dz_desc_(nullptr),
scale_bias_diff_desc_(nullptr),
activation_desc_(nullptr),
handle_(nullptr),
cudnn_data_type_(CUDNN_DATA_FLOAT),
beta_data_diff_(0) {}
BatchNormGradGpuKernelMod() { ResetResource(); }
~BatchNormGradGpuKernelMod() override { DestroyResource(); }
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
@ -171,6 +148,33 @@ class BatchNormGradGpuKernelMod : public NativeGpuKernelMod {
return true;
}
void ResetResource() noexcept override {
ResetSizeLists();
batch_ = 0;
channel_ = 0;
height_ = 0;
width_ = 0;
x_size_ = 0;
para_size_ = 0;
workspace_size_ = 0;
reserve_size_ = 0;
mode_ = CUDNN_BATCHNORM_SPATIAL;
bn_ops_ = CUDNN_BATCHNORM_OPS_BN;
epsilon_ = 10e-5;
is_train_ = false;
is_null_input_ = false;
x_desc_ = nullptr;
y_desc_ = nullptr;
dy_desc_ = nullptr;
dx_desc_ = nullptr;
dz_desc_ = nullptr;
scale_bias_diff_desc_ = nullptr;
activation_desc_ = nullptr;
handle_ = nullptr;
cudnn_data_type_ = CUDNN_DATA_FLOAT;
beta_data_diff_ = 0;
}
protected:
void InitResource() override {
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();

View File

@ -18,6 +18,7 @@
namespace mindspore {
namespace kernel {
namespace {
// float64
MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
FlattenFwdGpuKernelMod, double)
@ -25,6 +26,7 @@ MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeFloat64).Add
FlattenFwdGpuKernelMod, double)
MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
FlattenFwdGpuKernelMod, double)
// float32
MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
FlattenFwdGpuKernelMod, float)
@ -32,6 +34,7 @@ MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeFloat32).Add
FlattenFwdGpuKernelMod, float)
MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
FlattenFwdGpuKernelMod, float)
// float16
MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
FlattenFwdGpuKernelMod, half)
@ -39,6 +42,7 @@ MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeFloat16).
FlattenFwdGpuKernelMod, half)
MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
FlattenFwdGpuKernelMod, half)
// int64
MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
FlattenFwdGpuKernelMod, int64_t)
@ -46,13 +50,15 @@ MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOu
FlattenFwdGpuKernelMod, int64_t)
MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
FlattenFwdGpuKernelMod, int64_t)
// int32
MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
FlattenFwdGpuKernelMod, int)
FlattenFwdGpuKernelMod, int32_t)
MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
FlattenFwdGpuKernelMod, int)
FlattenFwdGpuKernelMod, int32_t)
MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
FlattenFwdGpuKernelMod, int)
FlattenFwdGpuKernelMod, int32_t)
// int16
MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
FlattenFwdGpuKernelMod, int16_t)
@ -107,5 +113,71 @@ MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeBool).AddOut
FlattenFwdGpuKernelMod, bool)
MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
FlattenFwdGpuKernelMod, bool)
} // namespace
// dynamic kernel:
namespace {
// float64
MS_REG_GPU_KERNEL_TWO(
Reshape,
KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
FlattenFwdGpuKernelMod, double, int64_t)
// float32
MS_REG_GPU_KERNEL_TWO(
Reshape,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
FlattenFwdGpuKernelMod, float, int64_t)
// float16
MS_REG_GPU_KERNEL_TWO(
Reshape,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
FlattenFwdGpuKernelMod, half, int64_t)
// int64
MS_REG_GPU_KERNEL_TWO(
Reshape, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
FlattenFwdGpuKernelMod, int64_t, int64_t)
// int32
MS_REG_GPU_KERNEL_TWO(
Reshape, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
FlattenFwdGpuKernelMod, int32_t, int64_t)
// int16
MS_REG_GPU_KERNEL_TWO(
Reshape, KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt16),
FlattenFwdGpuKernelMod, int16_t, int64_t)
// int8
MS_REG_GPU_KERNEL_TWO(
Reshape, KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt8),
FlattenFwdGpuKernelMod, char, int64_t)
// uint64
MS_REG_GPU_KERNEL_TWO(
Reshape, KernelAttr().AddInputAttr(kNumberTypeUInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt64),
FlattenFwdGpuKernelMod, uint64_t, int64_t)
// uint32
MS_REG_GPU_KERNEL_TWO(
Reshape, KernelAttr().AddInputAttr(kNumberTypeUInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt32),
FlattenFwdGpuKernelMod, uint, int64_t)
// uint16
MS_REG_GPU_KERNEL_TWO(
Reshape, KernelAttr().AddInputAttr(kNumberTypeUInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt16),
FlattenFwdGpuKernelMod, uint16_t, int64_t)
// uint8
MS_REG_GPU_KERNEL_TWO(
Reshape, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt8),
FlattenFwdGpuKernelMod, uchar, int64_t)
// bool
MS_REG_GPU_KERNEL_TWO(
Reshape, KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
FlattenFwdGpuKernelMod, bool, int64_t)
} // namespace
} // namespace kernel
} // namespace mindspore

View File

@ -25,7 +25,7 @@
namespace mindspore {
namespace kernel {
template <typename T>
template <typename T, typename S = int64_t>
class FlattenFwdGpuKernelMod : public NativeGpuKernelMod {
public:
FlattenFwdGpuKernelMod() : input_size_(0), is_null_input_(false) {}
@ -48,6 +48,7 @@ class FlattenFwdGpuKernelMod : public NativeGpuKernelMod {
}
bool Init(const CNodePtr &kernel_node) override {
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
kernel_node_ = kernel_node;
auto shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
kernel_node_ = kernel_node;
is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "input");
@ -75,6 +76,21 @@ class FlattenFwdGpuKernelMod : public NativeGpuKernelMod {
void InitSizeLists() override {
input_size_list_.push_back(input_size_);
output_size_list_.push_back(input_size_);
InitDynamicAttrSizeLists(kernel_node_.lock());
}
inline void InitDynamicAttrSizeLists(const CNodePtr &kernel_node) {
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num == 1) {
return;
}
for (size_t index = 1; index < input_num; ++index) {
size_t input_size = sizeof(S);
for (size_t x : AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, index)) {
input_size *= x;
}
input_size_list_.push_back(input_size);
}
}
private:

View File

@ -49,6 +49,8 @@ class DynamicBroadcastGradientArgsGpuKernelMod : public NativeGpuKernelMod {
CHECK_CUDA_RET_WITH_EXCEPT(
kernel_node_, cudaMemcpyAsync(&x1_value[0], s1_addr, input_size_list_[1], cudaMemcpyDeviceToHost, cuda_stream),
"DynamicBroadcastGradientArgs copy s1 value failed");
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(cuda_stream),
"DynamicBroadcastGradientArgs cudaStreamSynchronized failed");
auto grad_reduce_idx = CalOut({x0_value, x1_value});
r0_size_ = SetOuputValue(r0_addr, grad_reduce_idx[0], x0_value.size(), cuda_stream);
r1_size_ = SetOuputValue(r1_addr, grad_reduce_idx[1], x1_value.size(), cuda_stream);

View File

@ -84,7 +84,8 @@ std::set<int64_t> GetDependsFormMap(const CNodePtr &cnode) {
{kReshape, ShapeSet{1}},
{kSlice, ShapeSet{1, 2}},
{kSliceGrad, ShapeSet{2, 3}},
{kDynamicBroadcastTo, ShapeSet{1}}};
{kDynamicBroadcastTo, ShapeSet{1}},
{kReduceSum, ShapeSet{1}}};
MS_EXCEPTION_IF_NULL(cnode);
if (cnode->inputs().empty()) {
@ -98,7 +99,7 @@ std::set<int64_t> GetDependsFormMap(const CNodePtr &cnode) {
MS_EXCEPTION_IF_NULL(ms_context);
auto device = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
// Special dynamic shape depends for Ascend.
if (device == kAscendDevice && (prim_name == kReduceSum || prim_name == kTranspose)) {
if (device == kAscendDevice && prim_name == kTranspose) {
return {1};
}

View File

@ -45,27 +45,34 @@ bool BatchNormGrad::get_is_training() const {
return GetValue<bool>(value_ptr);
}
constexpr auto kInputNum = 6;
abstract::TupleShapePtr BatchNormGradInferShape(const PrimitivePtr &primitive,
const std::vector<AbstractBasePtr> &input_args) {
MS_EXCEPTION_IF_NULL(primitive);
auto prim_name = primitive->name();
(void)CheckAndConvertUtils::CheckInputArgs(input_args, kGreaterEqual, kInputNum, prim_name);
auto x_shape_ptr = input_args[kInputIndex1]->BuildShape();
auto scale_shape_ptr = input_args[kInputIndex2]->BuildShape();
return std::make_shared<abstract::TupleShape>(
std::vector<abstract::BaseShapePtr>{x_shape_ptr, scale_shape_ptr, scale_shape_ptr});
}
TuplePtr BatchNormGradInferType(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
MS_EXCEPTION_IF_NULL(primitive);
auto prim_name = primitive->name();
(void)CheckAndConvertUtils::CheckInputArgs(input_args, kGreaterEqual, kInputNum, prim_name);
auto x_type_ptr = input_args[kInputIndex1]->BuildType();
auto scale_type_ptr = input_args[kInputIndex2]->BuildType();
return std::make_shared<Tuple>(std::vector<TypePtr>{x_type_ptr, scale_type_ptr, scale_type_ptr});
}
AbstractBasePtr BatchNormGradInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
const std::vector<AbstractBasePtr> &input_args) {
MS_EXCEPTION_IF_NULL(primitive);
for (auto item : input_args) {
MS_EXCEPTION_IF_NULL(item);
}
const int64_t input_num = 5;
(void)CheckAndConvertUtils::CheckInteger("BatchNormGrad infer", SizeToLong(input_args.size()), kGreaterEqual,
input_num, primitive->name());
auto y_backprop_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[0]->BuildShape())[kShape];
auto x_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[1]->BuildShape())[kShape];
CheckAndConvertUtils::Check("BatchNorm y_backprop_shape", y_backprop_shape, kEqual, x_shape);
auto dx = input_args[kInputIndex1]->Broaden();
auto dscale = input_args[kInputIndex2]->Broaden();
auto reserve_1 = input_args[kInputIndex3]->Broaden();
auto reserve_2 = input_args[kInputIndex4]->Broaden();
AbstractBasePtrList rets = {dx, dscale, dscale, reserve_1, reserve_2};
return std::make_shared<abstract::AbstractTuple>(rets);
return abstract::MakeAbstract(BatchNormGradInferShape(primitive, input_args),
BatchNormGradInferType(primitive, input_args));
}
REGISTER_PRIMITIVE_C(kNameBatchNormGrad, BatchNormGrad);
REGISTER_PRIMITIVE_EVAL_IMPL(BatchNormGrad, prim::kPrimBatchNormGrad, BatchNormGradInfer, nullptr, true);
} // namespace ops
} // namespace mindspore

View File

@ -118,7 +118,7 @@ class SqrtGrad(PrimitiveWithInfer):
return x_dtype
class BatchNormGrad(PrimitiveWithInfer):
class BatchNormGrad(Primitive):
"""Performs grad of BatchNorm operation."""
@prim_attr_register
@ -127,13 +127,6 @@ class BatchNormGrad(PrimitiveWithInfer):
self.epsilon = validator.check_float_range(epsilon, 0, 1, Rel.INC_RIGHT, 'epsilon', self.name)
self.data_format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.name)
def infer_shape(self, y_backprop_shape, x_shape, scale_shape, save_mean_shape, save_variance_shape, reserve):
validator.check("BatchNorm y_backprop_shape", y_backprop_shape, "BatchNorm x_shape", x_shape)
return (x_shape, scale_shape, scale_shape)
def infer_dtype(self, y_backprop_type, x_type, scale_type, save_mean_shape, save_variance_shape, reserve):
return (x_type, scale_type, scale_type)
class SyncBatchNormGrad(PrimitiveWithInfer):
"""Performs grad of SyncBatchNorm operation."""

View File

@ -0,0 +1,52 @@
# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import numpy as np
import pytest
import mindspore.nn as nn
import mindspore.context as context
from mindspore.ops.operations import _inner_ops
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
class Net(nn.Cell):
def __init__(self):
super(Net, self).__init__()
self.args = _inner_ops.DynamicBroadcastGradientArgs()
def construct(self, s0, s1):
return self.args(s0, s1)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_net():
"""
Feature: DynamicBroadcastGradientArgs op.
Description: test cases for DynamicBroadcastGradientArgs op.
Expectation: the result match expected array.
"""
shape0 = (4, 2, 1)
shape1 = (2, 7)
net = Net()
r0, r1 = net(shape0, shape1)
r0_expected = [2]
r1_expected = [0]
assert np.array_equal(r0_expected, r0.asnumpy())
assert np.array_equal(r1_expected, r1.asnumpy())

View File

@ -102,15 +102,7 @@ class ConcatNet(nn.Cell):
return self.op((x1, x2))
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_dynamic_concat():
"""
Feature: Test Concat and its backward.
Description: The shape of inputs is dynamic.
Expectation: Assert that results are consistent with fixed shape.
"""
def dynamic_concat_run(is_grad):
axis = 1
dtype = np.float32
data_list = []
@ -118,15 +110,151 @@ def test_dynamic_concat():
data = []
data.append(np.random.rand(i, 16).astype(dtype))
data.append(np.random.rand(i, 32).astype(dtype))
data.append(np.random.rand(i, 48).astype(dtype))
if is_grad:
data.append(np.random.rand(i, 48).astype(dtype))
data_list.append(tuple(data))
column_names = get_columns(len(data_list[0]))
dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
dataset.set_dynamic_columns(
columns={column_names[0]: [None, 16], column_names[1]: [None, 32], column_names[2]: [None, 48]})
dynamic_columns = {column_names[0]: [
None, 16], column_names[1]: [None, 32]}
if is_grad:
dynamic_columns[column_names[-1]] = [None, 48]
dataset.set_dynamic_columns(columns=dynamic_columns)
net = ConcatNet(axis)
if is_grad:
net = GradNetWrtX(net)
output = dynamic_shape_sink_process(net, dataset)
output_cmp = fixed_shape_process(net, dataset)
assert compare(output, output_cmp)
net = GradNetWrtX(ConcatNet(axis))
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_dynamic_concat_forward():
"""
Feature: Test Concat.
Description: The shape of inputs is dynamic.
Expectation: Assert that results are consistent with fixed shape.
"""
dynamic_concat_run(False)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_dynamic_concat_backward():
"""
Feature: Test backward of Concat.
Description: The shape of inputs is dynamic.
Expectation: Assert that results are consistent with fixed shape.
"""
dynamic_concat_run(True)
class BatchNormNet(nn.Cell):
def __init__(self, c):
super(BatchNormNet, self).__init__()
self.bn = nn.BatchNorm1d(c)
def construct(self, input_data):
x = self.bn(input_data)
return x
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_dynamic_bachnorm():
"""
Feature: Test BatchNorm and its backward.
Description: The shape of inputs is dynamic.
Expectation: Assert that results are consistent with fixed shape.
"""
c = 256
dtype = np.float32
data_list = []
for i in [2, 64]:
data = []
data.append(np.random.rand(i, c).astype(dtype))
data.append(np.random.rand(i, c).astype(dtype))
data_list.append(tuple(data))
column_names = get_columns(len(data_list[0]))
dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
dynamic_columns = {column_names[0]: [None, c], column_names[1]: [None, c]}
dataset.set_dynamic_columns(columns=dynamic_columns)
net = GradNetWrtX(BatchNormNet(c))
gradients = dynamic_shape_sink_process(net, dataset)
gradients_cmp = fixed_shape_process(net, dataset)
assert compare(gradients, gradients_cmp)
class ReshapeNet(nn.Cell):
def construct(self, x, y):
shape_of_y = ops.DynamicShape()(y)
return ops.Reshape()(x, shape_of_y)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_dynamic_reshape():
"""
Feature: Test Reshape.
Description: The shape of inputs is dynamic.
Expectation: Assert that results are consistent with fixed shape.
"""
dtype = np.float32
data_list = []
for i in [2, 96]:
data = []
data.append(np.random.rand(i, 64, 1).astype(dtype))
data.append(np.random.rand(i, 64).astype(dtype))
data_list.append(tuple(data))
column_names = get_columns(len(data_list[0]))
dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
dynamic_columns = {column_names[0]: [
None, 64, 1], column_names[1]: [None, 64]}
dataset.set_dynamic_columns(columns=dynamic_columns)
net = ReshapeNet()
output = dynamic_shape_sink_process(net, dataset)
output_cmp = fixed_shape_process(net, dataset)
assert compare(output, output_cmp)
class ReduceSumNet(nn.Cell):
def __init__(self):
super(ReduceSumNet, self).__init__()
self.reduce = ops.ReduceSum()
def construct(self, x, y):
return self.reduce(x, y)
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_dynamic_reduce_sum():
"""
Feature: Test ReduceSum.
Description: The shape of inputs is dynamic.
Expectation: Assert that results are consistent with result of the numpy compute
"""
dtype = np.float32
data_list = []
for i in [2, 96]:
data = []
data.append(np.random.rand(i, 256).astype(dtype))
data.append(np.array([1], dtype=np.int64))
data_list.append(tuple(data))
column_names = get_columns(len(data_list[0]))
dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
dynamic_columns = {column_names[0]: [None, 256]}
dataset.set_dynamic_columns(columns=dynamic_columns)
net = ReduceSumNet()
output = dynamic_shape_sink_process(net, dataset)
# Currently, the parameter axis of ReduceSum operator is dynamic(tensor) is
# not supported under the fixed shape, so numpy is used for comparison
inputs = data_list[0]
output_cmp = np.sum(inputs[0], inputs[1][0])
assert np.allclose(output.asnumpy(), output_cmp, rtol=1.0e-4, atol=1.0e-4)