forked from mindspore-Ecosystem/mindspore
[DynamicShape][GPU]add dynamic shape support of Concat and its backward for DCN
This commit is contained in:
parent
81260a2319
commit
662c51c019
|
@ -75,7 +75,7 @@ class ConcatV2FwdGpuKernelMod : public NativeGpuKernelMod {
|
||||||
if (!CheckParam(kernel_node)) {
|
if (!CheckParam(kernel_node)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||||
int dims = SizeToInt(input_shape.size());
|
int dims = SizeToInt(input_shape.size());
|
||||||
axis_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis"));
|
axis_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis"));
|
||||||
if (axis_ < -dims || axis_ >= dims) {
|
if (axis_ < -dims || axis_ >= dims) {
|
||||||
|
@ -95,7 +95,7 @@ class ConcatV2FwdGpuKernelMod : public NativeGpuKernelMod {
|
||||||
int current_dim = 0;
|
int current_dim = 0;
|
||||||
for (int i = 0; i < input_num_; i++) {
|
for (int i = 0; i < input_num_; i++) {
|
||||||
size_t input_size = 1;
|
size_t input_size = 1;
|
||||||
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, i);
|
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, i);
|
||||||
for (size_t j = 0; j < input_shape.size(); j++) {
|
for (size_t j = 0; j < input_shape.size(); j++) {
|
||||||
input_size *= input_shape[j];
|
input_size *= input_shape[j];
|
||||||
}
|
}
|
||||||
|
@ -128,6 +128,18 @@ class ConcatV2FwdGpuKernelMod : public NativeGpuKernelMod {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ResetResource() noexcept override {
|
||||||
|
ResetSizeLists();
|
||||||
|
axis_ = 0;
|
||||||
|
input_num_ = 1;
|
||||||
|
output_size_ = 0;
|
||||||
|
all_size_before_axis_ = 1;
|
||||||
|
all_size_axis_ = 1;
|
||||||
|
kernel_name_ = "ConcatV2";
|
||||||
|
inputs_host_ = nullptr;
|
||||||
|
len_axis_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void InitSizeLists() override {}
|
void InitSizeLists() override {}
|
||||||
|
|
||||||
|
|
|
@ -18,21 +18,34 @@
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace kernel {
|
namespace kernel {
|
||||||
MS_REG_GPU_KERNEL_ONE(Slice, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
|
#define REG_SLICE_GPU(MS_DTYPE, DTYPE) \
|
||||||
SliceFwdGpuKernelMod, double)
|
MS_REG_GPU_KERNEL_ONE(Slice, KernelAttr().AddInputAttr(MS_DTYPE).AddOutputAttr(MS_DTYPE), SliceFwdGpuKernelMod, DTYPE)
|
||||||
MS_REG_GPU_KERNEL_ONE(Slice, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
|
||||||
SliceFwdGpuKernelMod, float)
|
#define REG_SLICE_GPU_DTYPES(F) \
|
||||||
MS_REG_GPU_KERNEL_ONE(Slice, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
F(kNumberTypeFloat64, double) \
|
||||||
SliceFwdGpuKernelMod, half)
|
F(kNumberTypeFloat32, float) \
|
||||||
MS_REG_GPU_KERNEL_ONE(Slice, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
|
F(kNumberTypeFloat16, half) \
|
||||||
SliceFwdGpuKernelMod, int64_t)
|
F(kNumberTypeInt64, int64_t) \
|
||||||
MS_REG_GPU_KERNEL_ONE(Slice, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
F(kNumberTypeInt32, int32_t) \
|
||||||
SliceFwdGpuKernelMod, int32_t)
|
F(kNumberTypeInt16, int16_t) \
|
||||||
MS_REG_GPU_KERNEL_ONE(Slice, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
|
F(kNumberTypeUInt8, uchar) \
|
||||||
SliceFwdGpuKernelMod, int16_t)
|
F(kNumberTypeBool, bool)
|
||||||
MS_REG_GPU_KERNEL_ONE(Slice, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
|
|
||||||
SliceFwdGpuKernelMod, uchar)
|
REG_SLICE_GPU_DTYPES(REG_SLICE_GPU)
|
||||||
MS_REG_GPU_KERNEL_ONE(Slice, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
|
|
||||||
SliceFwdGpuKernelMod, bool)
|
#define REG_DYNAMIC_SLICE_GPU_ATTR(T0_MS_DTYPE, T0_DTYPE, T1_MS_DTYPE) \
|
||||||
|
MS_REG_GPU_KERNEL_ONE(Slice, \
|
||||||
|
KernelAttr() \
|
||||||
|
.AddInputAttr(T0_MS_DTYPE) \
|
||||||
|
.AddInputAttr(T1_MS_DTYPE) \
|
||||||
|
.AddInputAttr(T1_MS_DTYPE) \
|
||||||
|
.AddOutputAttr(T0_MS_DTYPE), \
|
||||||
|
SliceFwdGpuKernelMod, T0_DTYPE)
|
||||||
|
|
||||||
|
#define REG_DYNAMIC_SLICE_GPU(MS_DTYPE, DTYPE) \
|
||||||
|
REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt32) \
|
||||||
|
REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt64)
|
||||||
|
|
||||||
|
REG_SLICE_GPU_DTYPES(REG_DYNAMIC_SLICE_GPU)
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -14,9 +14,8 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SLICE_GPU_KERNEL_H_
|
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_ARRAYS_SLICE_GPU_KERNEL_H_
|
||||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SLICE_GPU_KERNEL_H_
|
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_ARRAYS_SLICE_GPU_KERNEL_H_
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
@ -44,8 +43,7 @@ constexpr auto kIdx6 = 6;
|
||||||
template <typename T>
|
template <typename T>
|
||||||
class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
|
class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
|
||||||
public:
|
public:
|
||||||
SliceFwdGpuKernelMod()
|
SliceFwdGpuKernelMod() { kernel_name_ = "Slice"; }
|
||||||
: is_null_input_(false), input_size_(0), output_size_(0), workspace_size_(0), kernel_name_("Slice") {}
|
|
||||||
~SliceFwdGpuKernelMod() override = default;
|
~SliceFwdGpuKernelMod() override = default;
|
||||||
|
|
||||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
||||||
|
@ -56,6 +54,9 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
|
||||||
|
|
||||||
T *input = GetDeviceAddress<T>(inputs, 0);
|
T *input = GetDeviceAddress<T>(inputs, 0);
|
||||||
T *output = GetDeviceAddress<T>(outputs, 0);
|
T *output = GetDeviceAddress<T>(outputs, 0);
|
||||||
|
if (is_dynamic_attr_ && !get_dynamic_attr_value_) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', fail to get value of the dynamic attr!";
|
||||||
|
}
|
||||||
|
|
||||||
size_t input_rank = input_shape_.size();
|
size_t input_rank = input_shape_.size();
|
||||||
switch (input_rank) {
|
switch (input_rank) {
|
||||||
|
@ -105,8 +106,8 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
|
||||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||||
(void)CheckParam(kernel_node);
|
(void)CheckParam(kernel_node);
|
||||||
|
|
||||||
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||||
auto out_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
auto out_shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
|
||||||
is_null_input_ =
|
is_null_input_ =
|
||||||
CHECK_SHAPE_NULL(input_shape, kernel_name_, "input") || CHECK_SHAPE_NULL(out_shape, kernel_name_, "output");
|
CHECK_SHAPE_NULL(input_shape, kernel_name_, "input") || CHECK_SHAPE_NULL(out_shape, kernel_name_, "output");
|
||||||
if (is_null_input_) {
|
if (is_null_input_) {
|
||||||
|
@ -116,24 +117,36 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
|
||||||
(void)std::transform(input_shape.begin(), input_shape.end(), std::back_inserter(input_shape_),
|
(void)std::transform(input_shape.begin(), input_shape.end(), std::back_inserter(input_shape_),
|
||||||
[](const int64_t &e) { return static_cast<int32_t>(e); });
|
[](const int64_t &e) { return static_cast<int32_t>(e); });
|
||||||
|
|
||||||
input_size_ = sizeof(T);
|
size_t input_size = sizeof(T);
|
||||||
for (size_t x : input_shape) {
|
for (size_t x : input_shape) {
|
||||||
input_size_ *= x;
|
input_size *= x;
|
||||||
|
}
|
||||||
|
input_size_list_.push_back(input_size);
|
||||||
|
if (is_dynamic_attr_) {
|
||||||
|
std::vector<size_t> dynamic_attr_indexs = {kBeginIndex_, kSizeIndex_};
|
||||||
|
for (size_t index : dynamic_attr_indexs) {
|
||||||
|
input_size = sizeof(T);
|
||||||
|
for (size_t x : AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, index)) {
|
||||||
|
input_size *= x;
|
||||||
|
}
|
||||||
|
input_size_list_.push_back(input_size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
output_size_ = sizeof(T);
|
size_t output_size = sizeof(T);
|
||||||
for (size_t x : out_shape) {
|
for (size_t x : out_shape) {
|
||||||
output_size_ *= x;
|
output_size *= x;
|
||||||
}
|
}
|
||||||
|
output_size_list_.push_back(output_size);
|
||||||
|
|
||||||
// transpose begin and size for NHWC data
|
// transpose begin and size for NHWC data
|
||||||
auto data_format = AnfAlgo::GetInputFormat(kernel_node, 0);
|
auto data_format = AnfAlgo::GetInputFormat(kernel_node, 0);
|
||||||
if (data_format == "NHWC") {
|
if (data_format == kOpFormat_NHWC) {
|
||||||
std::swap(begin_[1], begin_[kIdx3]);
|
std::swap(begin_[1], begin_[kIdx3]);
|
||||||
std::swap(begin_[1], begin_[kIdx2]);
|
std::swap(begin_[1], begin_[kIdx2]);
|
||||||
std::swap(size_[1], size_[kIdx3]);
|
std::swap(size_[1], size_[kIdx3]);
|
||||||
std::swap(size_[1], size_[kIdx2]);
|
std::swap(size_[1], size_[kIdx2]);
|
||||||
} else if (data_format == "NDHWC") {
|
} else if (data_format == kOpFormat_NDHWC) {
|
||||||
std::swap(begin_[1], begin_[kIdx4]);
|
std::swap(begin_[1], begin_[kIdx4]);
|
||||||
std::swap(begin_[1], begin_[kIdx3]);
|
std::swap(begin_[1], begin_[kIdx3]);
|
||||||
std::swap(begin_[1], begin_[kIdx2]);
|
std::swap(begin_[1], begin_[kIdx2]);
|
||||||
|
@ -147,17 +160,30 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected:
|
void ResetResource() noexcept override {
|
||||||
void InitSizeLists() override {
|
ResetSizeLists();
|
||||||
input_size_list_.push_back(input_size_);
|
begin_.clear();
|
||||||
output_size_list_.push_back(output_size_);
|
size_.clear();
|
||||||
|
input_shape_.clear();
|
||||||
|
is_null_input_ = false;
|
||||||
|
kernel_name_ = "Slice";
|
||||||
|
is_dynamic_attr_ = false;
|
||||||
|
get_dynamic_attr_value_ = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void InitSizeLists() override {}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void CheckParam(const CNodePtr &kernel_node) {
|
void CheckParam(const CNodePtr &kernel_node) {
|
||||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||||
if (input_num != 1) {
|
constexpr size_t kDynamicSliceInputNum = 3;
|
||||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1, but got " << input_num;
|
if (input_num != 1 && input_num != kDynamicSliceInputNum) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1 or " << kDynamicSliceInputNum
|
||||||
|
<< ", but got " << input_num;
|
||||||
|
}
|
||||||
|
if (input_num == kDynamicSliceInputNum) {
|
||||||
|
is_dynamic_attr_ = true;
|
||||||
}
|
}
|
||||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||||
if (output_num != 1) {
|
if (output_num != 1) {
|
||||||
|
@ -173,8 +199,19 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
|
||||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be equal to 0, but got "
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be equal to 0, but got "
|
||||||
<< input_shape.size();
|
<< input_shape.size();
|
||||||
}
|
}
|
||||||
auto size = GetAttr<std::vector<int64_t>>(kernel_node, "size");
|
std::vector<int64_t> size, begin;
|
||||||
auto begin = GetAttr<std::vector<int64_t>>(kernel_node, "begin");
|
if (!is_dynamic_attr_) {
|
||||||
|
size = GetAttr<std::vector<int64_t>>(kernel_node, "size");
|
||||||
|
begin = GetAttr<std::vector<int64_t>>(kernel_node, "begin");
|
||||||
|
} else {
|
||||||
|
// The value of dynamic attr can only be obtained after the InferShape() of dynamic kernel is executed
|
||||||
|
if (DynamicKernel() == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
begin = GetDynamicAttrIntValue(kernel_node, kBeginIndex_);
|
||||||
|
size = GetDynamicAttrIntValue(kernel_node, kSizeIndex_);
|
||||||
|
get_dynamic_attr_value_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
if (size.size() != input_shape.size() || begin.size() != input_shape.size()) {
|
if (size.size() != input_shape.size() || begin.size() != input_shape.size()) {
|
||||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||||
|
@ -182,9 +219,9 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
|
||||||
<< "of size: " << size.size() << ", the dimension of begin: " << begin.size()
|
<< "of size: " << size.size() << ", the dimension of begin: " << begin.size()
|
||||||
<< ", the dimension of input_x: " << input_shape.size();
|
<< ", the dimension of input_x: " << input_shape.size();
|
||||||
}
|
}
|
||||||
const int64_t kDynamicShape = -1;
|
const int64_t NEG_ONE = -1;
|
||||||
for (size_t i = 0; i < input_shape.size(); i++) {
|
for (size_t i = 0; i < input_shape.size(); i++) {
|
||||||
if (size[i] == kDynamicShape) {
|
if (size[i] == NEG_ONE) {
|
||||||
size[i] = input_shape[i] - begin[i];
|
size[i] = input_shape[i] - begin[i];
|
||||||
}
|
}
|
||||||
if (input_shape[i] <= 0 || size[i] <= 0) {
|
if (input_shape[i] <= 0 || size[i] <= 0) {
|
||||||
|
@ -207,14 +244,13 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
|
||||||
std::vector<int32_t> size_;
|
std::vector<int32_t> size_;
|
||||||
std::vector<int32_t> input_shape_;
|
std::vector<int32_t> input_shape_;
|
||||||
|
|
||||||
bool is_null_input_;
|
bool is_null_input_{false};
|
||||||
|
bool is_dynamic_attr_{false};
|
||||||
size_t input_size_;
|
bool get_dynamic_attr_value_{false};
|
||||||
size_t output_size_;
|
static constexpr size_t kBeginIndex_{1};
|
||||||
size_t workspace_size_;
|
static constexpr size_t kSizeIndex_{2};
|
||||||
std::string kernel_name_;
|
|
||||||
};
|
};
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
||||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_SLICE_GPU_KERNEL_H_
|
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_ARRAYS_SLICE_GPU_KERNEL_H_
|
||||||
|
|
|
@ -80,6 +80,8 @@ class GpuDynamicKernel : public device::DynamicKernel {
|
||||||
void UpdateArgs() override;
|
void UpdateArgs() override;
|
||||||
void PostExecute() final { MS_LOG(EXCEPTION) << "`PostExecute()` should not invoked with gpu backend"; };
|
void PostExecute() final { MS_LOG(EXCEPTION) << "`PostExecute()` should not invoked with gpu backend"; };
|
||||||
void Execute() final { MS_LOG(EXCEPTION) << "`Execute()` should not invoked with gpu backend"; }
|
void Execute() final { MS_LOG(EXCEPTION) << "`Execute()` should not invoked with gpu backend"; }
|
||||||
|
|
||||||
|
std::map<uint32_t, tensor::TensorPtr> GetDependTensorMap() { return depend_tensor_map_; }
|
||||||
};
|
};
|
||||||
|
|
||||||
class NativeGpuKernelMod : public GpuKernelMod {
|
class NativeGpuKernelMod : public GpuKernelMod {
|
||||||
|
@ -100,6 +102,12 @@ class NativeGpuKernelMod : public GpuKernelMod {
|
||||||
virtual void InitSizeLists() = 0;
|
virtual void InitSizeLists() = 0;
|
||||||
std::weak_ptr<CNode> kernel_node_;
|
std::weak_ptr<CNode> kernel_node_;
|
||||||
|
|
||||||
|
inline void ResetSizeLists() {
|
||||||
|
input_size_list_.clear();
|
||||||
|
output_size_list_.clear();
|
||||||
|
workspace_size_list_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline T *GetDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
|
inline T *GetDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
|
||||||
if (index >= addr_list.size()) {
|
if (index >= addr_list.size()) {
|
||||||
|
@ -321,7 +329,66 @@ class NativeGpuKernelMod : public GpuKernelMod {
|
||||||
return type->second;
|
return type->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
device::DynamicKernelPtr dynamic_kernel_;
|
inline std::map<uint32_t, tensor::TensorPtr> GetDependTensorMap() {
|
||||||
|
auto gpu_dynamic_kernel = dynamic_cast<GpuDynamicKernel *>(DynamicKernel().get());
|
||||||
|
if (gpu_dynamic_kernel != nullptr) {
|
||||||
|
return gpu_dynamic_kernel->GetDependTensorMap();
|
||||||
|
}
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::vector<int64_t> GetTensorIntValue(const tensor::TensorPtr input_tensor, const size_t input_index) {
|
||||||
|
std::vector<int64_t> tensor_value;
|
||||||
|
MS_EXCEPTION_IF_NULL(input_tensor);
|
||||||
|
size_t data_size = input_tensor->DataSize();
|
||||||
|
auto tensor_type = input_tensor->Dtype();
|
||||||
|
if (tensor_type->type_id() == kNumberTypeInt32) {
|
||||||
|
auto tensor_data = reinterpret_cast<int32_t *>(input_tensor->data_c());
|
||||||
|
MS_EXCEPTION_IF_NULL(tensor_data);
|
||||||
|
tensor_value.assign(tensor_data, tensor_data + data_size);
|
||||||
|
} else if (tensor_type->type_id() == kNumberTypeInt64) {
|
||||||
|
auto tensor_data = reinterpret_cast<int64_t *>(input_tensor->data_c());
|
||||||
|
MS_EXCEPTION_IF_NULL(tensor_data);
|
||||||
|
tensor_value.assign(tensor_data, tensor_data + data_size);
|
||||||
|
} else {
|
||||||
|
MS_EXCEPTION(TypeError) << "For '" << kernel_name_ << "', the " << input_index
|
||||||
|
<< "th input must be a Tensor[Int64] or Tensor[Int32] type, but got "
|
||||||
|
<< input_tensor->ToString();
|
||||||
|
}
|
||||||
|
return tensor_value;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool ShapeEqual(const std::vector<size_t> &s1, const std::vector<int64_t> &s2) {
|
||||||
|
std::vector<size_t> s2_trans;
|
||||||
|
std::transform(s2.begin(), s2.end(), std::back_inserter(s2_trans), [](const int64_t &e) { return LongToSize(e); });
|
||||||
|
return std::equal(s1.begin(), s1.end(), s2_trans.begin(), s2_trans.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::vector<int64_t> GetDynamicAttrIntValue(const CNodePtr &kernel_node, const size_t input_index) {
|
||||||
|
const auto &depend_tensor_map = GetDependTensorMap();
|
||||||
|
if (depend_tensor_map.empty()) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the depend_tensor_map is empty!";
|
||||||
|
}
|
||||||
|
auto depend_iter = depend_tensor_map.find(input_index);
|
||||||
|
if (depend_iter == depend_tensor_map.end()) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', fail to find the " << input_index
|
||||||
|
<< "th input in the depend_tensor_map";
|
||||||
|
}
|
||||||
|
auto input_tensor = depend_iter->second;
|
||||||
|
const auto &input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, input_index);
|
||||||
|
if (!ShapeEqual(input_shape, input_tensor->shape())) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the " << input_index
|
||||||
|
<< "th input is different between the InferShape and the TensorShape";
|
||||||
|
}
|
||||||
|
const auto &data_format = AnfAlgo::GetInputFormat(kernel_node, input_index);
|
||||||
|
if (data_format != kOpFormat_DEFAULT && data_format != kOpFormat_NCHW) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the format of the " << input_index
|
||||||
|
<< "th input currently should be the default format and does not support " << data_format;
|
||||||
|
}
|
||||||
|
return GetTensorIntValue(input_tensor, input_index);
|
||||||
|
}
|
||||||
|
|
||||||
|
device::DynamicKernelPtr dynamic_kernel_{nullptr};
|
||||||
};
|
};
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "plugin/device/gpu/kernel/other/concat_offset_gpu_kernel.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace kernel {
|
||||||
|
#define REG_CONCAT_OFFSET_GPU_INPUT_OUTPUT(T0_MS_DTYPE, T0_DTYPE, T1_MS_DTYPE, T1_DTYPE) \
|
||||||
|
MS_REG_GPU_KERNEL_TWO(ConcatOffset, \
|
||||||
|
KernelAttr().AddAllSameAttr(true).AddInputAttr(T0_MS_DTYPE).AddOutputAttr(T1_MS_DTYPE), \
|
||||||
|
ConcatOffsetGpuKernelMod, T0_DTYPE, T1_DTYPE)
|
||||||
|
|
||||||
|
#define REG_CONCAT_OFFSET_GPU_INPUT(MS_DTYPE, DTYPE) \
|
||||||
|
REG_CONCAT_OFFSET_GPU_INPUT_OUTPUT(MS_DTYPE, DTYPE, kNumberTypeInt64, int64_t) \
|
||||||
|
REG_CONCAT_OFFSET_GPU_INPUT_OUTPUT(MS_DTYPE, DTYPE, kNumberTypeInt32, int32_t)
|
||||||
|
|
||||||
|
#define REG_CONCAT_OFFSET_GPU_INPUT_FLOAT(F) \
|
||||||
|
F(kNumberTypeFloat64, double) F(kNumberTypeFloat32, float) F(kNumberTypeFloat16, half)
|
||||||
|
|
||||||
|
#define REG_CONCAT_OFFSET_GPU_INPUT_INT(F) \
|
||||||
|
F(kNumberTypeInt64, int64_t) \
|
||||||
|
F(kNumberTypeInt32, int32_t) \
|
||||||
|
F(kNumberTypeInt16, int16_t) \
|
||||||
|
F(kNumberTypeInt8, char) \
|
||||||
|
F(kNumberTypeUInt64, uint64_t) \
|
||||||
|
F(kNumberTypeUInt32, uint32_t) \
|
||||||
|
F(kNumberTypeUInt16, uint16_t) \
|
||||||
|
F(kNumberTypeUInt8, uchar) \
|
||||||
|
F(kNumberTypeBool, bool)
|
||||||
|
|
||||||
|
REG_CONCAT_OFFSET_GPU_INPUT_FLOAT(REG_CONCAT_OFFSET_GPU_INPUT)
|
||||||
|
REG_CONCAT_OFFSET_GPU_INPUT_INT(REG_CONCAT_OFFSET_GPU_INPUT)
|
||||||
|
} // namespace kernel
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,136 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_OTHER_CONCAT_OFFSET_GPU_KERNEL_H_
|
||||||
|
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_OTHER_CONCAT_OFFSET_GPU_KERNEL_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||||
|
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||||
|
|
||||||
|
namespace mindspore {
|
||||||
|
namespace kernel {
|
||||||
|
template <typename T, typename S>
|
||||||
|
class ConcatOffsetGpuKernelMod : public NativeGpuKernelMod {
|
||||||
|
public:
|
||||||
|
ConcatOffsetGpuKernelMod() {}
|
||||||
|
~ConcatOffsetGpuKernelMod() override = default;
|
||||||
|
|
||||||
|
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||||
|
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||||
|
S *output_device_address = GetDeviceAddress<S>(outputs, 0);
|
||||||
|
size_t out_size = out_offset_.size() * sizeof(S);
|
||||||
|
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||||
|
cudaMemcpyAsync(output_device_address, out_offset_.data(), out_size,
|
||||||
|
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||||
|
"cudaMemcpyAsync error in ConcatOffsetGpuKernelMod::Launch");
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Init(const CNodePtr &kernel_node) override {
|
||||||
|
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||||
|
kernel_node_ = kernel_node;
|
||||||
|
if (!CheckParam(kernel_node)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||||
|
auto rank = input_shape.size();
|
||||||
|
auto rank_int = SizeToInt(rank);
|
||||||
|
auto axis = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis"));
|
||||||
|
if (axis < -rank_int || axis >= rank_int) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the 'axis' should be in the range [-" << rank << "," << rank
|
||||||
|
<< "), but got " << axis;
|
||||||
|
}
|
||||||
|
if (axis < 0) {
|
||||||
|
axis += rank_int;
|
||||||
|
}
|
||||||
|
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||||
|
if (input_num == 0) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of input should be greater than 0";
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < input_num; i++) {
|
||||||
|
size_t input_size = 1;
|
||||||
|
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, i);
|
||||||
|
for (size_t j = 0; j < input_shape.size(); j++) {
|
||||||
|
input_size *= input_shape[j];
|
||||||
|
}
|
||||||
|
input_size_list_.push_back(input_size * sizeof(T));
|
||||||
|
}
|
||||||
|
// cal offset
|
||||||
|
size_t shape_offset = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0)[axis];
|
||||||
|
std::vector<size_t> offset(input_num, 0);
|
||||||
|
for (size_t i = 1; i < input_num; i++) {
|
||||||
|
input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
|
||||||
|
if (input_shape.size() != rank) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << " the dimension of input should be equal, but got:"
|
||||||
|
<< " the dimension of the " << i << "'th input: " << input_shape.size()
|
||||||
|
<< " and the dimension of the first input: " << rank;
|
||||||
|
}
|
||||||
|
offset[i] = shape_offset;
|
||||||
|
shape_offset += input_shape[axis];
|
||||||
|
}
|
||||||
|
constexpr size_t kConcatOffsetOutputShapeSize = 2;
|
||||||
|
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||||
|
if (output_shape.size() != kConcatOffsetOutputShapeSize) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of output should be "
|
||||||
|
<< kConcatOffsetOutputShapeSize << ", but got:" << output_shape.size();
|
||||||
|
}
|
||||||
|
if (output_shape[0] != input_num) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||||
|
<< "', the first dimension value of output should be equal to "
|
||||||
|
"the number of input, but got the first dimension value of output: "
|
||||||
|
<< output_shape[0] << ", and the number of input: " << input_num;
|
||||||
|
}
|
||||||
|
if (output_shape[1] != rank) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||||
|
<< "', the second dimension value of output should be equal to "
|
||||||
|
"the dimension of input, but got the second dimension value of output: "
|
||||||
|
<< output_shape[1] << ", and the dimension of input: " << rank;
|
||||||
|
}
|
||||||
|
auto output_size = input_num * rank;
|
||||||
|
out_offset_.assign(output_size, 0);
|
||||||
|
for (size_t i = 0; i < input_num; ++i) {
|
||||||
|
out_offset_[i * rank + axis] = offset[i];
|
||||||
|
}
|
||||||
|
output_size_list_.push_back(out_offset_.size() * sizeof(S));
|
||||||
|
InitSizeLists();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ResetResource() noexcept override {
|
||||||
|
ResetSizeLists();
|
||||||
|
out_offset_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void InitSizeLists() override {}
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool CheckParam(const CNodePtr &kernel_node) {
|
||||||
|
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||||
|
if (output_num != 1) {
|
||||||
|
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be 1, but got " << output_num;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<S> out_offset_;
|
||||||
|
};
|
||||||
|
} // namespace kernel
|
||||||
|
} // namespace mindspore
|
||||||
|
|
||||||
|
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_OTHER_CONCAT_OFFSET_GPU_KERNEL_H_
|
|
@ -0,0 +1,132 @@
|
||||||
|
# Copyright 2022 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from mindspore import ops, nn, ParameterTuple, context, set_seed
|
||||||
|
from mindspore.train import DatasetHelper, connect_network_with_dataset
|
||||||
|
import mindspore.dataset as ds
|
||||||
|
|
||||||
|
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
|
||||||
|
set_seed(2)
|
||||||
|
|
||||||
|
|
||||||
|
def _exec_preprocess(network, is_train, dataset, dataset_sink_mode, epoch_num, sink_size):
|
||||||
|
if dataset_sink_mode and not is_train:
|
||||||
|
dataset.__loop_size__ = 1
|
||||||
|
|
||||||
|
dataset_helper = DatasetHelper(
|
||||||
|
dataset, dataset_sink_mode, sink_size, epoch_num)
|
||||||
|
|
||||||
|
if dataset_sink_mode:
|
||||||
|
network = connect_network_with_dataset(network, dataset_helper)
|
||||||
|
|
||||||
|
return dataset_helper, network
|
||||||
|
|
||||||
|
|
||||||
|
def dynamic_shape_sink_process(network, dataset, is_train=True):
|
||||||
|
dataset_sink_mode = True
|
||||||
|
sink_size = 1
|
||||||
|
epoch_num = 1
|
||||||
|
dataset_helper, network = _exec_preprocess(
|
||||||
|
network, is_train, dataset, dataset_sink_mode, epoch_num, sink_size)
|
||||||
|
network.set_train(is_train)
|
||||||
|
for inputs in dataset_helper:
|
||||||
|
outputs = network(*inputs)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
def fixed_shape_process(network, dataset, is_train=True):
|
||||||
|
network.set_train(is_train)
|
||||||
|
for inputs in dataset.create_tuple_iterator():
|
||||||
|
outputs = network(*inputs)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
def dataset_generator(data_list):
|
||||||
|
for data in data_list:
|
||||||
|
yield data
|
||||||
|
|
||||||
|
|
||||||
|
def get_columns(tensor_num):
|
||||||
|
columns = []
|
||||||
|
for i in range(tensor_num):
|
||||||
|
columns.append("data" + str(i))
|
||||||
|
return columns
|
||||||
|
|
||||||
|
|
||||||
|
def compare(output, expect):
|
||||||
|
if isinstance(output, (tuple, list)):
|
||||||
|
assert isinstance(expect, (tuple, list))
|
||||||
|
for output_, expect_ in zip(output, expect):
|
||||||
|
if not compare(output_, expect_):
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
if not np.allclose(output.asnumpy(), expect.asnumpy(), rtol=1.0e-4, atol=1.0e-4):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
class GradNetWrtX(nn.Cell):
|
||||||
|
def __init__(self, net):
|
||||||
|
super(GradNetWrtX, self).__init__()
|
||||||
|
self.net = net
|
||||||
|
self.grad_op = ops.GradOperation(
|
||||||
|
get_all=True, get_by_list=True, sens_param=True)
|
||||||
|
self.params = ParameterTuple(net.trainable_params())
|
||||||
|
|
||||||
|
def construct(self, *inputs):
|
||||||
|
gradient_function = self.grad_op(self.net, self.params)
|
||||||
|
return gradient_function(*inputs)
|
||||||
|
|
||||||
|
|
||||||
|
class ConcatNet(nn.Cell):
|
||||||
|
def __init__(self, axis):
|
||||||
|
super(ConcatNet, self).__init__()
|
||||||
|
self.op = ops.Concat(axis)
|
||||||
|
|
||||||
|
def construct(self, x1, x2):
|
||||||
|
return self.op((x1, x2))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.level0
|
||||||
|
@pytest.mark.platform_x86_gpu_training
|
||||||
|
@pytest.mark.env_onecard
|
||||||
|
def test_dynamic_concat():
|
||||||
|
"""
|
||||||
|
Feature: Test Concat and its backward.
|
||||||
|
Description: The shape of inputs is dynamic.
|
||||||
|
Expectation: Assert that results are consistent with fixed shape.
|
||||||
|
"""
|
||||||
|
axis = 1
|
||||||
|
dtype = np.float32
|
||||||
|
data_list = []
|
||||||
|
for i in [2, 64]:
|
||||||
|
data = []
|
||||||
|
data.append(np.random.rand(i, 16).astype(dtype))
|
||||||
|
data.append(np.random.rand(i, 32).astype(dtype))
|
||||||
|
data.append(np.random.rand(i, 48).astype(dtype))
|
||||||
|
data_list.append(tuple(data))
|
||||||
|
column_names = get_columns(len(data_list[0]))
|
||||||
|
dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
|
||||||
|
dataset.set_dynamic_columns(
|
||||||
|
columns={column_names[0]: [None, 16], column_names[1]: [None, 32], column_names[2]: [None, 48]})
|
||||||
|
|
||||||
|
net = GradNetWrtX(ConcatNet(axis))
|
||||||
|
|
||||||
|
gradients = dynamic_shape_sink_process(net, dataset)
|
||||||
|
gradients_cmp = fixed_shape_process(net, dataset)
|
||||||
|
assert compare(gradients, gradients_cmp)
|
Loading…
Reference in New Issue