From d954f45dd7ed382f864c99f05a61d88ea4712281 Mon Sep 17 00:00:00 2001 From: ZhidanLiu Date: Wed, 14 Sep 2022 18:12:51 +0800 Subject: [PATCH] reconstruct and add data type of BroadcastTo --- .jenkins/check/config/filter_cpplint.txt | 1 + .../cpu/kernel/broadcast_to_cpu_kernel.cc | 52 +++--- .../cpu/kernel/broadcast_to_cpu_kernel.h | 15 +- .../kernel/arrays/broadcast_to_gpu_kernel.cc | 168 ++++++++++++++++-- .../kernel/arrays/broadcast_to_gpu_kernel.h | 107 ++++------- .../cuda_impl/cuda_ops/broadcast_impl.cu | 81 +++++---- .../other/dynamic_broadcastto_gpu_kernel.cc | 77 -------- tests/st/ops/cpu/test_broadcast_to_op.py | 26 +++ tests/st/ops/gpu/test_broadcast_to_ops.py | 26 +++ 9 files changed, 314 insertions(+), 239 deletions(-) delete mode 100644 mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.cc diff --git a/.jenkins/check/config/filter_cpplint.txt b/.jenkins/check/config/filter_cpplint.txt index f81d8e9fb79..24eb9a7d15d 100644 --- a/.jenkins/check/config/filter_cpplint.txt +++ b/.jenkins/check/config/filter_cpplint.txt @@ -9,6 +9,7 @@ "mindspore/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_callback_register.cc" "runtime/references" "mindspore/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_manager.h" "runtime/references" "mindspore/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.h" "runtime/references" +"mindspore/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.cc" "whitespace/braces" "mindspore/mindspore/core/mindrt/src/actor/actormgr.h" "runtime/references" "mindspore/mindspore/core/mindrt/src/actor/actorpolicyinterface.h" "runtime/references" "mindspore/mindspore/core/mindrt/src/actor/actorthread.h" "runtime/references" diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.cc index 2742fbe1be1..60bc3f2cec0 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.cc @@ -14,18 +14,16 @@ * limitations under the License. */ -#include "plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h" #include #include #include "plugin/device/cpu/kernel/nnacl/errorcode.h" +#include "plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h" namespace mindspore { namespace kernel { namespace { -#ifndef _MSC_VER using complex64 = __complex__ float; using complex128 = __complex__ double; -#endif constexpr size_t kBroadcastToOutputsNum = 1; } // namespace @@ -48,16 +46,16 @@ std::map}, {KernelAttr().AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64), &BroadcastToCpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), + &BroadcastToCpuKernelMod::LaunchKernel}, {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), &BroadcastToCpuKernelMod::LaunchKernel}, {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64), &BroadcastToCpuKernelMod::LaunchKernel}, -#ifndef _MSC_VER {KernelAttr().AddInputAttr(kNumberTypeComplex64).AddOutputAttr(kNumberTypeComplex64), &BroadcastToCpuKernelMod::LaunchKernel}, {KernelAttr().AddInputAttr(kNumberTypeComplex128).AddOutputAttr(kNumberTypeComplex128), &BroadcastToCpuKernelMod::LaunchKernel}, -#endif {KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), &BroadcastToCpuKernelMod::LaunchKernel}}}, {kDynamicBroadcastTo, @@ -66,16 +64,13 @@ std::map}, {KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool), - &BroadcastToCpuKernelMod::LaunchKernel}, - {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32), - &BroadcastToCpuKernelMod::LaunchKernel}, - {KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32), - &BroadcastToCpuKernelMod::LaunchKernel}, - {KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool), &BroadcastToCpuKernelMod::LaunchKernel}}}}; -void BroadcastToCpuKernelMod::InitTaskFunc(const CNodePtr &kernel_node) { - kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node); +bool BroadcastToCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs) { + MS_EXCEPTION_IF_NULL(base_operator); + kernel_name_ = base_operator->name(); + if (kernel_name_ != kernel_type_) { MS_LOG(EXCEPTION) << "Suppose to be " << kernel_type_ << " but got " << kernel_name_; } @@ -85,22 +80,27 @@ void BroadcastToCpuKernelMod::InitTaskFunc(const CNodePtr &kernel_node) { MS_LOG(EXCEPTION) << "BroadcastTo cpu does not support " << kernel_type_; } - auto kernel_attr = GetKernelAttrFromNode(kernel_node); + auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs); auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport()); if (!is_match) { - MS_LOG(EXCEPTION) << "BroadcastTo does not support this kernel data type: " << kernel_attr; + MS_LOG(ERROR) << "For '" << kernel_name_ << "' does not support this kernel type: " << kernel_attr; + return false; } kernel_func_ = func_list_[kernel_type_][index].second; + return true; } -void BroadcastToCpuKernelMod::InitKernel(const CNodePtr &kernel_node) { - MS_EXCEPTION_IF_NULL(kernel_node); - kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node); - input_shape_ = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); - output_shape_ = common::AnfAlgo::GetOutputInferShape(kernel_node, 0); - if (AnfAlgo::IsShapesDynamic({input_shape_, output_shape_})) { - return; +int BroadcastToCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs, + const std::map &) { + input_shape_ = inputs[kIndex0]->GetShapeVector(); + output_shape_ = outputs[kIndex0]->GetShapeVector(); + + auto it_x = std::find_if(input_shape_.begin(), input_shape_.end(), [](int64_t sh) { return sh < 0; }); + if (it_x != input_shape_.end()) { + return KRET_UNKNOWN_SHAPE; } + size_t input_shape_size = input_shape_.size(); size_t output_shape_size = output_shape_.size(); @@ -112,8 +112,8 @@ void BroadcastToCpuKernelMod::InitKernel(const CNodePtr &kernel_node) { } shape_info_.input_shape_size_ = SizeToInt(input_shape_size); shape_info_.output_shape_size_ = SizeToInt(output_shape_size); - - InitTaskFunc(kernel_node); + int ret = KernelMod::Resize(base_operator, inputs, outputs); + return ret; } void BroadcastToCpuKernelMod::CheckArgs() { @@ -175,16 +175,16 @@ bool BroadcastToCpuKernelMod::LaunchKernel(const std::vector &inputs status = BroadcastToSize32(input_addr, &shape_info_, output_addr); } else if constexpr (std::is_same_v) { status = BroadcastToSize64(input_addr, &shape_info_, output_addr); + } else if constexpr (std::is_same_v) { + status = BroadcastToSize16(input_addr, &shape_info_, output_addr); } else if constexpr (std::is_same_v) { status = BroadcastToSize32(input_addr, &shape_info_, output_addr); } else if constexpr (std::is_same_v) { status = BroadcastToSize64(input_addr, &shape_info_, output_addr); -#ifndef _MSC_VER } else if constexpr (std::is_same_v) { status = BroadcastToSize64(input_addr, &shape_info_, output_addr); } else if constexpr (std::is_same_v) { status = BroadcastToSize128(input_addr, &shape_info_, output_addr); -#endif } else { MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', not supported data type, the dtype of input must be bool, int, complex, float or double"; diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h index c38b05a291c..8906afc6fc9 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_BROADCAST_TO_CPU_KERNEL_H_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_BROADCAST_TO_CPU_KERNEL_H_ #include #include @@ -32,7 +32,7 @@ namespace kernel { constexpr auto kBroadcastTo = "BroadcastTo"; constexpr auto kDynamicBroadcastTo = "DynamicBroadcastTo"; constexpr auto kUnknown = "Unknown"; -class BroadcastToCpuKernelMod : public DeprecatedNativeCpuKernelMod { +class BroadcastToCpuKernelMod : public NativeCpuKernelMod { public: BroadcastToCpuKernelMod() = default; explicit BroadcastToCpuKernelMod(const std::string &kernel_type) : kernel_type_(kernel_type) {} @@ -43,7 +43,12 @@ class BroadcastToCpuKernelMod : public DeprecatedNativeCpuKernelMod { return kernel_func_(this, inputs, workspace, outputs); } - void InitKernel(const CNodePtr &kernel_node) override; + bool Init(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs) override; + + int Resize(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs, + const std::map &inputsOnHost) override; void CheckArgs(); @@ -68,4 +73,4 @@ class BroadcastToCpuKernelMod : public DeprecatedNativeCpuKernelMod { } // namespace kernel } // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_BROADCAST_TO_CPU_KERNEL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.cc index f68840c9d51..ced6cb6eaaa 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.cc @@ -15,22 +15,162 @@ */ #include "plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h" namespace mindspore { namespace kernel { -MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64), - BroadcastToGpuKernelMod, double) -MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), - BroadcastToGpuKernelMod, float) -MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), - BroadcastToGpuKernelMod, half) -MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), - BroadcastToGpuKernelMod, int16_t) -MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), - BroadcastToGpuKernelMod, int32_t) -MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), - BroadcastToGpuKernelMod, int64_t) -MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), - BroadcastToGpuKernelMod, bool) +bool BroadcastToGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs) { + kernel_name_ = base_operator->GetPrim()->name(); + auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs); + auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport()); + if (!is_match) { + MS_LOG(ERROR) << "For '" << kernel_name_ << "', it does not support this kernel data type: " << kernel_attr; + return false; + } + kernel_func_ = func_list_[index].second; + input_type_size_ = abstract::TypeIdSize(kernel_attr.GetInputAttr(kIndex0).first); + return true; +} +void BroadcastToGpuKernelMod::ResetResource() noexcept { + input_size_ = 1; + output_size_ = 1; + for (size_t i = 0; i < SHAPE_SIZE; ++i) { + input_shape_[i] = 1; + output_shape_[i] = 1; + } + is_null_input_ = false; +} + +int BroadcastToGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs, + const std::map &inputsOnHost) { + ResetResource(); + auto input_shapes = inputs[kIndex0]->GetShapeVector(); + auto output_shapes = outputs[kIndex0]->GetShapeVector(); + + auto it_x = std::find_if(input_shapes.begin(), input_shapes.end(), [](int64_t sh) { return sh < 0; }); + if (it_x != input_shapes.end()) { + return KRET_UNKNOWN_SHAPE; + } + + if (input_shapes.size() > SHAPE_SIZE || output_shapes.size() > SHAPE_SIZE) { + MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input and output cannot be greater than " + << SHAPE_SIZE << ", but got the dimension of input: " << input_shapes.size() + << ", the dimension of output: " << output_shapes.size(); + } + + if (output_shapes.size() < input_shapes.size()) { + MS_LOG(EXCEPTION) << "For '" << kernel_name_ + << "', the dimension of output cannot be less than the dimension of input " + << ", but got the dimension of input: " << input_shapes.size() + << ", the dimension of output: " << output_shapes.size(); + } + size_t offset = output_shapes.size() - input_shapes.size(); + for (size_t i = 0; i < input_shapes.size(); i++) { + input_shape_[i + offset] = LongToSizeClipNeg(input_shapes[i]); + } + + for (size_t j = 0; j < output_shapes.size(); j++) { + output_shape_[j] = LongToSizeClipNeg(output_shapes[j]); + } + + input_size_ = std::accumulate(input_shape_.begin(), input_shape_.end(), size_t(1), std::multiplies{}); + output_size_ = std::accumulate(output_shape_.begin(), output_shape_.end(), size_t(1), std::multiplies{}); + + input_size_list_.clear(); + output_size_list_.clear(); + input_size_list_.push_back(input_size_ * input_type_size_); + output_size_list_.push_back(output_size_ * input_type_size_); + + // if (auto ret = KernelMod::Resize(base_operator, inputs, outputs, inputsOnHost); ret != KRET_OK) { + // return ret; + // } + return KRET_OK; +} + +template +bool BroadcastToGpuKernelMod::LaunchKernel(const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs, void *stream_ptr) { + if (is_null_input_) { + return true; + } + T *input_addr = GetDeviceAddress(inputs, 0); + T *output_addr = GetDeviceAddress(outputs, 0); + + BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], input_shape_[4], input_shape_[5], + input_shape_[6], input_shape_[7], output_shape_[0], output_shape_[1], output_shape_[2], output_shape_[3], + output_shape_[4], output_shape_[5], output_shape_[6], output_shape_[7], input_addr, output_addr, + reinterpret_cast(stream_ptr)); + return true; +} + +std::vector> BroadcastToGpuKernelMod::func_list_ = + { + {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeComplex64).AddOutputAttr(kNumberTypeComplex64), + &BroadcastToGpuKernelMod::LaunchKernel>}, + {KernelAttr().AddInputAttr(kNumberTypeComplex128).AddOutputAttr(kNumberTypeComplex128), + &BroadcastToGpuKernelMod::LaunchKernel>}, + {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt16), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat64), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt16), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), + &BroadcastToGpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt64), + &BroadcastToGpuKernelMod::LaunchKernel}, +}; + +std::vector BroadcastToGpuKernelMod::GetOpSupport() { + std::vector support_list; + (void)std::transform( + func_list_.begin(), func_list_.end(), std::back_inserter(support_list), + [](const std::pair &pair) { return pair.first; }); + return support_list; +} + +MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, BroadcastTo, BroadcastToGpuKernelMod); +MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, DynamicBroadcastTo, BroadcastToGpuKernelMod); } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h index 3d4d869bd58..531ca400b26 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h @@ -14,11 +14,15 @@ * limitations under the License. */ -#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_BROADCAST_TO_GPU_KERNEL_H_ -#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_BROADCAST_TO_GPU_KERNEL_H_ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_BROADCAST_TO_GPU_KERNEL_H_ +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_BROADCAST_TO_GPU_KERNEL_H_ #include #include +#include +#include +#include +#include #include "plugin/device/gpu/kernel/gpu_kernel.h" #include "plugin/device/gpu/kernel/gpu_kernel_factory.h" #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh" @@ -26,93 +30,44 @@ namespace mindspore { namespace kernel { constexpr size_t SHAPE_SIZE = 8; -template -class BroadcastToGpuKernelMod : public DeprecatedNativeGpuKernelMod { +class BroadcastToGpuKernelMod : public NativeGpuKernelMod { public: BroadcastToGpuKernelMod() : kernel_name_("BroadcastTo") {} ~BroadcastToGpuKernelMod() = default; - bool Launch(const std::vector &inputs, const std::vector &, + bool Launch(const std::vector &inputs, const std::vector &workspace, const std::vector &outputs, void *stream_ptr) override { - if (is_null_input_) { - return true; - } - T *input_addr = GetDeviceAddress(inputs, 0); - T *output_addr = GetDeviceAddress(outputs, 0); - - BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], input_shape_[4], input_shape_[5], - input_shape_[6], input_shape_[7], output_shape_[0], output_shape_[1], output_shape_[2], - output_shape_[3], output_shape_[4], output_shape_[5], output_shape_[6], output_shape_[7], input_addr, - output_addr, reinterpret_cast(stream_ptr)); - return true; - } - bool Init(const CNodePtr &kernel_node) override { - kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node); - auto input_shapes = AnfAlgo::GetInputDeviceShape(kernel_node, 0); - auto output_shapes = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); - kernel_node_ = kernel_node; - if (AnfAlgo::IsShapesDynamic({input_shapes, output_shapes})) { - return true; - } - - is_null_input_ = - CHECK_SHAPE_NULL(input_shapes, kernel_name_, "input") || CHECK_SHAPE_NULL(output_shapes, kernel_name_, "output"); - if (is_null_input_) { - InitSizeLists(); - return true; - } - if (input_shapes.size() > SHAPE_SIZE || output_shapes.size() > SHAPE_SIZE) { - MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input and output cannot be greater than " - << SHAPE_SIZE << ", but got the dimension of input: " << input_shapes.size() - << ", the dimension of output: " << output_shapes.size(); - } - - if (output_shapes.size() < input_shapes.size()) { - MS_LOG(EXCEPTION) << "For '" << kernel_name_ - << "', the dimension of output cannot be less than the dimension of input " - << ", but got the dimension of input: " << input_shapes.size() - << ", the dimension of output: " << output_shapes.size(); - } - - size_t offset = output_shapes.size() - input_shapes.size(); - for (size_t i = 0; i < input_shapes.size(); i++) { - input_shape_[i + offset] = LongToSizeClipNeg(input_shapes[i]); - } - - for (size_t j = 0; j < output_shapes.size(); j++) { - output_shape_[j] = LongToSizeClipNeg(output_shapes[j]); - } - - InitSizeLists(); - return true; + return kernel_func_(this, inputs, workspace, outputs, stream_ptr); } - void ResetResource() noexcept override { - ResetSizeLists(); - for (size_t i = 0; i < SHAPE_SIZE; ++i) { - input_shape_[i] = 1; - output_shape_[i] = 1; - } - is_null_input_ = false; - } + bool Init(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs) override; + + int Resize(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs, const std::map &) override; protected: - void InitSizeLists() override { - input_size_list_.clear(); - output_size_list_.clear(); - input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * input_shape_[4] * - input_shape_[5] * input_shape_[6] * input_shape_[7] * sizeof(T)); - output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] * - output_shape_[4] * output_shape_[5] * output_shape_[6] * output_shape_[7] * sizeof(T)); - } + std::vector GetOpSupport() override; + template + bool LaunchKernel(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs, void *stream_ptr); + using BroadcastToLaunchFunc = + std::function &, + const std::vector &, const std::vector &, void *)>; private: - size_t input_shape_[SHAPE_SIZE] = {1, 1, 1, 1, 1, 1, 1, 1}; - size_t output_shape_[SHAPE_SIZE] = {1, 1, 1, 1, 1, 1, 1, 1}; + std::string kernel_name_{}; + BroadcastToLaunchFunc kernel_func_; + void ResetResource() noexcept; + static std::vector> func_list_; + size_t input_size_; + size_t output_size_; + size_t input_type_size_; // sizeof(T) + std::vector input_shape_ = {1, 1, 1, 1, 1, 1, 1, 1}; + std::vector output_shape_ = {1, 1, 1, 1, 1, 1, 1, 1}; bool is_null_input_ = false; - std::string kernel_name_; }; } // namespace kernel } // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_BROADCAST_TO_GPU_KERNEL_H_ +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_BROADCAST_TO_GPU_KERNEL_H_ diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu index 2b53763d88c..d8b04172f48 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu @@ -126,31 +126,11 @@ struct MaximumFunc { __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return lhs > rhs ? lhs : rhs; } }; -#ifndef _WIN32 template struct PowerFunc { __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return pow(lhs, rhs); } }; -#else -template -struct PowerFunc { - __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { - return static_cast(pow(static_cast(lhs), static_cast(rhs))); - } -}; - -template <> -struct PowerFunc { - __device__ __host__ __forceinline__ float operator()(const float &lhs, const float &rhs) { return pow(lhs, rhs); } -}; - -template <> -struct PowerFunc { - __device__ __host__ __forceinline__ double operator()(const double &lhs, const double &rhs) { return pow(lhs, rhs); } -}; -#endif - template <> struct PowerFunc { __device__ __host__ __forceinline__ half operator()(const half &lhs, const half &rhs) { @@ -479,7 +459,7 @@ struct XDivyFunc { // XLogy check if lhs is less than epsilon, XLogy support half, float, double -template +template struct XLogyFunc { // default T is float __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { @@ -494,26 +474,6 @@ struct XLogyFunc { return res; } }; -#ifdef _WIN32 -template -struct XLogyFunc::value>::type> { - __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { - double tmpLhs = static_cast(lhs); - double tmpRhs = static_cast(rhs); - return tmpLhs < kFloatEplison && tmpLhs > -kFloatEplison ? 0.0 : (tmpLhs * log(tmpRhs)); - } -}; - -template <> -struct XLogyFunc { - __device__ __host__ __forceinline__ bool operator()(const bool &lhs, const bool &rhs) { - if (!lhs || !rhs) { - return false; - } - return true; - } -}; -#endif template <> struct XLogyFunc> { @@ -1802,6 +1762,12 @@ template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, co const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7, const half *input_addr, half *output_addr, cudaStream_t stream); + +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7, + const int8_t *input_addr, int8_t *output_addr, cudaStream_t stream); template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7, const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, @@ -1817,8 +1783,41 @@ template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, co const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7, const int64_t *input_addr, int64_t *output_addr, cudaStream_t stream); + +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7, + const uint8_t *input_addr, uint8_t *output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7, + const uint16_t *input_addr, uint16_t *output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7, + const uint32_t *input_addr, uint32_t *output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7, + const uint64_t *input_addr, uint64_t *output_addr, cudaStream_t stream); template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7, const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7, const bool *input_addr, bool *output_addr, cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7, + const Complex *input_addr, Complex *output_addr, + cudaStream_t stream); +template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3, + const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7, + const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3, + const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7, + const Complex *input_addr, Complex *output_addr, + cudaStream_t stream); diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.cc deleted file mode 100644 index 7db342b0d6f..00000000000 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.cc +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Copyright 2021-2022 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h" - -namespace mindspore { -namespace kernel { -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64), - BroadcastToGpuKernelMod, double) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32), - BroadcastToGpuKernelMod, float) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16), - BroadcastToGpuKernelMod, half) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt16), - BroadcastToGpuKernelMod, int16_t) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32), - BroadcastToGpuKernelMod, int32_t) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), - BroadcastToGpuKernelMod, int64_t) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat64), - BroadcastToGpuKernelMod, double) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), - BroadcastToGpuKernelMod, float) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16), - BroadcastToGpuKernelMod, half) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt16), - BroadcastToGpuKernelMod, int16_t) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), - BroadcastToGpuKernelMod, int32_t) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt64), - BroadcastToGpuKernelMod, int64_t) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool), - BroadcastToGpuKernelMod, bool) -MS_REG_GPU_KERNEL_ONE( - DynamicBroadcastTo, - KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool), - BroadcastToGpuKernelMod, bool) -} // namespace kernel -} // namespace mindspore diff --git a/tests/st/ops/cpu/test_broadcast_to_op.py b/tests/st/ops/cpu/test_broadcast_to_op.py index 8f0f9a24067..bbd166cdaa1 100644 --- a/tests/st/ops/cpu/test_broadcast_to_op.py +++ b/tests/st/ops/cpu/test_broadcast_to_op.py @@ -76,6 +76,32 @@ def test_broadcast(): assert np.allclose(output.asnumpy(), expect) +def broadcast_to_dtype(dtype): + """ + Basic function to test data type of BroadcastTo. + """ + shape = (2, 3, 4, 5) + x1_np = np.random.rand(4, 5).astype(dtype) + output = P.BroadcastTo(shape)(Tensor(x1_np)) + expect = np.broadcast_to(x1_np, shape) + assert np.allclose(output.asnumpy(), expect) + + +@pytest.mark.level1 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_broadcast_to_dtype(): + """ + Feature: Test supported data types of BroadCastTo. + Description: all data types + Expectation: success. + """ + types = [np.float16, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64, + np.uint8, np.uint16, np.uint32, np.uint64, np.complex64, np.complex128] + for dtype in types: + broadcast_to_dtype(dtype=dtype) + + @pytest.mark.level0 @pytest.mark.platform_x86_cpu @pytest.mark.env_onecard diff --git a/tests/st/ops/gpu/test_broadcast_to_ops.py b/tests/st/ops/gpu/test_broadcast_to_ops.py index 38e040f51ec..416bfd45b6f 100644 --- a/tests/st/ops/gpu/test_broadcast_to_ops.py +++ b/tests/st/ops/gpu/test_broadcast_to_ops.py @@ -70,6 +70,32 @@ def test_broadcast(): assert np.allclose(output.asnumpy(), expect) +def broadcast_to_dtype(dtype): + """ + Basic function to test data type of BroadcastTo. + """ + shape = (2, 3, 4, 5) + x1_np = np.random.rand(4, 5).astype(dtype) + output = P.BroadcastTo(shape)(Tensor(x1_np)) + expect = np.broadcast_to(x1_np, shape) + assert np.allclose(output.asnumpy(), expect) + + +@pytest.mark.level1 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_broadcast_to_dtype(): + """ + Feature: Test supported data types of BroadCastTo. + Description: all data types + Expectation: success. + """ + types = [np.float16, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64, + np.uint8, np.uint16, np.uint32, np.uint64, np.complex64, np.complex128] + for dtype in types: + broadcast_to_dtype(dtype=dtype) + + @pytest.mark.level1 @pytest.mark.platform_x86_gpu_training @pytest.mark.env_onecard