From d954f45dd7ed382f864c99f05a61d88ea4712281 Mon Sep 17 00:00:00 2001
From: ZhidanLiu <liuzhidan5@huawei.com>
Date: Wed, 14 Sep 2022 18:12:51 +0800
Subject: [PATCH] reconstruct and add data type of BroadcastTo

---
 .jenkins/check/config/filter_cpplint.txt      |   1 +
 .../cpu/kernel/broadcast_to_cpu_kernel.cc     |  52 +++---
 .../cpu/kernel/broadcast_to_cpu_kernel.h      |  15 +-
 .../kernel/arrays/broadcast_to_gpu_kernel.cc  | 168 ++++++++++++++++--
 .../kernel/arrays/broadcast_to_gpu_kernel.h   | 107 ++++-------
 .../cuda_impl/cuda_ops/broadcast_impl.cu      |  81 +++++----
 .../other/dynamic_broadcastto_gpu_kernel.cc   |  77 --------
 tests/st/ops/cpu/test_broadcast_to_op.py      |  26 +++
 tests/st/ops/gpu/test_broadcast_to_ops.py     |  26 +++
 9 files changed, 314 insertions(+), 239 deletions(-)
 delete mode 100644 mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.cc
diff --git a/.jenkins/check/config/filter_cpplint.txt b/.jenkins/check/config/filter_cpplint.txt
index f81d8e9fb79..24eb9a7d15d 100644
--- a/.jenkins/check/config/filter_cpplint.txt
+++ b/.jenkins/check/config/filter_cpplint.txt
@@ -9,6 +9,7 @@
 "mindspore/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_callback_register.cc"      "runtime/references"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_manager.h"                 "runtime/references"
 "mindspore/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.h"                 "runtime/references"
+"mindspore/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.cc"                     "whitespace/braces"
 "mindspore/mindspore/core/mindrt/src/actor/actormgr.h"                                             "runtime/references"
 "mindspore/mindspore/core/mindrt/src/actor/actorpolicyinterface.h"                                 "runtime/references"
 "mindspore/mindspore/core/mindrt/src/actor/actorthread.h"                                          "runtime/references"
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.cc
index 2742fbe1be1..60bc3f2cec0 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.cc
@@ -14,18 +14,16 @@
  * limitations under the License.
  */
 
-#include "plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h"
 #include <algorithm>
 #include <utility>
 #include "plugin/device/cpu/kernel/nnacl/errorcode.h"
+#include "plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h"
 
 namespace mindspore {
 namespace kernel {
 namespace {
-#ifndef _MSC_VER
 using complex64 = __complex__ float;
 using complex128 = __complex__ double;
-#endif
 constexpr size_t kBroadcastToOutputsNum = 1;
 }  // namespace
 
@@ -48,16 +46,16 @@ std::map<std::string, std::vector<std::pair<KernelAttr, BroadcastToCpuKernelMod:
        &BroadcastToCpuKernelMod::LaunchKernel<uint32_t>},
       {KernelAttr().AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
        &BroadcastToCpuKernelMod::LaunchKernel<uint64_t>},
+      {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+       &BroadcastToCpuKernelMod::LaunchKernel<float16>},
       {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
        &BroadcastToCpuKernelMod::LaunchKernel<float>},
       {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
        &BroadcastToCpuKernelMod::LaunchKernel<double>},
-#ifndef _MSC_VER
       {KernelAttr().AddInputAttr(kNumberTypeComplex64).AddOutputAttr(kNumberTypeComplex64),
        &BroadcastToCpuKernelMod::LaunchKernel<complex64>},
       {KernelAttr().AddInputAttr(kNumberTypeComplex128).AddOutputAttr(kNumberTypeComplex128),
        &BroadcastToCpuKernelMod::LaunchKernel<complex128>},
-#endif
       {KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
        &BroadcastToCpuKernelMod::LaunchKernel<bool>}}},
     {kDynamicBroadcastTo,
@@ -66,16 +64,13 @@ std::map<std::string, std::vector<std::pair<KernelAttr, BroadcastToCpuKernelMod:
       {KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
        &BroadcastToCpuKernelMod::LaunchKernel<int>},
       {KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool),
-       &BroadcastToCpuKernelMod::LaunchKernel<bool>},
-      {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
-       &BroadcastToCpuKernelMod::LaunchKernel<float>},
-      {KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
-       &BroadcastToCpuKernelMod::LaunchKernel<int>},
-      {KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
        &BroadcastToCpuKernelMod::LaunchKernel<bool>}}}};
 
-void BroadcastToCpuKernelMod::InitTaskFunc(const CNodePtr &kernel_node) {
-  kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
+bool BroadcastToCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+                                   const std::vector<KernelTensorPtr> &outputs) {
+  MS_EXCEPTION_IF_NULL(base_operator);
+  kernel_name_ = base_operator->name();
+
   if (kernel_name_ != kernel_type_) {
     MS_LOG(EXCEPTION) << "Suppose to be " << kernel_type_ << " but got " << kernel_name_;
   }
@@ -85,22 +80,27 @@ void BroadcastToCpuKernelMod::InitTaskFunc(const CNodePtr &kernel_node) {
     MS_LOG(EXCEPTION) << "BroadcastTo cpu does not support " << kernel_type_;
   }
 
-  auto kernel_attr = GetKernelAttrFromNode(kernel_node);
+  auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
   auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
   if (!is_match) {
-    MS_LOG(EXCEPTION) << "BroadcastTo does not support this kernel data type: " << kernel_attr;
+    MS_LOG(ERROR) << "For '" << kernel_name_ << "' does not support this kernel type: " << kernel_attr;
+    return false;
   }
   kernel_func_ = func_list_[kernel_type_][index].second;
+  return true;
 }
 
-void BroadcastToCpuKernelMod::InitKernel(const CNodePtr &kernel_node) {
-  MS_EXCEPTION_IF_NULL(kernel_node);
-  kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
-  input_shape_ = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-  output_shape_ = common::AnfAlgo::GetOutputInferShape(kernel_node, 0);
-  if (AnfAlgo::IsShapesDynamic({input_shape_, output_shape_})) {
-    return;
+int BroadcastToCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+                                    const std::vector<KernelTensorPtr> &outputs,
+                                    const std::map<uint32_t, tensor::TensorPtr> &) {
+  input_shape_ = inputs[kIndex0]->GetShapeVector();
+  output_shape_ = outputs[kIndex0]->GetShapeVector();
+
+  auto it_x = std::find_if(input_shape_.begin(), input_shape_.end(), [](int64_t sh) { return sh < 0; });
+  if (it_x != input_shape_.end()) {
+    return KRET_UNKNOWN_SHAPE;
   }
+
   size_t input_shape_size = input_shape_.size();
   size_t output_shape_size = output_shape_.size();
 
@@ -112,8 +112,8 @@ void BroadcastToCpuKernelMod::InitKernel(const CNodePtr &kernel_node) {
   }
   shape_info_.input_shape_size_ = SizeToInt(input_shape_size);
   shape_info_.output_shape_size_ = SizeToInt(output_shape_size);
-
-  InitTaskFunc(kernel_node);
+  int ret = KernelMod::Resize(base_operator, inputs, outputs);
+  return ret;
 }
 
 void BroadcastToCpuKernelMod::CheckArgs() {
@@ -175,16 +175,16 @@ bool BroadcastToCpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs
     status = BroadcastToSize32(input_addr, &shape_info_, output_addr);
   } else if constexpr (std::is_same_v<T, uint64_t>) {
     status = BroadcastToSize64(input_addr, &shape_info_, output_addr);
+  } else if constexpr (std::is_same_v<T, float16>) {
+    status = BroadcastToSize16(input_addr, &shape_info_, output_addr);
   } else if constexpr (std::is_same_v<T, float>) {
     status = BroadcastToSize32(input_addr, &shape_info_, output_addr);
   } else if constexpr (std::is_same_v<T, double>) {
     status = BroadcastToSize64(input_addr, &shape_info_, output_addr);
-#ifndef _MSC_VER
   } else if constexpr (std::is_same_v<T, complex64>) {
     status = BroadcastToSize64(input_addr, &shape_info_, output_addr);
   } else if constexpr (std::is_same_v<T, complex128>) {
     status = BroadcastToSize128(input_addr, &shape_info_, output_addr);
-#endif
   } else {
     MS_LOG(EXCEPTION) << "For '" << kernel_name_
                       << "', not supported data type, the dtype of input must be bool, int, complex, float or double";
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h
index c38b05a291c..8906afc6fc9 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_BROADCAST_TO_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_BROADCAST_TO_CPU_KERNEL_H_
 
 #include <vector>
 #include <map>
@@ -32,7 +32,7 @@ namespace kernel {
 constexpr auto kBroadcastTo = "BroadcastTo";
 constexpr auto kDynamicBroadcastTo = "DynamicBroadcastTo";
 constexpr auto kUnknown = "Unknown";
-class BroadcastToCpuKernelMod : public DeprecatedNativeCpuKernelMod {
+class BroadcastToCpuKernelMod : public NativeCpuKernelMod {
  public:
   BroadcastToCpuKernelMod() = default;
   explicit BroadcastToCpuKernelMod(const std::string &kernel_type) : kernel_type_(kernel_type) {}
@@ -43,7 +43,12 @@ class BroadcastToCpuKernelMod : public DeprecatedNativeCpuKernelMod {
     return kernel_func_(this, inputs, workspace, outputs);
   }
 
-  void InitKernel(const CNodePtr &kernel_node) override;
+  bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+            const std::vector<KernelTensorPtr> &outputs) override;
+
+  int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+             const std::vector<KernelTensorPtr> &outputs,
+             const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost) override;
 
   void CheckArgs();
 
@@ -68,4 +73,4 @@ class BroadcastToCpuKernelMod : public DeprecatedNativeCpuKernelMod {
 }  // namespace kernel
 }  // namespace mindspore
 
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_BROADCAST_TO_CPU_KERNEL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.cc
index f68840c9d51..ced6cb6eaaa 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.cc
@@ -15,22 +15,162 @@
  */
 
 #include "plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
 
 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
-                      BroadcastToGpuKernelMod, double)
-MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-                      BroadcastToGpuKernelMod, float)
-MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
-                      BroadcastToGpuKernelMod, half)
-MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
-                      BroadcastToGpuKernelMod, int16_t)
-MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-                      BroadcastToGpuKernelMod, int32_t)
-MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
-                      BroadcastToGpuKernelMod, int64_t)
-MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
-                      BroadcastToGpuKernelMod, bool)
+bool BroadcastToGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+                                   const std::vector<KernelTensorPtr> &outputs) {
+  kernel_name_ = base_operator->GetPrim()->name();
+  auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
+  auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
+  if (!is_match) {
+    MS_LOG(ERROR) << "For '" << kernel_name_ << "', it does not support this kernel data type: " << kernel_attr;
+    return false;
+  }
+  kernel_func_ = func_list_[index].second;
+  input_type_size_ = abstract::TypeIdSize(kernel_attr.GetInputAttr(kIndex0).first);
+  return true;
+}
+void BroadcastToGpuKernelMod::ResetResource() noexcept {
+  input_size_ = 1;
+  output_size_ = 1;
+  for (size_t i = 0; i < SHAPE_SIZE; ++i) {
+    input_shape_[i] = 1;
+    output_shape_[i] = 1;
+  }
+  is_null_input_ = false;
+}
+
+int BroadcastToGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+                                    const std::vector<KernelTensorPtr> &outputs,
+                                    const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost) {
+  ResetResource();
+  auto input_shapes = inputs[kIndex0]->GetShapeVector();
+  auto output_shapes = outputs[kIndex0]->GetShapeVector();
+
+  auto it_x = std::find_if(input_shapes.begin(), input_shapes.end(), [](int64_t sh) { return sh < 0; });
+  if (it_x != input_shapes.end()) {
+    return KRET_UNKNOWN_SHAPE;
+  }
+
+  if (input_shapes.size() > SHAPE_SIZE || output_shapes.size() > SHAPE_SIZE) {
+    MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input and output cannot be greater than "
+                      << SHAPE_SIZE << ", but got the dimension of input: " << input_shapes.size()
+                      << ", the dimension of output: " << output_shapes.size();
+  }
+
+  if (output_shapes.size() < input_shapes.size()) {
+    MS_LOG(EXCEPTION) << "For '" << kernel_name_
+                      << "', the dimension of output cannot be less than the dimension of input "
+                      << ", but got the dimension of input: " << input_shapes.size()
+                      << ", the dimension of output: " << output_shapes.size();
+  }
+  size_t offset = output_shapes.size() - input_shapes.size();
+  for (size_t i = 0; i < input_shapes.size(); i++) {
+    input_shape_[i + offset] = LongToSizeClipNeg(input_shapes[i]);
+  }
+
+  for (size_t j = 0; j < output_shapes.size(); j++) {
+    output_shape_[j] = LongToSizeClipNeg(output_shapes[j]);
+  }
+
+  input_size_ = std::accumulate(input_shape_.begin(), input_shape_.end(), size_t(1), std::multiplies{});
+  output_size_ = std::accumulate(output_shape_.begin(), output_shape_.end(), size_t(1), std::multiplies{});
+
+  input_size_list_.clear();
+  output_size_list_.clear();
+  input_size_list_.push_back(input_size_ * input_type_size_);
+  output_size_list_.push_back(output_size_ * input_type_size_);
+
+  //  if (auto ret = KernelMod::Resize(base_operator, inputs, outputs, inputsOnHost); ret != KRET_OK) {
+  //    return ret;
+  //  }
+  return KRET_OK;
+}
+
+template <typename T>
+bool BroadcastToGpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs,
+                                           const std::vector<AddressPtr> &workspace,
+                                           const std::vector<AddressPtr> &outputs, void *stream_ptr) {
+  if (is_null_input_) {
+    return true;
+  }
+  T *input_addr = GetDeviceAddress<T>(inputs, 0);
+  T *output_addr = GetDeviceAddress<T>(outputs, 0);
+
+  BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], input_shape_[4], input_shape_[5],
+              input_shape_[6], input_shape_[7], output_shape_[0], output_shape_[1], output_shape_[2], output_shape_[3],
+              output_shape_[4], output_shape_[5], output_shape_[6], output_shape_[7], input_addr, output_addr,
+              reinterpret_cast<cudaStream_t>(stream_ptr));
+  return true;
+}
+
+std::vector<std::pair<KernelAttr, BroadcastToGpuKernelMod::BroadcastToLaunchFunc>> BroadcastToGpuKernelMod::func_list_ =
+  {
+    {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
+     &BroadcastToGpuKernelMod::LaunchKernel<double>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+     &BroadcastToGpuKernelMod::LaunchKernel<float>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
+     &BroadcastToGpuKernelMod::LaunchKernel<half>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
+     &BroadcastToGpuKernelMod::LaunchKernel<int8_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
+     &BroadcastToGpuKernelMod::LaunchKernel<int16_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+     &BroadcastToGpuKernelMod::LaunchKernel<int32_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
+     &BroadcastToGpuKernelMod::LaunchKernel<int64_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
+     &BroadcastToGpuKernelMod::LaunchKernel<uint8_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
+     &BroadcastToGpuKernelMod::LaunchKernel<uint16_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
+     &BroadcastToGpuKernelMod::LaunchKernel<uint32_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
+     &BroadcastToGpuKernelMod::LaunchKernel<uint64_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
+     &BroadcastToGpuKernelMod::LaunchKernel<bool>},
+    {KernelAttr().AddInputAttr(kNumberTypeComplex64).AddOutputAttr(kNumberTypeComplex64),
+     &BroadcastToGpuKernelMod::LaunchKernel<utils::Complex<float>>},
+    {KernelAttr().AddInputAttr(kNumberTypeComplex128).AddOutputAttr(kNumberTypeComplex128),
+     &BroadcastToGpuKernelMod::LaunchKernel<utils::Complex<double>>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
+     &BroadcastToGpuKernelMod::LaunchKernel<double>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
+     &BroadcastToGpuKernelMod::LaunchKernel<float>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
+     &BroadcastToGpuKernelMod::LaunchKernel<half>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt16),
+     &BroadcastToGpuKernelMod::LaunchKernel<int16_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
+     &BroadcastToGpuKernelMod::LaunchKernel<int32_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
+     &BroadcastToGpuKernelMod::LaunchKernel<int64_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat64),
+     &BroadcastToGpuKernelMod::LaunchKernel<double>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
+     &BroadcastToGpuKernelMod::LaunchKernel<float>},
+    {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
+     &BroadcastToGpuKernelMod::LaunchKernel<half>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt16),
+     &BroadcastToGpuKernelMod::LaunchKernel<int16_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+     &BroadcastToGpuKernelMod::LaunchKernel<int32_t>},
+    {KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt64),
+     &BroadcastToGpuKernelMod::LaunchKernel<int64_t>},
+};
+
+std::vector<KernelAttr> BroadcastToGpuKernelMod::GetOpSupport() {
+  std::vector<KernelAttr> support_list;
+  (void)std::transform(
+    func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
+    [](const std::pair<KernelAttr, BroadcastToGpuKernelMod::BroadcastToLaunchFunc> &pair) { return pair.first; });
+  return support_list;
+}
+
+MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, BroadcastTo, BroadcastToGpuKernelMod);
+MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, DynamicBroadcastTo, BroadcastToGpuKernelMod);
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h
index 3d4d869bd58..531ca400b26 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h
@@ -14,11 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_BROADCAST_TO_GPU_KERNEL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_BROADCAST_TO_GPU_KERNEL_H_
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_BROADCAST_TO_GPU_KERNEL_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_BROADCAST_TO_GPU_KERNEL_H_
 
 #include <vector>
 #include <string>
+#include <functional>
+#include <utility>
+#include <algorithm>
+#include <map>
 #include "plugin/device/gpu/kernel/gpu_kernel.h"
 #include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
 #include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
@@ -26,93 +30,44 @@
 namespace mindspore {
 namespace kernel {
 constexpr size_t SHAPE_SIZE = 8;
-template <typename T>
-class BroadcastToGpuKernelMod : public DeprecatedNativeGpuKernelMod {
+class BroadcastToGpuKernelMod : public NativeGpuKernelMod {
  public:
   BroadcastToGpuKernelMod() : kernel_name_("BroadcastTo") {}
   ~BroadcastToGpuKernelMod() = default;
 
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
               const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
-    if (is_null_input_) {
-      return true;
-    }
-    T *input_addr = GetDeviceAddress<T>(inputs, 0);
-    T *output_addr = GetDeviceAddress<T>(outputs, 0);
-
-    BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], input_shape_[4], input_shape_[5],
-                input_shape_[6], input_shape_[7], output_shape_[0], output_shape_[1], output_shape_[2],
-                output_shape_[3], output_shape_[4], output_shape_[5], output_shape_[6], output_shape_[7], input_addr,
-                output_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
-    return true;
-  }
-  bool Init(const CNodePtr &kernel_node) override {
-    kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
-    auto input_shapes = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
-    auto output_shapes = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
-    kernel_node_ = kernel_node;
-    if (AnfAlgo::IsShapesDynamic({input_shapes, output_shapes})) {
-      return true;
-    }
-
-    is_null_input_ =
-      CHECK_SHAPE_NULL(input_shapes, kernel_name_, "input") || CHECK_SHAPE_NULL(output_shapes, kernel_name_, "output");
-    if (is_null_input_) {
-      InitSizeLists();
-      return true;
-    }
-    if (input_shapes.size() > SHAPE_SIZE || output_shapes.size() > SHAPE_SIZE) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input and output cannot be greater than "
-                        << SHAPE_SIZE << ", but got the dimension of input: " << input_shapes.size()
-                        << ", the dimension of output: " << output_shapes.size();
-    }
-
-    if (output_shapes.size() < input_shapes.size()) {
-      MS_LOG(EXCEPTION) << "For '" << kernel_name_
-                        << "', the dimension of output cannot be less than the dimension of input "
-                        << ", but got the dimension of input: " << input_shapes.size()
-                        << ", the dimension of output: " << output_shapes.size();
-    }
-
-    size_t offset = output_shapes.size() - input_shapes.size();
-    for (size_t i = 0; i < input_shapes.size(); i++) {
-      input_shape_[i + offset] = LongToSizeClipNeg(input_shapes[i]);
-    }
-
-    for (size_t j = 0; j < output_shapes.size(); j++) {
-      output_shape_[j] = LongToSizeClipNeg(output_shapes[j]);
-    }
-
-    InitSizeLists();
-    return true;
+    return kernel_func_(this, inputs, workspace, outputs, stream_ptr);
   }
 
-  void ResetResource() noexcept override {
-    ResetSizeLists();
-    for (size_t i = 0; i < SHAPE_SIZE; ++i) {
-      input_shape_[i] = 1;
-      output_shape_[i] = 1;
-    }
-    is_null_input_ = false;
-  }
+  bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+            const std::vector<KernelTensorPtr> &outputs) override;
+
+  int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
+             const std::vector<KernelTensorPtr> &outputs, const std::map<uint32_t, tensor::TensorPtr> &) override;
 
  protected:
-  void InitSizeLists() override {
-    input_size_list_.clear();
-    output_size_list_.clear();
-    input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * input_shape_[4] *
-                               input_shape_[5] * input_shape_[6] * input_shape_[7] * sizeof(T));
-    output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] *
-                                output_shape_[4] * output_shape_[5] * output_shape_[6] * output_shape_[7] * sizeof(T));
-  }
+  std::vector<KernelAttr> GetOpSupport() override;
+  template <typename T>
+  bool LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+                    const std::vector<AddressPtr> &outputs, void *stream_ptr);
+  using BroadcastToLaunchFunc =
+    std::function<bool(BroadcastToGpuKernelMod *, const std::vector<kernel::AddressPtr> &,
+                       const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &, void *)>;
 
  private:
-  size_t input_shape_[SHAPE_SIZE] = {1, 1, 1, 1, 1, 1, 1, 1};
-  size_t output_shape_[SHAPE_SIZE] = {1, 1, 1, 1, 1, 1, 1, 1};
+  std::string kernel_name_{};
+  BroadcastToLaunchFunc kernel_func_;
+  void ResetResource() noexcept;
+  static std::vector<std::pair<KernelAttr, BroadcastToLaunchFunc>> func_list_;
+  size_t input_size_;
+  size_t output_size_;
+  size_t input_type_size_;  // sizeof(T)
+  std::vector<size_t> input_shape_ = {1, 1, 1, 1, 1, 1, 1, 1};
+  std::vector<size_t> output_shape_ = {1, 1, 1, 1, 1, 1, 1, 1};
   bool is_null_input_ = false;
-  std::string kernel_name_;
 };
 }  // namespace kernel
 }  // namespace mindspore
 
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_BROADCAST_TO_GPU_KERNEL_H_
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_BROADCAST_TO_GPU_KERNEL_H_
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu
index 2b53763d88c..d8b04172f48 100644
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cu
@@ -126,31 +126,11 @@ struct MaximumFunc {
   __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return lhs > rhs ? lhs : rhs; }
 };
 
-#ifndef _WIN32
 template <typename T>
 struct PowerFunc {
   __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return pow(lhs, rhs); }
 };
 
-#else
-template <typename T>
-struct PowerFunc {
-  __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
-    return static_cast<T>(pow(static_cast<double>(lhs), static_cast<double>(rhs)));
-  }
-};
-
-template <>
-struct PowerFunc<float> {
-  __device__ __host__ __forceinline__ float operator()(const float &lhs, const float &rhs) { return pow(lhs, rhs); }
-};
-
-template <>
-struct PowerFunc<double> {
-  __device__ __host__ __forceinline__ double operator()(const double &lhs, const double &rhs) { return pow(lhs, rhs); }
-};
-#endif
-
 template <>
 struct PowerFunc<half> {
   __device__ __host__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
@@ -479,7 +459,7 @@ struct XDivyFunc<half2> {
 
 
 // XLogy check if lhs is less than epsilon, XLogy support half, float, double
-template <typename T, typename IsInteger = void>
+template <typename T>
 struct XLogyFunc {
   // default T is float
   __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
@@ -494,26 +474,6 @@ struct XLogyFunc {
     return res;
   }
 };
-#ifdef _WIN32
-template <typename T>
-struct XLogyFunc<T, typename std::enable_if<std::is_integral<T>::value>::type> {
-  __device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
-    double tmpLhs = static_cast<double>(lhs);
-    double tmpRhs = static_cast<double>(rhs);
-    return tmpLhs < kFloatEplison && tmpLhs > -kFloatEplison ? 0.0 : (tmpLhs * log(tmpRhs));
-  }
-};
-
-template <>
-struct XLogyFunc<bool> {
-  __device__ __host__ __forceinline__ bool operator()(const bool &lhs, const bool &rhs) {
-    if (!lhs || !rhs) {
-      return false;
-    }
-    return true;
-  }
-};
-#endif
 
 template <>
 struct XLogyFunc<Complex<float>> {
@@ -1802,6 +1762,12 @@ template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, co
                                           const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
                                           const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
                                           const half *input_addr, half *output_addr, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
+                                          const int8_t *input_addr, int8_t *output_addr, cudaStream_t stream);
 template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
                                           const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
                                           const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
@@ -1817,8 +1783,41 @@ template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, co
                                           const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
                                           const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
                                           const int64_t *input_addr, int64_t *output_addr, cudaStream_t stream);
+
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
+                                          const uint8_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
+                                          const uint16_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
+                                          const uint32_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
+                                          const uint64_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
 template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
                                           const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
                                           const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
                                           const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
                                           const bool *input_addr, bool *output_addr, cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
+                                          const Complex<float> *input_addr, Complex<float> *output_addr,
+                                          cudaStream_t stream);
+template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
+                                          const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
+                                          const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
+                                          const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
+                                          const Complex<double> *input_addr, Complex<double> *output_addr,
+                                          cudaStream_t stream);
diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.cc
deleted file mode 100644
index 7db342b0d6f..00000000000
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcastto_gpu_kernel.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Copyright 2021-2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h"
-
-namespace mindspore {
-namespace kernel {
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
-  BroadcastToGpuKernelMod, double)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
-  BroadcastToGpuKernelMod, float)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
-  BroadcastToGpuKernelMod, half)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt16),
-  BroadcastToGpuKernelMod, int16_t)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
-  BroadcastToGpuKernelMod, int32_t)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
-  BroadcastToGpuKernelMod, int64_t)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat64),
-  BroadcastToGpuKernelMod, double)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
-  BroadcastToGpuKernelMod, float)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
-  BroadcastToGpuKernelMod, half)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt16),
-  BroadcastToGpuKernelMod, int16_t)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-  BroadcastToGpuKernelMod, int32_t)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt64),
-  BroadcastToGpuKernelMod, int64_t)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool),
-  BroadcastToGpuKernelMod, bool)
-MS_REG_GPU_KERNEL_ONE(
-  DynamicBroadcastTo,
-  KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
-  BroadcastToGpuKernelMod, bool)
-}  // namespace kernel
-}  // namespace mindspore
diff --git a/tests/st/ops/cpu/test_broadcast_to_op.py b/tests/st/ops/cpu/test_broadcast_to_op.py
index 8f0f9a24067..bbd166cdaa1 100644
--- a/tests/st/ops/cpu/test_broadcast_to_op.py
+++ b/tests/st/ops/cpu/test_broadcast_to_op.py
@@ -76,6 +76,32 @@ def test_broadcast():
     assert np.allclose(output.asnumpy(), expect)
 
 
+def broadcast_to_dtype(dtype):
+    """
+    Basic function to test data type of BroadcastTo.
+    """
+    shape = (2, 3, 4, 5)
+    x1_np = np.random.rand(4, 5).astype(dtype)
+    output = P.BroadcastTo(shape)(Tensor(x1_np))
+    expect = np.broadcast_to(x1_np, shape)
+    assert np.allclose(output.asnumpy(), expect)
+
+
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_broadcast_to_dtype():
+    """
+    Feature: Test supported data types of BroadCastTo.
+    Description: all data types
+    Expectation: success.
+    """
+    types = [np.float16, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64,
+             np.uint8, np.uint16, np.uint32, np.uint64, np.complex64, np.complex128]
+    for dtype in types:
+        broadcast_to_dtype(dtype=dtype)
+
+
 @pytest.mark.level0
 @pytest.mark.platform_x86_cpu
 @pytest.mark.env_onecard
diff --git a/tests/st/ops/gpu/test_broadcast_to_ops.py b/tests/st/ops/gpu/test_broadcast_to_ops.py
index 38e040f51ec..416bfd45b6f 100644
--- a/tests/st/ops/gpu/test_broadcast_to_ops.py
+++ b/tests/st/ops/gpu/test_broadcast_to_ops.py
@@ -70,6 +70,32 @@ def test_broadcast():
     assert np.allclose(output.asnumpy(), expect)
 
 
+def broadcast_to_dtype(dtype):
+    """
+    Basic function to test data type of BroadcastTo.
+    """
+    shape = (2, 3, 4, 5)
+    x1_np = np.random.rand(4, 5).astype(dtype)
+    output = P.BroadcastTo(shape)(Tensor(x1_np))
+    expect = np.broadcast_to(x1_np, shape)
+    assert np.allclose(output.asnumpy(), expect)
+
+
+@pytest.mark.level1
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_broadcast_to_dtype():
+    """
+    Feature: Test supported data types of BroadCastTo.
+    Description: all data types
+    Expectation: success.
+    """
+    types = [np.float16, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64,
+             np.uint8, np.uint16, np.uint32, np.uint64, np.complex64, np.complex128]
+    for dtype in types:
+        broadcast_to_dtype(dtype=dtype)
+
+
 @pytest.mark.level1
 @pytest.mark.platform_x86_gpu_training
 @pytest.mark.env_onecard