forked from mindspore-Ecosystem/mindspore
!41982 reconstruct and add data type of BroadcastTo
Merge pull request !41982 from ZhidanLiu/master
This commit is contained in:
commit
9876005e52
|
@ -9,6 +9,7 @@
|
|||
"mindspore/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_callback_register.cc" "runtime/references"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_manager.h" "runtime/references"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.h" "runtime/references"
|
||||
"mindspore/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.cc" "whitespace/braces"
|
||||
"mindspore/mindspore/core/mindrt/src/actor/actormgr.h" "runtime/references"
|
||||
"mindspore/mindspore/core/mindrt/src/actor/actorpolicyinterface.h" "runtime/references"
|
||||
"mindspore/mindspore/core/mindrt/src/actor/actorthread.h" "runtime/references"
|
||||
|
|
|
@ -14,18 +14,16 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h"
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include "plugin/device/cpu/kernel/nnacl/errorcode.h"
|
||||
#include "plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
#ifndef _MSC_VER
|
||||
using complex64 = __complex__ float;
|
||||
using complex128 = __complex__ double;
|
||||
#endif
|
||||
constexpr size_t kBroadcastToOutputsNum = 1;
|
||||
} // namespace
|
||||
|
||||
|
@ -48,16 +46,16 @@ std::map<std::string, std::vector<std::pair<KernelAttr, BroadcastToCpuKernelMod:
|
|||
&BroadcastToCpuKernelMod::LaunchKernel<uint32_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
|
||||
&BroadcastToCpuKernelMod::LaunchKernel<uint64_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
&BroadcastToCpuKernelMod::LaunchKernel<float16>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
&BroadcastToCpuKernelMod::LaunchKernel<float>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
|
||||
&BroadcastToCpuKernelMod::LaunchKernel<double>},
|
||||
#ifndef _MSC_VER
|
||||
{KernelAttr().AddInputAttr(kNumberTypeComplex64).AddOutputAttr(kNumberTypeComplex64),
|
||||
&BroadcastToCpuKernelMod::LaunchKernel<complex64>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeComplex128).AddOutputAttr(kNumberTypeComplex128),
|
||||
&BroadcastToCpuKernelMod::LaunchKernel<complex128>},
|
||||
#endif
|
||||
{KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
|
||||
&BroadcastToCpuKernelMod::LaunchKernel<bool>}}},
|
||||
{kDynamicBroadcastTo,
|
||||
|
@ -66,16 +64,13 @@ std::map<std::string, std::vector<std::pair<KernelAttr, BroadcastToCpuKernelMod:
|
|||
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
&BroadcastToCpuKernelMod::LaunchKernel<int>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool),
|
||||
&BroadcastToCpuKernelMod::LaunchKernel<bool>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
|
||||
&BroadcastToCpuKernelMod::LaunchKernel<float>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
|
||||
&BroadcastToCpuKernelMod::LaunchKernel<int>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
|
||||
&BroadcastToCpuKernelMod::LaunchKernel<bool>}}}};
|
||||
|
||||
void BroadcastToCpuKernelMod::InitTaskFunc(const CNodePtr &kernel_node) {
|
||||
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
bool BroadcastToCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs) {
|
||||
MS_EXCEPTION_IF_NULL(base_operator);
|
||||
kernel_name_ = base_operator->name();
|
||||
|
||||
if (kernel_name_ != kernel_type_) {
|
||||
MS_LOG(EXCEPTION) << "Suppose to be " << kernel_type_ << " but got " << kernel_name_;
|
||||
}
|
||||
|
@ -85,22 +80,27 @@ void BroadcastToCpuKernelMod::InitTaskFunc(const CNodePtr &kernel_node) {
|
|||
MS_LOG(EXCEPTION) << "BroadcastTo cpu does not support " << kernel_type_;
|
||||
}
|
||||
|
||||
auto kernel_attr = GetKernelAttrFromNode(kernel_node);
|
||||
auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
|
||||
auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
|
||||
if (!is_match) {
|
||||
MS_LOG(EXCEPTION) << "BroadcastTo does not support this kernel data type: " << kernel_attr;
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "' does not support this kernel type: " << kernel_attr;
|
||||
return false;
|
||||
}
|
||||
kernel_func_ = func_list_[kernel_type_][index].second;
|
||||
return true;
|
||||
}
|
||||
|
||||
void BroadcastToCpuKernelMod::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
input_shape_ = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
output_shape_ = common::AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
if (AnfAlgo::IsShapesDynamic({input_shape_, output_shape_})) {
|
||||
return;
|
||||
int BroadcastToCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs,
|
||||
const std::map<uint32_t, tensor::TensorPtr> &) {
|
||||
input_shape_ = inputs[kIndex0]->GetShapeVector();
|
||||
output_shape_ = outputs[kIndex0]->GetShapeVector();
|
||||
|
||||
auto it_x = std::find_if(input_shape_.begin(), input_shape_.end(), [](int64_t sh) { return sh < 0; });
|
||||
if (it_x != input_shape_.end()) {
|
||||
return KRET_UNKNOWN_SHAPE;
|
||||
}
|
||||
|
||||
size_t input_shape_size = input_shape_.size();
|
||||
size_t output_shape_size = output_shape_.size();
|
||||
|
||||
|
@ -112,8 +112,8 @@ void BroadcastToCpuKernelMod::InitKernel(const CNodePtr &kernel_node) {
|
|||
}
|
||||
shape_info_.input_shape_size_ = SizeToInt(input_shape_size);
|
||||
shape_info_.output_shape_size_ = SizeToInt(output_shape_size);
|
||||
|
||||
InitTaskFunc(kernel_node);
|
||||
int ret = KernelMod::Resize(base_operator, inputs, outputs);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void BroadcastToCpuKernelMod::CheckArgs() {
|
||||
|
@ -175,16 +175,16 @@ bool BroadcastToCpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs
|
|||
status = BroadcastToSize32(input_addr, &shape_info_, output_addr);
|
||||
} else if constexpr (std::is_same_v<T, uint64_t>) {
|
||||
status = BroadcastToSize64(input_addr, &shape_info_, output_addr);
|
||||
} else if constexpr (std::is_same_v<T, float16>) {
|
||||
status = BroadcastToSize16(input_addr, &shape_info_, output_addr);
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
status = BroadcastToSize32(input_addr, &shape_info_, output_addr);
|
||||
} else if constexpr (std::is_same_v<T, double>) {
|
||||
status = BroadcastToSize64(input_addr, &shape_info_, output_addr);
|
||||
#ifndef _MSC_VER
|
||||
} else if constexpr (std::is_same_v<T, complex64>) {
|
||||
status = BroadcastToSize64(input_addr, &shape_info_, output_addr);
|
||||
} else if constexpr (std::is_same_v<T, complex128>) {
|
||||
status = BroadcastToSize128(input_addr, &shape_info_, output_addr);
|
||||
#endif
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||
<< "', not supported data type, the dtype of input must be bool, int, complex, float or double";
|
||||
|
|
|
@ -14,8 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_BROADCAST_TO_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_BROADCAST_TO_CPU_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
@ -32,7 +32,7 @@ namespace kernel {
|
|||
constexpr auto kBroadcastTo = "BroadcastTo";
|
||||
constexpr auto kDynamicBroadcastTo = "DynamicBroadcastTo";
|
||||
constexpr auto kUnknown = "Unknown";
|
||||
class BroadcastToCpuKernelMod : public DeprecatedNativeCpuKernelMod {
|
||||
class BroadcastToCpuKernelMod : public NativeCpuKernelMod {
|
||||
public:
|
||||
BroadcastToCpuKernelMod() = default;
|
||||
explicit BroadcastToCpuKernelMod(const std::string &kernel_type) : kernel_type_(kernel_type) {}
|
||||
|
@ -43,7 +43,12 @@ class BroadcastToCpuKernelMod : public DeprecatedNativeCpuKernelMod {
|
|||
return kernel_func_(this, inputs, workspace, outputs);
|
||||
}
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs) override;
|
||||
|
||||
int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs,
|
||||
const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost) override;
|
||||
|
||||
void CheckArgs();
|
||||
|
||||
|
@ -68,4 +73,4 @@ class BroadcastToCpuKernelMod : public DeprecatedNativeCpuKernelMod {
|
|||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_BROADCAST_TO_CPU_KERNEL_H_
|
||||
|
|
|
@ -15,22 +15,162 @@
|
|||
*/
|
||||
|
||||
#include "plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
|
||||
BroadcastToGpuKernelMod, double)
|
||||
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
BroadcastToGpuKernelMod, float)
|
||||
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
BroadcastToGpuKernelMod, half)
|
||||
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
|
||||
BroadcastToGpuKernelMod, int16_t)
|
||||
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
BroadcastToGpuKernelMod, int32_t)
|
||||
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
|
||||
BroadcastToGpuKernelMod, int64_t)
|
||||
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
|
||||
BroadcastToGpuKernelMod, bool)
|
||||
bool BroadcastToGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs) {
|
||||
kernel_name_ = base_operator->GetPrim()->name();
|
||||
auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
|
||||
auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
|
||||
if (!is_match) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', it does not support this kernel data type: " << kernel_attr;
|
||||
return false;
|
||||
}
|
||||
kernel_func_ = func_list_[index].second;
|
||||
input_type_size_ = abstract::TypeIdSize(kernel_attr.GetInputAttr(kIndex0).first);
|
||||
return true;
|
||||
}
|
||||
void BroadcastToGpuKernelMod::ResetResource() noexcept {
|
||||
input_size_ = 1;
|
||||
output_size_ = 1;
|
||||
for (size_t i = 0; i < SHAPE_SIZE; ++i) {
|
||||
input_shape_[i] = 1;
|
||||
output_shape_[i] = 1;
|
||||
}
|
||||
is_null_input_ = false;
|
||||
}
|
||||
|
||||
int BroadcastToGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs,
|
||||
const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost) {
|
||||
ResetResource();
|
||||
auto input_shapes = inputs[kIndex0]->GetShapeVector();
|
||||
auto output_shapes = outputs[kIndex0]->GetShapeVector();
|
||||
|
||||
auto it_x = std::find_if(input_shapes.begin(), input_shapes.end(), [](int64_t sh) { return sh < 0; });
|
||||
if (it_x != input_shapes.end()) {
|
||||
return KRET_UNKNOWN_SHAPE;
|
||||
}
|
||||
|
||||
if (input_shapes.size() > SHAPE_SIZE || output_shapes.size() > SHAPE_SIZE) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input and output cannot be greater than "
|
||||
<< SHAPE_SIZE << ", but got the dimension of input: " << input_shapes.size()
|
||||
<< ", the dimension of output: " << output_shapes.size();
|
||||
}
|
||||
|
||||
if (output_shapes.size() < input_shapes.size()) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||
<< "', the dimension of output cannot be less than the dimension of input "
|
||||
<< ", but got the dimension of input: " << input_shapes.size()
|
||||
<< ", the dimension of output: " << output_shapes.size();
|
||||
}
|
||||
size_t offset = output_shapes.size() - input_shapes.size();
|
||||
for (size_t i = 0; i < input_shapes.size(); i++) {
|
||||
input_shape_[i + offset] = LongToSizeClipNeg(input_shapes[i]);
|
||||
}
|
||||
|
||||
for (size_t j = 0; j < output_shapes.size(); j++) {
|
||||
output_shape_[j] = LongToSizeClipNeg(output_shapes[j]);
|
||||
}
|
||||
|
||||
input_size_ = std::accumulate(input_shape_.begin(), input_shape_.end(), size_t(1), std::multiplies{});
|
||||
output_size_ = std::accumulate(output_shape_.begin(), output_shape_.end(), size_t(1), std::multiplies{});
|
||||
|
||||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
input_size_list_.push_back(input_size_ * input_type_size_);
|
||||
output_size_list_.push_back(output_size_ * input_type_size_);
|
||||
|
||||
// if (auto ret = KernelMod::Resize(base_operator, inputs, outputs, inputsOnHost); ret != KRET_OK) {
|
||||
// return ret;
|
||||
// }
|
||||
return KRET_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool BroadcastToGpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
|
||||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
T *input_addr = GetDeviceAddress<T>(inputs, 0);
|
||||
T *output_addr = GetDeviceAddress<T>(outputs, 0);
|
||||
|
||||
BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], input_shape_[4], input_shape_[5],
|
||||
input_shape_[6], input_shape_[7], output_shape_[0], output_shape_[1], output_shape_[2], output_shape_[3],
|
||||
output_shape_[4], output_shape_[5], output_shape_[6], output_shape_[7], input_addr, output_addr,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<std::pair<KernelAttr, BroadcastToGpuKernelMod::BroadcastToLaunchFunc>> BroadcastToGpuKernelMod::func_list_ =
|
||||
{
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<double>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<float>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<half>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<int8_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<int16_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<int32_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<int64_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<uint8_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<uint16_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<uint32_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<uint64_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<bool>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeComplex64).AddOutputAttr(kNumberTypeComplex64),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<utils::Complex<float>>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeComplex128).AddOutputAttr(kNumberTypeComplex128),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<utils::Complex<double>>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<double>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<float>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<half>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt16),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<int16_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<int32_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<int64_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat64),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<double>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<float>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<half>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt16),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<int16_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<int32_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt64),
|
||||
&BroadcastToGpuKernelMod::LaunchKernel<int64_t>},
|
||||
};
|
||||
|
||||
std::vector<KernelAttr> BroadcastToGpuKernelMod::GetOpSupport() {
|
||||
std::vector<KernelAttr> support_list;
|
||||
(void)std::transform(
|
||||
func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
|
||||
[](const std::pair<KernelAttr, BroadcastToGpuKernelMod::BroadcastToLaunchFunc> &pair) { return pair.first; });
|
||||
return support_list;
|
||||
}
|
||||
|
||||
MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, BroadcastTo, BroadcastToGpuKernelMod);
|
||||
MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, DynamicBroadcastTo, BroadcastToGpuKernelMod);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -14,11 +14,15 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_BROADCAST_TO_GPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_BROADCAST_TO_GPU_KERNEL_H_
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_BROADCAST_TO_GPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_BROADCAST_TO_GPU_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
|
||||
|
@ -26,93 +30,44 @@
|
|||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr size_t SHAPE_SIZE = 8;
|
||||
template <typename T>
|
||||
class BroadcastToGpuKernelMod : public DeprecatedNativeGpuKernelMod {
|
||||
class BroadcastToGpuKernelMod : public NativeGpuKernelMod {
|
||||
public:
|
||||
BroadcastToGpuKernelMod() : kernel_name_("BroadcastTo") {}
|
||||
~BroadcastToGpuKernelMod() = default;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
T *input_addr = GetDeviceAddress<T>(inputs, 0);
|
||||
T *output_addr = GetDeviceAddress<T>(outputs, 0);
|
||||
|
||||
BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], input_shape_[4], input_shape_[5],
|
||||
input_shape_[6], input_shape_[7], output_shape_[0], output_shape_[1], output_shape_[2],
|
||||
output_shape_[3], output_shape_[4], output_shape_[5], output_shape_[6], output_shape_[7], input_addr,
|
||||
output_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto input_shapes = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
auto output_shapes = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
kernel_node_ = kernel_node;
|
||||
if (AnfAlgo::IsShapesDynamic({input_shapes, output_shapes})) {
|
||||
return true;
|
||||
return kernel_func_(this, inputs, workspace, outputs, stream_ptr);
|
||||
}
|
||||
|
||||
is_null_input_ =
|
||||
CHECK_SHAPE_NULL(input_shapes, kernel_name_, "input") || CHECK_SHAPE_NULL(output_shapes, kernel_name_, "output");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (input_shapes.size() > SHAPE_SIZE || output_shapes.size() > SHAPE_SIZE) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input and output cannot be greater than "
|
||||
<< SHAPE_SIZE << ", but got the dimension of input: " << input_shapes.size()
|
||||
<< ", the dimension of output: " << output_shapes.size();
|
||||
}
|
||||
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs) override;
|
||||
|
||||
if (output_shapes.size() < input_shapes.size()) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||
<< "', the dimension of output cannot be less than the dimension of input "
|
||||
<< ", but got the dimension of input: " << input_shapes.size()
|
||||
<< ", the dimension of output: " << output_shapes.size();
|
||||
}
|
||||
|
||||
size_t offset = output_shapes.size() - input_shapes.size();
|
||||
for (size_t i = 0; i < input_shapes.size(); i++) {
|
||||
input_shape_[i + offset] = LongToSizeClipNeg(input_shapes[i]);
|
||||
}
|
||||
|
||||
for (size_t j = 0; j < output_shapes.size(); j++) {
|
||||
output_shape_[j] = LongToSizeClipNeg(output_shapes[j]);
|
||||
}
|
||||
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
||||
void ResetResource() noexcept override {
|
||||
ResetSizeLists();
|
||||
for (size_t i = 0; i < SHAPE_SIZE; ++i) {
|
||||
input_shape_[i] = 1;
|
||||
output_shape_[i] = 1;
|
||||
}
|
||||
is_null_input_ = false;
|
||||
}
|
||||
int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs, const std::map<uint32_t, tensor::TensorPtr> &) override;
|
||||
|
||||
protected:
|
||||
void InitSizeLists() override {
|
||||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * input_shape_[4] *
|
||||
input_shape_[5] * input_shape_[6] * input_shape_[7] * sizeof(T));
|
||||
output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] *
|
||||
output_shape_[4] * output_shape_[5] * output_shape_[6] * output_shape_[7] * sizeof(T));
|
||||
}
|
||||
std::vector<KernelAttr> GetOpSupport() override;
|
||||
template <typename T>
|
||||
bool LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr);
|
||||
using BroadcastToLaunchFunc =
|
||||
std::function<bool(BroadcastToGpuKernelMod *, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &, void *)>;
|
||||
|
||||
private:
|
||||
size_t input_shape_[SHAPE_SIZE] = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
size_t output_shape_[SHAPE_SIZE] = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
std::string kernel_name_{};
|
||||
BroadcastToLaunchFunc kernel_func_;
|
||||
void ResetResource() noexcept;
|
||||
static std::vector<std::pair<KernelAttr, BroadcastToLaunchFunc>> func_list_;
|
||||
size_t input_size_;
|
||||
size_t output_size_;
|
||||
size_t input_type_size_; // sizeof(T)
|
||||
std::vector<size_t> input_shape_ = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
std::vector<size_t> output_shape_ = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
bool is_null_input_ = false;
|
||||
std::string kernel_name_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_BROADCAST_TO_GPU_KERNEL_H_
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_BROADCAST_TO_GPU_KERNEL_H_
|
||||
|
|
|
@ -126,31 +126,11 @@ struct MaximumFunc {
|
|||
__device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return lhs > rhs ? lhs : rhs; }
|
||||
};
|
||||
|
||||
#ifndef _WIN32
|
||||
template <typename T>
|
||||
struct PowerFunc {
|
||||
__device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return pow(lhs, rhs); }
|
||||
};
|
||||
|
||||
#else
|
||||
template <typename T>
|
||||
struct PowerFunc {
|
||||
__device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
|
||||
return static_cast<T>(pow(static_cast<double>(lhs), static_cast<double>(rhs)));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PowerFunc<float> {
|
||||
__device__ __host__ __forceinline__ float operator()(const float &lhs, const float &rhs) { return pow(lhs, rhs); }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PowerFunc<double> {
|
||||
__device__ __host__ __forceinline__ double operator()(const double &lhs, const double &rhs) { return pow(lhs, rhs); }
|
||||
};
|
||||
#endif
|
||||
|
||||
template <>
|
||||
struct PowerFunc<half> {
|
||||
__device__ __host__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
|
||||
|
@ -479,7 +459,7 @@ struct XDivyFunc<half2> {
|
|||
|
||||
|
||||
// XLogy check if lhs is less than epsilon, XLogy support half, float, double
|
||||
template <typename T, typename IsInteger = void>
|
||||
template <typename T>
|
||||
struct XLogyFunc {
|
||||
// default T is float
|
||||
__device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
|
||||
|
@ -494,26 +474,6 @@ struct XLogyFunc {
|
|||
return res;
|
||||
}
|
||||
};
|
||||
#ifdef _WIN32
|
||||
template <typename T>
|
||||
struct XLogyFunc<T, typename std::enable_if<std::is_integral<T>::value>::type> {
|
||||
__device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
|
||||
double tmpLhs = static_cast<double>(lhs);
|
||||
double tmpRhs = static_cast<double>(rhs);
|
||||
return tmpLhs < kFloatEplison && tmpLhs > -kFloatEplison ? 0.0 : (tmpLhs * log(tmpRhs));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct XLogyFunc<bool> {
|
||||
__device__ __host__ __forceinline__ bool operator()(const bool &lhs, const bool &rhs) {
|
||||
if (!lhs || !rhs) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template <>
|
||||
struct XLogyFunc<Complex<float>> {
|
||||
|
@ -1802,6 +1762,12 @@ template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, co
|
|||
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
|
||||
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
|
||||
const half *input_addr, half *output_addr, cudaStream_t stream);
|
||||
|
||||
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
|
||||
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
|
||||
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
|
||||
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
|
||||
const int8_t *input_addr, int8_t *output_addr, cudaStream_t stream);
|
||||
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
|
||||
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
|
||||
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
|
||||
|
@ -1817,8 +1783,41 @@ template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, co
|
|||
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
|
||||
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
|
||||
const int64_t *input_addr, int64_t *output_addr, cudaStream_t stream);
|
||||
|
||||
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
|
||||
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
|
||||
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
|
||||
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
|
||||
const uint8_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
|
||||
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
|
||||
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
|
||||
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
|
||||
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
|
||||
const uint16_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
|
||||
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
|
||||
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
|
||||
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
|
||||
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
|
||||
const uint32_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
|
||||
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
|
||||
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
|
||||
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
|
||||
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
|
||||
const uint64_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
|
||||
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
|
||||
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
|
||||
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
|
||||
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
|
||||
const bool *input_addr, bool *output_addr, cudaStream_t stream);
|
||||
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
|
||||
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
|
||||
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
|
||||
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
|
||||
const Complex<float> *input_addr, Complex<float> *output_addr,
|
||||
cudaStream_t stream);
|
||||
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
|
||||
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
|
||||
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
|
||||
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
|
||||
const Complex<double> *input_addr, Complex<double> *output_addr,
|
||||
cudaStream_t stream);
|
||||
|
|
|
@ -1,77 +0,0 @@
|
|||
/**
|
||||
* Copyright 2021-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
|
||||
BroadcastToGpuKernelMod, double)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
|
||||
BroadcastToGpuKernelMod, float)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
|
||||
BroadcastToGpuKernelMod, half)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt16),
|
||||
BroadcastToGpuKernelMod, int16_t)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
|
||||
BroadcastToGpuKernelMod, int32_t)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
|
||||
BroadcastToGpuKernelMod, int64_t)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat64),
|
||||
BroadcastToGpuKernelMod, double)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
|
||||
BroadcastToGpuKernelMod, float)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
|
||||
BroadcastToGpuKernelMod, half)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt16),
|
||||
BroadcastToGpuKernelMod, int16_t)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
BroadcastToGpuKernelMod, int32_t)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt64),
|
||||
BroadcastToGpuKernelMod, int64_t)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool),
|
||||
BroadcastToGpuKernelMod, bool)
|
||||
MS_REG_GPU_KERNEL_ONE(
|
||||
DynamicBroadcastTo,
|
||||
KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
|
||||
BroadcastToGpuKernelMod, bool)
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
|
@ -76,6 +76,32 @@ def test_broadcast():
|
|||
assert np.allclose(output.asnumpy(), expect)
|
||||
|
||||
|
||||
def broadcast_to_dtype(dtype):
|
||||
"""
|
||||
Basic function to test data type of BroadcastTo.
|
||||
"""
|
||||
shape = (2, 3, 4, 5)
|
||||
x1_np = np.random.rand(4, 5).astype(dtype)
|
||||
output = P.BroadcastTo(shape)(Tensor(x1_np))
|
||||
expect = np.broadcast_to(x1_np, shape)
|
||||
assert np.allclose(output.asnumpy(), expect)
|
||||
|
||||
|
||||
@pytest.mark.level1
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_broadcast_to_dtype():
|
||||
"""
|
||||
Feature: Test supported data types of BroadCastTo.
|
||||
Description: all data types
|
||||
Expectation: success.
|
||||
"""
|
||||
types = [np.float16, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64,
|
||||
np.uint8, np.uint16, np.uint32, np.uint64, np.complex64, np.complex128]
|
||||
for dtype in types:
|
||||
broadcast_to_dtype(dtype=dtype)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_cpu
|
||||
@pytest.mark.env_onecard
|
||||
|
|
|
@ -70,6 +70,32 @@ def test_broadcast():
|
|||
assert np.allclose(output.asnumpy(), expect)
|
||||
|
||||
|
||||
def broadcast_to_dtype(dtype):
|
||||
"""
|
||||
Basic function to test data type of BroadcastTo.
|
||||
"""
|
||||
shape = (2, 3, 4, 5)
|
||||
x1_np = np.random.rand(4, 5).astype(dtype)
|
||||
output = P.BroadcastTo(shape)(Tensor(x1_np))
|
||||
expect = np.broadcast_to(x1_np, shape)
|
||||
assert np.allclose(output.asnumpy(), expect)
|
||||
|
||||
|
||||
@pytest.mark.level1
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
def test_broadcast_to_dtype():
|
||||
"""
|
||||
Feature: Test supported data types of BroadCastTo.
|
||||
Description: all data types
|
||||
Expectation: success.
|
||||
"""
|
||||
types = [np.float16, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64,
|
||||
np.uint8, np.uint16, np.uint32, np.uint64, np.complex64, np.complex128]
|
||||
for dtype in types:
|
||||
broadcast_to_dtype(dtype=dtype)
|
||||
|
||||
|
||||
@pytest.mark.level1
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
|
|
Loading…
Reference in New Issue