!41982 reconstruct and add data type of BroadcastTo

Merge pull request !41982 from ZhidanLiu/master
This commit is contained in:
i-robot 2022-09-16 08:23:39 +00:00 committed by Gitee
commit 9876005e52
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
9 changed files with 314 additions and 239 deletions

View File

@ -9,6 +9,7 @@
"mindspore/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_callback_register.cc" "runtime/references"
"mindspore/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_manager.h" "runtime/references"
"mindspore/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.h" "runtime/references"
"mindspore/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.cc" "whitespace/braces"
"mindspore/mindspore/core/mindrt/src/actor/actormgr.h" "runtime/references"
"mindspore/mindspore/core/mindrt/src/actor/actorpolicyinterface.h" "runtime/references"
"mindspore/mindspore/core/mindrt/src/actor/actorthread.h" "runtime/references"

View File

@ -14,18 +14,16 @@
* limitations under the License.
*/
#include "plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h"
#include <algorithm>
#include <utility>
#include "plugin/device/cpu/kernel/nnacl/errorcode.h"
#include "plugin/device/cpu/kernel/broadcast_to_cpu_kernel.h"
namespace mindspore {
namespace kernel {
namespace {
#ifndef _MSC_VER
using complex64 = __complex__ float;
using complex128 = __complex__ double;
#endif
constexpr size_t kBroadcastToOutputsNum = 1;
} // namespace
@ -48,16 +46,16 @@ std::map<std::string, std::vector<std::pair<KernelAttr, BroadcastToCpuKernelMod:
&BroadcastToCpuKernelMod::LaunchKernel<uint32_t>},
{KernelAttr().AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
&BroadcastToCpuKernelMod::LaunchKernel<uint64_t>},
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
&BroadcastToCpuKernelMod::LaunchKernel<float16>},
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
&BroadcastToCpuKernelMod::LaunchKernel<float>},
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
&BroadcastToCpuKernelMod::LaunchKernel<double>},
#ifndef _MSC_VER
{KernelAttr().AddInputAttr(kNumberTypeComplex64).AddOutputAttr(kNumberTypeComplex64),
&BroadcastToCpuKernelMod::LaunchKernel<complex64>},
{KernelAttr().AddInputAttr(kNumberTypeComplex128).AddOutputAttr(kNumberTypeComplex128),
&BroadcastToCpuKernelMod::LaunchKernel<complex128>},
#endif
{KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
&BroadcastToCpuKernelMod::LaunchKernel<bool>}}},
{kDynamicBroadcastTo,
@ -66,16 +64,13 @@ std::map<std::string, std::vector<std::pair<KernelAttr, BroadcastToCpuKernelMod:
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
&BroadcastToCpuKernelMod::LaunchKernel<int>},
{KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool),
&BroadcastToCpuKernelMod::LaunchKernel<bool>},
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
&BroadcastToCpuKernelMod::LaunchKernel<float>},
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
&BroadcastToCpuKernelMod::LaunchKernel<int>},
{KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
&BroadcastToCpuKernelMod::LaunchKernel<bool>}}}};
void BroadcastToCpuKernelMod::InitTaskFunc(const CNodePtr &kernel_node) {
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
bool BroadcastToCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) {
MS_EXCEPTION_IF_NULL(base_operator);
kernel_name_ = base_operator->name();
if (kernel_name_ != kernel_type_) {
MS_LOG(EXCEPTION) << "Suppose to be " << kernel_type_ << " but got " << kernel_name_;
}
@ -85,22 +80,27 @@ void BroadcastToCpuKernelMod::InitTaskFunc(const CNodePtr &kernel_node) {
MS_LOG(EXCEPTION) << "BroadcastTo cpu does not support " << kernel_type_;
}
auto kernel_attr = GetKernelAttrFromNode(kernel_node);
auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
if (!is_match) {
MS_LOG(EXCEPTION) << "BroadcastTo does not support this kernel data type: " << kernel_attr;
MS_LOG(ERROR) << "For '" << kernel_name_ << "' does not support this kernel type: " << kernel_attr;
return false;
}
kernel_func_ = func_list_[kernel_type_][index].second;
return true;
}
void BroadcastToCpuKernelMod::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
input_shape_ = common::AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
output_shape_ = common::AnfAlgo::GetOutputInferShape(kernel_node, 0);
if (AnfAlgo::IsShapesDynamic({input_shape_, output_shape_})) {
return;
int BroadcastToCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs,
const std::map<uint32_t, tensor::TensorPtr> &) {
input_shape_ = inputs[kIndex0]->GetShapeVector();
output_shape_ = outputs[kIndex0]->GetShapeVector();
auto it_x = std::find_if(input_shape_.begin(), input_shape_.end(), [](int64_t sh) { return sh < 0; });
if (it_x != input_shape_.end()) {
return KRET_UNKNOWN_SHAPE;
}
size_t input_shape_size = input_shape_.size();
size_t output_shape_size = output_shape_.size();
@ -112,8 +112,8 @@ void BroadcastToCpuKernelMod::InitKernel(const CNodePtr &kernel_node) {
}
shape_info_.input_shape_size_ = SizeToInt(input_shape_size);
shape_info_.output_shape_size_ = SizeToInt(output_shape_size);
InitTaskFunc(kernel_node);
int ret = KernelMod::Resize(base_operator, inputs, outputs);
return ret;
}
void BroadcastToCpuKernelMod::CheckArgs() {
@ -175,16 +175,16 @@ bool BroadcastToCpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs
status = BroadcastToSize32(input_addr, &shape_info_, output_addr);
} else if constexpr (std::is_same_v<T, uint64_t>) {
status = BroadcastToSize64(input_addr, &shape_info_, output_addr);
} else if constexpr (std::is_same_v<T, float16>) {
status = BroadcastToSize16(input_addr, &shape_info_, output_addr);
} else if constexpr (std::is_same_v<T, float>) {
status = BroadcastToSize32(input_addr, &shape_info_, output_addr);
} else if constexpr (std::is_same_v<T, double>) {
status = BroadcastToSize64(input_addr, &shape_info_, output_addr);
#ifndef _MSC_VER
} else if constexpr (std::is_same_v<T, complex64>) {
status = BroadcastToSize64(input_addr, &shape_info_, output_addr);
} else if constexpr (std::is_same_v<T, complex128>) {
status = BroadcastToSize128(input_addr, &shape_info_, output_addr);
#endif
} else {
MS_LOG(EXCEPTION) << "For '" << kernel_name_
<< "', not supported data type, the dtype of input must be bool, int, complex, float or double";

View File

@ -14,8 +14,8 @@
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_BROADCAST_TO_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_BROADCAST_TO_CPU_KERNEL_H_
#include <vector>
#include <map>
@ -32,7 +32,7 @@ namespace kernel {
constexpr auto kBroadcastTo = "BroadcastTo";
constexpr auto kDynamicBroadcastTo = "DynamicBroadcastTo";
constexpr auto kUnknown = "Unknown";
class BroadcastToCpuKernelMod : public DeprecatedNativeCpuKernelMod {
class BroadcastToCpuKernelMod : public NativeCpuKernelMod {
public:
BroadcastToCpuKernelMod() = default;
explicit BroadcastToCpuKernelMod(const std::string &kernel_type) : kernel_type_(kernel_type) {}
@ -43,7 +43,12 @@ class BroadcastToCpuKernelMod : public DeprecatedNativeCpuKernelMod {
return kernel_func_(this, inputs, workspace, outputs);
}
void InitKernel(const CNodePtr &kernel_node) override;
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) override;
int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs,
const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost) override;
void CheckArgs();
@ -68,4 +73,4 @@ class BroadcastToCpuKernelMod : public DeprecatedNativeCpuKernelMod {
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_BROADCAST_TO_CPU_KERNEL_H_

View File

@ -15,22 +15,162 @@
*/
#include "plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/complex.h"
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
BroadcastToGpuKernelMod, double)
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
BroadcastToGpuKernelMod, float)
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
BroadcastToGpuKernelMod, half)
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
BroadcastToGpuKernelMod, int16_t)
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastToGpuKernelMod, int32_t)
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
BroadcastToGpuKernelMod, int64_t)
MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
BroadcastToGpuKernelMod, bool)
bool BroadcastToGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) {
kernel_name_ = base_operator->GetPrim()->name();
auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
if (!is_match) {
MS_LOG(ERROR) << "For '" << kernel_name_ << "', it does not support this kernel data type: " << kernel_attr;
return false;
}
kernel_func_ = func_list_[index].second;
input_type_size_ = abstract::TypeIdSize(kernel_attr.GetInputAttr(kIndex0).first);
return true;
}
void BroadcastToGpuKernelMod::ResetResource() noexcept {
input_size_ = 1;
output_size_ = 1;
for (size_t i = 0; i < SHAPE_SIZE; ++i) {
input_shape_[i] = 1;
output_shape_[i] = 1;
}
is_null_input_ = false;
}
int BroadcastToGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs,
const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost) {
ResetResource();
auto input_shapes = inputs[kIndex0]->GetShapeVector();
auto output_shapes = outputs[kIndex0]->GetShapeVector();
auto it_x = std::find_if(input_shapes.begin(), input_shapes.end(), [](int64_t sh) { return sh < 0; });
if (it_x != input_shapes.end()) {
return KRET_UNKNOWN_SHAPE;
}
if (input_shapes.size() > SHAPE_SIZE || output_shapes.size() > SHAPE_SIZE) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input and output cannot be greater than "
<< SHAPE_SIZE << ", but got the dimension of input: " << input_shapes.size()
<< ", the dimension of output: " << output_shapes.size();
}
if (output_shapes.size() < input_shapes.size()) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_
<< "', the dimension of output cannot be less than the dimension of input "
<< ", but got the dimension of input: " << input_shapes.size()
<< ", the dimension of output: " << output_shapes.size();
}
size_t offset = output_shapes.size() - input_shapes.size();
for (size_t i = 0; i < input_shapes.size(); i++) {
input_shape_[i + offset] = LongToSizeClipNeg(input_shapes[i]);
}
for (size_t j = 0; j < output_shapes.size(); j++) {
output_shape_[j] = LongToSizeClipNeg(output_shapes[j]);
}
input_size_ = std::accumulate(input_shape_.begin(), input_shape_.end(), size_t(1), std::multiplies{});
output_size_ = std::accumulate(output_shape_.begin(), output_shape_.end(), size_t(1), std::multiplies{});
input_size_list_.clear();
output_size_list_.clear();
input_size_list_.push_back(input_size_ * input_type_size_);
output_size_list_.push_back(output_size_ * input_type_size_);
// if (auto ret = KernelMod::Resize(base_operator, inputs, outputs, inputsOnHost); ret != KRET_OK) {
// return ret;
// }
return KRET_OK;
}
template <typename T>
bool BroadcastToGpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
if (is_null_input_) {
return true;
}
T *input_addr = GetDeviceAddress<T>(inputs, 0);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], input_shape_[4], input_shape_[5],
input_shape_[6], input_shape_[7], output_shape_[0], output_shape_[1], output_shape_[2], output_shape_[3],
output_shape_[4], output_shape_[5], output_shape_[6], output_shape_[7], input_addr, output_addr,
reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
std::vector<std::pair<KernelAttr, BroadcastToGpuKernelMod::BroadcastToLaunchFunc>> BroadcastToGpuKernelMod::func_list_ =
{
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
&BroadcastToGpuKernelMod::LaunchKernel<double>},
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
&BroadcastToGpuKernelMod::LaunchKernel<float>},
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
&BroadcastToGpuKernelMod::LaunchKernel<half>},
{KernelAttr().AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
&BroadcastToGpuKernelMod::LaunchKernel<int8_t>},
{KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
&BroadcastToGpuKernelMod::LaunchKernel<int16_t>},
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
&BroadcastToGpuKernelMod::LaunchKernel<int32_t>},
{KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
&BroadcastToGpuKernelMod::LaunchKernel<int64_t>},
{KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
&BroadcastToGpuKernelMod::LaunchKernel<uint8_t>},
{KernelAttr().AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
&BroadcastToGpuKernelMod::LaunchKernel<uint16_t>},
{KernelAttr().AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
&BroadcastToGpuKernelMod::LaunchKernel<uint32_t>},
{KernelAttr().AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
&BroadcastToGpuKernelMod::LaunchKernel<uint64_t>},
{KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
&BroadcastToGpuKernelMod::LaunchKernel<bool>},
{KernelAttr().AddInputAttr(kNumberTypeComplex64).AddOutputAttr(kNumberTypeComplex64),
&BroadcastToGpuKernelMod::LaunchKernel<utils::Complex<float>>},
{KernelAttr().AddInputAttr(kNumberTypeComplex128).AddOutputAttr(kNumberTypeComplex128),
&BroadcastToGpuKernelMod::LaunchKernel<utils::Complex<double>>},
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
&BroadcastToGpuKernelMod::LaunchKernel<double>},
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
&BroadcastToGpuKernelMod::LaunchKernel<float>},
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
&BroadcastToGpuKernelMod::LaunchKernel<half>},
{KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt16),
&BroadcastToGpuKernelMod::LaunchKernel<int16_t>},
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
&BroadcastToGpuKernelMod::LaunchKernel<int32_t>},
{KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
&BroadcastToGpuKernelMod::LaunchKernel<int64_t>},
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat64),
&BroadcastToGpuKernelMod::LaunchKernel<double>},
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
&BroadcastToGpuKernelMod::LaunchKernel<float>},
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
&BroadcastToGpuKernelMod::LaunchKernel<half>},
{KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt16),
&BroadcastToGpuKernelMod::LaunchKernel<int16_t>},
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
&BroadcastToGpuKernelMod::LaunchKernel<int32_t>},
{KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt64),
&BroadcastToGpuKernelMod::LaunchKernel<int64_t>},
};
std::vector<KernelAttr> BroadcastToGpuKernelMod::GetOpSupport() {
std::vector<KernelAttr> support_list;
(void)std::transform(
func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
[](const std::pair<KernelAttr, BroadcastToGpuKernelMod::BroadcastToLaunchFunc> &pair) { return pair.first; });
return support_list;
}
MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, BroadcastTo, BroadcastToGpuKernelMod);
MS_KERNEL_FACTORY_REG(NativeGpuKernelMod, DynamicBroadcastTo, BroadcastToGpuKernelMod);
} // namespace kernel
} // namespace mindspore

View File

@ -14,11 +14,15 @@
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_BROADCAST_TO_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_BROADCAST_TO_GPU_KERNEL_H_
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_BROADCAST_TO_GPU_KERNEL_H_
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_BROADCAST_TO_GPU_KERNEL_H_
#include <vector>
#include <string>
#include <functional>
#include <utility>
#include <algorithm>
#include <map>
#include "plugin/device/gpu/kernel/gpu_kernel.h"
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/broadcast_impl.cuh"
@ -26,93 +30,44 @@
namespace mindspore {
namespace kernel {
constexpr size_t SHAPE_SIZE = 8;
template <typename T>
class BroadcastToGpuKernelMod : public DeprecatedNativeGpuKernelMod {
class BroadcastToGpuKernelMod : public NativeGpuKernelMod {
public:
BroadcastToGpuKernelMod() : kernel_name_("BroadcastTo") {}
~BroadcastToGpuKernelMod() = default;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
if (is_null_input_) {
return true;
}
T *input_addr = GetDeviceAddress<T>(inputs, 0);
T *output_addr = GetDeviceAddress<T>(outputs, 0);
BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], input_shape_[4], input_shape_[5],
input_shape_[6], input_shape_[7], output_shape_[0], output_shape_[1], output_shape_[2],
output_shape_[3], output_shape_[4], output_shape_[5], output_shape_[6], output_shape_[7], input_addr,
output_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
return true;
}
bool Init(const CNodePtr &kernel_node) override {
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
auto input_shapes = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
auto output_shapes = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
kernel_node_ = kernel_node;
if (AnfAlgo::IsShapesDynamic({input_shapes, output_shapes})) {
return true;
return kernel_func_(this, inputs, workspace, outputs, stream_ptr);
}
is_null_input_ =
CHECK_SHAPE_NULL(input_shapes, kernel_name_, "input") || CHECK_SHAPE_NULL(output_shapes, kernel_name_, "output");
if (is_null_input_) {
InitSizeLists();
return true;
}
if (input_shapes.size() > SHAPE_SIZE || output_shapes.size() > SHAPE_SIZE) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input and output cannot be greater than "
<< SHAPE_SIZE << ", but got the dimension of input: " << input_shapes.size()
<< ", the dimension of output: " << output_shapes.size();
}
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) override;
if (output_shapes.size() < input_shapes.size()) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_
<< "', the dimension of output cannot be less than the dimension of input "
<< ", but got the dimension of input: " << input_shapes.size()
<< ", the dimension of output: " << output_shapes.size();
}
size_t offset = output_shapes.size() - input_shapes.size();
for (size_t i = 0; i < input_shapes.size(); i++) {
input_shape_[i + offset] = LongToSizeClipNeg(input_shapes[i]);
}
for (size_t j = 0; j < output_shapes.size(); j++) {
output_shape_[j] = LongToSizeClipNeg(output_shapes[j]);
}
InitSizeLists();
return true;
}
void ResetResource() noexcept override {
ResetSizeLists();
for (size_t i = 0; i < SHAPE_SIZE; ++i) {
input_shape_[i] = 1;
output_shape_[i] = 1;
}
is_null_input_ = false;
}
int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs, const std::map<uint32_t, tensor::TensorPtr> &) override;
protected:
void InitSizeLists() override {
input_size_list_.clear();
output_size_list_.clear();
input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * input_shape_[4] *
input_shape_[5] * input_shape_[6] * input_shape_[7] * sizeof(T));
output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] *
output_shape_[4] * output_shape_[5] * output_shape_[6] * output_shape_[7] * sizeof(T));
}
std::vector<KernelAttr> GetOpSupport() override;
template <typename T>
bool LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, void *stream_ptr);
using BroadcastToLaunchFunc =
std::function<bool(BroadcastToGpuKernelMod *, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &, void *)>;
private:
size_t input_shape_[SHAPE_SIZE] = {1, 1, 1, 1, 1, 1, 1, 1};
size_t output_shape_[SHAPE_SIZE] = {1, 1, 1, 1, 1, 1, 1, 1};
std::string kernel_name_{};
BroadcastToLaunchFunc kernel_func_;
void ResetResource() noexcept;
static std::vector<std::pair<KernelAttr, BroadcastToLaunchFunc>> func_list_;
size_t input_size_;
size_t output_size_;
size_t input_type_size_; // sizeof(T)
std::vector<size_t> input_shape_ = {1, 1, 1, 1, 1, 1, 1, 1};
std::vector<size_t> output_shape_ = {1, 1, 1, 1, 1, 1, 1, 1};
bool is_null_input_ = false;
std::string kernel_name_;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_BROADCAST_TO_GPU_KERNEL_H_
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_BROADCAST_TO_GPU_KERNEL_H_

View File

@ -126,31 +126,11 @@ struct MaximumFunc {
__device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return lhs > rhs ? lhs : rhs; }
};
#ifndef _WIN32
template <typename T>
struct PowerFunc {
__device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) { return pow(lhs, rhs); }
};
#else
template <typename T>
struct PowerFunc {
__device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
return static_cast<T>(pow(static_cast<double>(lhs), static_cast<double>(rhs)));
}
};
template <>
struct PowerFunc<float> {
__device__ __host__ __forceinline__ float operator()(const float &lhs, const float &rhs) { return pow(lhs, rhs); }
};
template <>
struct PowerFunc<double> {
__device__ __host__ __forceinline__ double operator()(const double &lhs, const double &rhs) { return pow(lhs, rhs); }
};
#endif
template <>
struct PowerFunc<half> {
__device__ __host__ __forceinline__ half operator()(const half &lhs, const half &rhs) {
@ -479,7 +459,7 @@ struct XDivyFunc<half2> {
// XLogy check if lhs is less than epsilon, XLogy support half, float, double
template <typename T, typename IsInteger = void>
template <typename T>
struct XLogyFunc {
// default T is float
__device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
@ -494,26 +474,6 @@ struct XLogyFunc {
return res;
}
};
#ifdef _WIN32
template <typename T>
struct XLogyFunc<T, typename std::enable_if<std::is_integral<T>::value>::type> {
__device__ __host__ __forceinline__ T operator()(const T &lhs, const T &rhs) {
double tmpLhs = static_cast<double>(lhs);
double tmpRhs = static_cast<double>(rhs);
return tmpLhs < kFloatEplison && tmpLhs > -kFloatEplison ? 0.0 : (tmpLhs * log(tmpRhs));
}
};
template <>
struct XLogyFunc<bool> {
__device__ __host__ __forceinline__ bool operator()(const bool &lhs, const bool &rhs) {
if (!lhs || !rhs) {
return false;
}
return true;
}
};
#endif
template <>
struct XLogyFunc<Complex<float>> {
@ -1802,6 +1762,12 @@ template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, co
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
const half *input_addr, half *output_addr, cudaStream_t stream);
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
const int8_t *input_addr, int8_t *output_addr, cudaStream_t stream);
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
@ -1817,8 +1783,41 @@ template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, co
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
const int64_t *input_addr, int64_t *output_addr, cudaStream_t stream);
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
const uint8_t *input_addr, uint8_t *output_addr, cudaStream_t stream);
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
const uint16_t *input_addr, uint16_t *output_addr, cudaStream_t stream);
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
const uint32_t *input_addr, uint32_t *output_addr, cudaStream_t stream);
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
const uint64_t *input_addr, uint64_t *output_addr, cudaStream_t stream);
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
const bool *input_addr, bool *output_addr, cudaStream_t stream);
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
const Complex<float> *input_addr, Complex<float> *output_addr,
cudaStream_t stream);
template CUDA_LIB_EXPORT void BroadcastTo(const size_t &i0, const size_t &i1, const size_t &i2, const size_t &i3,
const size_t &i4, const size_t &i5, const size_t &i6, const size_t &i7,
const size_t &o0, const size_t &o1, const size_t &o2, const size_t &o3,
const size_t &o4, const size_t &o5, const size_t &o6, const size_t &o7,
const Complex<double> *input_addr, Complex<double> *output_addr,
cudaStream_t stream);

View File

@ -1,77 +0,0 @@
/**
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/gpu/kernel/arrays/broadcast_to_gpu_kernel.h"
namespace mindspore {
namespace kernel {
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
BroadcastToGpuKernelMod, double)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
BroadcastToGpuKernelMod, float)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
BroadcastToGpuKernelMod, half)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt16),
BroadcastToGpuKernelMod, int16_t)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
BroadcastToGpuKernelMod, int32_t)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
BroadcastToGpuKernelMod, int64_t)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat64),
BroadcastToGpuKernelMod, double)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32),
BroadcastToGpuKernelMod, float)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16),
BroadcastToGpuKernelMod, half)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt16),
BroadcastToGpuKernelMod, int16_t)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
BroadcastToGpuKernelMod, int32_t)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt64),
BroadcastToGpuKernelMod, int64_t)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool),
BroadcastToGpuKernelMod, bool)
MS_REG_GPU_KERNEL_ONE(
DynamicBroadcastTo,
KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
BroadcastToGpuKernelMod, bool)
} // namespace kernel
} // namespace mindspore

View File

@ -76,6 +76,32 @@ def test_broadcast():
assert np.allclose(output.asnumpy(), expect)
def broadcast_to_dtype(dtype):
"""
Basic function to test data type of BroadcastTo.
"""
shape = (2, 3, 4, 5)
x1_np = np.random.rand(4, 5).astype(dtype)
output = P.BroadcastTo(shape)(Tensor(x1_np))
expect = np.broadcast_to(x1_np, shape)
assert np.allclose(output.asnumpy(), expect)
@pytest.mark.level1
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_broadcast_to_dtype():
"""
Feature: Test supported data types of BroadCastTo.
Description: all data types
Expectation: success.
"""
types = [np.float16, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64,
np.uint8, np.uint16, np.uint32, np.uint64, np.complex64, np.complex128]
for dtype in types:
broadcast_to_dtype(dtype=dtype)
@pytest.mark.level0
@pytest.mark.platform_x86_cpu
@pytest.mark.env_onecard

View File

@ -70,6 +70,32 @@ def test_broadcast():
assert np.allclose(output.asnumpy(), expect)
def broadcast_to_dtype(dtype):
"""
Basic function to test data type of BroadcastTo.
"""
shape = (2, 3, 4, 5)
x1_np = np.random.rand(4, 5).astype(dtype)
output = P.BroadcastTo(shape)(Tensor(x1_np))
expect = np.broadcast_to(x1_np, shape)
assert np.allclose(output.asnumpy(), expect)
@pytest.mark.level1
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
def test_broadcast_to_dtype():
"""
Feature: Test supported data types of BroadCastTo.
Description: all data types
Expectation: success.
"""
types = [np.float16, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64,
np.uint8, np.uint16, np.uint32, np.uint64, np.complex64, np.complex128]
for dtype in types:
broadcast_to_dtype(dtype=dtype)
@pytest.mark.level1
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard