forked from mindspore-Ecosystem/mindspore
!31271 Reconstruct the GPU invoking mode
Merge pull request !31271 from zong_shuai/reconstruct_gpu_kernel
This commit is contained in:
commit
27e417de6a
|
@ -19,15 +19,13 @@
|
|||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/batchtospace_helper.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr size_t SHAPE_SIZE = 4;
|
||||
constexpr size_t CROPS_SHAPE_0 = 2;
|
||||
constexpr size_t CROPS_SHAPE_1 = 2;
|
||||
template <typename T>
|
||||
class BatchToSpaceGpuKernelMod : public NativeGpuKernelMod {
|
||||
public:
|
||||
|
@ -36,139 +34,55 @@ class BatchToSpaceGpuKernelMod : public NativeGpuKernelMod {
|
|||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
T *input = GetDeviceAddress<T>(inputs, 0);
|
||||
T *output = GetDeviceAddress<T>(outputs, 0);
|
||||
|
||||
size_t size = output_size_ / sizeof(T);
|
||||
|
||||
CalBatchToSpace<T>(size, input, in_, ih_, iw_, ic_, on_, oh_, ow_, oc_, crops_[0][0], crops_[0][1], crops_[1][0],
|
||||
crops_[1][1], block_size_, output, reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
std::vector<void *> input_addrs = ConvertPtrs(inputs);
|
||||
std::vector<void *> work_addrs = ConvertPtrs(workspace);
|
||||
std::vector<void *> output_addrs = ConvertPtrs(outputs);
|
||||
int flag = helper_ptr_->Process(input_addrs, output_addrs, work_addrs, stream_ptr);
|
||||
if (flag != 0) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
(void)CheckParam(kernel_node);
|
||||
input_size_ = sizeof(T);
|
||||
for (size_t idx = 0; idx < input_shape_.size(); ++idx) {
|
||||
input_size_ *= input_shape_[idx];
|
||||
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
|
||||
helper_ptr_ = std::make_unique<cukernel::BatchToSpaceHelperGpuKernel<T>>(kernel_name_);
|
||||
helper_ptr_->ResetResource();
|
||||
|
||||
std::vector<std::vector<size_t>> input_shapes;
|
||||
std::vector<std::vector<size_t>> output_shapes;
|
||||
auto input_shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
|
||||
auto output_shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, 0);
|
||||
input_shapes.emplace_back(input_shape);
|
||||
output_shapes.emplace_back(output_shape);
|
||||
attr_.block_size = GetAttr<int64_t>(kernel_node, "block_size");
|
||||
attr_.crops = GetAttr<std::vector<std::vector<int64_t>>>(kernel_node, "crops");
|
||||
attr_.input_shape = input_shape;
|
||||
int flag = helper_ptr_->CheckKernelParam(&attr_);
|
||||
if (flag != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
in_ = input_shape_[0];
|
||||
ic_ = input_shape_[1];
|
||||
ih_ = input_shape_[2];
|
||||
iw_ = input_shape_[3];
|
||||
|
||||
on_ = in_ / (block_size_ * block_size_);
|
||||
oc_ = ic_;
|
||||
oh_ = ih_ * block_size_ - crops_[0][0] - crops_[0][1];
|
||||
ow_ = iw_ * block_size_ - crops_[1][0] - crops_[1][1];
|
||||
output_size_ = on_ * oc_ * oh_ * ow_ * sizeof(T);
|
||||
flag = helper_ptr_->CalMemSize(input_shapes, output_shapes);
|
||||
if (flag != 0) {
|
||||
return false;
|
||||
}
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
void ResetResource() noexcept override {
|
||||
in_ = 0;
|
||||
ic_ = 0;
|
||||
ih_ = 0;
|
||||
iw_ = 0;
|
||||
on_ = 0;
|
||||
oc_ = 0;
|
||||
oh_ = 0;
|
||||
ow_ = 0;
|
||||
kernel_name_ = "BatchToSpace";
|
||||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
crops_.clear();
|
||||
input_shape_.clear();
|
||||
}
|
||||
|
||||
protected:
|
||||
void InitSizeLists() override {
|
||||
input_size_list_.push_back(input_size_);
|
||||
output_size_list_.push_back(output_size_);
|
||||
}
|
||||
|
||||
void CheckParam(const CNodePtr &kernel_node) {
|
||||
block_size_ = GetAttr<int64_t>(kernel_node, "block_size");
|
||||
if (block_size_ < 1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the 'block_size' cannot be less than 1, but got "
|
||||
<< block_size_;
|
||||
}
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
size_t output_num = common::AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
|
||||
// check input_shape
|
||||
auto input_shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
|
||||
if (input_shape.size() != SHAPE_SIZE) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
if ((input_shape[0] % (block_size_ * block_size_)) != 0) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||
<< "', input_shape[0] should be divisible by product of block_shape, but got input_shape[0]: "
|
||||
<< input_shape[0] << ", block_shape: " << block_size_;
|
||||
}
|
||||
for (size_t idx = 0; idx < SHAPE_SIZE; ++idx) {
|
||||
if (input_shape[idx] < 1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||
<< "', the element of shape of input cannot be less than 1, but got "
|
||||
<< CONVERT_VECTOR_TO_STRING(input_shape);
|
||||
}
|
||||
}
|
||||
input_shape_.assign(input_shape.begin(), input_shape.end());
|
||||
|
||||
// check crops
|
||||
crops_ = (GetAttr<std::vector<std::vector<int64_t>>>(kernel_node, "crops"));
|
||||
|
||||
if (crops_.size() != CROPS_SHAPE_0) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the size of 'crops' should be " << CROPS_SHAPE_0
|
||||
<< ", but got " << crops_.size();
|
||||
}
|
||||
if (crops_[0].size() != CROPS_SHAPE_1 || crops_[1].size() != CROPS_SHAPE_1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the size of element of 'crops' should be " << CROPS_SHAPE_1
|
||||
<< ", but got the size of crops[0]: " << crops_[0].size()
|
||||
<< ", the size of crops[1]: " << crops_[1].size();
|
||||
} else {
|
||||
for (size_t idx_i = 0; idx_i < CROPS_SHAPE_0; ++idx_i) {
|
||||
for (size_t idx_j = 0; idx_j < CROPS_SHAPE_1; ++idx_j) {
|
||||
if (crops_[idx_i][idx_j] < 0) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||
<< "', the element of 'crops' should be greater than or equal to 0, but got crops["
|
||||
<< idx_i << "][" << idx_j << "]: " << crops_[idx_i][idx_j];
|
||||
}
|
||||
}
|
||||
auto tmp_shape = input_shape[idx_i + CROPS_SHAPE_1] * block_size_ - crops_[idx_i][0] - crops_[idx_i][1];
|
||||
if (tmp_shape <= 0) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||
<< "', the element of shape of output should be greater than 0, but got " << tmp_shape;
|
||||
}
|
||||
}
|
||||
}
|
||||
input_size_list_ = helper_ptr_->GetInputSizeList();
|
||||
output_size_list_ = helper_ptr_->GetOutputSizeList();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::vector<int64_t>> crops_;
|
||||
std::vector<size_t> input_shape_;
|
||||
size_t block_size_;
|
||||
size_t input_size_;
|
||||
size_t output_size_;
|
||||
size_t in_;
|
||||
size_t ic_;
|
||||
size_t ih_;
|
||||
size_t iw_;
|
||||
size_t on_;
|
||||
size_t oc_;
|
||||
size_t oh_;
|
||||
size_t ow_;
|
||||
std::string kernel_name_;
|
||||
std::unique_ptr<cukernel::BatchToSpaceHelperGpuKernel<T>> helper_ptr_ = nullptr;
|
||||
cukernel::BatchToSpaceAttr attr_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -18,9 +18,10 @@
|
|||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_ARRAYS_UNIQUE_GPU_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/unique_helper.h"
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T, typename S>
|
||||
|
@ -34,32 +35,31 @@ class UniqueGpuKernelMod : public NativeGpuKernelMod {
|
|||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
T *input = GetDeviceAddress<T>(inputs, 0);
|
||||
S *input_index = GetDeviceAddress<S>(workspace, 0);
|
||||
S *sorted_index = GetDeviceAddress<S>(workspace, 1);
|
||||
T *output = GetDeviceAddress<T>(outputs, 0);
|
||||
S *index = GetDeviceAddress<S>(outputs, 1);
|
||||
stream_ptr_ = stream_ptr;
|
||||
post_output_size_ = CalUnique(input, num_elements_, input_index, sorted_index, output, index,
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
std::vector<void *> input_ptrs = ConvertPtrs(inputs);
|
||||
std::vector<void *> work_ptrs = ConvertPtrs(workspace);
|
||||
std::vector<void *> output_ptrs = ConvertPtrs(outputs);
|
||||
if (helper_ptr_->Process(input_ptrs, output_ptrs, work_ptrs, stream_ptr) != 0) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
auto kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
helper_ptr_ = std::make_unique<cukernel::UniqueHelperGpuKernel<T, S>>(kernel_name);
|
||||
helper_ptr_->ResetResource();
|
||||
std::vector<std::vector<size_t>> input_shapes;
|
||||
std::vector<std::vector<size_t>> output_shapes;
|
||||
std::vector<size_t> shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
for (auto x : shape) {
|
||||
num_elements_ *= x;
|
||||
}
|
||||
input_size_ = num_elements_ * sizeof(T);
|
||||
output_size_ = input_size_;
|
||||
workspace_size_ = num_elements_ * sizeof(S);
|
||||
input_shapes.emplace_back(shape);
|
||||
helper_ptr_->CalMemSize(input_shapes, output_shapes);
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -73,7 +73,7 @@ class UniqueGpuKernelMod : public NativeGpuKernelMod {
|
|||
for (size_t i = 0; i < output_num; ++i) {
|
||||
std::vector<size_t> shape = common::AnfAlgo::GetOutputInferShape(kernel_node_.lock(), i);
|
||||
if (i == 0) {
|
||||
shape[0] = post_output_size_;
|
||||
shape[0] = helper_ptr_->GetOutSize();
|
||||
}
|
||||
TypeId type_id = common::AnfAlgo::GetOutputInferDataType(kernel_node_.lock(), i);
|
||||
type_ids.emplace_back(type_id);
|
||||
|
@ -83,11 +83,6 @@ class UniqueGpuKernelMod : public NativeGpuKernelMod {
|
|||
}
|
||||
|
||||
void ResetResource() noexcept override {
|
||||
input_size_ = 0;
|
||||
output_size_ = 0;
|
||||
workspace_size_ = 0;
|
||||
num_elements_ = 1;
|
||||
post_output_size_ = 0;
|
||||
is_null_input_ = false;
|
||||
stream_ptr_ = nullptr;
|
||||
input_size_list_.clear();
|
||||
|
@ -97,21 +92,15 @@ class UniqueGpuKernelMod : public NativeGpuKernelMod {
|
|||
|
||||
protected:
|
||||
void InitSizeLists() override {
|
||||
input_size_list_.push_back(input_size_);
|
||||
output_size_list_.push_back(output_size_);
|
||||
output_size_list_.push_back(num_elements_ * sizeof(S));
|
||||
workspace_size_list_.push_back(workspace_size_);
|
||||
workspace_size_list_.push_back(workspace_size_);
|
||||
input_size_list_ = helper_ptr_->GetInputSizeList();
|
||||
output_size_list_ = helper_ptr_->GetOutputSizeList();
|
||||
workspace_size_list_ = helper_ptr_->GetWorkSizeList();
|
||||
}
|
||||
|
||||
private:
|
||||
void *stream_ptr_;
|
||||
size_t input_size_;
|
||||
size_t output_size_;
|
||||
size_t workspace_size_;
|
||||
int num_elements_;
|
||||
int post_output_size_;
|
||||
bool is_null_input_;
|
||||
std::unique_ptr<cukernel::UniqueHelperGpuKernel<T, S>> helper_ptr_ = nullptr;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -0,0 +1,161 @@
|
|||
/**
|
||||
* Copyright 2019-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_BATCHTOSPACE_HELPER_H_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_BATCHTOSPACE_HELPER_H_
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/helper_base.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/batchtospace_impl.cuh"
|
||||
|
||||
namespace mindspore {
|
||||
namespace cukernel {
|
||||
constexpr size_t INPUT_NUM = 1;
|
||||
constexpr size_t OUTPUT_NUM = 1;
|
||||
constexpr size_t SHAPE_SIZE = 4;
|
||||
constexpr size_t CROPS_SHAPE_0 = 2;
|
||||
constexpr size_t CROPS_SHAPE_1 = 2;
|
||||
|
||||
struct BatchToSpaceAttr : public GpuKernelAttrBase {
|
||||
std::vector<std::vector<int64_t>> crops;
|
||||
std::vector<size_t> input_shape;
|
||||
size_t block_size;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class BatchToSpaceHelperGpuKernel : public GpuKernelHelperBase {
|
||||
public:
|
||||
explicit BatchToSpaceHelperGpuKernel(std::string &kernel_name) : GpuKernelHelperBase(kernel_name) {}
|
||||
virtual ~BatchToSpaceHelperGpuKernel() = default;
|
||||
int CalMemSize(const std::vector<std::vector<size_t>> &input_shapes,
|
||||
const std::vector<std::vector<size_t>> &output_shapes) override {
|
||||
int flag = CalShapesSizeInBytes<T>(input_shapes, INPUT_NUM, kernel_name_, "input_shapes", &input_size_list_);
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
flag = CalShapesSizeInBytes<T>(output_shapes, OUTPUT_NUM, kernel_name_, "output_shapes", &output_size_list_);
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
kernel_size_ = output_size_list_[0] / sizeof(T);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Process(const std::vector<void *> &input_ptrs, const std::vector<void *> &output_ptrs,
|
||||
const std::vector<void *> &work_ptrs, void *cuda_stream) override {
|
||||
size_t in = attr_ptr_->input_shape[0];
|
||||
size_t ic = attr_ptr_->input_shape[1];
|
||||
size_t ih = attr_ptr_->input_shape[2];
|
||||
size_t iw = attr_ptr_->input_shape[3];
|
||||
|
||||
size_t on = in / (attr_ptr_->block_size * attr_ptr_->block_size);
|
||||
size_t oc = ic;
|
||||
size_t oh = ih * attr_ptr_->block_size - attr_ptr_->crops[0][0] - attr_ptr_->crops[0][1];
|
||||
size_t ow = iw * attr_ptr_->block_size - attr_ptr_->crops[1][0] - attr_ptr_->crops[1][1];
|
||||
|
||||
T *input_ptr = nullptr;
|
||||
T *output_ptr = nullptr;
|
||||
int flag = GetDeviceAddress<T>(input_ptrs, 0, kernel_name_, &input_ptr);
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
|
||||
flag = GetDeviceAddress<T>(output_ptrs, 0, kernel_name_, &output_ptr);
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
|
||||
CalBatchToSpace<T>(kernel_size_, input_ptr, in, ih, iw, ic, on, oh, ow, oc, attr_ptr_->crops[0][0],
|
||||
attr_ptr_->crops[0][1], attr_ptr_->crops[1][0], attr_ptr_->crops[1][1], attr_ptr_->block_size,
|
||||
output_ptr, reinterpret_cast<cudaStream_t>(cuda_stream));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ResetResource() override {
|
||||
kernel_size_ = 0;
|
||||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
work_size_list_.clear();
|
||||
}
|
||||
int CheckKernelParam(GpuKernelAttrBase *kernel_attr) override {
|
||||
attr_ptr_ = dynamic_cast<BatchToSpaceAttr *>(kernel_attr);
|
||||
if (attr_ptr_->block_size < 1) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the 'block_size' cannot be less than 1, but got "
|
||||
<< attr_ptr_->block_size;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// check input_shape
|
||||
if (attr_ptr_->input_shape.size() != SHAPE_SIZE) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the dimension of input should be 4, but got "
|
||||
<< attr_ptr_->input_shape.size();
|
||||
return -1;
|
||||
}
|
||||
if ((attr_ptr_->input_shape[0] % (attr_ptr_->block_size * attr_ptr_->block_size)) != 0) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_
|
||||
<< "', input_shape[0] should be divisible by product of block_shape, but got input_shape[0]: "
|
||||
<< attr_ptr_->input_shape[0] << ", block_shape: " << attr_ptr_->block_size;
|
||||
return -1;
|
||||
}
|
||||
for (size_t idx = 0; idx < SHAPE_SIZE; ++idx) {
|
||||
if (attr_ptr_->input_shape[idx] < 1) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the element of shape of input cannot be less than 1, but got "
|
||||
<< ConvertVectorToString(attr_ptr_->input_shape);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// check crops
|
||||
if (attr_ptr_->crops.size() != CROPS_SHAPE_0) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the size of 'crops' should be " << CROPS_SHAPE_0 << ", but got "
|
||||
<< attr_ptr_->crops.size();
|
||||
return -1;
|
||||
}
|
||||
if (attr_ptr_->crops[0].size() != CROPS_SHAPE_1 || attr_ptr_->crops[1].size() != CROPS_SHAPE_1) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the size of element of 'crops' should be " << CROPS_SHAPE_1
|
||||
<< ", but got the size of crops[0]: " << attr_ptr_->crops[0].size()
|
||||
<< ", the size of crops[1]: " << attr_ptr_->crops[1].size();
|
||||
return -1;
|
||||
} else {
|
||||
for (size_t idx_i = 0; idx_i < CROPS_SHAPE_0; ++idx_i) {
|
||||
for (size_t idx_j = 0; idx_j < CROPS_SHAPE_1; ++idx_j) {
|
||||
if (attr_ptr_->crops[idx_i][idx_j] < 0) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_
|
||||
<< "', the element of 'crops' should be greater than or equal to 0, but got crops[" << idx_i
|
||||
<< "][" << idx_j << "]: " << attr_ptr_->crops[idx_i][idx_j];
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
auto tmp_shape = attr_ptr_->input_shape[idx_i + CROPS_SHAPE_1] * attr_ptr_->block_size -
|
||||
attr_ptr_->crops[idx_i][0] - attr_ptr_->crops[idx_i][1];
|
||||
if (tmp_shape <= 0) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_
|
||||
<< "', the element of shape of output should be greater than 0, but got " << tmp_shape;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
BatchToSpaceAttr *attr_ptr_;
|
||||
size_t kernel_size_;
|
||||
};
|
||||
} // namespace cukernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_BATCHTOSPACE_HELPER_H_
|
|
@ -0,0 +1,89 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_COMMON_H_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_COMMON_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "mindspore/core/utils/log_adapter.h"
|
||||
namespace mindspore {
|
||||
namespace cukernel {
|
||||
// 1. 错误码细化
|
||||
|
||||
inline std::string ConvertVectorToString(const std::vector<size_t> &value) {
|
||||
std::stringstream ss;
|
||||
ss << "(";
|
||||
for (auto it = value.begin(); it != value.end(); it++) {
|
||||
if (it == value.begin()) {
|
||||
ss << *it;
|
||||
} else {
|
||||
ss << ", " << *it;
|
||||
}
|
||||
}
|
||||
ss << ")";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int CalShapesSizeInBytes(const std::vector<std::vector<size_t>> &shapes, const size_t shape_num,
|
||||
const std::string kernel_name, const std::string param_name,
|
||||
std::vector<size_t> *shapes_size) {
|
||||
if (shape_num != shapes.size()) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name << "', the number of " << param_name << "should be equal to " << shape_num
|
||||
<< ", but got " << shapes.size();
|
||||
return -1;
|
||||
}
|
||||
size_t return_flag = 0;
|
||||
for (size_t idx = 0; idx < shape_num; ++idx) {
|
||||
size_t cur_size = sizeof(T);
|
||||
if (shapes[idx].size() == 0) {
|
||||
// 常数
|
||||
MS_LOG(WARNING) << "For '" << kernel_name << "', the shapes[" << idx << "] is ( )";
|
||||
shapes_size->emplace_back(cur_size);
|
||||
continue;
|
||||
}
|
||||
for (const auto &val : shapes[idx]) {
|
||||
cur_size *= val;
|
||||
}
|
||||
if (cur_size == 0) {
|
||||
MS_LOG(WARNING) << "For '" << kernel_name << "', the shape cannot contain zero, but got shapes[" << idx << "] is "
|
||||
<< ConvertVectorToString(shapes[idx]);
|
||||
return_flag = -1;
|
||||
}
|
||||
shapes_size->emplace_back(cur_size);
|
||||
}
|
||||
return return_flag;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline int GetDeviceAddress(const std::vector<void *> &addr_list, const size_t index, const std::string kernel_name,
|
||||
T **out_ptr) {
|
||||
if (index >= addr_list.size()) {
|
||||
MS_LOG(ERROR) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (addr_list[index] == nullptr) {
|
||||
MS_LOG(ERROR) << "The device address is empty, address index: " << index << ", op name is: " << kernel_name;
|
||||
return -1;
|
||||
}
|
||||
*out_ptr = reinterpret_cast<T *>(addr_list[index]);
|
||||
return 0;
|
||||
}
|
||||
} // namespace cukernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_COMMON_H_
|
|
@ -0,0 +1,63 @@
|
|||
/**
|
||||
* Copyright 2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_HELPER_BASE_H_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_HELPER_BASE_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "mindspore/core/utils/log_adapter.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/cuda_class_common.h"
|
||||
namespace mindspore {
|
||||
namespace cukernel {
|
||||
struct GpuKernelAttrBase {
|
||||
virtual ~GpuKernelAttrBase() = default;
|
||||
};
|
||||
|
||||
class GpuKernelHelperBase {
|
||||
public:
|
||||
explicit GpuKernelHelperBase(std::string &kernel_name) : kernel_name_(kernel_name) {}
|
||||
virtual ~GpuKernelHelperBase() {
|
||||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
work_size_list_.clear();
|
||||
}
|
||||
|
||||
virtual int CalMemSize(const std::vector<std::vector<size_t>> &input_shapes,
|
||||
const std::vector<std::vector<size_t>> &output_shapes) = 0;
|
||||
|
||||
virtual int Process(const std::vector<void *> &input_ptrs, const std::vector<void *> &output_ptrs,
|
||||
const std::vector<void *> &work_ptrs, void *cuda_stream) = 0;
|
||||
|
||||
virtual void ResetResource() {
|
||||
MS_LOG(ERROR) << "kernel must override the `ResetResource()` method when dynamic shape";
|
||||
}
|
||||
|
||||
std::vector<size_t> GetInputSizeList() { return input_size_list_; }
|
||||
std::vector<size_t> GetOutputSizeList() { return output_size_list_; }
|
||||
std::vector<size_t> GetWorkSizeList() { return work_size_list_; }
|
||||
|
||||
virtual int CheckKernelParam(GpuKernelAttrBase *kernel_attr) { return 0; }
|
||||
|
||||
protected:
|
||||
std::vector<size_t> input_size_list_;
|
||||
std::vector<size_t> output_size_list_;
|
||||
std::vector<size_t> work_size_list_;
|
||||
std::string kernel_name_;
|
||||
};
|
||||
} // namespace cukernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_HELPER_BASE_H_
|
|
@ -0,0 +1,147 @@
|
|||
/**
|
||||
* Copyright 2019-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_UNARY_HELPER_H_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_UNARY_HELPER_H_
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/helper_base.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh"
|
||||
|
||||
namespace mindspore {
|
||||
namespace cukernel {
|
||||
enum UnaryOptype {
|
||||
UNARY_OP_EXP = 0,
|
||||
UNARY_OP_EXPM1,
|
||||
UNARY_OP_LOG,
|
||||
UNARY_OP_LOG1P,
|
||||
UNARY_OP_ERF,
|
||||
UNARY_OP_ERFC,
|
||||
UNARY_OP_NEG,
|
||||
UNARY_OP_RECIPROCAL,
|
||||
UNARY_OP_SQUARE,
|
||||
UNARY_OP_SQRT,
|
||||
UNARY_OP_RSQRT,
|
||||
UNARY_OP_SIN,
|
||||
UNARY_OP_COS,
|
||||
UNARY_OP_ASIN,
|
||||
UNARY_OP_ACOS,
|
||||
UNARY_OP_ATAN,
|
||||
UNARY_OP_ASINH,
|
||||
UNARY_OP_ACOSH,
|
||||
UNARY_OP_ABS,
|
||||
UNARY_OP_FLOOR,
|
||||
UNARY_OP_RINT,
|
||||
UNARY_OP_ROUND,
|
||||
UNARY_OP_SIGN,
|
||||
UNARY_OP_REAL,
|
||||
UNARY_OP_IMAG,
|
||||
UNARY_OP_CONJ,
|
||||
UNARY_OP_INVALID_TYPE = 255
|
||||
};
|
||||
|
||||
static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {
|
||||
{"Exp", UNARY_OP_EXP}, {"Expm1", UNARY_OP_EXPM1},
|
||||
{"Log", UNARY_OP_LOG}, {"Log1p", UNARY_OP_LOG1P},
|
||||
{"Erf", UNARY_OP_ERF}, {"Erfc", UNARY_OP_ERFC},
|
||||
{"Neg", UNARY_OP_NEG}, {"Reciprocal", UNARY_OP_RECIPROCAL},
|
||||
{"Square", UNARY_OP_SQUARE}, {"Sqrt", UNARY_OP_SQRT},
|
||||
{"Rsqrt", UNARY_OP_RSQRT}, {"Sin", UNARY_OP_SIN},
|
||||
{"Cos", UNARY_OP_COS}, {"Asin", UNARY_OP_ASIN},
|
||||
{"ACos", UNARY_OP_ACOS}, {"Atan", UNARY_OP_ATAN},
|
||||
{"Asinh", UNARY_OP_ASINH}, {"Acosh", UNARY_OP_ACOSH},
|
||||
{"Abs", UNARY_OP_ABS}, {"Floor", UNARY_OP_FLOOR},
|
||||
{"Rint", UNARY_OP_RINT}, {"Round", UNARY_OP_ROUND},
|
||||
{"Real", UNARY_OP_REAL}, {"Imag", UNARY_OP_IMAG},
|
||||
{"Sign", UNARY_OP_SIGN}, {"Conj", UNARY_OP_CONJ}};
|
||||
|
||||
template <typename T>
|
||||
class UnaryHelperGpuKernel : public GpuKernelHelperBase {
|
||||
public:
|
||||
explicit UnaryHelperGpuKernel(std::string &kernel_name) : GpuKernelHelperBase(kernel_name) {}
|
||||
virtual ~UnaryHelperGpuKernel() = default;
|
||||
int CalMemSize(const std::vector<std::vector<size_t>> &input_shapes,
|
||||
const std::vector<std::vector<size_t>> &output_shapes) override {
|
||||
auto iter = kUnaryOpTypeMap.find(kernel_name_);
|
||||
if (iter == kUnaryOpTypeMap.end()) {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << ", only support these types: Exp, Expm1, Log, Log1p, Erf, Erfc,"
|
||||
<< " Neg, Reciprocal, Square, Sqrt, Rsqrt, Sin, Cos, Asin, ACos, Atan, Asinh, Acosh, Abs, "
|
||||
<< "Floor, Rint, Round, Real, Imag, Sign, Conj currently, but got " << kernel_name_;
|
||||
return -1;
|
||||
}
|
||||
|
||||
unary_op_type_ = iter->second;
|
||||
int flag = CalShapesSizeInBytes<T>(input_shapes, 1, kernel_name_, "input_shapes", &input_size_list_);
|
||||
output_size_list_ = input_size_list_;
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Process(const std::vector<void *> &input_ptrs, const std::vector<void *> &output_ptrs,
|
||||
const std::vector<void *> &work_ptrs, void *cuda_stream) override {
|
||||
static std::map<UnaryOptype, std::function<void(const T *, T *, const size_t, cudaStream_t)>> func_map = {
|
||||
{UNARY_OP_EXP, Exponential<T>}, {UNARY_OP_EXPM1, Expm1<T>},
|
||||
{UNARY_OP_LOG, Logarithm<T>}, {UNARY_OP_LOG1P, Log1p<T>},
|
||||
{UNARY_OP_ERF, Erf<T>}, {UNARY_OP_ERFC, Erfc<T>},
|
||||
{UNARY_OP_NEG, Negative<T>}, {UNARY_OP_RECIPROCAL, Reciprocal<T>},
|
||||
{UNARY_OP_SQUARE, Square<T>}, {UNARY_OP_SQRT, Sqrt<T>},
|
||||
{UNARY_OP_RSQRT, Rsqrt<T>}, {UNARY_OP_SIN, Sin<T>},
|
||||
{UNARY_OP_COS, Cos<T>}, {UNARY_OP_ASIN, Asin<T>},
|
||||
{UNARY_OP_ACOS, ACos<T>}, {UNARY_OP_ATAN, Atan<T>},
|
||||
{UNARY_OP_ASINH, Asinh<T>}, {UNARY_OP_ACOSH, Acosh<T>},
|
||||
{UNARY_OP_ABS, Abs<T>}, {UNARY_OP_FLOOR, Floor<T>},
|
||||
{UNARY_OP_RINT, Rint<T>}, {UNARY_OP_ROUND, Round<T>},
|
||||
{UNARY_OP_SIGN, Sign<T>}};
|
||||
|
||||
auto iter = func_map.find(unary_op_type_);
|
||||
if (iter != func_map.end()) {
|
||||
T *input_addr;
|
||||
T *output_addr;
|
||||
int flag = GetDeviceAddress<T>(input_ptrs, 0, kernel_name_, &input_addr);
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
flag = GetDeviceAddress<T>(output_ptrs, 0, kernel_name_, &output_addr);
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
iter->second(input_addr, output_addr, input_size_list_[0] / sizeof(T),
|
||||
reinterpret_cast<cudaStream_t>(cuda_stream));
|
||||
} else {
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << ", only support these types: Exp, Expm1, Log, Log1p, Erf, Erfc,"
|
||||
<< " Neg, Reciprocal, Square, Sqrt, Rsqrt, Sin, Cos, Asin, ACos, Atan, Asinh, Acosh, Abs, "
|
||||
<< "Floor, Rint, Round, Real, Imag, Sign, Conj currently, but got " << unary_op_type_;
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ResetResource() override {
|
||||
unary_op_type_ = UNARY_OP_INVALID_TYPE;
|
||||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
}
|
||||
|
||||
private:
|
||||
UnaryOptype unary_op_type_;
|
||||
};
|
||||
} // namespace cukernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_UNARY_HELPER_H_
|
|
@ -0,0 +1,105 @@
|
|||
/**
|
||||
* Copyright 2019-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_UNIQUE_HELPER_H_
|
||||
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_UNIQUE_HELPER_H_
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/helper_base.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unique_impl.cuh"
|
||||
|
||||
namespace mindspore {
|
||||
namespace cukernel {
|
||||
constexpr size_t INPUT_NUM = 1;
|
||||
constexpr size_t OUTPUT_NUM = 1;
|
||||
constexpr size_t WORK_NUM = 0;
|
||||
constexpr size_t SHAPE_SIZE = 4;
|
||||
constexpr size_t CROPS_SHAPE_0 = 2;
|
||||
constexpr size_t CROPS_SHAPE_1 = 2;
|
||||
|
||||
template <typename T, typename S>
|
||||
class UniqueHelperGpuKernel : public GpuKernelHelperBase {
|
||||
public:
|
||||
explicit UniqueHelperGpuKernel(std::string &kernel_name) : GpuKernelHelperBase(kernel_name) {}
|
||||
virtual ~UniqueHelperGpuKernel() = default;
|
||||
int CalMemSize(const std::vector<std::vector<size_t>> &input_shapes,
|
||||
const std::vector<std::vector<size_t>> &output_shapes) override {
|
||||
int flag = CalShapesSizeInBytes<T>(input_shapes, INPUT_NUM, kernel_name_, "input_shapes", &input_size_list_);
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
num_elements_ = input_size_list_[0] / sizeof(T);
|
||||
size_t workspace_size = num_elements_ * sizeof(S);
|
||||
work_size_list_.emplace_back(workspace_size);
|
||||
work_size_list_.emplace_back(workspace_size);
|
||||
output_size_list_.emplace_back(input_size_list_[0]);
|
||||
output_size_list_.emplace_back(num_elements_ * sizeof(S));
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Process(const std::vector<void *> &input_ptrs, const std::vector<void *> &output_ptrs,
|
||||
const std::vector<void *> &work_ptrs, void *cuda_stream) override {
|
||||
T *t_input_ptr = nullptr;
|
||||
S *s_input_index = nullptr;
|
||||
S *s_sorted_index = nullptr;
|
||||
T *t_output_ptr = nullptr;
|
||||
S *s_output_index = nullptr;
|
||||
int flag = GetDeviceAddress<T>(input_ptrs, 0, kernel_name_, &t_input_ptr);
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
|
||||
flag = GetDeviceAddress<S>(work_ptrs, 0, kernel_name_, &s_input_index);
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
|
||||
flag = GetDeviceAddress<S>(work_ptrs, 1, kernel_name_, &s_sorted_index);
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
flag = GetDeviceAddress<T>(output_ptrs, 0, kernel_name_, &t_output_ptr);
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
|
||||
flag = GetDeviceAddress<S>(output_ptrs, 1, kernel_name_, &s_output_index);
|
||||
if (flag != 0) {
|
||||
return flag;
|
||||
}
|
||||
|
||||
post_output_size_ = CalUnique(t_input_ptr, num_elements_, s_input_index, s_sorted_index, t_output_ptr,
|
||||
s_output_index, reinterpret_cast<cudaStream_t>(cuda_stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ResetResource() override {
|
||||
num_elements_ = 1;
|
||||
post_output_size_ = 0;
|
||||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
work_size_list_.clear();
|
||||
}
|
||||
|
||||
int GetOutSize() { return post_output_size_; }
|
||||
|
||||
private:
|
||||
int num_elements_;
|
||||
int post_output_size_;
|
||||
};
|
||||
} // namespace cukernel
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_CLASS_UNIQUE_HELPER_H_
|
|
@ -113,6 +113,14 @@ class NativeGpuKernelMod : public GpuKernelMod {
|
|||
return reinterpret_cast<T *>(addr_list[index]->addr);
|
||||
}
|
||||
|
||||
std::vector<void *> ConvertPtrs(const std::vector<AddressPtr> &input_ptrs) {
|
||||
std::vector<void *> out_ptrs;
|
||||
for (auto &cur_addr : input_ptrs) {
|
||||
out_ptrs.emplace_back(cur_addr->addr);
|
||||
}
|
||||
return out_ptrs;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T *GetPossiblyNullDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t index) {
|
||||
if (index >= addr_list.size()) {
|
||||
|
|
|
@ -39,21 +39,21 @@ class UnaryOpComplexGpuKernelMod : public NativeGpuKernelMod {
|
|||
|
||||
S *output_addr = GetDeviceAddress<S>(outputs, 0);
|
||||
switch (unary_op_type_) {
|
||||
case UNARY_OP_REAL: {
|
||||
case cukernel::UNARY_OP_REAL: {
|
||||
if constexpr (!std::is_same<S, utils::Complex<float>>::value &&
|
||||
!std::is_same<S, utils::Complex<double>>::value) {
|
||||
Real(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
}
|
||||
break;
|
||||
}
|
||||
case UNARY_OP_IMAG: {
|
||||
case cukernel::UNARY_OP_IMAG: {
|
||||
if constexpr (!std::is_same<S, utils::Complex<float>>::value &&
|
||||
!std::is_same<S, utils::Complex<double>>::value) {
|
||||
Imag(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
}
|
||||
break;
|
||||
}
|
||||
case UNARY_OP_CONJ: {
|
||||
case cukernel::UNARY_OP_CONJ: {
|
||||
if constexpr (std::is_same<T, S>::value && !std::is_same<T, bool>::value) {
|
||||
Conj(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
}
|
||||
|
@ -112,8 +112,8 @@ class UnaryOpComplexGpuKernelMod : public NativeGpuKernelMod {
|
|||
private:
|
||||
void GetOpType(const CNodePtr &kernel_node) {
|
||||
std::string kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
static std::map<std::string, UnaryOptype> kComplexSupportedTypeMap = {
|
||||
{"Real", UNARY_OP_REAL}, {"Imag", UNARY_OP_IMAG}, {"Conj", UNARY_OP_CONJ}};
|
||||
static std::map<std::string, cukernel::UnaryOptype> kComplexSupportedTypeMap = {
|
||||
{"Real", cukernel::UNARY_OP_REAL}, {"Imag", cukernel::UNARY_OP_IMAG}, {"Conj", cukernel::UNARY_OP_CONJ}};
|
||||
auto iter = kComplexSupportedTypeMap.find(kernel_name);
|
||||
if (iter != kComplexSupportedTypeMap.end()) {
|
||||
unary_op_type_ = iter->second;
|
||||
|
@ -128,7 +128,7 @@ class UnaryOpComplexGpuKernelMod : public NativeGpuKernelMod {
|
|||
size_t output_size_;
|
||||
size_t workspace_size_;
|
||||
bool is_null_input_;
|
||||
UnaryOptype unary_op_type_;
|
||||
cukernel::UnaryOptype unary_op_type_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -21,58 +21,13 @@
|
|||
#include <functional>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel.h"
|
||||
#include "plugin/device/gpu/kernel/gpu_kernel_factory.h"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/unary_op_impl.cuh"
|
||||
#include "plugin/device/gpu/kernel/cuda_impl/cuda_class/unary_helper.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
enum UnaryOptype {
|
||||
UNARY_OP_EXP = 0,
|
||||
UNARY_OP_EXPM1,
|
||||
UNARY_OP_LOG,
|
||||
UNARY_OP_LOG1P,
|
||||
UNARY_OP_ERF,
|
||||
UNARY_OP_ERFC,
|
||||
UNARY_OP_NEG,
|
||||
UNARY_OP_RECIPROCAL,
|
||||
UNARY_OP_SQUARE,
|
||||
UNARY_OP_SQRT,
|
||||
UNARY_OP_RSQRT,
|
||||
UNARY_OP_SIN,
|
||||
UNARY_OP_COS,
|
||||
UNARY_OP_ASIN,
|
||||
UNARY_OP_ACOS,
|
||||
UNARY_OP_ATAN,
|
||||
UNARY_OP_ASINH,
|
||||
UNARY_OP_ACOSH,
|
||||
UNARY_OP_ABS,
|
||||
UNARY_OP_FLOOR,
|
||||
UNARY_OP_RINT,
|
||||
UNARY_OP_ROUND,
|
||||
UNARY_OP_SIGN,
|
||||
UNARY_OP_REAL,
|
||||
UNARY_OP_IMAG,
|
||||
UNARY_OP_CONJ,
|
||||
UNARY_OP_INVALID_TYPE = 255
|
||||
};
|
||||
|
||||
static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {
|
||||
{"Exp", UNARY_OP_EXP}, {"Expm1", UNARY_OP_EXPM1},
|
||||
{"Log", UNARY_OP_LOG}, {"Log1p", UNARY_OP_LOG1P},
|
||||
{"Erf", UNARY_OP_ERF}, {"Erfc", UNARY_OP_ERFC},
|
||||
{"Neg", UNARY_OP_NEG}, {"Reciprocal", UNARY_OP_RECIPROCAL},
|
||||
{"Square", UNARY_OP_SQUARE}, {"Sqrt", UNARY_OP_SQRT},
|
||||
{"Rsqrt", UNARY_OP_RSQRT}, {"Sin", UNARY_OP_SIN},
|
||||
{"Cos", UNARY_OP_COS}, {"Asin", UNARY_OP_ASIN},
|
||||
{"ACos", UNARY_OP_ACOS}, {"Atan", UNARY_OP_ATAN},
|
||||
{"Asinh", UNARY_OP_ASINH}, {"Acosh", UNARY_OP_ACOSH},
|
||||
{"Abs", UNARY_OP_ABS}, {"Floor", UNARY_OP_FLOOR},
|
||||
{"Rint", UNARY_OP_RINT}, {"Round", UNARY_OP_ROUND},
|
||||
{"Real", UNARY_OP_REAL}, {"Imag", UNARY_OP_IMAG},
|
||||
{"Sign", UNARY_OP_SIGN}, {"Conj", UNARY_OP_CONJ}};
|
||||
|
||||
template <typename T>
|
||||
class UnaryOpGpuKernelMod : public NativeGpuKernelMod {
|
||||
public:
|
||||
|
@ -84,72 +39,50 @@ class UnaryOpGpuKernelMod : public NativeGpuKernelMod {
|
|||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
|
||||
static std::map<UnaryOptype, std::function<void(const T *, T *, const size_t, cudaStream_t)>> func_map = {
|
||||
{UNARY_OP_EXP, Exponential<T>}, {UNARY_OP_EXPM1, Expm1<T>},
|
||||
{UNARY_OP_LOG, Logarithm<T>}, {UNARY_OP_LOG1P, Log1p<T>},
|
||||
{UNARY_OP_ERF, Erf<T>}, {UNARY_OP_ERFC, Erfc<T>},
|
||||
{UNARY_OP_NEG, Negative<T>}, {UNARY_OP_RECIPROCAL, Reciprocal<T>},
|
||||
{UNARY_OP_SQUARE, Square<T>}, {UNARY_OP_SQRT, Sqrt<T>},
|
||||
{UNARY_OP_RSQRT, Rsqrt<T>}, {UNARY_OP_SIN, Sin<T>},
|
||||
{UNARY_OP_COS, Cos<T>}, {UNARY_OP_ASIN, Asin<T>},
|
||||
{UNARY_OP_ACOS, ACos<T>}, {UNARY_OP_ATAN, Atan<T>},
|
||||
{UNARY_OP_ASINH, Asinh<T>}, {UNARY_OP_ACOSH, Acosh<T>},
|
||||
{UNARY_OP_ABS, Abs<T>}, {UNARY_OP_FLOOR, Floor<T>},
|
||||
{UNARY_OP_RINT, Rint<T>}, {UNARY_OP_ROUND, Round<T>},
|
||||
{UNARY_OP_SIGN, Sign<T>}};
|
||||
|
||||
auto iter = func_map.find(unary_op_type_);
|
||||
if (iter != func_map.end()) {
|
||||
T *input_addr = GetDeviceAddress<T>(inputs, 0);
|
||||
T *output_addr = GetDeviceAddress<T>(outputs, 0);
|
||||
iter->second(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: Exp, Expm1, Log, Log1p, Erf, Erfc,"
|
||||
<< " Neg, Reciprocal, Square, Sqrt, Rsqrt, Sin, Cos, Asin, ACos, Atan, Asinh, Acosh, Abs, "
|
||||
<< "Floor, Rint, Round, Real, Imag, Sign, Conj currently, but got " << unary_op_type_;
|
||||
std::vector<void *> input_addrs;
|
||||
std::vector<void *> output_addrs;
|
||||
std::vector<void *> work_addrs;
|
||||
for (size_t idx = 0; idx < inputs.size(); ++idx) {
|
||||
void *cur_ptr = reinterpret_cast<void *>(GetDeviceAddress<T>(inputs, idx));
|
||||
input_addrs.emplace_back(cur_ptr);
|
||||
}
|
||||
for (size_t idx = 0; idx < outputs.size(); ++idx) {
|
||||
void *cur_ptr = reinterpret_cast<void *>(GetDeviceAddress<T>(outputs, idx));
|
||||
output_addrs.emplace_back(cur_ptr);
|
||||
}
|
||||
int flag = helper_ptr_->Process(input_addrs, output_addrs, work_addrs, stream_ptr);
|
||||
if (flag != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
std::string kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
auto iter = kUnaryOpTypeMap.find(kernel_name);
|
||||
if (iter == kUnaryOpTypeMap.end()) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << ", only support these types: Exp, Expm1, Log, Log1p, Erf, Erfc,"
|
||||
<< " Neg, Reciprocal, Square, Sqrt, Rsqrt, Sin, Cos, Asin, ACos, Atan, Asinh, Acosh, Abs, "
|
||||
<< "Floor, Rint, Round, Real, Imag, Sign, Conj currently, but got " << kernel_name;
|
||||
}
|
||||
unary_op_type_ = iter->second;
|
||||
size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
size_t output_num = common::AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
std::string kernel_name = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
helper_ptr_ = std::make_unique<cukernel::UnaryHelperGpuKernel<T>>(kernel_name);
|
||||
helper_ptr_->ResetResource();
|
||||
std::vector<std::vector<size_t>> input_shapes;
|
||||
std::vector<std::vector<size_t>> output_shapes;
|
||||
auto input_shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
|
||||
auto output_shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, 0);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
input_size_list_.emplace_back(0);
|
||||
output_size_list_.emplace_back(0);
|
||||
return true;
|
||||
}
|
||||
for (size_t i = 0; i < input_shape.size(); i++) {
|
||||
input_size_ *= input_shape[i];
|
||||
input_shapes.emplace_back(input_shape);
|
||||
output_shapes.emplace_back(output_shape);
|
||||
int flag = helper_ptr_->CalMemSize(input_shapes, output_shapes);
|
||||
if (flag != 0) {
|
||||
return false;
|
||||
}
|
||||
output_size_ = input_size_;
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
||||
void ResetResource() noexcept override {
|
||||
unary_op_type_ = UNARY_OP_INVALID_TYPE;
|
||||
input_size_ = sizeof(T);
|
||||
output_size_ = sizeof(T);
|
||||
workspace_size_ = 0;
|
||||
is_null_input_ = false;
|
||||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
workspace_size_list_.clear();
|
||||
|
@ -157,15 +90,13 @@ class UnaryOpGpuKernelMod : public NativeGpuKernelMod {
|
|||
|
||||
protected:
|
||||
void InitSizeLists() override {
|
||||
input_size_list_.push_back(input_size_);
|
||||
output_size_list_.push_back(output_size_);
|
||||
input_size_list_ = helper_ptr_->GetInputSizeList();
|
||||
output_size_list_ = helper_ptr_->GetOutputSizeList();
|
||||
workspace_size_list_ = helper_ptr_->GetWorkSizeList();
|
||||
}
|
||||
|
||||
private:
|
||||
UnaryOptype unary_op_type_;
|
||||
size_t input_size_;
|
||||
size_t output_size_;
|
||||
size_t workspace_size_;
|
||||
std::unique_ptr<cukernel::UnaryHelperGpuKernel<T>> helper_ptr_ = nullptr;
|
||||
bool is_null_input_;
|
||||
};
|
||||
} // namespace kernel
|
||||
|
|
Loading…
Reference in New Issue