forked from mindspore-Ecosystem/mindspore
!28300 optimizes the kernel error description of GPU about tile,topk,transpose etc.
Merge pull request !28300 from chentangyu/code_cty_fix_last_master_I4D1Q8
This commit is contained in:
commit
ba7ab2ec76
|
@ -96,12 +96,10 @@ class ScatterNdGpuFwdKernel : public GpuKernel {
|
|||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 2, but got " << input_num;
|
||||
return false;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
return false;
|
||||
}
|
||||
|
||||
input_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
|
|
|
@ -59,25 +59,26 @@ class TileGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but Tile needs 1 input.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but Tile has 1 output.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape_) || CHECK_NULL_INPUT(output_shape_);
|
||||
is_null_input_ =
|
||||
CHECK_SHAPE_NULL(input_shape_, kernel_name, "input") || CHECK_SHAPE_NULL(output_shape_, kernel_name, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'TileGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (output_shape_.size() < 1) {
|
||||
MS_LOG(EXCEPTION) << "For 'TileGpuKernel', the rank of output cannot be less than 1, but got "
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output cannot be less than 1, but got "
|
||||
<< output_shape_.size();
|
||||
}
|
||||
input_size_ = 1;
|
||||
|
@ -87,8 +88,8 @@ class TileGpuKernel : public GpuKernel {
|
|||
|
||||
output_size_ = 1;
|
||||
if (output_shape_.size() > TILE_MAX_DIMENSION) {
|
||||
MS_LOG(EXCEPTION) << "Output is " << output_shape_.size() << "-D, but Tile supports up to " << TILE_MAX_DIMENSION
|
||||
<< "-D.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output cannot be greater than "
|
||||
<< TILE_MAX_DIMENSION << ", but got " << output_shape_.size();
|
||||
}
|
||||
shape_size_ = output_shape_.size();
|
||||
for (size_t i = 0; i < output_shape_.size(); i++) {
|
||||
|
|
|
@ -83,12 +83,13 @@ class TopKGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shapes) || CHECK_NULL_INPUT(output_shapes);
|
||||
is_null_input_ =
|
||||
CHECK_SHAPE_NULL(input_shapes, kernel_name, "input") || CHECK_SHAPE_NULL(output_shapes, kernel_name, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'TopkGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -74,28 +74,26 @@ class TransposeGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but transpose needs 1 input.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but transpose needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'TransposeGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
shape_size_ = input_shape.size();
|
||||
if (shape_size_ > TRANSPOSE_MAX_DIMENSION) {
|
||||
MS_LOG(EXCEPTION) << "Input is " << shape_size_ << "-D, but transpose supports max " << TRANSPOSE_MAX_DIMENSION
|
||||
<< "-D inputs.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output cannot be greater than "
|
||||
<< TRANSPOSE_MAX_DIMENSION << ", but got " << shape_size_;
|
||||
}
|
||||
|
||||
input_size_ = 1;
|
||||
|
|
|
@ -50,11 +50,11 @@ class UniqueGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
std::vector<size_t> shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'UniqueGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_UNPACK_GPU_KERNEL_H
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
||||
|
@ -55,10 +56,9 @@ class UnpackGpuFwdKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
}
|
||||
(void)CheckParam(kernel_node);
|
||||
axis_ = static_cast<int32_t>(GetAttr<int64_t>(kernel_node, "axis"));
|
||||
if (axis_ < 0) {
|
||||
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
|
@ -73,9 +73,8 @@ class UnpackGpuFwdKernel : public GpuKernel {
|
|||
for (size_t i = 0; i < output_num_; i++) {
|
||||
size_t _size = 1;
|
||||
auto _shape = AnfAlgo::GetOutputDeviceShape(kernel_node, i);
|
||||
is_null_input_ = CHECK_NULL_INPUT(_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(_shape, kernel_name_, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'UnpackGpuKernel', output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -87,9 +86,8 @@ class UnpackGpuFwdKernel : public GpuKernel {
|
|||
workspace_size_list_.push_back(sizeof(T *) * output_num_);
|
||||
|
||||
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name_, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'UnpackGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -108,13 +106,11 @@ class UnpackGpuFwdKernel : public GpuKernel {
|
|||
void InitSizeLists() override {}
|
||||
|
||||
private:
|
||||
bool CheckParam(const CNodePtr &kernel_node) {
|
||||
void CheckParam(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "input number is " << input_num << ", but UnpackGpuFwdKernel needs 1 input.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
int axis_;
|
||||
bool is_null_input_;
|
||||
|
|
|
@ -54,14 +54,15 @@ class UnsortedSegmentMaxGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
auto segment_ids_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
|
||||
auto output_shapes = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ =
|
||||
CHECK_NULL_INPUT(input_shapes) || CHECK_NULL_INPUT(segment_ids_shapes) || CHECK_NULL_INPUT(output_shapes);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shapes, kernel_name, "input") ||
|
||||
CHECK_SHAPE_NULL(segment_ids_shapes, kernel_name, "segment_ids") ||
|
||||
CHECK_SHAPE_NULL(output_shapes, kernel_name, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'UnsortedSegmentMaxGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -73,9 +74,8 @@ class UnsortedSegmentMaxGpuKernel : public GpuKernel {
|
|||
MS_LOG(INFO) << "UnsortedSegmentMax Kernel Input count is 2";
|
||||
}
|
||||
if (output_shapes.size() < 1) {
|
||||
MS_LOG(EXCEPTION)
|
||||
<< "For UnsortedSegmentMax, output shape incorrect rank. Expect Rank at least rank 1, got Rank: "
|
||||
<< output_shapes.size() << ".";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output cannot be less than 1, but got "
|
||||
<< output_shapes.size();
|
||||
}
|
||||
num_segments_ = output_shapes[0];
|
||||
input_size_ = 1;
|
||||
|
|
|
@ -49,13 +49,14 @@ class UnsortedSegmentMinGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
auto segment_ids_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
|
||||
auto output_shapes = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ =
|
||||
CHECK_NULL_INPUT(input_shapes) || CHECK_NULL_INPUT(segment_ids_shapes) || CHECK_NULL_INPUT(output_shapes);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shapes, kernel_name, "input") ||
|
||||
CHECK_SHAPE_NULL(segment_ids_shapes, kernel_name, "segment_ids") ||
|
||||
CHECK_SHAPE_NULL(output_shapes, kernel_name, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'UnsortedSegmentMinGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -67,9 +68,8 @@ class UnsortedSegmentMinGpuKernel : public GpuKernel {
|
|||
MS_LOG(INFO) << "UnsortedSegmentMin Kernel Input count is 2";
|
||||
}
|
||||
if (output_shapes.size() < 1) {
|
||||
MS_LOG(EXCEPTION)
|
||||
<< "For UnsortedSegmentMin, output shape incorrect rank. Expect Rank at least rank 1, got Rank: "
|
||||
<< output_shapes.size() << ".";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output cannot be less than 1, but got "
|
||||
<< output_shapes.size();
|
||||
}
|
||||
num_segments_ = output_shapes[0];
|
||||
input_size_ = 1;
|
||||
|
@ -96,7 +96,6 @@ class UnsortedSegmentMinGpuKernel : public GpuKernel {
|
|||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
||||
void ResetResource() noexcept override {
|
||||
num_segments_ = 1;
|
||||
inner_size_ = 1;
|
||||
|
|
|
@ -52,13 +52,15 @@ class UnsortedSegmentSumGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
auto input_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
auto ids_shapes = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
|
||||
auto output_shapes = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shapes) || CHECK_NULL_INPUT(ids_shapes) || CHECK_NULL_INPUT(output_shapes);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shapes, kernel_name, "input") ||
|
||||
CHECK_SHAPE_NULL(ids_shapes, kernel_name, "segment_ids") ||
|
||||
CHECK_SHAPE_NULL(output_shapes, kernel_name, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'UnsortedSegmentSumGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -51,12 +51,12 @@ class ZerosLikeGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
|
||||
std::vector<size_t> input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'ZeroslikeGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -55,7 +55,7 @@ class CustomAOTGpuKernel : public GpuKernel {
|
|||
if (!handle_) {
|
||||
handle_ = dlopen(file_path_.c_str(), RTLD_LAZY | RTLD_LOCAL);
|
||||
if (!handle_) {
|
||||
MS_LOG(ERROR) << "Open Error: " << dlerror();
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', open should be successful, but error, " << dlerror();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ class CustomAOTGpuKernel : public GpuKernel {
|
|||
reinterpret_cast<std::add_pointer<int(int, void **, int *, int64_t **, const char **, void *, void *)>::type>(
|
||||
dlsym(handle_, func_name_.c_str()));
|
||||
if (auto error_info = dlerror(); error_info != nullptr) {
|
||||
MS_LOG(ERROR) << error_info;
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', error info: " << error_info;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -79,7 +79,8 @@ class CustomAOTGpuKernel : public GpuKernel {
|
|||
ret = aot_func_(nparam, ¶ms[0], &ndims_[0], &shapes_[0], &type_pointer_list_[0], stream_ptr, nullptr);
|
||||
}
|
||||
} catch (const std::exception &e) {
|
||||
MS_LOG(ERROR) << "CustomAOT operator failed when running user defined file " << file_path_ << "! "
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', operator failed when running user defined file " << file_path_
|
||||
<< "! "
|
||||
<< "Error message is " << e.what();
|
||||
return false;
|
||||
}
|
||||
|
@ -88,14 +89,15 @@ class CustomAOTGpuKernel : public GpuKernel {
|
|||
case 0:
|
||||
break;
|
||||
case 1:
|
||||
MS_LOG(ERROR) << "Number of parameters passed to AOT kernel is " << nparam
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the number of parameters passed to AOT kernel is " << nparam
|
||||
<< ", inconsistent with what the user wants";
|
||||
return false;
|
||||
case 2:
|
||||
MS_LOG(ERROR) << "Type of parameters passed to AOT kernel is inconsistent with what the user wants";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_
|
||||
<< "', type of parameters passed to AOT kernel is inconsistent with what the user wants";
|
||||
return false;
|
||||
default:
|
||||
MS_LOG(ERROR) << "Error occurred when running AOT kernel, "
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', error occurred when running AOT kernel, "
|
||||
<< "error id is " << ret;
|
||||
return false;
|
||||
}
|
||||
|
@ -104,27 +106,25 @@ class CustomAOTGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
const auto &exec_info = AnfAlgo::GetNodeAttr<std::string>(kernel_node, "func_name");
|
||||
if (auto pos = exec_info.find(":"); pos != std::string::npos) {
|
||||
auto path = exec_info.substr(0, pos);
|
||||
auto real_path = FileUtils::GetRealPath(path.c_str());
|
||||
if (!real_path.has_value()) {
|
||||
MS_LOG(ERROR) << "Invalid file path, " << path << " does not exist.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the file path should be exist, but got " << path;
|
||||
}
|
||||
file_path_ = real_path.value();
|
||||
func_name_ = exec_info.substr(pos + 1);
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Wrong execute info:" << exec_info;
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', Wrong execute info:" << exec_info;
|
||||
}
|
||||
|
||||
num_input_ = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
auto input_type_list = AnfAlgo::GetAllInputDeviceTypes(kernel_node);
|
||||
if (num_input_ != input_type_list.size()) {
|
||||
MS_LOG(ERROR) << "Input shapes'size is " << num_input_ << ", while input types' size is "
|
||||
<< input_type_list.size();
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be " << input_type_list.size()
|
||||
<< ", but got " << num_input_;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_input_; i++) {
|
||||
|
@ -141,9 +141,8 @@ class CustomAOTGpuKernel : public GpuKernel {
|
|||
auto output_type_list = AnfAlgo::GetAllOutputDeviceTypes(kernel_node);
|
||||
|
||||
if (num_output_ != output_type_list.size()) {
|
||||
MS_LOG(ERROR) << "Output shapes'size is " << num_output_ << ", while output types' size is "
|
||||
<< output_type_list.size();
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be " << output_type_list.size()
|
||||
<< ", but got " << num_output_;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_output_; i++) {
|
||||
|
|
|
@ -61,12 +61,13 @@ bool DatasetInitKernel::Launch(const std::vector<AddressPtr> &, const std::vecto
|
|||
size_t len = total_bytes_ * buffer_q_capacity_;
|
||||
|
||||
if (!device::gpu::GPUMemoryAllocator::GetInstance().AllocBufferQueueMem(len, &addr)) {
|
||||
MS_LOG(EXCEPTION) << "Memory not enough: failed to allocate GPU buffer queue memory[" << len << "].";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', memory not enough: failed to allocate GPU buffer queue memory["
|
||||
<< len << "].";
|
||||
}
|
||||
|
||||
auto status = GpuBufferMgr::GetInstance().Create(0, queue_name_, addr, shapes_, buffer_q_capacity_);
|
||||
if (status) {
|
||||
MS_LOG(EXCEPTION) << "Init Dataset Failed. len: " << len << ", status:" << status;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', init Dataset Failed. len: " << len << ", status:" << status;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
|
@ -121,7 +121,7 @@ bool DatasetIteratorKernel::ReadDevice(void **addr, size_t *len) {
|
|||
#ifdef ENABLE_DUMP_IR
|
||||
mindspore::RDR::TriggerAll();
|
||||
#endif
|
||||
MS_LOG(EXCEPTION) << "Get data timeout";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', get data timeout";
|
||||
}
|
||||
}
|
||||
#ifndef ENABLE_SECURITY
|
||||
|
@ -130,7 +130,7 @@ bool DatasetIteratorKernel::ReadDevice(void **addr, size_t *len) {
|
|||
profiling_op_->RecordData(queue_size, start_time_stamp, end_time_stamp);
|
||||
}
|
||||
#endif
|
||||
MS_LOG(ERROR) << "Get data failed, errcode " << ret;
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', get data failed, errcode " << ret;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -141,7 +141,7 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
|
|||
if (handle_ == HandleMgr::INVALID_HANDLE) {
|
||||
handle_ = GpuBufferMgr::GetInstance().Open(0, queue_name_, output_size_list_);
|
||||
if (handle_ == HandleMgr::INVALID_HANDLE) {
|
||||
MS_LOG(EXCEPTION) << "Gpu Queue(" << queue_name_ << ") Open Failed";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', gpu Queue(" << queue_name_ << ") Open Failed";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -151,7 +151,7 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v
|
|||
return false;
|
||||
}
|
||||
if (total_bytes_ != len) {
|
||||
MS_LOG(ERROR) << "Dataset front error. read: " << len << ", expect: " << total_bytes_ << ", ";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', dataset front error, read: " << len << ", expect: " << total_bytes_;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
GetNextProfiling::GetNextProfiling(const std::string &path) : profiling_path_(path) {}
|
||||
GetNextProfiling::GetNextProfiling(const std::string &path) : profiling_path_(path), kernel_name_("GetNextProfiling") {}
|
||||
|
||||
void GetNextProfiling::GetDeviceId() {
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
|
@ -44,13 +44,13 @@ void GetNextProfiling::Init() {
|
|||
void GetNextProfiling::SaveProfilingData() {
|
||||
std::ofstream handle(file_name_, std::ios::trunc);
|
||||
if (!handle.is_open()) {
|
||||
MS_LOG(ERROR) << "Open get-next profiling file failed.";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', open get-next profiling file failed.";
|
||||
return;
|
||||
}
|
||||
for (uint32_t index = 0; index < queue_size_.size(); index++) {
|
||||
if (index > time_stamp_.size() - 1) {
|
||||
handle.close();
|
||||
MS_LOG(EXCEPTION) << "index exceeds time_stamp_ size.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', index exceeds time_stamp_ size.";
|
||||
}
|
||||
handle << Name() << " " << time_stamp_[index].first << " " << time_stamp_[index].second << " " << queue_size_[index]
|
||||
<< std::endl;
|
||||
|
@ -62,7 +62,7 @@ void GetNextProfiling::SaveProfilingData() {
|
|||
|
||||
void GetNextProfiling::ChangeFileMode() {
|
||||
if (chmod(common::SafeCStr(file_name_), S_IRUSR | S_IWUSR) == -1) {
|
||||
MS_LOG(ERROR) << "Modify file:" << file_name_ << " to rw fail.";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', modify file:" << file_name_ << " to rw fail.";
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,6 +44,7 @@ class GetNextProfiling : public ProfilingOp {
|
|||
std::vector<std::pair<uint64_t, uint64_t>> time_stamp_; // First value of std::pair is the start time stamp,
|
||||
// Second value of std::pair is the stop time stamp
|
||||
std::string device_id_;
|
||||
std::string kernel_name_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -90,6 +90,7 @@ class PrintGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
if (AnfAlgo::HasNodeAttr("string_pos", kernel_node)) {
|
||||
|
@ -105,9 +106,8 @@ class PrintGpuKernel : public GpuKernel {
|
|||
input_flag_ = SetInputFlag(&string_pos_, input_tensor_num);
|
||||
for (size_t i = 0; i < input_tensor_num; i++) {
|
||||
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, i);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name_, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'PrintGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -183,7 +183,7 @@ class PrintGpuKernel : public GpuKernel {
|
|||
input_device_data->push_back(GetDeviceAddress<double>(inputs, i));
|
||||
break;
|
||||
default:
|
||||
MS_LOG(EXCEPTION) << "TypeId: " << type_id << " is not supported in Print.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the typeid cannot be " << type_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -200,7 +200,7 @@ class PrintGpuKernel : public GpuKernel {
|
|||
}
|
||||
for (size_t i = 0; i < string_pos->size(); i++) {
|
||||
if ((*string_pos)[i] < 0) {
|
||||
MS_LOG(EXCEPTION) << "string_pos cannot be a negative value";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', string_pos cannot be a negative value";
|
||||
}
|
||||
auto index = IntToSize((*string_pos)[i]);
|
||||
res[index] = -1;
|
||||
|
|
|
@ -65,22 +65,21 @@ class AddNGpuFwdKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
num_input_ = GetAttr<int64_t>(kernel_node, "n");
|
||||
if (num_input_ != input_num) {
|
||||
MS_LOG(ERROR) << "Input number is " << num_input_ << " in attr, but got " << input_num << "input.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be " << num_input_ << ", but got "
|
||||
<< input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but cudnnAddTensor needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "AddNGpuFwdKernel input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -49,20 +49,18 @@ class AssignAddGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but cudnnAddTensor needs 2 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 2, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but cudnnAddTensor needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "AssignAddGpuFwdKernel input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -57,13 +57,15 @@ class BroadcastComplexOpGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
GetOpType(kernel_node);
|
||||
auto shape1 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
auto shape2 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
|
||||
auto shape3 = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
need_broadcast_ = AnfAlgo::IsTensorBroadcast(shape1, shape2);
|
||||
if (need_broadcast_ && shape1.size() > MAX_DIMS) {
|
||||
MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than " << MAX_DIMS;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be greater than " << MAX_DIMS
|
||||
<< ", but got " << shape1.size();
|
||||
}
|
||||
|
||||
lhs_shape_.resize(MAX_DIMS, 1);
|
||||
|
@ -128,7 +130,9 @@ class BroadcastComplexOpGpuKernel : public GpuKernel {
|
|||
return;
|
||||
}
|
||||
|
||||
MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||
<< ", only support these types: RealDiv, Mul, Sub, Add, Div or Complex currently, but got "
|
||||
<< kernel_name;
|
||||
}
|
||||
|
||||
BroadcastOpType op_type_;
|
||||
|
|
|
@ -70,20 +70,22 @@ class BroadcastOpGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
GetOpType(kernel_node);
|
||||
auto shape1 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
auto shape2 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
|
||||
auto shape3 = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(shape1) || CHECK_NULL_INPUT(shape2) || CHECK_NULL_INPUT(shape3);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape1, kernel_name_, "input") ||
|
||||
CHECK_SHAPE_NULL(shape2, kernel_name_, "input") ||
|
||||
CHECK_SHAPE_NULL(shape3, kernel_name_, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'BroadcastGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
need_broadcast_ = AnfAlgo::IsTensorBroadcast(shape1, shape2);
|
||||
if (need_broadcast_ && shape1.size() > MAX_DIMS) {
|
||||
MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than: " << MAX_DIMS << ", actual size is "
|
||||
<< shape1.size();
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be greater than " << MAX_DIMS
|
||||
<< ", but got " << shape1.size();
|
||||
}
|
||||
|
||||
lhs_shape_.resize(MAX_DIMS, 1);
|
||||
|
@ -94,7 +96,8 @@ class BroadcastOpGpuKernel : public GpuKernel {
|
|||
if (i < MAX_DIMS) {
|
||||
output_shape_[i] = shape3[i];
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Output index: " << i << " should be less than " << MAX_DIMS;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the index of output should be less than " << MAX_DIMS
|
||||
<< ", but got " << i;
|
||||
}
|
||||
}
|
||||
output_num_ *= shape3[i];
|
||||
|
@ -106,7 +109,8 @@ class BroadcastOpGpuKernel : public GpuKernel {
|
|||
lhs_shape_[j + lhs_offset] = shape1[j];
|
||||
} else {
|
||||
auto index = j + lhs_offset;
|
||||
MS_LOG(EXCEPTION) << "Invalid input1 index: " << index;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the index of input cannot be " << index << ", but got "
|
||||
<< index;
|
||||
}
|
||||
}
|
||||
input1_num_ *= shape1[j];
|
||||
|
@ -118,7 +122,8 @@ class BroadcastOpGpuKernel : public GpuKernel {
|
|||
rhs_shape_[k + rhs_offset] = shape2[k];
|
||||
} else {
|
||||
auto index = k + rhs_offset;
|
||||
MS_LOG(EXCEPTION) << "Invalid input2 index: " << index;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the index of input cannot be " << index << ", but got "
|
||||
<< index;
|
||||
}
|
||||
}
|
||||
input2_num_ *= shape2[k];
|
||||
|
@ -201,7 +206,10 @@ class BroadcastOpGpuKernel : public GpuKernel {
|
|||
return;
|
||||
}
|
||||
|
||||
MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||
<< ", only support these types: Maximum, Minimum, Pow, RealDiv, Mul, Sub, Add, Div, DivNoNan, "
|
||||
"Mod, FloorDiv, AbsGrad, FloorMod, Atan2, TruncateDiv or TruncateMod currently, but got "
|
||||
<< kernel_name;
|
||||
}
|
||||
|
||||
BroadcastOpType op_type_;
|
||||
|
|
|
@ -69,20 +69,23 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
GetOpType(kernel_node);
|
||||
auto shape1 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto shape2 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
auto shape3 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
|
||||
is_null_input_ = CHECK_NULL_INPUT(shape1) || CHECK_NULL_INPUT(shape2) || CHECK_NULL_INPUT(shape3);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape1, kernel_name_, "input_1") ||
|
||||
CHECK_SHAPE_NULL(shape2, kernel_name_, "input_2") ||
|
||||
CHECK_SHAPE_NULL(shape3, kernel_name_, "input_3");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'BroadcastGradGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
need_broadcast_ = AnfAlgo::IsTensorBroadcast(shape1, shape2);
|
||||
if (need_broadcast_ && shape1.size() > kMaxShapeSize) {
|
||||
MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than " << kMaxShapeSize;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be greater than "
|
||||
<< kMaxShapeSize << ", but got " << shape1.size();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < shape3.size(); i++) {
|
||||
|
@ -98,8 +101,8 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
|
|||
x1_shape_[i + x1_offset] = shape1[i];
|
||||
} else {
|
||||
auto index = i + x1_offset;
|
||||
MS_LOG(EXCEPTION) << "For 'BroadcastOpGrad', the dimension of input cannot be greater than " << kMaxShapeSize
|
||||
<< ", but got " << (index + 1);
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be greater than "
|
||||
<< kMaxShapeSize << ", but got " << (index + 1);
|
||||
}
|
||||
}
|
||||
input1_num_ *= shape1[i];
|
||||
|
@ -111,7 +114,8 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
|
|||
x2_shape_[i + x2_offset] = shape2[i];
|
||||
} else {
|
||||
auto index = i + x2_offset;
|
||||
MS_LOG(EXCEPTION) << "Invalid input2 index: " << index;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be greater than "
|
||||
<< kMaxShapeSize << ", but got " << (index + 1);
|
||||
}
|
||||
}
|
||||
input2_num_ *= shape2[i];
|
||||
|
@ -162,7 +166,8 @@ class BroadcastOpGradGpuKernel : public GpuKernel {
|
|||
|
||||
auto iter = kBroadcastTypeMap.find(kernel_name);
|
||||
if (iter == kBroadcastTypeMap.end()) {
|
||||
MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_
|
||||
<< ", only support these types: MaximumGrad or MinimumGrad currently, but got " << kernel_name;
|
||||
} else {
|
||||
op_type_ = iter->second;
|
||||
}
|
||||
|
|
|
@ -66,14 +66,14 @@ class CastAllGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
num_input_ = GetAttr<size_t>(kernel_node, "n");
|
||||
size_ = std::make_unique<size_t[]>(num_input_);
|
||||
for (size_t i = 0; i < num_input_; i++) {
|
||||
auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
|
||||
is_null_input_ = CHECK_NULL_INPUT(shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'CastAllGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <cublas_v2.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/eye_impl.cuh"
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/matrix_split_impl.cuh"
|
||||
|
@ -62,6 +63,7 @@ class CholeskyGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
lower_ = static_cast<bool>(GetAttr<bool>(kernel_node, kLower));
|
||||
split_dim_ = static_cast<int>(GetAttr<int64_t>(kernel_node, kSplitDim));
|
||||
|
@ -77,11 +79,11 @@ class CholeskyGpuKernel : public GpuKernel {
|
|||
|
||||
auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kInputIndex);
|
||||
|
||||
is_null_input_ = CHECK_NULL_INPUT(in_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(in_shape, kernel_name_, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(EXCEPTION) << "For 'CholeskyGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (split_dim_ == 0) {
|
||||
return InitNoSplitDim(in_shape);
|
||||
}
|
||||
|
@ -103,11 +105,11 @@ class CholeskyGpuKernel : public GpuKernel {
|
|||
cho_row_ = in_shape.at(kDim1);
|
||||
cho_col_ = in_shape.at(kDim2);
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Input Only support Rank 2 OR 3";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the dimension of input only should be 2 or 3";
|
||||
return false;
|
||||
}
|
||||
if (cho_row_ != cho_col_) {
|
||||
MS_LOG(ERROR) << "Cholesky need square matrix as input.";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the shape of input should be square matrix";
|
||||
return false;
|
||||
}
|
||||
// set matrix row or col to be lead dimension
|
||||
|
@ -121,13 +123,14 @@ class CholeskyGpuKernel : public GpuKernel {
|
|||
|
||||
bool InitSplitDim(const std::vector<size_t> &in_shape) {
|
||||
if (in_shape.size() != kCholeskyNormalShape) {
|
||||
MS_LOG(ERROR) << "Cholesky Split Matrix Need Input Rank as 2.";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the dimension of input should be " << kCholeskyNormalShape
|
||||
<< ", but got " << in_shape.size();
|
||||
return false;
|
||||
}
|
||||
cho_row_ = in_shape.at(kDim0);
|
||||
cho_col_ = in_shape.at(kDim1);
|
||||
if (cho_row_ != cho_col_) {
|
||||
MS_LOG(ERROR) << "Cholesky Split Matrix Need Square Matrix as Input.";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the shape of input should be square matrix";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -206,7 +209,7 @@ class CholeskyGpuKernel : public GpuKernel {
|
|||
kernel_node_, cusolverDnDpotrfBatched(handle_, uplo_, m_, d_array_addr, lda_, d_info_array_addr, batch_),
|
||||
"cusolver cholesky batched Fail");
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "cholesky factorization do not support other data type but only float or double, right now.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type only should be float or double, right now.";
|
||||
}
|
||||
size_t output_elements = outputs.at(kDim0)->size / unit_size_;
|
||||
// copy results from written input's matrix to output's matrix by up or lower flag.
|
||||
|
@ -243,7 +246,7 @@ class CholeskyGpuKernel : public GpuKernel {
|
|||
kernel_node_, cusolverDnDpotrfBatched(handle_, uplo_, m_, d_array_addr, lda_, d_info_array_addr, batch_),
|
||||
"cusolver cholesky batched Fail");
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "cholesky factorization do not support other data type but only float or double, right now.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type only should be float or double, right now.";
|
||||
}
|
||||
|
||||
TriangleMatrixCopy(d_batch_input_addr, output_addr, uplo_, outputs[0]->size / sizeof(T), ldb_, m_,
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <cublas_v2.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/triangle_matrix_copy_impl.cuh"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
|
@ -41,7 +42,7 @@ class CholeskySolveGpuKernel : public GpuKernel {
|
|||
public:
|
||||
using pointer = T *;
|
||||
|
||||
CholeskySolveGpuKernel() = default;
|
||||
CholeskySolveGpuKernel() : is_null_input_(false) {}
|
||||
~CholeskySolveGpuKernel() = default;
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
|
||||
|
@ -49,6 +50,9 @@ class CholeskySolveGpuKernel : public GpuKernel {
|
|||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
CHECK_CUSOLVER_RET_WITH_ERROR(cusolverDnSetStream(handle_, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cusolverDnSetStream failed");
|
||||
auto input_a_addr = GetDeviceAddress<T>(inputs, kDim0);
|
||||
|
@ -81,7 +85,7 @@ class CholeskySolveGpuKernel : public GpuKernel {
|
|||
d_b_array_addr, ldb_, d_info_array_addr, batch_),
|
||||
"cusolver cholesky solve batched Fail");
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "cholesky solve do not support other data type but only float or double, right now.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type only should be float or double, right now.";
|
||||
}
|
||||
size_t output_elements = outputs.at(kDim0)->size / unit_size_;
|
||||
// copy results from written input's matrix to output's matrix.
|
||||
|
@ -90,6 +94,7 @@ class CholeskySolveGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
lower_ = static_cast<bool>(GetAttr<bool>(kernel_node, kLower));
|
||||
// gpu input is col major default, so need to change row major.
|
||||
|
@ -103,10 +108,14 @@ class CholeskySolveGpuKernel : public GpuKernel {
|
|||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
|
||||
auto in_a_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kDim0);
|
||||
auto in_b_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, kDim1);
|
||||
if (CHECK_NULL_INPUT(in_a_shape) || CHECK_NULL_INPUT(in_b_shape)) {
|
||||
MS_LOG(EXCEPTION) << "For 'CholeskySolveGpuKernel', input is null";
|
||||
is_null_input_ =
|
||||
CHECK_SHAPE_NULL(in_a_shape, kernel_name_, "input_a") || CHECK_SHAPE_NULL(in_b_shape, kernel_name_, "input_b");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
return InitDim(in_a_shape, in_b_shape);
|
||||
(void)InitDim(in_a_shape, in_b_shape);
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
|
@ -127,7 +136,7 @@ class CholeskySolveGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
private:
|
||||
bool InitDim(const std::vector<size_t> &in_a_shape, const std::vector<size_t> &in_b_shape) {
|
||||
void InitDim(const std::vector<size_t> &in_a_shape, const std::vector<size_t> &in_b_shape) {
|
||||
if (in_a_shape.size() == kCholeskyDefaultShape) {
|
||||
batch_ = 1;
|
||||
cho_row_ = in_a_shape.at(kDim0);
|
||||
|
@ -141,17 +150,14 @@ class CholeskySolveGpuKernel : public GpuKernel {
|
|||
cho_row_ = in_a_shape.at(kDim1);
|
||||
cho_col_ = in_a_shape.at(kDim2);
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Input Only support Rank 2 OR 3";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input only should be 2 or 3";
|
||||
}
|
||||
if (cho_row_ != cho_col_) {
|
||||
MS_LOG(ERROR) << "Cholesky need square matrix as input.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the shape of input should be square matrix";
|
||||
}
|
||||
size_t b_row = in_b_shape.size() == kCholeskyBatchedShape ? in_b_shape.at(kDim1) : in_b_shape.at(kDim0);
|
||||
if (cho_row_ != b_row) {
|
||||
MS_LOG(ERROR) << "Cholesky right hand matrix is not equal to left matrix.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', right hand matrix should be equal to left matrix";
|
||||
}
|
||||
m_ = SizeToInt(in_a_shape.at(kDim1));
|
||||
lda_ = m_;
|
||||
|
@ -159,7 +165,6 @@ class CholeskySolveGpuKernel : public GpuKernel {
|
|||
h_a_array_.resize(batch_);
|
||||
h_b_array_.resize(batch_);
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
size_t cho_row_{0};
|
||||
size_t cho_col_{0};
|
||||
|
@ -177,6 +182,7 @@ class CholeskySolveGpuKernel : public GpuKernel {
|
|||
std::vector<size_t> input_size_list_;
|
||||
std::vector<size_t> output_size_list_;
|
||||
std::vector<size_t> workspace_size_list_;
|
||||
bool is_null_input_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <cublas_v2.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/eye_impl.cuh"
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/matrix_split_impl.cuh"
|
||||
|
@ -67,13 +68,13 @@ class CholeskyTrsmGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
|
||||
blas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
|
||||
auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(in_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(in_shape, kernel_name_, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'CholeskyTrsmSolveGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -84,12 +85,10 @@ class CholeskyTrsmGpuKernel : public GpuKernel {
|
|||
}
|
||||
} else {
|
||||
if (in_shape.size() != 2) {
|
||||
MS_LOG(ERROR) << "CholeskyTrsm Split Matrix Need Input Rank as 2.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input only should be 2";
|
||||
}
|
||||
if (in_shape[0] != in_shape[1]) {
|
||||
MS_LOG(ERROR) << "CholeskyTrsm Split Matrix Need Square Matrix as Input.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the shape of input should be square matrix";
|
||||
}
|
||||
InitDimOthers(kernel_node, in_shape);
|
||||
}
|
||||
|
@ -201,17 +200,19 @@ class CholeskyTrsmGpuKernel : public GpuKernel {
|
|||
if (in_shape.size() == 2) {
|
||||
batch_ = 1;
|
||||
if (in_shape[0] != in_shape[1]) {
|
||||
MS_LOG(ERROR) << "CholeskyTrsm shape0: " << in_shape[0] << ", is not equal to shape1: " << in_shape[1];
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', shape0 should be equal to " << in_shape[1] << ", but got "
|
||||
<< in_shape[0];
|
||||
return false;
|
||||
}
|
||||
} else if (in_shape.size() == 3) {
|
||||
batch_ = SizeToInt(in_shape[0]);
|
||||
if (in_shape[1] != in_shape[2]) {
|
||||
MS_LOG(ERROR) << "CholeskyTrsm shape1: " << in_shape[1] << ", is not equal to shape2: " << in_shape[2];
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', shape1 should be equal to " << in_shape[2] << ", but got "
|
||||
<< in_shape[1];
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Input Only support Rank 2 OR 3";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the dimension of input only should be 2 or 3";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -55,15 +55,15 @@ class CumProdGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but CumProdGpuKernel needs 1.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
input_size_0_ = sizeof(T);
|
||||
shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(shape_);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape_, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'CumProdGpuKernel', input is null.";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -72,7 +72,8 @@ class CumProdGpuKernel : public GpuKernel {
|
|||
reverse_ = GetAttr<bool>(kernel_node, "reverse");
|
||||
int input_dim_length = SizeToInt(shape_.size());
|
||||
if (axis_ >= input_dim_length) {
|
||||
MS_LOG(EXCEPTION) << "Axis is: " << axis_ << " out of bounds.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the value of 'axis' should be less than " << input_dim_length
|
||||
<< ", but got " << axis_;
|
||||
}
|
||||
while (axis_ < 0) {
|
||||
axis_ += input_dim_length;
|
||||
|
|
|
@ -55,15 +55,15 @@ class CumSumGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but CumSumGpuKernel needs 1.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
input_size_0_ = sizeof(T);
|
||||
shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(shape_);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape_, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'CumSumGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -72,7 +72,8 @@ class CumSumGpuKernel : public GpuKernel {
|
|||
reverse_ = GetAttr<bool>(kernel_node, "reverse");
|
||||
int input_dim_length = SizeToInt(shape_.size());
|
||||
if (axis_ >= input_dim_length) {
|
||||
MS_LOG(EXCEPTION) << "Axis is: " << axis_ << " out of bounds.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the value of 'axis' should be less than " << input_dim_length
|
||||
<< ", but got " << axis_;
|
||||
}
|
||||
while (axis_ < 0) {
|
||||
axis_ += input_dim_length;
|
||||
|
|
|
@ -47,7 +47,8 @@ class DetTriangleGpuKernel : public GpuKernel {
|
|||
|
||||
if (!CheckTriangle(input_addr, fill_mode_, matrix_n_, outputs[0]->size / sizeof(T),
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr))) {
|
||||
MS_LOG(ERROR) << "The elements in the upper half of the matrix should be all 0, fill mode is: " << fill_mode_;
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_
|
||||
<< "', the elements in the upper half of the matrix should be all 0, fill mode is: " << fill_mode_;
|
||||
return false;
|
||||
}
|
||||
DetTriangle(input_addr, output_addr, matrix_n_, outputs[0]->size / sizeof(T),
|
||||
|
@ -56,21 +57,20 @@ class DetTriangleGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but DetTriangle needs 1 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but DetTriangle needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape) || CHECK_NULL_INPUT(output_shape);
|
||||
is_null_input_ =
|
||||
CHECK_SHAPE_NULL(input_shape, kernel_name_, "input") || CHECK_SHAPE_NULL(output_shape, kernel_name_, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'DeterminantTriangleGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -79,8 +79,8 @@ class DetTriangleGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
if (input_shape.size() < 2) {
|
||||
MS_LOG(ERROR) << "The input should have rank at least 2.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be less than 2, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
|
||||
matrix_n_ = input_shape[input_shape.size() - 1];
|
||||
|
@ -89,12 +89,11 @@ class DetTriangleGpuKernel : public GpuKernel {
|
|||
output_size_ *= output_shape[i];
|
||||
}
|
||||
if (matrix_n_ == 0 || output_size_ != input_size_ / matrix_n_ / matrix_n_) {
|
||||
MS_LOG(ERROR) << "The output shape is wrong.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the shape of output should be "
|
||||
<< (input_size_ / matrix_n_ / matrix_n_) << ", but got " << output_size_;
|
||||
}
|
||||
if (input_shape[input_shape.size() - 2] != input_shape[input_shape.size() - 1]) {
|
||||
MS_LOG(ERROR) << "The matrix should be in shape of square.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the shape of input should be square matrix";
|
||||
}
|
||||
auto prim = AnfAlgo::GetCNodePrimitive(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(prim);
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include <cusolverDn.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <complex>
|
||||
#include <algorithm>
|
||||
#include <type_traits>
|
||||
|
@ -52,13 +53,14 @@ struct Complex_traits<Complex<T>> {
|
|||
template <typename T>
|
||||
class EighcGpuKernel : public GpuKernel {
|
||||
public:
|
||||
EighcGpuKernel() = default;
|
||||
EighcGpuKernel() : is_null_input_(false) {}
|
||||
~EighcGpuKernel() = default;
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
|
||||
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
blas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
auto A_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
|
@ -70,13 +72,14 @@ class EighcGpuKernel : public GpuKernel {
|
|||
jobz_ = CUSOLVER_EIG_MODE_NOVECTOR;
|
||||
}
|
||||
cusolver_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
|
||||
bool is_null_input = CHECK_NULL_INPUT(A_shape);
|
||||
if (is_null_input) {
|
||||
MS_LOG(EXCEPTION) << "For 'EighValue GpuKernel', input is null";
|
||||
is_null_input_ = CHECK_SHAPE_NULL(A_shape, kernel_name_, "input");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (A_shape.size() != kShape2dDims || A_shape[1] != A_shape[1]) {
|
||||
MS_LOG(EXCEPTION) << "wrong array shape, A should be a square matrix, but got [" << A_shape[0] << " X "
|
||||
<< A_shape[1] << "]";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the shape of input should be square matrix, but got ["
|
||||
<< A_shape[0] << " X " << A_shape[1] << "]";
|
||||
}
|
||||
m_ = A_shape[0];
|
||||
InitSizeLists();
|
||||
|
@ -85,6 +88,9 @@ class EighcGpuKernel : public GpuKernel {
|
|||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
CHECK_CUBLAS_RET_WITH_ERROR(cublasSetStream(blas_handle_, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cublasSetStream failed");
|
||||
CHECK_CUSOLVER_RET_WITH_ERROR(cusolverDnSetStream(cusolver_handle_, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
|
@ -136,7 +142,7 @@ class EighcGpuKernel : public GpuKernel {
|
|||
}
|
||||
d_work = device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(sizeof(T) * lwork);
|
||||
if (!d_work) {
|
||||
MS_LOG(EXCEPTION) << "GPU memory alloca failed.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', GPU memory alloca failed.";
|
||||
}
|
||||
if constexpr (std::is_same_v<T, Complex<float>>) {
|
||||
cusolverDnCheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuComplex *>(w_v_addr), lda_, w_w_addr,
|
||||
|
@ -198,6 +204,7 @@ class EighcGpuKernel : public GpuKernel {
|
|||
std::vector<size_t> output_size_list_{};
|
||||
std::vector<size_t> workspace_size_list_{};
|
||||
using D = typename Complex_traits<T>::value_type;
|
||||
bool is_null_input_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -39,13 +39,14 @@ constexpr char LOWER[] = "lower";
|
|||
template <typename T>
|
||||
class EighGpuKernel : public GpuKernel {
|
||||
public:
|
||||
EighGpuKernel() = default;
|
||||
EighGpuKernel() : is_null_input_(false) {}
|
||||
~EighGpuKernel() = default;
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
|
||||
const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
auto A_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
compute_eigen_vectors_ = static_cast<bool>(GetAttr<bool>(kernel_node, C_EIEH_VECTOR));
|
||||
|
@ -56,13 +57,14 @@ class EighGpuKernel : public GpuKernel {
|
|||
jobz_ = CUSOLVER_EIG_MODE_NOVECTOR;
|
||||
}
|
||||
cusolver_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
|
||||
bool is_null_input = CHECK_NULL_INPUT(A_shape);
|
||||
if (is_null_input) {
|
||||
MS_LOG(EXCEPTION) << "For 'EighValue GpuKernel', input is null";
|
||||
is_null_input_ = CHECK_SHAPE_NULL(A_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (A_shape.size() != kShape2dDims || A_shape[0] != A_shape[1]) {
|
||||
MS_LOG(EXCEPTION) << "wrong array shape, A should be a square matrix, but got [" << A_shape[0] << " X "
|
||||
<< A_shape[1] << "]";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the shape of input should be square matrix, but got ["
|
||||
<< A_shape[0] << " X " << A_shape[1] << "]";
|
||||
}
|
||||
m_ = A_shape[0];
|
||||
InitSizeLists();
|
||||
|
@ -71,6 +73,9 @@ class EighGpuKernel : public GpuKernel {
|
|||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
CHECK_CUSOLVER_RET_WITH_ERROR(cusolverDnSetStream(cusolver_handle_, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cusolverDnSetStream failed");
|
||||
// matrix A, input or output(eigenvector)
|
||||
|
@ -152,6 +157,7 @@ class EighGpuKernel : public GpuKernel {
|
|||
cusolverEigMode_t jobz_ = CUSOLVER_EIG_MODE_NOVECTOR;
|
||||
bool compute_eigen_vectors_{false};
|
||||
bool lower_{true};
|
||||
bool is_null_input_;
|
||||
std::vector<T *> h_array_{};
|
||||
std::vector<size_t> input_size_list_{};
|
||||
std::vector<size_t> output_size_list_{};
|
||||
|
|
|
@ -49,23 +49,21 @@ class EqualCountGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but equalcount needs 2 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 2, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but equalcount needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
|
||||
output_size_ = sizeof(T);
|
||||
input_size_ = sizeof(T);
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'EqualcountGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -77,13 +77,11 @@ class FloatStatusGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
}
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
(void)CheckParam(kernel_node);
|
||||
auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'FloatStatusGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -91,10 +89,10 @@ class FloatStatusGpuKernel : public GpuKernel {
|
|||
for (size_t x : shape) {
|
||||
input_size_ = input_size_ * x;
|
||||
}
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto iter = kOpTypeMap.find(kernel_name);
|
||||
if (iter == kOpTypeMap.end()) {
|
||||
MS_LOG(EXCEPTION) << "FloatStatus kernel " << kernel_name << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << ", only support these types: FloatStatus, IsInf, IsNan, IsFinite "
|
||||
<< "currently, but got " << kernel_name;
|
||||
}
|
||||
kernel_name_ = iter->second;
|
||||
|
||||
|
@ -114,18 +112,16 @@ class FloatStatusGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
private:
|
||||
bool CheckParam(const CNodePtr &kernel_node) {
|
||||
void CheckParam(const CNodePtr &kernel_node) {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but FloatStatusGpuKernel needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but FloatStatusGpuKernel needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<size_t> input_size_list_;
|
||||
|
|
|
@ -52,20 +52,18 @@ class IdentityGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but identity needs 1 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but identity needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "IdentityGpuKernel input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -62,18 +62,20 @@ class IndexAddGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but index add needs 3 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
|
||||
}
|
||||
std::vector<size_t> dst_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
std::vector<size_t> index_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
std::vector<size_t> src_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
|
||||
is_null_input_ = CHECK_NULL_INPUT(dst_shape) || CHECK_NULL_INPUT(index_shape) || CHECK_NULL_INPUT(src_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(dst_shape, kernel_name, "x") ||
|
||||
CHECK_SHAPE_NULL(index_shape, kernel_name, "indices") ||
|
||||
CHECK_SHAPE_NULL(src_shape, kernel_name, "y");
|
||||
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'IndexAddGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -91,8 +93,8 @@ class IndexAddGpuKernel : public GpuKernel {
|
|||
inner_size_ *= src_shape[i];
|
||||
}
|
||||
if (axis < 0 || axis >= SizeToInt(src_shape.size()) || axis >= SizeToInt(dst_shape.size())) {
|
||||
MS_LOG(EXCEPTION) << "Init axis size failed, actual src axis size is " << src_axis_size_
|
||||
<< ", actual dst axis size is " << dst_axis_size_;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the size of 'axis' cannot be greater than or equal to "
|
||||
<< SizeToInt(src_shape.size()) << " or " << SizeToInt(dst_shape.size()) << ", but got " << axis;
|
||||
}
|
||||
src_axis_size_ = src_shape[axis];
|
||||
dst_axis_size_ = dst_shape[axis];
|
||||
|
|
|
@ -49,37 +49,34 @@ class LinSpaceGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but DynamicLinSpace needs 3 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but DynamicLinSpace needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
auto input_1 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
auto input_2 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
|
||||
auto value_count = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_1) || CHECK_NULL_INPUT(input_2) || CHECK_NULL_INPUT(value_count);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_1, kernel_name, "start") ||
|
||||
CHECK_SHAPE_NULL(input_2, kernel_name, "stop") ||
|
||||
CHECK_SHAPE_NULL(value_count, kernel_name, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'LinspaceGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
// error checking input data
|
||||
if ((input_1.size() != 0) || (input_2.size() != 0)) {
|
||||
MS_LOG(ERROR) << "For LinShape "
|
||||
<< "both start and end must be 0-D Tensors. Got " << input_1.size() << " and " << input_2.size()
|
||||
<< ".";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', both start and end should be 0-D Tensors, but got dimension "
|
||||
<< "of start: " << input_1.size() << " and dimension of end: " << input_2.size();
|
||||
}
|
||||
|
||||
if (value_count.size() != 1) {
|
||||
MS_LOG(ERROR) << "For LinShape, output shape incorrect rank. Expect Rank: 1, got Rank: " << value_count.size()
|
||||
<< ".";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output should be 1, but got "
|
||||
<< value_count.size();
|
||||
}
|
||||
value_count_ = value_count[0];
|
||||
InitSizeLists();
|
||||
|
|
|
@ -49,11 +49,11 @@ class LogicalNotGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'LogicalGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <cublas_v2.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <type_traits>
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
|
@ -39,7 +40,7 @@ constexpr size_t kLuNormalShape = 2;
|
|||
template <typename T>
|
||||
class LUGpuKernel : public GpuKernel {
|
||||
public:
|
||||
LUGpuKernel() = default;
|
||||
LUGpuKernel() : is_null_input_(false) {}
|
||||
~LUGpuKernel() = default;
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
|
||||
|
@ -47,6 +48,9 @@ class LUGpuKernel : public GpuKernel {
|
|||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
CHECK_CUSOLVER_RET_WITH_ERROR(cusolverDnSetStream(handle_, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||
"cusolverDnSetStream failed");
|
||||
auto input_addr = GetDeviceAddress<T>(inputs, kDim0);
|
||||
|
@ -65,7 +69,7 @@ class LUGpuKernel : public GpuKernel {
|
|||
"cusolver query lu work size fail");
|
||||
|
||||
if (cudaMalloc(reinterpret_cast<void **>(&d_work_), unit_size_ * lwork_) != cudaSuccess) {
|
||||
MS_LOG(EXCEPTION) << "cusolver malloc work size fail";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', cusolver malloc work size fail";
|
||||
}
|
||||
|
||||
CHECK_CUSOLVER_RET_WITH_EXCEPT(
|
||||
|
@ -79,7 +83,7 @@ class LUGpuKernel : public GpuKernel {
|
|||
// 5. malloc device working space of getrf
|
||||
|
||||
if (cudaMalloc(reinterpret_cast<void **>(&d_work_), unit_size_ * lwork_) != cudaSuccess) {
|
||||
MS_LOG(EXCEPTION) << "cusolver malloc work size fail";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', cusolver malloc work size fail";
|
||||
}
|
||||
|
||||
// 6. solve to lu factorization according to cuSolver api, outputs have been written to input's matrix.
|
||||
|
@ -87,7 +91,7 @@ class LUGpuKernel : public GpuKernel {
|
|||
kernel_node_, cusolverDnDgetrf(handle_, m_, m_, input_addr, lda_, d_work_, piv_output_addr, info_output_addr),
|
||||
"cusolver lu fail");
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "cholesky factorization do not support other data type but only float or double, right now.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type only should be float or double, right now.";
|
||||
}
|
||||
// 7. copy results from written input's matrix to output's matrix.
|
||||
// if (cudaMemcpy(output_addr, input_addr, lda_ * m_ * unit_size_, cudaMemcpyDeviceToDevice) != cudaSuccess) {
|
||||
|
@ -101,14 +105,16 @@ class LUGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
// 1. get CuSolver Dense matrix handler
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCusolverDnHandle();
|
||||
auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
// 2. check input shape not null
|
||||
bool is_null_input = CHECK_NULL_INPUT(in_shape);
|
||||
if (is_null_input) {
|
||||
MS_LOG(EXCEPTION) << "For 'PureCholeskyGpuKernel', input is null";
|
||||
is_null_input_ = CHECK_SHAPE_NULL(in_shape, kernel_name_, "input");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
// 3. calculate input size
|
||||
if (!InitInputSize(in_shape)) {
|
||||
|
@ -126,11 +132,11 @@ class LUGpuKernel : public GpuKernel {
|
|||
lu_row_ = in_shape.at(kDim0);
|
||||
lu_col_ = in_shape.at(kDim1);
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Input Only support Rank 1 OR 2";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the dimension of input only should be 1 or 2";
|
||||
return false;
|
||||
}
|
||||
if (lu_row_ != lu_col_) {
|
||||
MS_LOG(ERROR) << "Cholesky need square matrix as input.";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', the shape of input should be square matrix";
|
||||
return false;
|
||||
}
|
||||
// set matrix row or col to be lead dimension
|
||||
|
@ -170,6 +176,7 @@ class LUGpuKernel : public GpuKernel {
|
|||
std::vector<size_t> input_size_list_{};
|
||||
std::vector<size_t> output_size_list_{};
|
||||
std::vector<size_t> workspace_size_list_{};
|
||||
bool is_null_input_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -91,13 +91,14 @@ class MatMulGpuKernel : public GpuKernel {
|
|||
"cublasGemmStridedBatchedEx failed. Possible reasons: the GPU is occupied by other processes.");
|
||||
}
|
||||
} catch (const std::exception &e) {
|
||||
MS_LOG(EXCEPTION) << "Encountered an exception: " << e.what() << " when invoke cublas "
|
||||
<< (batch_ == 1 ? "cublasGemmEx" : "cublasGemmStridedBatchedEx");
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', encountered an exception: " << e.what() << " when invoke "
|
||||
<< "cublas " << (batch_ == 1 ? "cublasGemmEx" : "cublasGemmStridedBatchedEx");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
|
||||
dtype_a_ = GetCudaDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
|
||||
|
@ -105,7 +106,7 @@ class MatMulGpuKernel : public GpuKernel {
|
|||
dtype_c_ = GetCudaDataType(TypeIdLabel(AnfAlgo::GetOutputDeviceDataType(kernel_node, 0)));
|
||||
auto node_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
if (dtype_a_ != dtype_b_ || dtype_a_ != dtype_c_) {
|
||||
MS_LOG(EXCEPTION) << "input and output types are not the same in " << node_name;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', input and output types are not the same in " << node_name;
|
||||
}
|
||||
if (dtype_a_ == CUDA_R_16F && dtype_b_ == CUDA_R_16F && dtype_c_ == CUDA_R_16F) {
|
||||
MS_LOG(INFO) << "input and output type is float16, allow to use Tensor Core operations if possible";
|
||||
|
@ -113,15 +114,16 @@ class MatMulGpuKernel : public GpuKernel {
|
|||
}
|
||||
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
auto input1_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(output_shape) || CHECK_NULL_INPUT(input1_shape);
|
||||
is_null_input_ =
|
||||
CHECK_SHAPE_NULL(input1_shape, kernel_name_, "input") || CHECK_SHAPE_NULL(output_shape, kernel_name_, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'MatmulGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
auto dims = output_shape.size();
|
||||
if (dims < 2) {
|
||||
MS_LOG(EXCEPTION) << "Output dims " << dims << " not support.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of output cannot be less than 2, but got "
|
||||
<< dims;
|
||||
}
|
||||
|
||||
m_ = output_shape[dims - 2];
|
||||
|
@ -139,7 +141,7 @@ class MatMulGpuKernel : public GpuKernel {
|
|||
} else if (!transpose && input1_shape.size() > (dims - 1)) {
|
||||
k_ = input1_shape[dims - 1];
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Init k_ via input1_shape failed.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', init k_ via input1_shape failed.";
|
||||
}
|
||||
|
||||
transpose = GetAttr<bool>(kernel_node, "transpose_x2");
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <cublas_v2.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
||||
|
@ -90,28 +91,30 @@ class MatrixInverseGpuKernel : public GpuKernel {
|
|||
reinterpret_cast<double **>(inv_batch_addr), len, info_addr, batchsize),
|
||||
"cublas trsm batched Fail");
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "The data type entered must be float or double.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the data type entered must be float or double.";
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name_, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'MatrixInverseGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (input_shape.size() < 2) {
|
||||
MS_LOG(EXCEPTION) << "The dim entered needs to be greater than 2, but " << input_shape.size() << " was taken";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be less than 2, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
size_t last_index = input_shape.size() - 1;
|
||||
if (input_shape[last_index] != input_shape[last_index - 1]) {
|
||||
MS_LOG(EXCEPTION) << "The last two dimensions of the input matrix should be equal!";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the last two dimensions of the input matrix should be equal, "
|
||||
<< "but got one: " << input_shape[last_index] << ", another: " << input_shape[last_index - 1];
|
||||
}
|
||||
size_ = input_shape[last_index];
|
||||
for (size_t i = 0; i < last_index - 1; i++) {
|
||||
|
|
|
@ -59,7 +59,7 @@ class MultinomialGpuKernel : public GpuKernel {
|
|||
T *probs_addr = GetDeviceAddress<T>(inputs, 0);
|
||||
int64_t *num_sample_addr = GetDeviceAddress<int64_t>(inputs, 1);
|
||||
if (distributions_ == 0) {
|
||||
MS_LOG(ERROR) << "Divide by zero. the distributions_ is 0.";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', divide by zero. the distributions_ is 0.";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -83,22 +83,20 @@ class MultinomialGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but multinomial needs 2 input.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 2, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but multinomial needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
auto input_shape_0 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape_0) || CHECK_NULL_INPUT(output_shape);
|
||||
is_null_input_ =
|
||||
CHECK_SHAPE_NULL(input_shape_0, kernel_name_, "input") || CHECK_SHAPE_NULL(output_shape, kernel_name_, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'MultinomialGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -65,24 +65,22 @@ class NMSWithMaskGpuFwdKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
iou_value_ = GetAttr<float>(kernel_node, "iou_threshold");
|
||||
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but NMSWithMask needs 1 input.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 3) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but NMSWithMask needs 3 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 3, but got " << output_num;
|
||||
}
|
||||
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'NMSWithMaskGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -116,7 +116,7 @@ class RandomOpGpuKernel : public GpuKernel {
|
|||
inputs[2]->size / sizeof(T), output_addr, outputs[0]->size / sizeof(T),
|
||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "For UniformInt op, `minval` should be strictly less than `maxval`";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << "', `minval` should be strictly less than `maxval`";
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
@ -144,7 +144,8 @@ class RandomOpGpuKernel : public GpuKernel {
|
|||
break;
|
||||
}
|
||||
default: {
|
||||
MS_LOG(EXCEPTION) << "Random operation " << random_op_type_ << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: StandardNormal, CudnnUniformReal, "
|
||||
<< "UniformInt, UniformReal currently, but got " << random_op_type_;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
@ -154,29 +155,27 @@ class RandomOpGpuKernel : public GpuKernel {
|
|||
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto iter = kRandomOpTypeMap.find(kernel_name);
|
||||
if (iter == kRandomOpTypeMap.end()) {
|
||||
MS_LOG(EXCEPTION) << "Random operation " << kernel_name << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << ", only support these types: StandardNormal, CudnnUniformReal, "
|
||||
<< "UniformInt, UniformReal currently, but got " << kernel_name;
|
||||
} else {
|
||||
random_op_type_ = iter->second;
|
||||
}
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if ((random_op_type_ == RANDOM_OP_NORMAL || random_op_type_ == RANDOM_OP_UNIFORM_REAL) && input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but random op needs 1 input.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
if (random_op_type_ == RANDOM_OP_UNIFORM_INT && input_num != 3) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but random op needs 3 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but random op needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
auto input_shape_0 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape_0) || CHECK_NULL_INPUT(output_shape);
|
||||
is_null_input_ =
|
||||
CHECK_SHAPE_NULL(input_shape_0, kernel_name, "input") || CHECK_SHAPE_NULL(output_shape, kernel_name, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'RandomOpGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -51,11 +51,12 @@ class SquareSumAllGpuFwdKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(ERROR) << "SquareSumAllGpuFwdKernel input is null";
|
||||
return false;
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
for (size_t i = 0; i < input_shape.size(); i++) {
|
||||
input_size_ *= input_shape[i];
|
||||
|
|
|
@ -56,18 +56,21 @@ class SquaredDifferenceOpGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto input_shape1 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
auto input_shape2 = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 1);
|
||||
auto output_shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape1) || CHECK_NULL_INPUT(input_shape2) || CHECK_NULL_INPUT(output_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape1, kernel_name, "input") ||
|
||||
CHECK_SHAPE_NULL(input_shape2, kernel_name, "input") ||
|
||||
CHECK_SHAPE_NULL(output_shape, kernel_name, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'SquaredDifferenceGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
need_broadcast_ = IsBroadcast(input_shape1, input_shape2);
|
||||
if (need_broadcast_ && output_shape.size() > MAX_DIMS) {
|
||||
MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than " << MAX_DIMS;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of output cannot be greater than " << MAX_DIMS
|
||||
<< ", but got " << output_shape.size();
|
||||
}
|
||||
|
||||
lhs_shape_.resize(MAX_DIMS, 1);
|
||||
|
@ -86,7 +89,8 @@ class SquaredDifferenceOpGpuKernel : public GpuKernel {
|
|||
lhs_shape_[j + lhs_offset] = input_shape1[j];
|
||||
} else {
|
||||
auto index = j + lhs_offset;
|
||||
MS_LOG(EXCEPTION) << "Invalid input1 index: " << index;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the index of input cannot be " << index << ", but got "
|
||||
<< index;
|
||||
}
|
||||
}
|
||||
input1_num_ *= input_shape1[j];
|
||||
|
@ -98,7 +102,8 @@ class SquaredDifferenceOpGpuKernel : public GpuKernel {
|
|||
rhs_shape_[k + rhs_offset] = input_shape2[k];
|
||||
} else {
|
||||
auto index = k + rhs_offset;
|
||||
MS_LOG(EXCEPTION) << "Invalid input2 index: " << index;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the index of input cannot be " << index << ", but got "
|
||||
<< index;
|
||||
}
|
||||
}
|
||||
input2_num_ *= input_shape2[k];
|
||||
|
|
|
@ -118,28 +118,30 @@ class TrsmGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
blas_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
|
||||
auto A_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto b_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
is_null_input_ = CHECK_NULL_INPUT(A_shape) || CHECK_NULL_INPUT(b_shape);
|
||||
is_null_input_ =
|
||||
CHECK_SHAPE_NULL(A_shape, kernel_name, "input_A") || CHECK_SHAPE_NULL(b_shape, kernel_name, "input_b");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'TrsmGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (A_shape[kDim0] != A_shape[kDim1]) {
|
||||
MS_LOG(EXCEPTION) << "wrong array shape, A should be a squre matrix, but got [" << A_shape[kDim0] << " X "
|
||||
<< A_shape[kDim1] << "]";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the shape of input should be square matrix, but got ["
|
||||
<< A_shape[kDim0] << " X " << A_shape[kDim1] << "]";
|
||||
}
|
||||
m_ = A_shape[kDim0];
|
||||
|
||||
if (b_shape.size() != kAVectorxDimNum && b_shape.size() != kAMatrixDimNum) {
|
||||
MS_LOG(EXCEPTION) << "wrong array shape, b should be 1D or 2D, but got [" << b_shape.size() << "] dimensions";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 1 or 2, but got "
|
||||
<< b_shape.size();
|
||||
}
|
||||
if (b_shape[kDim0] != m_) {
|
||||
MS_LOG(EXCEPTION) << "wrong array shape, b should match the shape of A, excepted [" << m_ << "] but got ["
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the shape of input should be [" << m_ << "], but got ["
|
||||
<< b_shape[kDim0] << "]";
|
||||
}
|
||||
if (b_shape.size() == kAVectorxDimNum || (b_shape.size() == kAMatrixDimNum && b_shape[kDim1] == 1)) {
|
||||
|
@ -158,7 +160,7 @@ class TrsmGpuKernel : public GpuKernel {
|
|||
} else if (trans == "T") {
|
||||
trans_ = CUBLAS_OP_N;
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "trans should be in [N, T], but got [" << trans << "]";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', trans should be in [N, T], but got [" << trans << "]";
|
||||
}
|
||||
|
||||
bool lower = AnfAlgo::GetNodeAttr<bool>(kernel_node, "lower");
|
||||
|
|
|
@ -64,29 +64,28 @@ class UnaryOpComplexGpuKernel : public GpuKernel {
|
|||
break;
|
||||
}
|
||||
default: {
|
||||
MS_LOG(EXCEPTION) << "Unary operation " << unary_op_type_ << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: Real, Imag, Conj currently, "
|
||||
<< "but got " << unary_op_type_;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
GetOpType(kernel_node);
|
||||
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but unary op needs 1 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but unary op needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be 3, but got " << output_num;
|
||||
}
|
||||
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name_, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "UnaryOpGpuKernel input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -123,8 +122,8 @@ class UnaryOpComplexGpuKernel : public GpuKernel {
|
|||
unary_op_type_ = iter->second;
|
||||
return;
|
||||
}
|
||||
|
||||
MS_LOG(EXCEPTION) << "operation " << kernel_name << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: Real, Imag, Conj currently, but got "
|
||||
<< kernel_name;
|
||||
}
|
||||
|
||||
private:
|
||||
|
|
|
@ -184,7 +184,9 @@ class UnaryOpGpuKernel : public GpuKernel {
|
|||
break;
|
||||
}
|
||||
default: {
|
||||
MS_LOG(EXCEPTION) << "Unary operation " << unary_op_type_ << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: Exp, Expm1, Log, Log1p, Erf, Erfc,"
|
||||
<< " Neg, Reciprocal, Square, Sqrt, Rsqrt, Sin, Cos, Asin, ACos, Atan, Asinh, Acosh, Abs, "
|
||||
<< "Floor, Rint, Round, Real, Imag, Sign, Conj currently, but got " << unary_op_type_;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
@ -193,23 +195,22 @@ class UnaryOpGpuKernel : public GpuKernel {
|
|||
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto iter = kUnaryOpTypeMap.find(kernel_name);
|
||||
if (iter == kUnaryOpTypeMap.end()) {
|
||||
MS_LOG(EXCEPTION) << "Unary operation " << kernel_name << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << ", only support these types: Exp, Expm1, Log, Log1p, Erf, Erfc,"
|
||||
<< " Neg, Reciprocal, Square, Sqrt, Rsqrt, Sin, Cos, Asin, ACos, Atan, Asinh, Acosh, Abs, "
|
||||
<< "Floor, Rint, Round, Real, Imag, Sign, Conj currently, but got " << kernel_name;
|
||||
}
|
||||
unary_op_type_ = iter->second;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but unary op needs 1 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but unary op needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
auto input_shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'UnaryOpGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -112,7 +112,9 @@ class UnaryGradOpGpuKernel : public GpuKernel {
|
|||
break;
|
||||
}
|
||||
default: {
|
||||
MS_LOG(EXCEPTION) << "Unary grad operation " << unary_grad_op_type_ << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: SqrtGrad, RsqrtGrad, AsinGrad, "
|
||||
<< "ACosGrad, AtanGrad, AsinhGrad, AcoshGrad, ReciprocalGrad currently, but got "
|
||||
<< unary_grad_op_type_;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
@ -121,23 +123,22 @@ class UnaryGradOpGpuKernel : public GpuKernel {
|
|||
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto iter = kUnaryGradOpTypeMap.find(kernel_name);
|
||||
if (iter == kUnaryGradOpTypeMap.end()) {
|
||||
MS_LOG(EXCEPTION) << "Unary grad operation " << kernel_name << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << ", only support these types: SqrtGrad, RsqrtGrad, AsinGrad, "
|
||||
<< "ACosGrad, AtanGrad, AsinhGrad, AcoshGrad, ReciprocalGrad currently, but got "
|
||||
<< kernel_name;
|
||||
}
|
||||
unary_grad_op_type_ = iter->second;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but unary grad op needs 2 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 2, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but unary grad op needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'UnaryOpGradGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -145,9 +146,8 @@ class UnaryGradOpGpuKernel : public GpuKernel {
|
|||
input_size_ *= input_shape[i];
|
||||
}
|
||||
auto dx_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
is_null_input_ = CHECK_NULL_INPUT(dx_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(dx_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "UnaryGradOpGpuKernel input 1 is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -155,7 +155,8 @@ class UnaryGradOpGpuKernel : public GpuKernel {
|
|||
dx_size_ *= dx_shape[i];
|
||||
}
|
||||
if (input_size_ != dx_size_) {
|
||||
MS_LOG(WARNING) << "UnaryGradOpGpuKernel inputs should be same, but got " << input_size_ << " and " << dx_size_;
|
||||
MS_LOG(WARNING) << "For '" << kernel_name << "', both inputs should be equal, but got " << input_size_ << " and "
|
||||
<< dx_size_;
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <cublas_v2.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
||||
#include "backend/kernel_compiler/gpu/kernel_constants.h"
|
||||
|
@ -98,7 +99,8 @@ class UpdateThorGradientGpuKernel : public GpuKernel {
|
|||
CUDA_R_32F, algo_),
|
||||
"cublasSgemm Call Fail");
|
||||
} catch (const std::exception &e) {
|
||||
MS_LOG(EXCEPTION) << "Encountered an exception: " << e.what() << "when invoke cubals cublasGemmStridedBatchedEx";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", encountered an exception: " << e.what()
|
||||
<< " when invoke cubals cublasGemmStridedBatchedEx";
|
||||
}
|
||||
|
||||
auto r_input_addr = workspace1_addr;
|
||||
|
@ -147,9 +149,7 @@ class UpdateThorGradientGpuKernel : public GpuKernel {
|
|||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_node_ = kernel_node;
|
||||
handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCublasHandle();
|
||||
if (!SetProperty(kernel_node)) {
|
||||
return false;
|
||||
}
|
||||
(void)SetProperty(kernel_node);
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -188,22 +188,20 @@ class UpdateThorGradientGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
private:
|
||||
bool SetProperty(const CNodePtr &kernel_node) {
|
||||
void SetProperty(const CNodePtr &kernel_node) {
|
||||
auto matrix_a_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto gradient_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
auto matrix_g_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
|
||||
is_null_input_ =
|
||||
CHECK_NULL_INPUT(matrix_a_shape) || CHECK_NULL_INPUT(gradient_shape) || CHECK_NULL_INPUT(matrix_g_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(matrix_a_shape, kernel_name_, "matrix_a") ||
|
||||
CHECK_SHAPE_NULL(gradient_shape, kernel_name_, "gradient") ||
|
||||
CHECK_SHAPE_NULL(matrix_g_shape, kernel_name_, "matrix_g");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'UpdateThorGradientGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
return;
|
||||
}
|
||||
|
||||
split_dim = LongToSize(GetAttr<int64_t>(kernel_node, "split_dim"));
|
||||
if (split_dim == 0) {
|
||||
MS_LOG(ERROR) << "Divide by zero, split_dim can not be zero.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", divide by zero, split_dim cannot be 0, but got " << split_dim;
|
||||
}
|
||||
gradient_size.batch_h = gradient_shape[0] / split_dim;
|
||||
gradient_size.batch_w = gradient_shape[1] / split_dim;
|
||||
|
@ -244,7 +242,6 @@ class UpdateThorGradientGpuKernel : public GpuKernel {
|
|||
gradient_size.ori_w = gradient_shape[1];
|
||||
gradient_size.ori_h = gradient_shape[0];
|
||||
gradient_size.dtype = GetCudaDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 1)));
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t split_dim;
|
||||
|
|
|
@ -71,13 +71,15 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
|
|||
break;
|
||||
}
|
||||
default: {
|
||||
MS_LOG(EXCEPTION) << "Kernel type " << nccl_kernel_type_ << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: AllReduce, AllGather, Broadcast, "
|
||||
<< "ReduceScatter currently, but got " << nccl_kernel_type_;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
nccl_data_type_ = nccl_dtype(AnfAlgo::GetInputDeviceDataType(kernel_node, 0));
|
||||
|
@ -87,9 +89,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
|
|||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
for (size_t i = 0; i < input_num; ++i) {
|
||||
auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, i);
|
||||
is_null_input_ = CHECK_NULL_INPUT(shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'NcclCollectiveGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -103,9 +104,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
|
|||
}
|
||||
for (size_t i = 0; i < output_num; ++i) {
|
||||
auto shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, i);
|
||||
is_null_input_ = CHECK_NULL_INPUT(shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'NcclCollectiveGpuKernel', output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -201,7 +201,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
|
|||
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto iter = kNcclTypeMap.find(kernel_name);
|
||||
if (iter == kNcclTypeMap.end()) {
|
||||
MS_LOG(EXCEPTION) << "Kernel " << kernel_name << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: AllReduce, AllGather, Broadcast, "
|
||||
<< "ReduceScatter currently, but got " << kernel_name;
|
||||
} else {
|
||||
nccl_kernel_type_ = iter->second;
|
||||
}
|
||||
|
@ -220,7 +221,8 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel {
|
|||
} else if (type == "prod") {
|
||||
nccl_reduce_type_ = ncclProd;
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Nccl reduce type " << type << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: sum, max, min, prod currently, "
|
||||
<< "but got " << type;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -52,13 +52,15 @@ class NcclP2PGpuKernel : public NcclGpuKernel {
|
|||
break;
|
||||
}
|
||||
default: {
|
||||
MS_LOG(EXCEPTION) << "Kernel type " << nccl_kernel_type_ << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: AllToAllv, NeighborExchange "
|
||||
<< "currently, but got " << nccl_kernel_type_;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
InferCommType(kernel_node);
|
||||
|
@ -73,9 +75,8 @@ class NcclP2PGpuKernel : public NcclGpuKernel {
|
|||
}
|
||||
for (size_t i = 0; i < input_num; ++i) {
|
||||
auto shape = AnfAlgo::GetInputRealDeviceShapeIfExist(kernel_node, i);
|
||||
is_null_input_ = CHECK_NULL_INPUT(shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'NcclP2PGpuKernel', input shape is null ";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -88,9 +89,8 @@ class NcclP2PGpuKernel : public NcclGpuKernel {
|
|||
}
|
||||
for (size_t i = 0; i < output_num; ++i) {
|
||||
auto shape = AnfAlgo::GetOutputRealDeviceShapeIfExist(kernel_node, i);
|
||||
is_null_input_ = CHECK_NULL_INPUT(shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'NcclP2PGpuKernel', output shape is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -157,10 +157,12 @@ class NcclP2PGpuKernel : public NcclGpuKernel {
|
|||
|
||||
// send_rank_id and recv rank_id size needs to be equal to input_list size
|
||||
if (send_rank_ids.size() != input_size_list_.size()) {
|
||||
MS_LOG(ERROR) << "Trying to use AlltoAllv, but send_rank_ids vector size not equals to input_list size.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", trying to use AlltoAllv, the size of send_rank_ids vector "
|
||||
<< "should be " << input_size_list_.size() << ", but got " << send_rank_ids.size();
|
||||
}
|
||||
if (recv_rank_ids.size() != output_size_list_.size()) {
|
||||
MS_LOG(ERROR) << "Trying to use AlltoAllv, but recv_rank_ids vector size not equals to output_list size.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", trying to use AlltoAllv, the size of recv_rank_ids vector "
|
||||
<< "should be " << output_size_list_.size() << ", but got " << recv_rank_ids.size();
|
||||
}
|
||||
|
||||
// This implementation refers to NVIDIA NCCL 2.11 doc.
|
||||
|
@ -182,7 +184,8 @@ class NcclP2PGpuKernel : public NcclGpuKernel {
|
|||
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto iter = kNcclTypeMap.find(kernel_name);
|
||||
if (iter == kNcclTypeMap.end()) {
|
||||
MS_LOG(EXCEPTION) << "Kernel " << kernel_name << " is not supported.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << ", only support these types: AllToAllv, NeighborExchange "
|
||||
<< "currently, but got " << kernel_name;
|
||||
} else {
|
||||
nccl_kernel_type_ = iter->second;
|
||||
}
|
||||
|
|
|
@ -46,21 +46,20 @@ class NcclRecvGpuKernel : public NcclGpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but NCCL receive needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
src_rank_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "src_rank"));
|
||||
group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
|
||||
nccl_data_type_ = nccl_dtype(AnfAlgo::GetOutputDeviceDataType(kernel_node, 0));
|
||||
|
||||
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(output_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(output_shape, kernel_name, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'NcclRecvGpuKernel', output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -46,12 +46,12 @@ class NcclSendGpuKernel : public NcclGpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but NCCL send needs 1 input.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_num;
|
||||
}
|
||||
|
||||
dest_rank_ = static_cast<int>(GetAttr<int64_t>(kernel_node, "dest_rank"));
|
||||
|
@ -60,9 +60,8 @@ class NcclSendGpuKernel : public NcclGpuKernel {
|
|||
MS_LOG(INFO) << "NcclSend dest rank is " << dest_rank_ << ", group name is " << group_name_;
|
||||
|
||||
auto input_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'NcclSendGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -41,6 +41,9 @@ class SyncBatchNormGpuKernel : public NcclGpuKernel {
|
|||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
T *x = GetDeviceAddress<T>(inputs, 0);
|
||||
S *scale = GetDeviceAddress<S>(inputs, 1);
|
||||
S *bias = GetDeviceAddress<S>(inputs, 2);
|
||||
|
@ -78,6 +81,7 @@ class SyncBatchNormGpuKernel : public NcclGpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto root_rank = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kAttrRootRank);
|
||||
if (root_rank) {
|
||||
root_ = static_cast<int>(GetValue<int64_t>(root_rank));
|
||||
|
@ -86,24 +90,22 @@ class SyncBatchNormGpuKernel : public NcclGpuKernel {
|
|||
group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 5) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but SyncBatchNorm needs 5 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 5, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 5) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but SyncBatchNorm needs 5 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 5, but got " << output_num;
|
||||
}
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
if (CHECK_NULL_INPUT(input_shape)) {
|
||||
MS_LOG(WARNING) << "SyncBatchNorm input is null";
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
auto input_shape_dims = input_shape.size();
|
||||
if (input_shape_dims != 4 && input_shape_dims != 2) {
|
||||
MS_LOG(EXCEPTION) << "Tensor shape is " << input_shape.size()
|
||||
<< ", SyncBatchNormGpuKernel input should be 2D or 4D";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input only should be 2 or 4, but got "
|
||||
<< input_shape_dims;
|
||||
}
|
||||
input_size_ = 1;
|
||||
for (auto dim : input_shape) {
|
||||
|
@ -175,6 +177,7 @@ class SyncBatchNormGpuKernel : public NcclGpuKernel {
|
|||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
workspace_size_list_.clear();
|
||||
is_null_input_ = false;
|
||||
}
|
||||
|
||||
protected:
|
||||
|
@ -233,6 +236,7 @@ class SyncBatchNormGpuKernel : public NcclGpuKernel {
|
|||
string group_name_;
|
||||
int root_;
|
||||
cudaStream_t comm_stream_;
|
||||
bool is_null_input_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -41,6 +41,9 @@ class SyncBatchNormGradGpuKernel : public NcclGpuKernel {
|
|||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||
if (is_null_input_) {
|
||||
return true;
|
||||
}
|
||||
T *dy = GetDeviceAddress<T>(inputs, 0);
|
||||
T *x_input = GetDeviceAddress<T>(inputs, 1);
|
||||
S *scale = GetDeviceAddress<S>(inputs, 2);
|
||||
|
@ -65,6 +68,7 @@ class SyncBatchNormGradGpuKernel : public NcclGpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto root_rank = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kAttrRootRank);
|
||||
if (root_rank) {
|
||||
root_ = static_cast<int>(GetValue<int64_t>(root_rank));
|
||||
|
@ -73,24 +77,22 @@ class SyncBatchNormGradGpuKernel : public NcclGpuKernel {
|
|||
group_name_ = GetAttr<std::string>(kernel_node, kAttrGroup);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 5) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but SyncBatchNormGrad needs 5 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 5, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 3) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but SyncBatchNormGrad needs 5 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 3, but got " << output_num;
|
||||
}
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
if (CHECK_NULL_INPUT(input_shape)) {
|
||||
MS_LOG(WARNING) << "SyncBatchNormGrad input is null";
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
auto input_shape_dims = input_shape.size();
|
||||
if (input_shape_dims != 4 && input_shape_dims != 2) {
|
||||
MS_LOG(EXCEPTION) << "Tensor shape is " << input_shape.size()
|
||||
<< ", SyncBatchNormGpuGrad input should be 2D or 4D";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input only should be 2 or 4, but got "
|
||||
<< input_shape_dims;
|
||||
}
|
||||
input_size_ = 1;
|
||||
for (auto dim : input_shape) {
|
||||
|
@ -155,6 +157,7 @@ class SyncBatchNormGradGpuKernel : public NcclGpuKernel {
|
|||
input_size_list_.clear();
|
||||
output_size_list_.clear();
|
||||
workspace_size_list_.clear();
|
||||
is_null_input_ = false;
|
||||
}
|
||||
|
||||
protected:
|
||||
|
@ -201,6 +204,7 @@ class SyncBatchNormGradGpuKernel : public NcclGpuKernel {
|
|||
string group_name_;
|
||||
int root_;
|
||||
cudaStream_t comm_stream_;
|
||||
bool is_null_input_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_OTHER_ASSIGN_GPU_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
||||
|
||||
|
@ -52,15 +53,13 @@ class AssignGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
if (!CheckParam(kernel_node)) {
|
||||
return false;
|
||||
}
|
||||
(void)CheckParam(kernel_node);
|
||||
auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'AssignGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -80,19 +79,16 @@ class AssignGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
private:
|
||||
bool CheckParam(const CNodePtr &kernel_node) {
|
||||
void CheckParam(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but AssignGpuKernel needs 2 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 2, but got " << input_num;
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but AssignGpuKernel needs 1 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<size_t> input_size_list_;
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#define MINDSPORE_CCSRC_KERNEL_GPU_OTHER_BOUNDINGBOX_DECODE_GPU_KERNEL_H
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/boundingbox_decode_impl.cuh"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
|
@ -46,7 +47,8 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
|
|||
T *bboxes_addr = GetDeviceAddress<T>(outputs, 0);
|
||||
|
||||
if (inputs[0]->size != inputs[1]->size) {
|
||||
MS_LOG(ERROR) << "Rois box size must equal with deltas box size -" << inputs[1]->size << ", but got"
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_
|
||||
<< "', rois box size must equal with deltas box size: " << inputs[1]->size << ", but got "
|
||||
<< inputs[0]->size;
|
||||
return false;
|
||||
}
|
||||
|
@ -54,7 +56,7 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
|
|||
const size_t coordinate = 4;
|
||||
const size_t block_size = inputs[0]->size / sizeof(T);
|
||||
if ((block_size % coordinate) != 0) {
|
||||
MS_LOG(ERROR) << "The size of the box must be a multiple of 4.";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << ", the size of the box should be a multiple of 4.";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -65,11 +67,11 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but BoundingBoxDecode needs 2 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 2, but got " << input_num;
|
||||
}
|
||||
rois_size_ = sizeof(T);
|
||||
deltas_size_ = sizeof(T);
|
||||
|
@ -78,9 +80,10 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
|
|||
auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto labels_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(logits_shape) || CHECK_NULL_INPUT(labels_shape) || CHECK_NULL_INPUT(output_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(logits_shape, kernel_name_, "anchor_box") ||
|
||||
CHECK_SHAPE_NULL(labels_shape, kernel_name_, "deltas") ||
|
||||
CHECK_SHAPE_NULL(output_shape, kernel_name_, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'BoundingBoxDecodeGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -111,7 +114,7 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
|
|||
means_.emplace_back(mean);
|
||||
}
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Attribute means type is invalid.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute means type is invalid.";
|
||||
}
|
||||
|
||||
auto stds = prim->GetAttr("stds");
|
||||
|
@ -124,7 +127,7 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
|
|||
stds_.emplace_back(std);
|
||||
}
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Attribute stds type is invalid.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute stds type is invalid.";
|
||||
}
|
||||
|
||||
std::vector<int64_t> max_shape_me = GetAttr<std::vector<int64_t>>(kernel_node, "max_shape");
|
||||
|
@ -133,11 +136,13 @@ class BoundingBoxDecodeGpuKernel : public GpuKernel {
|
|||
wh_ratio_clip_ = GetAttr<float>(kernel_node, "wh_ratio_clip");
|
||||
|
||||
if (means_.size() < coordinate_size || stds_.size() < coordinate_size) {
|
||||
MS_LOG(EXCEPTION) << "The size of means or stds is less than 4.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the both size of means or stds cannot be less than 4, but got"
|
||||
<< " the size of means: " << means_.size() << ", the size of stds: " << stds_.size();
|
||||
}
|
||||
|
||||
if (max_shape_.size() < 2) {
|
||||
MS_LOG(EXCEPTION) << "The size of max_shape is less than 2.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the size of max_shape cannot be less than 2, but got "
|
||||
<< max_shape_.size();
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#define MINDSPORE_CCSRC_KERNEL_GPU_OTHER_BOUNDINGBOX_ENCODE_GPU_KERNEL_H
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/boundingbox_encode_impl.cuh"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
||||
|
@ -28,7 +29,6 @@ template <typename T>
|
|||
class BoundingBoxEncodeGpuKernel : public GpuKernel {
|
||||
public:
|
||||
BoundingBoxEncodeGpuKernel() : anchor_size_(0), groundtruth_size_(0), deltas_size_(0), is_null_input_(false) {}
|
||||
|
||||
~BoundingBoxEncodeGpuKernel() override = default;
|
||||
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
|
@ -45,7 +45,8 @@ class BoundingBoxEncodeGpuKernel : public GpuKernel {
|
|||
T *deltas_addr = GetDeviceAddress<T>(outputs, 0);
|
||||
|
||||
if (inputs[0]->size != inputs[1]->size) {
|
||||
MS_LOG(ERROR) << "Anchor box size must equal with groundtruth box size -" << inputs[1]->size << ", but got"
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_
|
||||
<< "', anchor box size must equal with groundtruth box size: " << inputs[1]->size << ", but got "
|
||||
<< inputs[0]->size;
|
||||
return false;
|
||||
}
|
||||
|
@ -53,7 +54,7 @@ class BoundingBoxEncodeGpuKernel : public GpuKernel {
|
|||
const size_t coordinate = 4;
|
||||
const size_t block_size = inputs[0]->size / sizeof(T);
|
||||
if ((block_size % coordinate) != 0) {
|
||||
MS_LOG(ERROR) << "The size of the box must be a multiple of 4.";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << ", the size of the box should be a multiple of 4.";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -64,11 +65,11 @@ class BoundingBoxEncodeGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but BoundingBoxEncode needs 2 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 2, but got " << input_num;
|
||||
}
|
||||
anchor_size_ = sizeof(T);
|
||||
groundtruth_size_ = sizeof(T);
|
||||
|
@ -77,9 +78,10 @@ class BoundingBoxEncodeGpuKernel : public GpuKernel {
|
|||
auto logits_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto labels_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(logits_shape) || CHECK_NULL_INPUT(labels_shape) || CHECK_NULL_INPUT(output_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(logits_shape, kernel_name_, "anchor_box") ||
|
||||
CHECK_SHAPE_NULL(labels_shape, kernel_name_, "groundtruth_box") ||
|
||||
CHECK_SHAPE_NULL(output_shape, kernel_name_, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'BoundingBoxEncodeGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -110,7 +112,7 @@ class BoundingBoxEncodeGpuKernel : public GpuKernel {
|
|||
means_.emplace_back(mean);
|
||||
}
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Attribute means type is invalid.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute means type is invalid.";
|
||||
}
|
||||
auto stds = prim->GetAttr("stds");
|
||||
MS_EXCEPTION_IF_NULL(stds);
|
||||
|
@ -122,11 +124,12 @@ class BoundingBoxEncodeGpuKernel : public GpuKernel {
|
|||
stds_.emplace_back(std);
|
||||
}
|
||||
} else {
|
||||
MS_LOG(EXCEPTION) << "Attribute stds type is invalid.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute stds type is invalid.";
|
||||
}
|
||||
|
||||
if (means_.size() < coordinate_size || stds_.size() < coordinate_size) {
|
||||
MS_LOG(EXCEPTION) << "The size of means or stds is less than 4.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the both size of means or stds cannot be less than 4, but got"
|
||||
<< " the size of means: " << means_.size() << ", the size of stds: " << stds_.size();
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#define MINDSPORE_CCSRC_KERNEL_GPU_OTHER_CHECK_VALID_GPU_KERNEL_H
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "backend/kernel_compiler/gpu/cuda_impl/check_valid_impl.cuh"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
|
||||
#include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
|
||||
|
@ -28,7 +29,6 @@ template <typename T, typename S>
|
|||
class CheckValidGpuKernel : public GpuKernel {
|
||||
public:
|
||||
CheckValidGpuKernel() : anchor_boxes_size_(0), img_metas_size_(0), valid_size_(0), is_null_input_(false) {}
|
||||
|
||||
~CheckValidGpuKernel() override = default;
|
||||
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
|
@ -48,7 +48,7 @@ class CheckValidGpuKernel : public GpuKernel {
|
|||
const size_t coordinate = 4;
|
||||
const size_t block_size = inputs[0]->size / sizeof(T);
|
||||
if ((block_size % coordinate) != 0) {
|
||||
MS_LOG(ERROR) << "The size of the box must be a multiple of 4.";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << ", the size of the box should be a multiple of 4.";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -58,11 +58,11 @@ class CheckValidGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but CheckValid needs 2 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 2, but got " << input_num;
|
||||
}
|
||||
anchor_boxes_size_ = sizeof(T);
|
||||
img_metas_size_ = sizeof(T);
|
||||
|
@ -71,10 +71,10 @@ class CheckValidGpuKernel : public GpuKernel {
|
|||
auto anchor_boxes_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto img_metas_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
auto valid_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ =
|
||||
CHECK_NULL_INPUT(anchor_boxes_shape) || CHECK_NULL_INPUT(img_metas_shape) || CHECK_NULL_INPUT(valid_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(anchor_boxes_shape, kernel_name_, "bboxes") ||
|
||||
CHECK_SHAPE_NULL(img_metas_shape, kernel_name_, "img_metas") ||
|
||||
CHECK_SHAPE_NULL(valid_shape, kernel_name_, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'CheckValidGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -62,18 +62,17 @@ class GpuConvertToDynamicShapeGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_count = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_count != 1) {
|
||||
MS_LOG(ERROR) << input_count << "inputs were provided, but GpuConvertToDynamicShapeGpuKernel expects 1.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 1, but got " << input_count;
|
||||
}
|
||||
|
||||
input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape_);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape_, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'GpuConvertToDynamicShapeGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -29,7 +29,6 @@ template <typename T>
|
|||
class IOUGpuKernel : public GpuKernel {
|
||||
public:
|
||||
IOUGpuKernel() : gt_boxes_size_(0), anchor_boxes_size_(0), iou_size_(0), mode_(0), is_null_input_(false) {}
|
||||
|
||||
~IOUGpuKernel() override = default;
|
||||
|
||||
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
||||
|
@ -50,7 +49,7 @@ class IOUGpuKernel : public GpuKernel {
|
|||
const size_t block_size_0 = inputs[0]->size / sizeof(T);
|
||||
const size_t block_size_1 = inputs[1]->size / sizeof(T);
|
||||
if ((block_size_0 % coordinate) != 0 || (block_size_1 % coordinate) != 0) {
|
||||
MS_LOG(ERROR) << "The size of the box must be a multiple of 4.";
|
||||
MS_LOG(ERROR) << "For '" << kernel_name_ << ", the size of the box should be a multiple of 4.";
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -65,8 +64,7 @@ class IOUGpuKernel : public GpuKernel {
|
|||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 2) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but IOU needs 2 inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 2, but got " << input_num;
|
||||
}
|
||||
gt_boxes_size_ = sizeof(T);
|
||||
anchor_boxes_size_ = sizeof(T);
|
||||
|
@ -75,10 +73,10 @@ class IOUGpuKernel : public GpuKernel {
|
|||
auto gt_boxes_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
auto anchor_boxes_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
||||
auto iou_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ =
|
||||
CHECK_NULL_INPUT(gt_boxes_shape) || CHECK_NULL_INPUT(anchor_boxes_shape) || CHECK_NULL_INPUT(iou_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(gt_boxes_shape, kernel_name_, "anchor_boxes") ||
|
||||
CHECK_SHAPE_NULL(anchor_boxes_shape, kernel_name_, "gt_boxes") ||
|
||||
CHECK_SHAPE_NULL(iou_shape, kernel_name_, "output");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'IOUGpuKernel', input or output is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
@ -103,8 +101,7 @@ class IOUGpuKernel : public GpuKernel {
|
|||
} else if (mode == "iof") {
|
||||
mode_ = 1;
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Mode only support 'iou' or 'iof'.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', mode only support 'iou' or 'iof'.";
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
|
@ -66,26 +66,25 @@ class BatchNormFold2GpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
InitResource();
|
||||
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != INPUT_NUM) {
|
||||
MS_LOG(ERROR) << "Argument number is " << input_num << ", but BatchNormFold2GpuKernel needs " << INPUT_NUM
|
||||
<< " inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be " << INPUT_NUM << ", but got "
|
||||
<< input_num;
|
||||
}
|
||||
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "BatchNormFold2GpuKernel input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (input_shape.size() != 4) {
|
||||
MS_LOG(ERROR) << "BatchNormFold2GpuKernel input shape needs (N,C,H,W).";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
batch_size_ = input_shape[0];
|
||||
channel_ = input_shape[1];
|
||||
|
|
|
@ -94,26 +94,25 @@ class BatchNormFold2GradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != INPUT_NUM) {
|
||||
MS_LOG(ERROR) << "Argument number is " << input_num << ", but BatchNormFold2GradGpuKernel needs " << INPUT_NUM
|
||||
<< " inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be " << INPUT_NUM << ", but got "
|
||||
<< input_num;
|
||||
}
|
||||
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "BatchNormFold2GradGpuKernel input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (input_shape.size() != 4) {
|
||||
MS_LOG(ERROR) << "BatchNormFold2GradGpuKernel input shape needs (N,C,H,W).";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
batch_size_ = input_shape[0];
|
||||
channel_ = input_shape[1];
|
||||
|
|
|
@ -99,18 +99,17 @@ class BatchNormFoldGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
InitResource();
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << " but BatchNormFold GpuKernel OP needs 4 input.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 4) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but BatchNormFold GpuKernel OP needs 4 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 4, but got " << output_num;
|
||||
}
|
||||
|
||||
auto prim = AnfAlgo::GetCNodePrimitive(kernel_node);
|
||||
|
@ -122,16 +121,14 @@ class BatchNormFoldGpuKernel : public GpuKernel {
|
|||
freeze_bn_ = static_cast<int>(GetValue<int64_t>(prim->GetAttr("freeze_bn")));
|
||||
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'BatchNormFoldGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (input_shape.size() != 4) {
|
||||
MS_LOG(ERROR) << "Input shape is " << input_shape.size()
|
||||
<< ", but BatchNormFold GpuKernel OP needs 4DTensor input.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
CheckTensorSize({input_shape});
|
||||
batch_ = input_shape[0];
|
||||
|
|
|
@ -78,18 +78,17 @@ class BatchNormFoldGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != INPUT_NUM) {
|
||||
MS_LOG(ERROR) << "Input number is " << input_num << ", but BatchNormFoldGrad GpuKernel OP needs " << INPUT_NUM
|
||||
<< " inputs.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be " << INPUT_NUM << ", but got "
|
||||
<< input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(ERROR) << "Output number is " << output_num << ", but BatchNormFoldGrad GpuKernel OP needs 4 output.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
|
||||
auto prim = AnfAlgo::GetCNodePrimitive(kernel_node);
|
||||
|
@ -99,16 +98,14 @@ class BatchNormFoldGradGpuKernel : public GpuKernel {
|
|||
freeze_bn_ = static_cast<int>(GetValue<int64_t>(prim->GetAttr("freeze_bn")));
|
||||
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'BatchNormFoldGradGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (input_shape.size() != 4) {
|
||||
MS_LOG(ERROR) << "Input shape is " << input_shape.size()
|
||||
<< ", but BatchNormFoldGrad GpuKernel OP needs 4DTensor input.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
batch_ = input_shape[0];
|
||||
channel_ = input_shape[1];
|
||||
|
|
|
@ -49,24 +49,23 @@ class CorrectionMulGpuKernel : public GpuKernel {
|
|||
return true;
|
||||
}
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
InitResource();
|
||||
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
MS_LOG(ERROR) << "Argument number is " << input_num << ", but CorrectionMulGpuKernel needs 3.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
|
||||
}
|
||||
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'CorrectionMulGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (input_shape.size() != 4) {
|
||||
MS_LOG(ERROR) << "CorrectionMulGpuKernel input shape needs (N,C,H,W).";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
batch_size_ = input_shape[0];
|
||||
channel_ = input_shape[1];
|
||||
|
|
|
@ -55,24 +55,23 @@ class CorrectionMulGradGpuKernel : public GpuKernel {
|
|||
}
|
||||
|
||||
bool Init(const CNodePtr &kernel_node) override {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
InitResource();
|
||||
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 4) {
|
||||
MS_LOG(ERROR) << "Argument number is " << input_num << ", but CorrectionMulGradGpuKernel needs 4.";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 4, but got " << input_num;
|
||||
}
|
||||
|
||||
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
||||
is_null_input_ = CHECK_NULL_INPUT(input_shape);
|
||||
is_null_input_ = CHECK_SHAPE_NULL(input_shape, kernel_name, "input");
|
||||
if (is_null_input_) {
|
||||
MS_LOG(WARNING) << "For 'CorrectionMulGradGpuKernel', input is null";
|
||||
InitSizeLists();
|
||||
return true;
|
||||
}
|
||||
if (input_shape.size() != 4) {
|
||||
MS_LOG(ERROR) << "CorrectionMulGradGpuKernel input shape needs (N,C,H,W).";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the dimension of input should be 4, but got "
|
||||
<< input_shape.size();
|
||||
}
|
||||
batch_size_ = input_shape[0];
|
||||
channel_ = input_shape[1];
|
||||
|
|
|
@ -45,17 +45,16 @@ const std::vector<size_t> &FakeLearnedScaleQuantPerChannelGpuKernel::GetWorkspac
|
|||
}
|
||||
|
||||
bool FakeLearnedScaleQuantPerChannelGpuKernel::Init(const CNodePtr &kernel_node) {
|
||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||
kernel_node_ = kernel_node;
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 3) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num
|
||||
<< ", but FakeLearnedScaleQuantPerChannel GpuKernel OP needs 3 Input.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of inputs should be 3, but got " << input_num;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num
|
||||
<< ", but FakeLearnedScaleQuantPerChannel GpuKernel OP needs 1 output.";
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name << "', the number of outputs should be 1, but got " << output_num;
|
||||
}
|
||||
|
||||
quant_delay_ = static_cast<int>(GetValue<int64_t>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("quant_delay")));
|
||||
|
|
Loading…
Reference in New Issue