!7384 opencl convolutoin kernel run segmentfault with hasbias=false
Merge pull request !7384 from 王东旭/fix_conv_no_bias_bug
This commit is contained in:
commit
fa37f8fde3
|
@ -76,7 +76,9 @@ int ConcatOpenCLKernel::Init() {
|
|||
|
||||
auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
|
||||
MS_LOG(DEBUG) << " concat at axis=: " << param->axis_;
|
||||
param->axis_ = (param->axis_ == -1) ? (in_tensors_[0]->shape().size() - 1) : param->axis_;
|
||||
if (param->axis_ < 0) {
|
||||
param->axis_ += in_tensors_.front()->shape().size();
|
||||
}
|
||||
if (param->axis_ < 0 || param->axis_ > 3) {
|
||||
MS_LOG(ERROR) << " only support axis >= 0 and axis <= 3 ";
|
||||
return RET_ERROR;
|
||||
|
|
|
@ -65,6 +65,7 @@ int ConvolutionOpenCLKernel::Init() {
|
|||
CO_SLICES_ = UP_DIV(CO_, C4NUM);
|
||||
KH_ = param->kernel_h_;
|
||||
KW_ = param->kernel_w_;
|
||||
has_bias_ = in_tensors_.size() == 3;
|
||||
|
||||
// note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true
|
||||
TILES_X_ = UP_DIV(OW_, 4);
|
||||
|
@ -243,7 +244,9 @@ int ConvolutionOpenCLKernel::InitBias() {
|
|||
|
||||
int ConvolutionOpenCLKernel::InitBuffer() {
|
||||
InitWeight();
|
||||
InitBias();
|
||||
if (has_bias_) {
|
||||
InitBias();
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
@ -298,7 +301,9 @@ int ConvolutionOpenCLKernel::Run() {
|
|||
cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_};
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
|
||||
if (has_bias_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
|
||||
}
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape);
|
||||
} else {
|
||||
|
@ -306,7 +311,9 @@ int ConvolutionOpenCLKernel::Run() {
|
|||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
|
||||
if (has_bias_) {
|
||||
ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
|
||||
}
|
||||
if (op_format_ == Format_NC4HW4) {
|
||||
cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_};
|
||||
cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_};
|
||||
|
@ -372,10 +379,14 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
|
|||
|
||||
code +=
|
||||
"__kernel void Convolution(__read_only image2d_t input,\n"
|
||||
" __write_only image2d_t output,\n"
|
||||
" __global FLT4 *weight,\n"
|
||||
" __global FLT4 *bias)"
|
||||
"{\n";
|
||||
" __write_only image2d_t output,\n";
|
||||
if (has_bias_) {
|
||||
code +=
|
||||
" __global FLT4 *weight,\n"
|
||||
" __global FLT4 *bias) {\n";
|
||||
} else {
|
||||
code += " __global FLT4 *weight) {\n";
|
||||
}
|
||||
|
||||
code += " int n_oh = get_global_id(0); // [0, N*OH)\n";
|
||||
if (batch_size_ == 1) {
|
||||
|
@ -426,17 +437,20 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
|
|||
" }\n"
|
||||
" }\n"
|
||||
" }\n\n";
|
||||
code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n";
|
||||
|
||||
if (has_bias_) {
|
||||
code += " out0_c4 = out0_c4 + bias[co_slice];\n";
|
||||
}
|
||||
|
||||
if (param->act_type_ == ActType_Relu) {
|
||||
code += " out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n";
|
||||
code += " out0_c4 = max(out0_c4, (FLT4)(0.0f));\n";
|
||||
} else if (param->act_type_ == ActType_Relu6) {
|
||||
code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n";
|
||||
code += " out0_c4 = clamp(out0_c4, (FLT4)(0.0f), (FLT4)(6.0f));\n";
|
||||
}
|
||||
if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) {
|
||||
code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out0_c4_bias);// NHWC4: NH WC\n}";
|
||||
code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out0_c4);// NHWC4: NH WC\n}";
|
||||
} else {
|
||||
code += " WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out0_c4_bias);\n}";
|
||||
code += " WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out0_c4);\n}";
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
@ -460,8 +474,11 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
|
|||
"\n"
|
||||
"__kernel void Convolution(__read_only image2d_t input,\n"
|
||||
" __write_only image2d_t output,\n"
|
||||
" __global FLT4 *weight,\n"
|
||||
" __global FLT4 *bias,\n"
|
||||
" __global FLT4 *weight,\n";
|
||||
if (has_bias_) {
|
||||
code += " __global FLT4 *bias,\n";
|
||||
}
|
||||
code +=
|
||||
" const int4 input_shape,\n"
|
||||
" const int4 output_shape)\n"
|
||||
"{\n";
|
||||
|
@ -578,7 +595,10 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
|
|||
" }\n"
|
||||
" }\n\n";
|
||||
|
||||
code += " out0 = out0 + bias[co_slice];\n";
|
||||
if (has_bias_) {
|
||||
code += " out0 = out0 + bias[co_slice];\n";
|
||||
}
|
||||
|
||||
if (param->act_type_ == ActType_Relu) {
|
||||
code += " out0 = max(out0, (FLT4)(0.0f));\n";
|
||||
} else if (param->act_type_ == ActType_Relu6) {
|
||||
|
@ -591,7 +611,9 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
|
|||
" if (last_is_double)"
|
||||
" {\n";
|
||||
}
|
||||
code += " out1 = out1 + bias[co_slice];\n";
|
||||
if (has_bias_) {
|
||||
code += " out1 = out1 + bias[co_slice];\n";
|
||||
}
|
||||
if (param->act_type_ == ActType_Relu) {
|
||||
code += " out1 = max(out1, (FLT4)(0.0f));\n";
|
||||
} else if (param->act_type_ == ActType_Relu6) {
|
||||
|
@ -788,8 +810,11 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
|
|||
"};\n"
|
||||
"\n"
|
||||
"__kernel void Winograd36To4x4(__read_only image2d_t input,\n"
|
||||
" __write_only image2d_t output,\n"
|
||||
" __global FLT4 *bias,\n"
|
||||
" __write_only image2d_t output,\n";
|
||||
if (has_bias_) {
|
||||
code += " __global FLT4 *bias,\n";
|
||||
}
|
||||
code +=
|
||||
" int4 input_shape, // N 36 H/4*W/4 CO_SLICES\n"
|
||||
" int4 output_shape) // N H W CO_SLICES\n"
|
||||
"{\n"
|
||||
|
@ -824,9 +849,10 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
|
|||
" for (int y = 0; y < 6; y++)\n"
|
||||
" {\n"
|
||||
" acc += AtM_row[y] * At[x * 6 + y];\n"
|
||||
" }\n"
|
||||
" acc += bias[slice];\n"
|
||||
"\n";
|
||||
" }\n";
|
||||
if (has_bias_) {
|
||||
code += " acc += bias[slice];\n";
|
||||
}
|
||||
|
||||
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
|
||||
if (param->act_type_ == ActType_Relu) {
|
||||
|
|
|
@ -35,44 +35,15 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
|
|||
|
||||
int Init() override;
|
||||
int Run() override;
|
||||
int InitBuffer();
|
||||
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
|
||||
|
||||
private:
|
||||
bool use_fp16_ = false;
|
||||
|
||||
int batch_size_{};
|
||||
int CI_{};
|
||||
int IH_{};
|
||||
int IW_{};
|
||||
int CO_{};
|
||||
int OH_{};
|
||||
int OW_{};
|
||||
int CI_SLICES_{};
|
||||
int CO_SLICES_{};
|
||||
int KH_{};
|
||||
int KW_{};
|
||||
void *packed_weight_ = nullptr;
|
||||
void *packed_bias_ = nullptr;
|
||||
|
||||
bool use_winograd_ = false;
|
||||
int TILES_X_{};
|
||||
int TILES_Y_{};
|
||||
int TILES_XY_{};
|
||||
void *winograd_mem0_ = nullptr;
|
||||
void *winograd_mem1_ = nullptr;
|
||||
|
||||
cl::Kernel kernel_4x4to36_;
|
||||
cl::Kernel kernel_conv_;
|
||||
cl::Kernel kernel_36to4x4_;
|
||||
|
||||
int InitBuffer();
|
||||
int InitWeight();
|
||||
int InitBias();
|
||||
int GenerateWinogradWeight();
|
||||
|
||||
std::string CodeGenConvolutionNHWC4();
|
||||
std::string CodeGenConvolutionNC4HW4();
|
||||
|
||||
std::string CodeGenWinograd4x4To36();
|
||||
std::string CodeGenWinogradConvolution();
|
||||
std::string CodeGenWinograd36To4x4();
|
||||
|
@ -110,6 +81,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
|
|||
param->pad_r_,
|
||||
param->dilation_h_,
|
||||
param->dilation_w_,
|
||||
has_bias_,
|
||||
use_fp16_,
|
||||
op_format_,
|
||||
param->act_type_};
|
||||
|
@ -119,6 +91,34 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
|
|||
}
|
||||
return code_id;
|
||||
}
|
||||
|
||||
bool use_fp16_ = false;
|
||||
|
||||
int batch_size_{};
|
||||
int CI_{};
|
||||
int IH_{};
|
||||
int IW_{};
|
||||
int CO_{};
|
||||
int OH_{};
|
||||
int OW_{};
|
||||
int CI_SLICES_{};
|
||||
int CO_SLICES_{};
|
||||
int KH_{};
|
||||
int KW_{};
|
||||
void *packed_weight_ = nullptr;
|
||||
void *packed_bias_ = nullptr;
|
||||
bool has_bias_ = false;
|
||||
|
||||
bool use_winograd_ = false;
|
||||
int TILES_X_{};
|
||||
int TILES_Y_{};
|
||||
int TILES_XY_{};
|
||||
void *winograd_mem0_ = nullptr;
|
||||
void *winograd_mem1_ = nullptr;
|
||||
|
||||
cl::Kernel kernel_4x4to36_;
|
||||
cl::Kernel kernel_conv_;
|
||||
cl::Kernel kernel_36to4x4_;
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -34,134 +34,71 @@ namespace mindspore::kernel {
|
|||
int ToFormatOpenCLKernel::Init() {
|
||||
auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_);
|
||||
out_mem_type_ = parameter->out_mem_type;
|
||||
std::string program_name = "to_format";
|
||||
std::map<schema::Format, std::string> format_str{
|
||||
{schema::Format::Format_NCHW, "NCHW"}, {schema::Format::Format_NHWC, "NHWC"},
|
||||
{schema::Format::Format_NC4HW4, "NC4HW4"}, {schema::Format::Format_NC4, "NHWC4"},
|
||||
{schema::Format::Format_NC, "NHWC"}, {schema::Format::Format_NHWC4, "NHWC4"}};
|
||||
std::string kernel_name =
|
||||
"to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()];
|
||||
std::map<TypeId, std::string> dtype_str{
|
||||
{kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}, {kNumberTypeInt8, "Int8"}};
|
||||
if (out_mem_type_ == OpenCLMemType::IMG) {
|
||||
kernel_name += "_IMG_" + dtype_str[in_tensors_[0]->data_type()];
|
||||
std::map<TypeId, std::string> dtype_str{{kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}};
|
||||
std::string kernel_name;
|
||||
if (parameter->out_mem_type == OpenCLMemType::IMG) {
|
||||
kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_[0]->data_type()];
|
||||
} else {
|
||||
kernel_name += "_BUF_" + dtype_str[out_tensors_[0]->data_type()];
|
||||
kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_[0]->data_type()];
|
||||
}
|
||||
|
||||
this->set_name(kernel_name);
|
||||
|
||||
#ifdef PROGRAM_WITH_IL
|
||||
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
|
||||
#else
|
||||
std::string program_name = "to_format";
|
||||
std::set<std::string> build_options;
|
||||
std::string source = to_format_source;
|
||||
ocl_runtime_->LoadSource(program_name, source);
|
||||
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
|
||||
#endif
|
||||
InitNHWCShape();
|
||||
|
||||
InitNHWC();
|
||||
MS_LOG(DEBUG) << kernel_name << " Init Done!";
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ToFormatOpenCLKernel::InitNHWCShape() {
|
||||
std::vector<int> shapex = out_tensors_[0]->shape();
|
||||
size_t n, h, w, c;
|
||||
if (shapex.size() == 2) {
|
||||
n = shapex[0];
|
||||
h = 1;
|
||||
w = 1;
|
||||
c = shapex[1];
|
||||
nhwc_shape_ = {n, h, w, c};
|
||||
return RET_OK;
|
||||
int ToFormatOpenCLKernel::InitNHWC() {
|
||||
std::vector<int> out_shape = out_tensors_[0]->shape();
|
||||
if (out_shape.size() == 1) {
|
||||
N_ = out_shape[0];
|
||||
H_ = 1;
|
||||
W_ = 1;
|
||||
C_ = 1;
|
||||
} else if (out_shape.size() == 2) {
|
||||
N_ = out_shape[0];
|
||||
H_ = 1;
|
||||
W_ = 1;
|
||||
C_ = out_shape[1];
|
||||
} else if (out_shape.size() == 3) {
|
||||
N_ = out_shape[0];
|
||||
H_ = 1;
|
||||
W_ = out_shape[1];
|
||||
C_ = out_shape[2];
|
||||
} else if (out_shape.size() == 4) {
|
||||
N_ = out_shape[0];
|
||||
H_ = out_shape[1];
|
||||
W_ = out_shape[2];
|
||||
C_ = out_shape[3];
|
||||
}
|
||||
if (shapex.size() == 3) {
|
||||
n = 1;
|
||||
h = 1;
|
||||
w = 1;
|
||||
c = 1;
|
||||
nhwc_shape_ = {n, h, w, c};
|
||||
return RET_OK;
|
||||
}
|
||||
if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4HW4 ||
|
||||
out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4 ||
|
||||
out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC) {
|
||||
n = shapex[0];
|
||||
h = shapex[1];
|
||||
w = shapex[2];
|
||||
c = shapex[3];
|
||||
} else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NCHW) {
|
||||
n = shapex[0];
|
||||
h = shapex[2];
|
||||
w = shapex[3];
|
||||
c = shapex[1];
|
||||
} else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4 ||
|
||||
out_tensors_[0]->GetFormat() == schema::Format::Format_NC) {
|
||||
n = shapex[0];
|
||||
h = 1;
|
||||
w = 1;
|
||||
c = shapex[1];
|
||||
} else {
|
||||
n = shapex[0];
|
||||
h = shapex[1];
|
||||
w = shapex[2];
|
||||
c = shapex[3];
|
||||
}
|
||||
nhwc_shape_ = {n, h, w, c};
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ToFormatOpenCLKernel::ReSize() { return RET_OK; }
|
||||
|
||||
int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) {
|
||||
std::vector<size_t> vec = {nhwc_shape_[0] * nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)};
|
||||
*global_size = std::move(vec);
|
||||
return RET_OK;
|
||||
}
|
||||
int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size,
|
||||
std::vector<size_t> *local_size) {
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
|
||||
size_t im_dst_x, im_dst_y;
|
||||
if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4HW4) {
|
||||
int c = nhwc_shape_[3];
|
||||
int h = nhwc_shape_[1];
|
||||
int w = nhwc_shape_[2];
|
||||
im_dst_y = nhwc_shape_[0] * h * UP_DIV(c, C4NUM);
|
||||
im_dst_x = w;
|
||||
} else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) {
|
||||
int h = nhwc_shape_[0] * nhwc_shape_[1];
|
||||
int w = nhwc_shape_[2];
|
||||
int c = nhwc_shape_[3];
|
||||
im_dst_x = w * UP_DIV(c, C4NUM);
|
||||
im_dst_y = h;
|
||||
} else if (out_tensors_[0]->GetFormat() == schema::Format::Format_NC4) {
|
||||
int c = nhwc_shape_[3];
|
||||
im_dst_x = UP_DIV(c, C4NUM);
|
||||
im_dst_y = 1;
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat();
|
||||
return RET_ERROR;
|
||||
}
|
||||
img_size->clear();
|
||||
auto enable_fp16_ = ocl_runtime_->GetFp16Enable();
|
||||
size_t img_dtype = CL_FLOAT;
|
||||
if (enable_fp16_) {
|
||||
img_dtype = CL_HALF_FLOAT;
|
||||
}
|
||||
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
|
||||
*img_size = vec;
|
||||
size_t img_height = N_ * H_;
|
||||
size_t img_width = W_ * UP_DIV(C_, C4NUM);
|
||||
size_t img_dtype = ocl_runtime_->GetFp16Enable() ? CL_HALF_FLOAT : CL_FLOAT;
|
||||
*img_size = {img_width, img_height, img_dtype};
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ToFormatOpenCLKernel::Run() {
|
||||
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||
std::vector<size_t> local = {};
|
||||
std::vector<size_t> global;
|
||||
GetGlobalSize(0, &global);
|
||||
|
||||
cl_int4 shape{(cl_int)nhwc_shape_[0], (cl_int)nhwc_shape_[1], (cl_int)nhwc_shape_[2], (cl_int)nhwc_shape_[3]};
|
||||
std::vector<size_t> global = {N_ * H_, W_, UP_DIV(C_, C4NUM)};
|
||||
std::vector<size_t> local = {16, 8, 1};
|
||||
cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_};
|
||||
cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};
|
||||
|
||||
auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG;
|
||||
auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF;
|
||||
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type);
|
||||
|
|
|
@ -31,16 +31,18 @@ class ToFormatOpenCLKernel : public OpenCLKernel {
|
|||
~ToFormatOpenCLKernel() override{};
|
||||
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int ReSize() override { return RET_OK; };
|
||||
int Run() override;
|
||||
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
|
||||
int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override;
|
||||
int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override;
|
||||
int InitNHWCShape();
|
||||
|
||||
private:
|
||||
int InitNHWC();
|
||||
|
||||
cl::Kernel kernel_;
|
||||
std::vector<size_t> nhwc_shape_;
|
||||
size_t N_{1};
|
||||
size_t H_{1};
|
||||
size_t W_{1};
|
||||
size_t C_{1};
|
||||
};
|
||||
} // namespace mindspore::kernel
|
||||
|
||||
|
|
|
@ -237,33 +237,33 @@ void PrintTensor(lite::Tensor *tensor, int num, const std::string &out_file) {
|
|||
}
|
||||
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
|
||||
auto runtime = runtime_wrapper.GetInstance();
|
||||
runtime->SyncCommandQueue();
|
||||
|
||||
auto allocator = runtime->GetAllocator();
|
||||
auto origin_data = tensor->data_c();
|
||||
allocator->MapBuffer(origin_data, CL_MAP_READ | CL_MAP_WRITE, nullptr, true);
|
||||
tensor->SetData(origin_data);
|
||||
runtime->SyncCommandQueue();
|
||||
allocator->MapBuffer(origin_data, CL_MAP_READ, nullptr, true);
|
||||
|
||||
auto Batch = tensor->Batch();
|
||||
auto Height = tensor->shape().size() == 4 ? tensor->Height() : 1;
|
||||
auto Width = tensor->shape().size() == 4 ? tensor->Width() : 1;
|
||||
auto SLICES = UP_DIV(tensor->Channel(), C4NUM);
|
||||
auto shape = tensor->shape();
|
||||
auto N = shape.size() > 0 ? shape[0] : 1;
|
||||
auto H = shape.size() > 1 ? shape[1] : 1;
|
||||
auto W = shape.size() > 2 ? shape[2] : 1;
|
||||
auto C = shape.size() > 3 ? shape[3] : 1;
|
||||
auto SLICES = UP_DIV(C, C4NUM);
|
||||
auto ElementsC4Num = N * H * W * UP_ROUND(C, C4NUM);
|
||||
auto alignment = runtime->GetImagePitchAlignment();
|
||||
auto dtype_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half4) : sizeof(cl_float4);
|
||||
auto row_pitch = (Width * SLICES + alignment - 1) / alignment * alignment * dtype_size;
|
||||
auto row_size = Width * SLICES * dtype_size;
|
||||
std::vector<char> data(tensor->Size());
|
||||
for (int i = 0; i < Batch * Height; ++i) {
|
||||
auto FLT4_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half4) : sizeof(cl_float4);
|
||||
auto row_pitch = (W * SLICES + alignment - 1) / alignment * alignment * FLT4_size;
|
||||
auto row_size = W * SLICES * FLT4_size;
|
||||
std::vector<char> data(N * H * row_size);
|
||||
for (int i = 0; i < N * H; ++i) {
|
||||
memcpy(static_cast<char *>(data.data()) + i * row_size, static_cast<char *>(origin_data) + i * row_pitch, row_size);
|
||||
}
|
||||
|
||||
std::cout << "shape=(";
|
||||
for (auto x : tensor->shape()) {
|
||||
for (auto x : shape) {
|
||||
printf("%3d,", x);
|
||||
}
|
||||
printf("): ");
|
||||
|
||||
for (size_t i = 0; i < num && i < tensor->ElementsNum(); ++i) {
|
||||
for (size_t i = 0; i < num && i < ElementsC4Num; ++i) {
|
||||
if (tensor->data_type() == kNumberTypeFloat16)
|
||||
printf("%zu %6.3f | ", i, (reinterpret_cast<float16_t *>(data.data()))[i]);
|
||||
else
|
||||
|
@ -272,7 +272,7 @@ void PrintTensor(lite::Tensor *tensor, int num, const std::string &out_file) {
|
|||
printf("\n");
|
||||
|
||||
if (!out_file.empty()) {
|
||||
Write2File(data.data(), out_file, tensor->Size());
|
||||
Write2File(data.data(), out_file, data.size());
|
||||
}
|
||||
allocator->UnmapBuffer(origin_data);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue