diff --git a/cmake/external_libs/opencl.cmake b/cmake/external_libs/opencl.cmake index 4c35d7310b7..999dc16e58f 100644 --- a/cmake/external_libs/opencl.cmake +++ b/cmake/external_libs/opencl.cmake @@ -22,11 +22,11 @@ function(gene_opencl BASEPATH) if(NOT EXISTS ${CL_SRC_DIR}) return() endif() - file(GLOB_RECURSE CL_LIST ${CL_SRC_DIR}/*.cl) + file(GLOB_RECURSE CL_LIST ${CL_SRC_DIR}/*.cl ${CL_SRC_DIR}/int8/*.cl) foreach(file_path ${CL_LIST}) file(REMOVE ${file_path}.inc) string(REGEX REPLACE ".+/(.+)\\..*" "\\1" kernel_name "${file_path}") - set(inc_file_ex "${kernel_name}.cl.inc") + set(inc_file_ex "${file_path}.inc") execute_process( COMMAND bash -c "sed 's/\\\\/\\\\\\\\/g' " COMMAND bash -c "sed 's/\\\"/\\\\\\\"/g' " diff --git a/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc index 52fa90196b6..318b00affd3 100644 --- a/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc +++ b/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc @@ -143,7 +143,7 @@ int OpenCLAllocator::GetImgDtypeSize(const ImageSize &img_size) { dtype_size = sizeof(cl_float); } else if (img_size.dtype == CL_HALF_FLOAT) { dtype_size = sizeof(cl_half); - } else if (img_size.dtype == CL_UNSIGNED_INT8) { + } else if (img_size.dtype == CL_SIGNED_INT8) { dtype_size = sizeof(cl_uchar); } else { MS_LOG(ERROR) << "Unsupported dtype " << img_size.dtype; diff --git a/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt b/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt index 15308a928b5..03095f0a090 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt +++ b/mindspore/lite/src/runtime/kernel/opencl/CMakeLists.txt @@ -1,7 +1,8 @@ if(${SUPPORT_GPU} STREQUAL opencl) file(GLOB_RECURSE OPENCL_KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cc - ${CMAKE_CURRENT_SOURCE_DIR}/kernel/*.cc) + ${CMAKE_CURRENT_SOURCE_DIR}/kernel/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/kernel/int8/*.cc) add_library(opencl_kernel_mid OBJECT ${OPENCL_KERNEL_SRC}) add_dependencies(opencl_kernel_mid fbs_src) endif() diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/int8/arithmetic.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/int8/arithmetic.cl new file mode 100644 index 00000000000..1abe4a9004f --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/int8/arithmetic.cl @@ -0,0 +1,19 @@ +__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; + +__kernel void ElementAddInt8(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max, + const float4 scale, const char4 zero_point) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + char4 a = convert_char4(read_imagei(input_a, smp_none, (int2)(X, Y))); + char4 b = convert_char4(read_imagei(input_b, smp_none, (int2)(X, Y))); + + float4 real_a = convert_float4(a - zero_point.x) * scale.x; + float4 real_b = convert_float4(b - zero_point.y) * scale.y; + int4 result = convert_int4(round((real_a + real_b) / scale.z)) + zero_point.z; + result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + write_imagei(output, (int2)(X, Y), result); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl index 39e4e61e435..dff7bc7c29a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl @@ -1,8 +1,8 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -#define BUF_to_IMG(src_dtype, dst_dtype, SRC_FLT, DST_FLT, WRITE_IMAGE_OUT) \ - __kernel void BUF_to_IMG_##src_dtype##_##dst_dtype(__global float4 *src_data, __write_only image2d_t dst_data, \ +#define BUF_to_IMG(src_dtype, dst_dtype, SRC_TYPE, DST_TYPE, WRITE_IMAGE_OUT) \ + __kernel void BUF_to_IMG_##src_dtype##_##dst_dtype(__global SRC_TYPE##4 *src_data, __write_only image2d_t dst_data, \ int4 size, int4 shape) { \ int X = get_global_id(0); \ int Y = get_global_id(1); \ @@ -10,21 +10,21 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP if (X >= size.x || Y >= size.y || Z >= size.z) { \ return; \ } \ - DST_FLT##4 data = (DST_FLT##4)(0.f); \ + DST_TYPE##4 data = (DST_TYPE##4)(0.f); \ int offset = (X * shape.z + Y) * shape.w + Z * 4; \ - __global SRC_FLT *src_addr = (__global SRC_FLT *)src_data; \ + __global SRC_TYPE *src_addr = (__global SRC_TYPE *)src_data; \ src_addr += offset; \ if ((Z + 1) * 4 <= shape.w) { \ - data = convert_##DST_FLT##4(((__global SRC_FLT##4 *)src_addr)[0]); \ + data = convert_##DST_TYPE##4(((__global SRC_TYPE##4 *)src_addr)[0]); \ } else { \ if ((shape.w - Z * 4) >= 1) { \ - data.x = (DST_FLT)src_addr[0]; \ + data.x = (DST_TYPE)src_addr[0]; \ } \ if ((shape.w - Z * 4) >= 2) { \ - data.y = (DST_FLT)src_addr[1]; \ + data.y = (DST_TYPE)src_addr[1]; \ } \ if ((shape.w - Z * 4) >= 3) { \ - data.z = (DST_FLT)src_addr[2]; \ + data.z = (DST_TYPE)src_addr[2]; \ } \ } \ if (size.y * size.z <= MAX_IMAGE2D_WIDTH) \ @@ -33,15 +33,16 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP WRITE_IMAGE_OUT(dst_data, (int2)(Z, X * size.y + Y), data); \ } -// BUF_to_IMG(src_dtype, dst_dtype, SRC_FLT, DST_FLT, WRITE_IMAGE_OUT) +// BUF_to_IMG(src_dtype, dst_dtype, SRC_TYPE, DST_TYPE, WRITE_IMAGE_OUT) BUF_to_IMG(float32, float32, float, float, write_imagef); BUF_to_IMG(float32, float16, float, half, write_imageh); BUF_to_IMG(float16, float16, half, half, write_imageh); BUF_to_IMG(int32, int32, float, float, write_imagef); BUF_to_IMG(uint32, uint32, float, float, write_imagef); +BUF_to_IMG(int8, int8, char, int, write_imagei); -#define IMG_to_BUF(src_dtype, dst_dtype, SRC_FLT, DST_FLT, READ_IMAGE_IN) \ - __kernel void IMG_to_BUF_##src_dtype##_##dst_dtype(__read_only image2d_t src_data, __global DST_FLT##4 * dst_data, \ +#define IMG_to_BUF(src_dtype, dst_dtype, SRC_TYPE, DST_TYPE, READ_IMAGE_IN) \ + __kernel void IMG_to_BUF_##src_dtype##_##dst_dtype(__read_only image2d_t src_data, __global DST_TYPE##4 * dst_data, \ int4 size, int4 shape) { \ int X = get_global_id(0); \ int Y = get_global_id(1); \ @@ -49,16 +50,16 @@ BUF_to_IMG(uint32, uint32, float, float, write_imagef); if (X >= size.x || Y >= size.y || Z >= size.z) { \ return; \ } \ - DST_FLT##4 data; \ + DST_TYPE##4 data; \ if (size.y * size.z <= MAX_IMAGE2D_WIDTH) \ - data = convert_##DST_FLT##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); \ + data = convert_##DST_TYPE##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); \ else \ - data = convert_##DST_FLT##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Z, X * size.y + Y))); \ + data = convert_##DST_TYPE##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Z, X * size.y + Y))); \ int offset = (X * shape.z + Y) * shape.w + Z * 4; \ - __global DST_FLT *dst_addr = (__global DST_FLT *)dst_data; \ + __global DST_TYPE *dst_addr = (__global DST_TYPE *)dst_data; \ dst_addr += offset; \ if ((Z + 1) * 4 <= shape.w) { \ - ((__global DST_FLT##4 *)dst_addr)[0] = data; \ + ((__global DST_TYPE##4 *)dst_addr)[0] = data; \ } else { \ if (shape.w - Z * 4 >= 1) { \ dst_addr[0] = data.x; \ @@ -72,9 +73,10 @@ BUF_to_IMG(uint32, uint32, float, float, write_imagef); } \ } -// IMG_to_BUF(src_dtype, dst_dtype, SRC_FLT, DST_FLT, READ_IMAGE_IN) +// IMG_to_BUF(src_dtype, dst_dtype, SRC_TYPE, DST_TYPE, READ_IMAGE_IN) IMG_to_BUF(float32, float32, float, float, read_imagef); IMG_to_BUF(float16, float32, half, float, read_imageh); IMG_to_BUF(float16, float16, half, half, read_imageh); IMG_to_BUF(int32, int32, float, float, read_imagef); IMG_to_BUF(uint32, uint32, float, float, read_imagef); +IMG_to_BUF(int8, int8, char, char, read_imagei); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/int8/arithmetic_int8.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/int8/arithmetic_int8.cc new file mode 100644 index 00000000000..1959ff1ab31 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/int8/arithmetic_int8.cc @@ -0,0 +1,234 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/opencl/kernel/int8/arithmetic_int8.h" +#include +#include "nnacl/fp32/common_func_fp32.h" +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "src/runtime/kernel/opencl/utils.h" +#ifndef PROGRAM_WITH_IL +#include "src/runtime/kernel/opencl/cl/int8/arithmetic.cl.inc" +#endif + +using mindspore::kernel::KERNEL_ARCH::kGPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::lite::opencl::ImageSize; +using mindspore::lite::opencl::MemType; +using mindspore::schema::ActivationType_NO_ACTIVATION; +using mindspore::schema::ActivationType_RELU; +using mindspore::schema::ActivationType_RELU6; +using mindspore::schema::EltwiseMode_MAXIMUM; +using mindspore::schema::EltwiseMode_PROD; +using mindspore::schema::EltwiseMode_SUM; +using mindspore::schema::PrimitiveType_BiasAdd; +using mindspore::schema::PrimitiveType_Eltwise; + +namespace mindspore::kernel { + +int ArithmeticInt8OpenCLKernel::CheckSpecs() { + for (auto &tensor : in_tensors_) { + if (tensor->data_type() != kNumberTypeInt8) { + MS_LOG(ERROR) << "ArithmeticInt8OpenCLKernel only support int8 input"; + return RET_ERROR; + } + } + for (auto &tensor : out_tensors_) { + if (tensor->data_type() != kNumberTypeInt8) { + MS_LOG(ERROR) << "ArithmeticInt8OpenCLKernel only support int8 output"; + return RET_ERROR; + } + } + + if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { + MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); + return RET_ERROR; + } + auto *param = reinterpret_cast(op_parameter_); + if (!IsArithmetic(Type())) { + MS_LOG(ERROR) << "UnSupported Operator: " << schema::EnumNamePrimitiveType(Type()); + return RET_ERROR; + } + if (Type() == schema::PrimitiveType_Eltwise) { + auto mode = param->eltwise_mode_; + if (mode != EltwiseMode_PROD && mode != EltwiseMode_SUM && mode != EltwiseMode_MAXIMUM) { + MS_LOG(ERROR) << "Eltwise mode not support, mode:" << mode; + return RET_ERROR; + } + } + if (!(param->activation_type_ == ActivationType_NO_ACTIVATION || param->activation_type_ == ActivationType_RELU || + param->activation_type_ == ActivationType_RELU6)) { + MS_LOG(ERROR) << "Unsupported activation type " << param->activation_type_; + return RET_ERROR; + } + return RET_OK; +} + +void ArithmeticInt8OpenCLKernel::SetGlobalLocal() { + if (element_flag_) { + global_size_ = {out_shape_.width, out_shape_.height}; + } else { + global_size_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; + } + AlignGlobalLocal(global_size_, {}); +} + +int ArithmeticInt8OpenCLKernel::InitWeights() { + auto allocator = ocl_runtime_->GetAllocator(); + auto fp16_enable = ocl_runtime_->GetFp16Enable(); + for (int i = 0; i < 2; ++i) { + const auto &in_tensor = in_tensors_.at(i); + GpuTensorInfo in_shape = GpuTensorInfo(in_tensor); + if (in_tensor->IsConst()) { + std::vector weight(in_shape.Image2DSize, 0); + bool src_is_fp16 = in_tensor->data_type() == kNumberTypeFloat16; + PackNHWCToNHWC4(in_tensor->data_c(), weight.data(), src_is_fp16, fp16_enable, in_shape); + size_t dtype = fp16_enable ? CL_HALF_FLOAT : CL_FLOAT; + ImageSize img_size{in_shape.width, in_shape.height, dtype}; + auto weight_ptr_ = allocator->Malloc(img_size, weight.data()); + weight_ptrs_.push_back(weight_ptr_); + } else { + weight_ptrs_.push_back(nullptr); + } + } + return RET_OK; +} + +void ArithmeticInt8OpenCLKernel::SetConstArgs() { + int arg_idx = 3; + if (!element_flag_) { + cl_int4 in0_shape = {static_cast(in0_shape_.N), static_cast(in0_shape_.H), static_cast(in0_shape_.W), + static_cast(in0_shape_.Slice)}; + cl_int4 in1_shape = {static_cast(in1_shape_.N), static_cast(in1_shape_.H), static_cast(in1_shape_.W), + static_cast(in1_shape_.Slice)}; + cl_int4 out_shape = {static_cast(out_shape_.N), static_cast(out_shape_.H), static_cast(out_shape_.W), + static_cast(out_shape_.Slice)}; + int broadcastC_flag = 0; // do not need broadcast in C4 + if (in0_shape_.C == 1 && in1_shape_.C != 1) { + broadcastC_flag = 1; // BroadCast C4 in input0 + } else if (in0_shape_.C != 1 && in1_shape_.C == 1) { + broadcastC_flag = 2; // BroadCast C4 in input1 + } + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in0_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in1_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag); + } else { + cl_int2 output_shape{static_cast(global_range_[0]), static_cast(global_range_[1])}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); + } + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); + + // set quantization parameter. + auto input0_quant_param = in_tensors_[0]->quant_params().front(); + auto input1_quant_param = in_tensors_[1]->quant_params().front(); + auto output_quant_param = out_tensors_[0]->quant_params().front(); + cl_float4 scale = {static_cast(input0_quant_param.scale), static_cast(input1_quant_param.scale), + static_cast(output_quant_param.scale), 0.0}; + cl_char4 zero_point = {static_cast(input0_quant_param.zeroPoint), + static_cast(input1_quant_param.zeroPoint), + static_cast(output_quant_param.zeroPoint), 0}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale); // scale + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, zero_point); // zero_point +} + +int ArithmeticInt8OpenCLKernel::Prepare() { +#ifdef PROGRAM_WITH_IL + kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name_); +#else + + in0_shape_ = GpuTensorInfo(in_tensors_[0]); + in1_shape_ = GpuTensorInfo(in_tensors_[1]); + out_shape_ = GpuTensorInfo(out_tensors_[0]); + + auto *param = reinterpret_cast(op_parameter_); + if (Type() == PrimitiveType_BiasAdd) { + const_cast(param)->broadcasting_ = true; + } + element_flag_ = !param->broadcasting_; + kernel_name_ = param->broadcasting_ ? "BroadcastNHWC4" : "Element"; + switch (Type()) { + case PrimitiveType_MulFusion: + kernel_name_ += "MulInt8"; + break; + case PrimitiveType_AddFusion: + kernel_name_ += "AddInt8"; + break; + case PrimitiveType_SubFusion: + kernel_name_ += "SubInt8"; + break; + case PrimitiveType_DivFusion: + kernel_name_ += "DivInt8"; + break; + case PrimitiveType_Eltwise: { + auto mode = param->eltwise_mode_; + if (mode == EltwiseMode_PROD) { + kernel_name_ += "MulInt8"; + } else if (mode == EltwiseMode_SUM) { + kernel_name_ += "AddInt8"; + } else if (mode == EltwiseMode_MAXIMUM) { + kernel_name_ += "MaximumInt8"; + } + break; + } + default: + kernel_name_ += schema::EnumNamePrimitiveType(Type()); + } + + if (param->activation_type_ == ActivationType_RELU) { + activation_min_ = 0.f; + } else if (param->activation_type_ == ActivationType_RELU6) { + activation_min_ = 0.f; + activation_max_ = 6.f; + } + + std::string program_name = "Arithmetic"; + std::string source = arithmetic_source; + ocl_runtime_->LoadSource(program_name, source); + int error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name_); +#endif + if (error_code != RET_OK) { + return error_code; + } + + SetGlobalLocal(); + // BiasAdd InitWeight will be called in opencl_subgraph prepare + if (Type() != PrimitiveType_BiasAdd) { + InitWeights(); + } + SetConstArgs(); + MS_LOG(DEBUG) << kernel_name_ << " Init Done!"; + return RET_OK; +} + +int ArithmeticInt8OpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + auto input_0_ptr = weight_ptrs_[0] == nullptr ? in_tensors_[0]->data_c() : weight_ptrs_[0]; + auto input_1_ptr = weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : weight_ptrs_[1]; + int arg_idx = 0; + + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); + return RET_OK; +} + +REG_KERNEL(kGPU, kNumberTypeInt8, PrimitiveType_AddFusion, OpenCLKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/int8/arithmetic_int8.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/int8/arithmetic_int8.h new file mode 100644 index 00000000000..0ecd98b9a90 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/int8/arithmetic_int8.h @@ -0,0 +1,54 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_ + +#include +#include +#include +#include "src/runtime/kernel/arm/fp32/arithmetic_fp32.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" + +namespace mindspore::kernel { + +extern std::set SupportedOpenCLArithmetics; + +class ArithmeticInt8OpenCLKernel : public OpenCLKernel { + public: + using OpenCLKernel::OpenCLKernel; + ~ArithmeticInt8OpenCLKernel() override = default; + + int Run() override; + int Prepare() override; + int CheckSpecs() override; + int InitWeights() override; + void SetConstArgs() override; + void SetGlobalLocal() override; + + private: + bool element_flag_{true}; + float activation_min_{-128}; + float activation_max_{127}; + GpuTensorInfo in0_shape_; + GpuTensorInfo in1_shape_; + GpuTensorInfo out_shape_; + std::vector weight_ptrs_; + std::string kernel_name_; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_ diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc index b5aacf7842f..dc3d53dcb14 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc @@ -37,7 +37,8 @@ int ToFormatOpenCLKernel::CheckSpecs() { return RET_ERROR; } auto data_type = in_tensors_.front()->data_type(); - if (data_type != kNumberTypeFloat32 && data_type != kNumberTypeFloat16 && data_type != kNumberTypeInt32) { + if (data_type != kNumberTypeFloat32 && data_type != kNumberTypeFloat16 && data_type != kNumberTypeInt32 && + data_type != kNumberTypeInt8) { MS_LOG(ERROR) << "Unsupported data type " << data_type; return RET_ERROR; } @@ -64,10 +65,10 @@ void ToFormatOpenCLKernel::SetGlobalLocal() { } int ToFormatOpenCLKernel::Prepare() { - static std::map dtype_str{{kNumberTypeFloat32, "float32"}, - {kNumberTypeFloat16, "float16"}, - {kNumberTypeInt32, "int32"}, - {kNumberTypeUInt32, "uint32"}}; + static std::map dtype_str{ + {kNumberTypeFloat32, "float32"}, {kNumberTypeFloat16, "float16"}, {kNumberTypeInt32, "int32"}, + {kNumberTypeUInt32, "uint32"}, {kNumberTypeInt8, "int8"}, + }; auto in_tensor = in_tensors_.front(); auto out_tensor = out_tensors_.front(); std::string kernel_name = out_mem_type_ == MemType::IMG ? "BUF_to_IMG_" : "IMG_to_BUF_"; diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc index fee1040b4d9..7f9013cac79 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc @@ -84,7 +84,7 @@ int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) { } case kNumberTypeInt8: case kNumberTypeUInt8: { - img_dtype = CL_UNSIGNED_INT8; + img_dtype = CL_SIGNED_INT8; break; } default: { @@ -138,8 +138,10 @@ void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) { printf("%d %7d | ", i, reinterpret_cast(data.data())[i]); } else if (tensor->data_type() == kNumberTypeFloat16) { printf("%d %7.3f | ", i, reinterpret_cast(data.data())[i]); - } else { + } else if (tensor->data_type() == kNumberTypeFloat32) { printf("%d %7.3f | ", i, reinterpret_cast(data.data())[i]); + } else if (tensor->data_type() == kNumberTypeInt8) { + printf("%d %7d | ", i, static_cast(reinterpret_cast(data.data())[i])); } } printf("\n"); @@ -409,7 +411,7 @@ int OpenCLKernel::CheckSpecs() { } if (in_tensors_.size() > 0) { if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16 && - in_tensors_[0]->data_type() != kNumberTypeInt32) { + in_tensors_[0]->data_type() != kNumberTypeInt32 && in_tensors_[0]->data_type() != kNumberTypeInt8) { MS_LOG(WARNING) << "Unsupported data type: " << in_tensors_[0]->data_type(); return RET_ERROR; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index eaa6b9364d9..f4d9192cacd 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -49,16 +49,16 @@ void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) { auto *H = dst + 1; auto *W = dst + 2; auto *C = dst + 3; - if (src_num == 1) { + if (src_num == 1) { // 1 1 1 C *C = src[0]; - } else if (src_num == 2) { + } else if (src_num == 2) { // N 1 1 C *N = src[0]; *C = src[1]; - } else if (src_num == 3) { + } else if (src_num == 3) { // N 1 W C *N = src[0]; *W = src[1]; *C = src[2]; - } else if (src_num == 4) { + } else if (src_num == 4) { // N H W C *N = src[0]; *H = src[1]; *W = src[2]; diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc index 67c662555e9..04bbe7d922c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc @@ -128,6 +128,9 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector &in_tensors, MS_LOG(ERROR) << "OpenCLSubGraph new tensor failed!"; return RET_ERROR; } + for (const auto ¶m : in_tensor->quant_params()) { + new_tensor->AddQuantParam(param); + } out_tensors->emplace_back(new_tensor); KernelKey desc{kGPU, kNumberTypeFloat32, PRIM_TO_FORMAT}; diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc index ea5aa210315..9b62540fa9d 100644 --- a/mindspore/lite/src/scheduler.cc +++ b/mindspore/lite/src/scheduler.cc @@ -352,6 +352,7 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector &in // support more data type like int32 kernel::KernelKey gpu_desc{kGPU, kNumberTypeFloat32, desc.type}; if (context_->IsGpuFloat16Enabled()) gpu_desc.data_type = kNumberTypeFloat16; + if (in_tensors.front()->data_type() == kNumberTypeInt8) gpu_desc.data_type = kNumberTypeInt8; auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter); if (kernel != nullptr) { MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " " << node->name_; diff --git a/mindspore/lite/src/sub_graph_kernel.cc b/mindspore/lite/src/sub_graph_kernel.cc index 0e95b3025ad..eda97d010bb 100644 --- a/mindspore/lite/src/sub_graph_kernel.cc +++ b/mindspore/lite/src/sub_graph_kernel.cc @@ -171,6 +171,14 @@ int CpuSubGraph::Run(const KernelCallBack &before, const KernelCallBack &after) out_tensor->set_ref_count(out_tensor->ref_count() + 1); } #endif +#ifdef SUPPORT_GPU + // In heterogeneous scenarios of CPU and GPU, call MutableData to MapBuffer(synchronize data). + if (context_->IsGpuEnabled()) { + for (auto tensor : this->in_tensors()) { + tensor->MutableData(); + } + } +#endif for (auto *kernel : nodes_) { MS_ASSERT(nullptr != kernel); diff --git a/mindspore/lite/test/models_gpu_fp32.cfg b/mindspore/lite/test/models_gpu_fp32.cfg index 4d0071453cc..10ad3583030 100644 --- a/mindspore/lite/test/models_gpu_fp32.cfg +++ b/mindspore/lite/test/models_gpu_fp32.cfg @@ -41,3 +41,4 @@ Q_new_detect.tflite Q_object_scene.tflite Q_pose.tflite matmul.pb +add_uint8.tflite;2 diff --git a/mindspore/lite/test/models_with_multiple_inputs.cfg b/mindspore/lite/test/models_with_multiple_inputs.cfg index 5429a09610c..abf36020d47 100644 --- a/mindspore/lite/test/models_with_multiple_inputs.cfg +++ b/mindspore/lite/test/models_with_multiple_inputs.cfg @@ -27,3 +27,4 @@ ml_tts_encoder_control_flow.pb;4;1:1,22:1:1 hiai_cv_labelDetectorModel_v3.tflite;2 ml_tts_vocoder.pb;66 ml_tacotron_decoder_step_stf.tflite;9;1,80:1,256:1,1024:1,1024:1,1024:1,1024:1,8:1,1,256:1 +add_uint8.tflite;2 diff --git a/mindspore/lite/test/run_benchmark_nets.sh b/mindspore/lite/test/run_benchmark_nets.sh index 7a86e01d842..7922f8d4d73 100755 --- a/mindspore/lite/test/run_benchmark_nets.sh +++ b/mindspore/lite/test/run_benchmark_nets.sh @@ -1892,16 +1892,29 @@ function Run_gpu() { adb -s ${device_id} shell < adb_cmd.txt - # Run gpu tflite converted models: + # Run gpu fp32 converted models: while read line; do - model_name=${line} + model_name=${line%%;*} if [[ $model_name == \#* ]]; then continue fi + model_name=`echo ${line} | awk -F ';' '{print $1}'` + input_num=`echo ${line} | awk -F ';' '{print $2}'` + input_files="" + data_path="/data/local/tmp/input_output/" + output_file=${data_path}'output/'${model_name}'.ms.out' + if [[ ${input_num} == "" ]]; then + input_files=/data/local/tmp/input_output/input/${model_name}.ms.bin + else + for i in $(seq 1 $input_num) + do + input_files=$input_files${data_path}'input/'$model_name'.ms.bin_'$i',' + done + fi echo ${model_name} >> "${run_gpu_log_file}" echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/data/local/tmp/input_output/output/'${model_name}'.ms.out' >> "${run_gpu_log_file}" - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/data/local/tmp/input_output/output/'${model_name}'.ms.out' >> adb_run_cmd.txt + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}" + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_gpu_log_file}" if [ $? = 0 ]; then run_result='arm64_gpu: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file}