!14491 mindspore lite gpu support int8
From: @yeyunpeng2020 Reviewed-by: @ddwsky,@HilbertDavid Signed-off-by: @ddwsky
This commit is contained in:
commit
adf4655aba
|
@ -22,11 +22,11 @@ function(gene_opencl BASEPATH)
|
||||||
if(NOT EXISTS ${CL_SRC_DIR})
|
if(NOT EXISTS ${CL_SRC_DIR})
|
||||||
return()
|
return()
|
||||||
endif()
|
endif()
|
||||||
file(GLOB_RECURSE CL_LIST ${CL_SRC_DIR}/*.cl)
|
file(GLOB_RECURSE CL_LIST ${CL_SRC_DIR}/*.cl ${CL_SRC_DIR}/int8/*.cl)
|
||||||
foreach(file_path ${CL_LIST})
|
foreach(file_path ${CL_LIST})
|
||||||
file(REMOVE ${file_path}.inc)
|
file(REMOVE ${file_path}.inc)
|
||||||
string(REGEX REPLACE ".+/(.+)\\..*" "\\1" kernel_name "${file_path}")
|
string(REGEX REPLACE ".+/(.+)\\..*" "\\1" kernel_name "${file_path}")
|
||||||
set(inc_file_ex "${kernel_name}.cl.inc")
|
set(inc_file_ex "${file_path}.inc")
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND bash -c "sed 's/\\\\/\\\\\\\\/g' "
|
COMMAND bash -c "sed 's/\\\\/\\\\\\\\/g' "
|
||||||
COMMAND bash -c "sed 's/\\\"/\\\\\\\"/g' "
|
COMMAND bash -c "sed 's/\\\"/\\\\\\\"/g' "
|
||||||
|
|
|
@ -143,7 +143,7 @@ int OpenCLAllocator::GetImgDtypeSize(const ImageSize &img_size) {
|
||||||
dtype_size = sizeof(cl_float);
|
dtype_size = sizeof(cl_float);
|
||||||
} else if (img_size.dtype == CL_HALF_FLOAT) {
|
} else if (img_size.dtype == CL_HALF_FLOAT) {
|
||||||
dtype_size = sizeof(cl_half);
|
dtype_size = sizeof(cl_half);
|
||||||
} else if (img_size.dtype == CL_UNSIGNED_INT8) {
|
} else if (img_size.dtype == CL_SIGNED_INT8) {
|
||||||
dtype_size = sizeof(cl_uchar);
|
dtype_size = sizeof(cl_uchar);
|
||||||
} else {
|
} else {
|
||||||
MS_LOG(ERROR) << "Unsupported dtype " << img_size.dtype;
|
MS_LOG(ERROR) << "Unsupported dtype " << img_size.dtype;
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
if(${SUPPORT_GPU} STREQUAL opencl)
|
if(${SUPPORT_GPU} STREQUAL opencl)
|
||||||
file(GLOB_RECURSE OPENCL_KERNEL_SRC
|
file(GLOB_RECURSE OPENCL_KERNEL_SRC
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/*.cc
|
${CMAKE_CURRENT_SOURCE_DIR}/*.cc
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/kernel/*.cc)
|
${CMAKE_CURRENT_SOURCE_DIR}/kernel/*.cc
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernel/int8/*.cc)
|
||||||
add_library(opencl_kernel_mid OBJECT ${OPENCL_KERNEL_SRC})
|
add_library(opencl_kernel_mid OBJECT ${OPENCL_KERNEL_SRC})
|
||||||
add_dependencies(opencl_kernel_mid fbs_src)
|
add_dependencies(opencl_kernel_mid fbs_src)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
|
||||||
|
|
||||||
|
__kernel void ElementAddInt8(__read_only image2d_t input_a, __read_only image2d_t input_b,
|
||||||
|
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max,
|
||||||
|
const float4 scale, const char4 zero_point) {
|
||||||
|
int X = get_global_id(0);
|
||||||
|
int Y = get_global_id(1);
|
||||||
|
if (X >= output_shape.x || Y >= output_shape.y) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
char4 a = convert_char4(read_imagei(input_a, smp_none, (int2)(X, Y)));
|
||||||
|
char4 b = convert_char4(read_imagei(input_b, smp_none, (int2)(X, Y)));
|
||||||
|
|
||||||
|
float4 real_a = convert_float4(a - zero_point.x) * scale.x;
|
||||||
|
float4 real_b = convert_float4(b - zero_point.y) * scale.y;
|
||||||
|
int4 result = convert_int4(round((real_a + real_b) / scale.z)) + zero_point.z;
|
||||||
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max));
|
||||||
|
write_imagei(output, (int2)(X, Y), result);
|
||||||
|
}
|
|
@ -1,8 +1,8 @@
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||||
|
|
||||||
#define BUF_to_IMG(src_dtype, dst_dtype, SRC_FLT, DST_FLT, WRITE_IMAGE_OUT) \
|
#define BUF_to_IMG(src_dtype, dst_dtype, SRC_TYPE, DST_TYPE, WRITE_IMAGE_OUT) \
|
||||||
__kernel void BUF_to_IMG_##src_dtype##_##dst_dtype(__global float4 *src_data, __write_only image2d_t dst_data, \
|
__kernel void BUF_to_IMG_##src_dtype##_##dst_dtype(__global SRC_TYPE##4 *src_data, __write_only image2d_t dst_data, \
|
||||||
int4 size, int4 shape) { \
|
int4 size, int4 shape) { \
|
||||||
int X = get_global_id(0); \
|
int X = get_global_id(0); \
|
||||||
int Y = get_global_id(1); \
|
int Y = get_global_id(1); \
|
||||||
|
@ -10,21 +10,21 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP
|
||||||
if (X >= size.x || Y >= size.y || Z >= size.z) { \
|
if (X >= size.x || Y >= size.y || Z >= size.z) { \
|
||||||
return; \
|
return; \
|
||||||
} \
|
} \
|
||||||
DST_FLT##4 data = (DST_FLT##4)(0.f); \
|
DST_TYPE##4 data = (DST_TYPE##4)(0.f); \
|
||||||
int offset = (X * shape.z + Y) * shape.w + Z * 4; \
|
int offset = (X * shape.z + Y) * shape.w + Z * 4; \
|
||||||
__global SRC_FLT *src_addr = (__global SRC_FLT *)src_data; \
|
__global SRC_TYPE *src_addr = (__global SRC_TYPE *)src_data; \
|
||||||
src_addr += offset; \
|
src_addr += offset; \
|
||||||
if ((Z + 1) * 4 <= shape.w) { \
|
if ((Z + 1) * 4 <= shape.w) { \
|
||||||
data = convert_##DST_FLT##4(((__global SRC_FLT##4 *)src_addr)[0]); \
|
data = convert_##DST_TYPE##4(((__global SRC_TYPE##4 *)src_addr)[0]); \
|
||||||
} else { \
|
} else { \
|
||||||
if ((shape.w - Z * 4) >= 1) { \
|
if ((shape.w - Z * 4) >= 1) { \
|
||||||
data.x = (DST_FLT)src_addr[0]; \
|
data.x = (DST_TYPE)src_addr[0]; \
|
||||||
} \
|
} \
|
||||||
if ((shape.w - Z * 4) >= 2) { \
|
if ((shape.w - Z * 4) >= 2) { \
|
||||||
data.y = (DST_FLT)src_addr[1]; \
|
data.y = (DST_TYPE)src_addr[1]; \
|
||||||
} \
|
} \
|
||||||
if ((shape.w - Z * 4) >= 3) { \
|
if ((shape.w - Z * 4) >= 3) { \
|
||||||
data.z = (DST_FLT)src_addr[2]; \
|
data.z = (DST_TYPE)src_addr[2]; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
if (size.y * size.z <= MAX_IMAGE2D_WIDTH) \
|
if (size.y * size.z <= MAX_IMAGE2D_WIDTH) \
|
||||||
|
@ -33,15 +33,16 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP
|
||||||
WRITE_IMAGE_OUT(dst_data, (int2)(Z, X * size.y + Y), data); \
|
WRITE_IMAGE_OUT(dst_data, (int2)(Z, X * size.y + Y), data); \
|
||||||
}
|
}
|
||||||
|
|
||||||
// BUF_to_IMG(src_dtype, dst_dtype, SRC_FLT, DST_FLT, WRITE_IMAGE_OUT)
|
// BUF_to_IMG(src_dtype, dst_dtype, SRC_TYPE, DST_TYPE, WRITE_IMAGE_OUT)
|
||||||
BUF_to_IMG(float32, float32, float, float, write_imagef);
|
BUF_to_IMG(float32, float32, float, float, write_imagef);
|
||||||
BUF_to_IMG(float32, float16, float, half, write_imageh);
|
BUF_to_IMG(float32, float16, float, half, write_imageh);
|
||||||
BUF_to_IMG(float16, float16, half, half, write_imageh);
|
BUF_to_IMG(float16, float16, half, half, write_imageh);
|
||||||
BUF_to_IMG(int32, int32, float, float, write_imagef);
|
BUF_to_IMG(int32, int32, float, float, write_imagef);
|
||||||
BUF_to_IMG(uint32, uint32, float, float, write_imagef);
|
BUF_to_IMG(uint32, uint32, float, float, write_imagef);
|
||||||
|
BUF_to_IMG(int8, int8, char, int, write_imagei);
|
||||||
|
|
||||||
#define IMG_to_BUF(src_dtype, dst_dtype, SRC_FLT, DST_FLT, READ_IMAGE_IN) \
|
#define IMG_to_BUF(src_dtype, dst_dtype, SRC_TYPE, DST_TYPE, READ_IMAGE_IN) \
|
||||||
__kernel void IMG_to_BUF_##src_dtype##_##dst_dtype(__read_only image2d_t src_data, __global DST_FLT##4 * dst_data, \
|
__kernel void IMG_to_BUF_##src_dtype##_##dst_dtype(__read_only image2d_t src_data, __global DST_TYPE##4 * dst_data, \
|
||||||
int4 size, int4 shape) { \
|
int4 size, int4 shape) { \
|
||||||
int X = get_global_id(0); \
|
int X = get_global_id(0); \
|
||||||
int Y = get_global_id(1); \
|
int Y = get_global_id(1); \
|
||||||
|
@ -49,16 +50,16 @@ BUF_to_IMG(uint32, uint32, float, float, write_imagef);
|
||||||
if (X >= size.x || Y >= size.y || Z >= size.z) { \
|
if (X >= size.x || Y >= size.y || Z >= size.z) { \
|
||||||
return; \
|
return; \
|
||||||
} \
|
} \
|
||||||
DST_FLT##4 data; \
|
DST_TYPE##4 data; \
|
||||||
if (size.y * size.z <= MAX_IMAGE2D_WIDTH) \
|
if (size.y * size.z <= MAX_IMAGE2D_WIDTH) \
|
||||||
data = convert_##DST_FLT##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); \
|
data = convert_##DST_TYPE##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); \
|
||||||
else \
|
else \
|
||||||
data = convert_##DST_FLT##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Z, X * size.y + Y))); \
|
data = convert_##DST_TYPE##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Z, X * size.y + Y))); \
|
||||||
int offset = (X * shape.z + Y) * shape.w + Z * 4; \
|
int offset = (X * shape.z + Y) * shape.w + Z * 4; \
|
||||||
__global DST_FLT *dst_addr = (__global DST_FLT *)dst_data; \
|
__global DST_TYPE *dst_addr = (__global DST_TYPE *)dst_data; \
|
||||||
dst_addr += offset; \
|
dst_addr += offset; \
|
||||||
if ((Z + 1) * 4 <= shape.w) { \
|
if ((Z + 1) * 4 <= shape.w) { \
|
||||||
((__global DST_FLT##4 *)dst_addr)[0] = data; \
|
((__global DST_TYPE##4 *)dst_addr)[0] = data; \
|
||||||
} else { \
|
} else { \
|
||||||
if (shape.w - Z * 4 >= 1) { \
|
if (shape.w - Z * 4 >= 1) { \
|
||||||
dst_addr[0] = data.x; \
|
dst_addr[0] = data.x; \
|
||||||
|
@ -72,9 +73,10 @@ BUF_to_IMG(uint32, uint32, float, float, write_imagef);
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
// IMG_to_BUF(src_dtype, dst_dtype, SRC_FLT, DST_FLT, READ_IMAGE_IN)
|
// IMG_to_BUF(src_dtype, dst_dtype, SRC_TYPE, DST_TYPE, READ_IMAGE_IN)
|
||||||
IMG_to_BUF(float32, float32, float, float, read_imagef);
|
IMG_to_BUF(float32, float32, float, float, read_imagef);
|
||||||
IMG_to_BUF(float16, float32, half, float, read_imageh);
|
IMG_to_BUF(float16, float32, half, float, read_imageh);
|
||||||
IMG_to_BUF(float16, float16, half, half, read_imageh);
|
IMG_to_BUF(float16, float16, half, half, read_imageh);
|
||||||
IMG_to_BUF(int32, int32, float, float, read_imagef);
|
IMG_to_BUF(int32, int32, float, float, read_imagef);
|
||||||
IMG_to_BUF(uint32, uint32, float, float, read_imagef);
|
IMG_to_BUF(uint32, uint32, float, float, read_imagef);
|
||||||
|
IMG_to_BUF(int8, int8, char, char, read_imagei);
|
||||||
|
|
|
@ -0,0 +1,234 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "src/runtime/kernel/opencl/kernel/int8/arithmetic_int8.h"
|
||||||
|
#include <vector>
|
||||||
|
#include "nnacl/fp32/common_func_fp32.h"
|
||||||
|
#include "schema/model_generated.h"
|
||||||
|
#include "src/kernel_registry.h"
|
||||||
|
#include "src/runtime/kernel/opencl/utils.h"
|
||||||
|
#ifndef PROGRAM_WITH_IL
|
||||||
|
#include "src/runtime/kernel/opencl/cl/int8/arithmetic.cl.inc"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
using mindspore::kernel::KERNEL_ARCH::kGPU;
|
||||||
|
using mindspore::lite::KernelRegistrar;
|
||||||
|
using mindspore::lite::RET_ERROR;
|
||||||
|
using mindspore::lite::RET_OK;
|
||||||
|
using mindspore::lite::opencl::ImageSize;
|
||||||
|
using mindspore::lite::opencl::MemType;
|
||||||
|
using mindspore::schema::ActivationType_NO_ACTIVATION;
|
||||||
|
using mindspore::schema::ActivationType_RELU;
|
||||||
|
using mindspore::schema::ActivationType_RELU6;
|
||||||
|
using mindspore::schema::EltwiseMode_MAXIMUM;
|
||||||
|
using mindspore::schema::EltwiseMode_PROD;
|
||||||
|
using mindspore::schema::EltwiseMode_SUM;
|
||||||
|
using mindspore::schema::PrimitiveType_BiasAdd;
|
||||||
|
using mindspore::schema::PrimitiveType_Eltwise;
|
||||||
|
|
||||||
|
namespace mindspore::kernel {
|
||||||
|
|
||||||
|
int ArithmeticInt8OpenCLKernel::CheckSpecs() {
|
||||||
|
for (auto &tensor : in_tensors_) {
|
||||||
|
if (tensor->data_type() != kNumberTypeInt8) {
|
||||||
|
MS_LOG(ERROR) << "ArithmeticInt8OpenCLKernel only support int8 input";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto &tensor : out_tensors_) {
|
||||||
|
if (tensor->data_type() != kNumberTypeInt8) {
|
||||||
|
MS_LOG(ERROR) << "ArithmeticInt8OpenCLKernel only support int8 output";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
|
||||||
|
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
auto *param = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
|
||||||
|
if (!IsArithmetic(Type())) {
|
||||||
|
MS_LOG(ERROR) << "UnSupported Operator: " << schema::EnumNamePrimitiveType(Type());
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
if (Type() == schema::PrimitiveType_Eltwise) {
|
||||||
|
auto mode = param->eltwise_mode_;
|
||||||
|
if (mode != EltwiseMode_PROD && mode != EltwiseMode_SUM && mode != EltwiseMode_MAXIMUM) {
|
||||||
|
MS_LOG(ERROR) << "Eltwise mode not support, mode:" << mode;
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!(param->activation_type_ == ActivationType_NO_ACTIVATION || param->activation_type_ == ActivationType_RELU ||
|
||||||
|
param->activation_type_ == ActivationType_RELU6)) {
|
||||||
|
MS_LOG(ERROR) << "Unsupported activation type " << param->activation_type_;
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ArithmeticInt8OpenCLKernel::SetGlobalLocal() {
|
||||||
|
if (element_flag_) {
|
||||||
|
global_size_ = {out_shape_.width, out_shape_.height};
|
||||||
|
} else {
|
||||||
|
global_size_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N};
|
||||||
|
}
|
||||||
|
AlignGlobalLocal(global_size_, {});
|
||||||
|
}
|
||||||
|
|
||||||
|
int ArithmeticInt8OpenCLKernel::InitWeights() {
|
||||||
|
auto allocator = ocl_runtime_->GetAllocator();
|
||||||
|
auto fp16_enable = ocl_runtime_->GetFp16Enable();
|
||||||
|
for (int i = 0; i < 2; ++i) {
|
||||||
|
const auto &in_tensor = in_tensors_.at(i);
|
||||||
|
GpuTensorInfo in_shape = GpuTensorInfo(in_tensor);
|
||||||
|
if (in_tensor->IsConst()) {
|
||||||
|
std::vector<char> weight(in_shape.Image2DSize, 0);
|
||||||
|
bool src_is_fp16 = in_tensor->data_type() == kNumberTypeFloat16;
|
||||||
|
PackNHWCToNHWC4(in_tensor->data_c(), weight.data(), src_is_fp16, fp16_enable, in_shape);
|
||||||
|
size_t dtype = fp16_enable ? CL_HALF_FLOAT : CL_FLOAT;
|
||||||
|
ImageSize img_size{in_shape.width, in_shape.height, dtype};
|
||||||
|
auto weight_ptr_ = allocator->Malloc(img_size, weight.data());
|
||||||
|
weight_ptrs_.push_back(weight_ptr_);
|
||||||
|
} else {
|
||||||
|
weight_ptrs_.push_back(nullptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ArithmeticInt8OpenCLKernel::SetConstArgs() {
|
||||||
|
int arg_idx = 3;
|
||||||
|
if (!element_flag_) {
|
||||||
|
cl_int4 in0_shape = {static_cast<int>(in0_shape_.N), static_cast<int>(in0_shape_.H), static_cast<int>(in0_shape_.W),
|
||||||
|
static_cast<int>(in0_shape_.Slice)};
|
||||||
|
cl_int4 in1_shape = {static_cast<int>(in1_shape_.N), static_cast<int>(in1_shape_.H), static_cast<int>(in1_shape_.W),
|
||||||
|
static_cast<int>(in1_shape_.Slice)};
|
||||||
|
cl_int4 out_shape = {static_cast<int>(out_shape_.N), static_cast<int>(out_shape_.H), static_cast<int>(out_shape_.W),
|
||||||
|
static_cast<int>(out_shape_.Slice)};
|
||||||
|
int broadcastC_flag = 0; // do not need broadcast in C4
|
||||||
|
if (in0_shape_.C == 1 && in1_shape_.C != 1) {
|
||||||
|
broadcastC_flag = 1; // BroadCast C4 in input0
|
||||||
|
} else if (in0_shape_.C != 1 && in1_shape_.C == 1) {
|
||||||
|
broadcastC_flag = 2; // BroadCast C4 in input1
|
||||||
|
}
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in0_shape);
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in1_shape);
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_shape);
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag);
|
||||||
|
} else {
|
||||||
|
cl_int2 output_shape{static_cast<int>(global_range_[0]), static_cast<int>(global_range_[1])};
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||||
|
}
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
|
||||||
|
|
||||||
|
// set quantization parameter.
|
||||||
|
auto input0_quant_param = in_tensors_[0]->quant_params().front();
|
||||||
|
auto input1_quant_param = in_tensors_[1]->quant_params().front();
|
||||||
|
auto output_quant_param = out_tensors_[0]->quant_params().front();
|
||||||
|
cl_float4 scale = {static_cast<float>(input0_quant_param.scale), static_cast<float>(input1_quant_param.scale),
|
||||||
|
static_cast<float>(output_quant_param.scale), 0.0};
|
||||||
|
cl_char4 zero_point = {static_cast<int8_t>(input0_quant_param.zeroPoint),
|
||||||
|
static_cast<int8_t>(input1_quant_param.zeroPoint),
|
||||||
|
static_cast<int8_t>(output_quant_param.zeroPoint), 0};
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale); // scale
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, zero_point); // zero_point
|
||||||
|
}
|
||||||
|
|
||||||
|
int ArithmeticInt8OpenCLKernel::Prepare() {
|
||||||
|
#ifdef PROGRAM_WITH_IL
|
||||||
|
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name_);
|
||||||
|
#else
|
||||||
|
|
||||||
|
in0_shape_ = GpuTensorInfo(in_tensors_[0]);
|
||||||
|
in1_shape_ = GpuTensorInfo(in_tensors_[1]);
|
||||||
|
out_shape_ = GpuTensorInfo(out_tensors_[0]);
|
||||||
|
|
||||||
|
auto *param = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
|
||||||
|
if (Type() == PrimitiveType_BiasAdd) {
|
||||||
|
const_cast<ArithmeticParameter *>(param)->broadcasting_ = true;
|
||||||
|
}
|
||||||
|
element_flag_ = !param->broadcasting_;
|
||||||
|
kernel_name_ = param->broadcasting_ ? "BroadcastNHWC4" : "Element";
|
||||||
|
switch (Type()) {
|
||||||
|
case PrimitiveType_MulFusion:
|
||||||
|
kernel_name_ += "MulInt8";
|
||||||
|
break;
|
||||||
|
case PrimitiveType_AddFusion:
|
||||||
|
kernel_name_ += "AddInt8";
|
||||||
|
break;
|
||||||
|
case PrimitiveType_SubFusion:
|
||||||
|
kernel_name_ += "SubInt8";
|
||||||
|
break;
|
||||||
|
case PrimitiveType_DivFusion:
|
||||||
|
kernel_name_ += "DivInt8";
|
||||||
|
break;
|
||||||
|
case PrimitiveType_Eltwise: {
|
||||||
|
auto mode = param->eltwise_mode_;
|
||||||
|
if (mode == EltwiseMode_PROD) {
|
||||||
|
kernel_name_ += "MulInt8";
|
||||||
|
} else if (mode == EltwiseMode_SUM) {
|
||||||
|
kernel_name_ += "AddInt8";
|
||||||
|
} else if (mode == EltwiseMode_MAXIMUM) {
|
||||||
|
kernel_name_ += "MaximumInt8";
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
kernel_name_ += schema::EnumNamePrimitiveType(Type());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (param->activation_type_ == ActivationType_RELU) {
|
||||||
|
activation_min_ = 0.f;
|
||||||
|
} else if (param->activation_type_ == ActivationType_RELU6) {
|
||||||
|
activation_min_ = 0.f;
|
||||||
|
activation_max_ = 6.f;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string program_name = "Arithmetic";
|
||||||
|
std::string source = arithmetic_source;
|
||||||
|
ocl_runtime_->LoadSource(program_name, source);
|
||||||
|
int error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name_);
|
||||||
|
#endif
|
||||||
|
if (error_code != RET_OK) {
|
||||||
|
return error_code;
|
||||||
|
}
|
||||||
|
|
||||||
|
SetGlobalLocal();
|
||||||
|
// BiasAdd InitWeight will be called in opencl_subgraph prepare
|
||||||
|
if (Type() != PrimitiveType_BiasAdd) {
|
||||||
|
InitWeights();
|
||||||
|
}
|
||||||
|
SetConstArgs();
|
||||||
|
MS_LOG(DEBUG) << kernel_name_ << " Init Done!";
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ArithmeticInt8OpenCLKernel::Run() {
|
||||||
|
MS_LOG(DEBUG) << this->name() << " Running!";
|
||||||
|
auto input_0_ptr = weight_ptrs_[0] == nullptr ? in_tensors_[0]->data_c() : weight_ptrs_[0];
|
||||||
|
auto input_1_ptr = weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : weight_ptrs_[1];
|
||||||
|
int arg_idx = 0;
|
||||||
|
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr);
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr);
|
||||||
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
|
||||||
|
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
REG_KERNEL(kGPU, kNumberTypeInt8, PrimitiveType_AddFusion, OpenCLKernelCreator<ArithmeticInt8OpenCLKernel>)
|
||||||
|
} // namespace mindspore::kernel
|
|
@ -0,0 +1,54 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_
|
||||||
|
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
|
#include "src/runtime/kernel/arm/fp32/arithmetic_fp32.h"
|
||||||
|
#include "src/runtime/kernel/opencl/opencl_kernel.h"
|
||||||
|
|
||||||
|
namespace mindspore::kernel {
|
||||||
|
|
||||||
|
extern std::set<schema::PrimitiveType> SupportedOpenCLArithmetics;
|
||||||
|
|
||||||
|
class ArithmeticInt8OpenCLKernel : public OpenCLKernel {
|
||||||
|
public:
|
||||||
|
using OpenCLKernel::OpenCLKernel;
|
||||||
|
~ArithmeticInt8OpenCLKernel() override = default;
|
||||||
|
|
||||||
|
int Run() override;
|
||||||
|
int Prepare() override;
|
||||||
|
int CheckSpecs() override;
|
||||||
|
int InitWeights() override;
|
||||||
|
void SetConstArgs() override;
|
||||||
|
void SetGlobalLocal() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool element_flag_{true};
|
||||||
|
float activation_min_{-128};
|
||||||
|
float activation_max_{127};
|
||||||
|
GpuTensorInfo in0_shape_;
|
||||||
|
GpuTensorInfo in1_shape_;
|
||||||
|
GpuTensorInfo out_shape_;
|
||||||
|
std::vector<void *> weight_ptrs_;
|
||||||
|
std::string kernel_name_;
|
||||||
|
};
|
||||||
|
} // namespace mindspore::kernel
|
||||||
|
|
||||||
|
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_
|
|
@ -37,7 +37,8 @@ int ToFormatOpenCLKernel::CheckSpecs() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
auto data_type = in_tensors_.front()->data_type();
|
auto data_type = in_tensors_.front()->data_type();
|
||||||
if (data_type != kNumberTypeFloat32 && data_type != kNumberTypeFloat16 && data_type != kNumberTypeInt32) {
|
if (data_type != kNumberTypeFloat32 && data_type != kNumberTypeFloat16 && data_type != kNumberTypeInt32 &&
|
||||||
|
data_type != kNumberTypeInt8) {
|
||||||
MS_LOG(ERROR) << "Unsupported data type " << data_type;
|
MS_LOG(ERROR) << "Unsupported data type " << data_type;
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
@ -64,10 +65,10 @@ void ToFormatOpenCLKernel::SetGlobalLocal() {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ToFormatOpenCLKernel::Prepare() {
|
int ToFormatOpenCLKernel::Prepare() {
|
||||||
static std::map<TypeId, std::string> dtype_str{{kNumberTypeFloat32, "float32"},
|
static std::map<TypeId, std::string> dtype_str{
|
||||||
{kNumberTypeFloat16, "float16"},
|
{kNumberTypeFloat32, "float32"}, {kNumberTypeFloat16, "float16"}, {kNumberTypeInt32, "int32"},
|
||||||
{kNumberTypeInt32, "int32"},
|
{kNumberTypeUInt32, "uint32"}, {kNumberTypeInt8, "int8"},
|
||||||
{kNumberTypeUInt32, "uint32"}};
|
};
|
||||||
auto in_tensor = in_tensors_.front();
|
auto in_tensor = in_tensors_.front();
|
||||||
auto out_tensor = out_tensors_.front();
|
auto out_tensor = out_tensors_.front();
|
||||||
std::string kernel_name = out_mem_type_ == MemType::IMG ? "BUF_to_IMG_" : "IMG_to_BUF_";
|
std::string kernel_name = out_mem_type_ == MemType::IMG ? "BUF_to_IMG_" : "IMG_to_BUF_";
|
||||||
|
|
|
@ -84,7 +84,7 @@ int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) {
|
||||||
}
|
}
|
||||||
case kNumberTypeInt8:
|
case kNumberTypeInt8:
|
||||||
case kNumberTypeUInt8: {
|
case kNumberTypeUInt8: {
|
||||||
img_dtype = CL_UNSIGNED_INT8;
|
img_dtype = CL_SIGNED_INT8;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default: {
|
default: {
|
||||||
|
@ -138,8 +138,10 @@ void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
|
||||||
printf("%d %7d | ", i, reinterpret_cast<int32_t *>(data.data())[i]);
|
printf("%d %7d | ", i, reinterpret_cast<int32_t *>(data.data())[i]);
|
||||||
} else if (tensor->data_type() == kNumberTypeFloat16) {
|
} else if (tensor->data_type() == kNumberTypeFloat16) {
|
||||||
printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]);
|
printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]);
|
||||||
} else {
|
} else if (tensor->data_type() == kNumberTypeFloat32) {
|
||||||
printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
|
printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
|
||||||
|
} else if (tensor->data_type() == kNumberTypeInt8) {
|
||||||
|
printf("%d %7d | ", i, static_cast<int>(reinterpret_cast<int8_t *>(data.data())[i]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
@ -409,7 +411,7 @@ int OpenCLKernel::CheckSpecs() {
|
||||||
}
|
}
|
||||||
if (in_tensors_.size() > 0) {
|
if (in_tensors_.size() > 0) {
|
||||||
if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16 &&
|
if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16 &&
|
||||||
in_tensors_[0]->data_type() != kNumberTypeInt32) {
|
in_tensors_[0]->data_type() != kNumberTypeInt32 && in_tensors_[0]->data_type() != kNumberTypeInt8) {
|
||||||
MS_LOG(WARNING) << "Unsupported data type: " << in_tensors_[0]->data_type();
|
MS_LOG(WARNING) << "Unsupported data type: " << in_tensors_[0]->data_type();
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,16 +49,16 @@ void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
|
||||||
auto *H = dst + 1;
|
auto *H = dst + 1;
|
||||||
auto *W = dst + 2;
|
auto *W = dst + 2;
|
||||||
auto *C = dst + 3;
|
auto *C = dst + 3;
|
||||||
if (src_num == 1) {
|
if (src_num == 1) { // 1 1 1 C
|
||||||
*C = src[0];
|
*C = src[0];
|
||||||
} else if (src_num == 2) {
|
} else if (src_num == 2) { // N 1 1 C
|
||||||
*N = src[0];
|
*N = src[0];
|
||||||
*C = src[1];
|
*C = src[1];
|
||||||
} else if (src_num == 3) {
|
} else if (src_num == 3) { // N 1 W C
|
||||||
*N = src[0];
|
*N = src[0];
|
||||||
*W = src[1];
|
*W = src[1];
|
||||||
*C = src[2];
|
*C = src[2];
|
||||||
} else if (src_num == 4) {
|
} else if (src_num == 4) { // N H W C
|
||||||
*N = src[0];
|
*N = src[0];
|
||||||
*H = src[1];
|
*H = src[1];
|
||||||
*W = src[2];
|
*W = src[2];
|
||||||
|
|
|
@ -128,6 +128,9 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
|
||||||
MS_LOG(ERROR) << "OpenCLSubGraph new tensor failed!";
|
MS_LOG(ERROR) << "OpenCLSubGraph new tensor failed!";
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
for (const auto ¶m : in_tensor->quant_params()) {
|
||||||
|
new_tensor->AddQuantParam(param);
|
||||||
|
}
|
||||||
|
|
||||||
out_tensors->emplace_back(new_tensor);
|
out_tensors->emplace_back(new_tensor);
|
||||||
KernelKey desc{kGPU, kNumberTypeFloat32, PRIM_TO_FORMAT};
|
KernelKey desc{kGPU, kNumberTypeFloat32, PRIM_TO_FORMAT};
|
||||||
|
|
|
@ -352,6 +352,7 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
|
||||||
// support more data type like int32
|
// support more data type like int32
|
||||||
kernel::KernelKey gpu_desc{kGPU, kNumberTypeFloat32, desc.type};
|
kernel::KernelKey gpu_desc{kGPU, kNumberTypeFloat32, desc.type};
|
||||||
if (context_->IsGpuFloat16Enabled()) gpu_desc.data_type = kNumberTypeFloat16;
|
if (context_->IsGpuFloat16Enabled()) gpu_desc.data_type = kNumberTypeFloat16;
|
||||||
|
if (in_tensors.front()->data_type() == kNumberTypeInt8) gpu_desc.data_type = kNumberTypeInt8;
|
||||||
auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter);
|
auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter);
|
||||||
if (kernel != nullptr) {
|
if (kernel != nullptr) {
|
||||||
MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " " << node->name_;
|
MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " " << node->name_;
|
||||||
|
|
|
@ -171,6 +171,14 @@ int CpuSubGraph::Run(const KernelCallBack &before, const KernelCallBack &after)
|
||||||
out_tensor->set_ref_count(out_tensor->ref_count() + 1);
|
out_tensor->set_ref_count(out_tensor->ref_count() + 1);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef SUPPORT_GPU
|
||||||
|
// In heterogeneous scenarios of CPU and GPU, call MutableData to MapBuffer(synchronize data).
|
||||||
|
if (context_->IsGpuEnabled()) {
|
||||||
|
for (auto tensor : this->in_tensors()) {
|
||||||
|
tensor->MutableData();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for (auto *kernel : nodes_) {
|
for (auto *kernel : nodes_) {
|
||||||
MS_ASSERT(nullptr != kernel);
|
MS_ASSERT(nullptr != kernel);
|
||||||
|
|
|
@ -41,3 +41,4 @@ Q_new_detect.tflite
|
||||||
Q_object_scene.tflite
|
Q_object_scene.tflite
|
||||||
Q_pose.tflite
|
Q_pose.tflite
|
||||||
matmul.pb
|
matmul.pb
|
||||||
|
add_uint8.tflite;2
|
||||||
|
|
|
@ -27,3 +27,4 @@ ml_tts_encoder_control_flow.pb;4;1:1,22:1:1
|
||||||
hiai_cv_labelDetectorModel_v3.tflite;2
|
hiai_cv_labelDetectorModel_v3.tflite;2
|
||||||
ml_tts_vocoder.pb;66
|
ml_tts_vocoder.pb;66
|
||||||
ml_tacotron_decoder_step_stf.tflite;9;1,80:1,256:1,1024:1,1024:1,1024:1,1024:1,8:1,1,256:1
|
ml_tacotron_decoder_step_stf.tflite;9;1,80:1,256:1,1024:1,1024:1,1024:1,1024:1,8:1,1,256:1
|
||||||
|
add_uint8.tflite;2
|
||||||
|
|
|
@ -1892,16 +1892,29 @@ function Run_gpu() {
|
||||||
|
|
||||||
adb -s ${device_id} shell < adb_cmd.txt
|
adb -s ${device_id} shell < adb_cmd.txt
|
||||||
|
|
||||||
# Run gpu tflite converted models:
|
# Run gpu fp32 converted models:
|
||||||
while read line; do
|
while read line; do
|
||||||
model_name=${line}
|
model_name=${line%%;*}
|
||||||
if [[ $model_name == \#* ]]; then
|
if [[ $model_name == \#* ]]; then
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
model_name=`echo ${line} | awk -F ';' '{print $1}'`
|
||||||
|
input_num=`echo ${line} | awk -F ';' '{print $2}'`
|
||||||
|
input_files=""
|
||||||
|
data_path="/data/local/tmp/input_output/"
|
||||||
|
output_file=${data_path}'output/'${model_name}'.ms.out'
|
||||||
|
if [[ ${input_num} == "" ]]; then
|
||||||
|
input_files=/data/local/tmp/input_output/input/${model_name}.ms.bin
|
||||||
|
else
|
||||||
|
for i in $(seq 1 $input_num)
|
||||||
|
do
|
||||||
|
input_files=$input_files${data_path}'input/'$model_name'.ms.bin_'$i','
|
||||||
|
done
|
||||||
|
fi
|
||||||
echo ${model_name} >> "${run_gpu_log_file}"
|
echo ${model_name} >> "${run_gpu_log_file}"
|
||||||
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
|
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
|
||||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/data/local/tmp/input_output/output/'${model_name}'.ms.out' >> "${run_gpu_log_file}"
|
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}"
|
||||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/data/local/tmp/input_output/output/'${model_name}'.ms.out' >> adb_run_cmd.txt
|
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt
|
||||||
adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_gpu_log_file}"
|
adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_gpu_log_file}"
|
||||||
if [ $? = 0 ]; then
|
if [ $? = 0 ]; then
|
||||||
run_result='arm64_gpu: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file}
|
run_result='arm64_gpu: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file}
|
||||||
|
|
Loading…
Reference in New Issue