!5680 add gpu benchmark

Merge pull request !5680 from wandongdong/master
This commit is contained in:
mindspore-ci-bot 2020-09-03 21:51:26 +08:00 committed by Gitee
commit 0aaaf29158
9 changed files with 61 additions and 13 deletions

View File

@ -27,11 +27,6 @@
#include "src/ir/tensor.h"
#include "include/errorcode.h"
#ifdef ENABLE_FP16
using FLOAT_t = float16_t;
#else
using FLOAT_t = float;
#endif
// using mindspore::kernel::AddressPtr;
namespace mindspore::kernel {

View File

@ -55,10 +55,20 @@ __kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global FLT4 *d
result[1] = (FLT4)(0.0f);
result[2] = (FLT4)(0.0f);
result[3] = (FLT4)(0.0f);
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W));
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W));
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W));
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W));
bool over_size = W * C.y > 65535;
FLT4 x0, x1, x2, x3;
if (over_size) {
x0 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X));
x1 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 1));
x2 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 2));
x3 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 3));
} else {
x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W));
x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W));
x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W));
x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W));
}
result[0].x = x0.x;
result[0].y = x1.x;
result[0].z = x2.x;

View File

@ -44,7 +44,7 @@ void ActivationOpenClKernel::InitBuffer() {
alpha_buff_ = allocator->MapBuffer(alpha_buff_, CL_MAP_WRITE, nullptr, true);
memset(alpha_buff_, 0x00, fp_size);
if (enable_fp16_) {
auto fp16 = (float16_t)alpha_;
auto fp16 = (int16_t)alpha_;
memcpy(alpha_buff_, &fp16, fp_size);
} else {
memcpy(alpha_buff_, &alpha_, fp_size);

View File

@ -72,7 +72,7 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
int div_ci = UP_DIV(ci, C4NUM);
int div_co = UP_DIV(co, C4NUM);
auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
auto data_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
auto data_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float);
// IHWO to OHWI4(I)4(O)(converter format is IHWO)
// init padWeight_(buffer mem)

View File

@ -75,7 +75,7 @@ int MatMulOpenCLKernel::ReSize() { return RET_OK; }
void MatMulOpenCLKernel::PadWeight() {
auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
size_t dtype_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
size_t dtype_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float);
padWeight_ = allocator->Malloc(sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size);
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
memset(padWeight_, 0x00, sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size);

View File

@ -27,6 +27,7 @@ using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Reshape;
using mindspore::schema::PrimitiveType_Squeeze;
namespace mindspore::kernel {
@ -142,4 +143,6 @@ kernel::LiteKernel *OpenCLReshapeKernelCreator(const std::vector<lite::tensor::T
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reshape, OpenCLReshapeKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reshape, OpenCLReshapeKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Squeeze, OpenCLReshapeKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Squeeze, OpenCLReshapeKernelCreator)
} // namespace mindspore::kernel

View File

@ -0,0 +1,3 @@
mobilenet_v1_1.0_224.tflite
mobilenet_v2_1.0_224.tflite
resnet.tflite

View File

@ -310,6 +310,42 @@ function Run_arm64() {
fi
#sleep 1
done < ${models_tflite_awaretraining_config}
# Run gpu tflite converted models:
while read line; do
model_name=${line}
if [[ $model_name == \#* ]]; then
continue
fi
echo ${model_name} >> "${run_benchmark_log_file}"
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}"
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> adb_run_cmd.txt
adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
if [ $? = 0 ]; then
run_result='arm64_gpu: '${model_name}' pass'
echo ${run_result} >> ${run_benchmark_result_file}
else
run_result='arm64_gpu: '${model_name}' failed'
echo ${run_result} >> ${run_benchmark_result_file}
return 1
fi
# run benchmark test without clib data
#echo ${model_name}
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> "${run_benchmark_log_file}"
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> adb_run_cmd.txt
adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
if [ $? = 0 ]; then
run_result='arm64_gpu: '${model_name}' pass'
echo ${run_result} >> ${run_benchmark_result_file}
else
run_result='arm64_gpu: '${model_name}' failed'
echo ${run_result} >> ${run_benchmark_result_file}
return 1
fi
#sleep 1
done < ${models_tflite_gpu_config}
}
# Print start msg before run testcase
@ -397,6 +433,7 @@ models_tflite_posttraining_config=${basepath}/models_tflite_posttraining.cfg
models_onnx_config=${basepath}/models_onnx.cfg
models_fp16_config=${basepath}/models_fp16.cfg
models_mindspore_config=${basepath}/models_mindspore.cfg
models_tflite_gpu_config=${basepath}/models_tflite_gpu.cfg
Convert_status=0
rm -rf ${basepath}/ms_models

View File

@ -36,7 +36,7 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
size_t dtype_size = sizeof(float);
if (enable_fp16) {
ocl_runtime->SetFp16Enable(true);
dtype_size = sizeof(float16_t);
dtype_size = sizeof(int16_t);
}
auto allocator = ocl_runtime->GetAllocator();
int ci = shape[0];