!5680 add gpu benchmark
Merge pull request !5680 from wandongdong/master
This commit is contained in:
commit
0aaaf29158
|
@ -27,11 +27,6 @@
|
|||
#include "src/ir/tensor.h"
|
||||
#include "include/errorcode.h"
|
||||
|
||||
#ifdef ENABLE_FP16
|
||||
using FLOAT_t = float16_t;
|
||||
#else
|
||||
using FLOAT_t = float;
|
||||
#endif
|
||||
|
||||
// using mindspore::kernel::AddressPtr;
|
||||
namespace mindspore::kernel {
|
||||
|
|
|
@ -55,10 +55,20 @@ __kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global FLT4 *d
|
|||
result[1] = (FLT4)(0.0f);
|
||||
result[2] = (FLT4)(0.0f);
|
||||
result[3] = (FLT4)(0.0f);
|
||||
FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W));
|
||||
FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W));
|
||||
FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W));
|
||||
FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W));
|
||||
bool over_size = W * C.y > 65535;
|
||||
FLT4 x0, x1, x2, x3;
|
||||
if (over_size) {
|
||||
x0 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X));
|
||||
x1 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 1));
|
||||
x2 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 2));
|
||||
x3 = READ_IMAGE(src_data, smp_zero, (int2)(C, 4 * X + 3));
|
||||
} else {
|
||||
x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W));
|
||||
x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W));
|
||||
x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W));
|
||||
x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W));
|
||||
}
|
||||
|
||||
result[0].x = x0.x;
|
||||
result[0].y = x1.x;
|
||||
result[0].z = x2.x;
|
||||
|
|
|
@ -44,7 +44,7 @@ void ActivationOpenClKernel::InitBuffer() {
|
|||
alpha_buff_ = allocator->MapBuffer(alpha_buff_, CL_MAP_WRITE, nullptr, true);
|
||||
memset(alpha_buff_, 0x00, fp_size);
|
||||
if (enable_fp16_) {
|
||||
auto fp16 = (float16_t)alpha_;
|
||||
auto fp16 = (int16_t)alpha_;
|
||||
memcpy(alpha_buff_, &fp16, fp_size);
|
||||
} else {
|
||||
memcpy(alpha_buff_, &alpha_, fp_size);
|
||||
|
|
|
@ -72,7 +72,7 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
|
|||
int div_ci = UP_DIV(ci, C4NUM);
|
||||
int div_co = UP_DIV(co, C4NUM);
|
||||
auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
|
||||
auto data_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
|
||||
auto data_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float);
|
||||
|
||||
// IHWO to OHWI4(I)4(O)(converter format is IHWO)
|
||||
// init padWeight_(buffer mem)
|
||||
|
|
|
@ -75,7 +75,7 @@ int MatMulOpenCLKernel::ReSize() { return RET_OK; }
|
|||
void MatMulOpenCLKernel::PadWeight() {
|
||||
auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
|
||||
|
||||
size_t dtype_size = enable_fp16_ ? sizeof(float16_t) : sizeof(float);
|
||||
size_t dtype_size = enable_fp16_ ? sizeof(int16_t) : sizeof(float);
|
||||
padWeight_ = allocator->Malloc(sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size);
|
||||
padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
|
||||
memset(padWeight_, 0x00, sizeCI.s[1] * sizeCO.s[1] * C4NUM * C4NUM * dtype_size);
|
||||
|
|
|
@ -27,6 +27,7 @@ using mindspore::lite::KernelRegistrar;
|
|||
using mindspore::lite::RET_ERROR;
|
||||
using mindspore::lite::RET_OK;
|
||||
using mindspore::schema::PrimitiveType_Reshape;
|
||||
using mindspore::schema::PrimitiveType_Squeeze;
|
||||
|
||||
namespace mindspore::kernel {
|
||||
|
||||
|
@ -142,4 +143,6 @@ kernel::LiteKernel *OpenCLReshapeKernelCreator(const std::vector<lite::tensor::T
|
|||
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reshape, OpenCLReshapeKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reshape, OpenCLReshapeKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Squeeze, OpenCLReshapeKernelCreator)
|
||||
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Squeeze, OpenCLReshapeKernelCreator)
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
mobilenet_v1_1.0_224.tflite
|
||||
mobilenet_v2_1.0_224.tflite
|
||||
resnet.tflite
|
|
@ -310,6 +310,42 @@ function Run_arm64() {
|
|||
fi
|
||||
#sleep 1
|
||||
done < ${models_tflite_awaretraining_config}
|
||||
|
||||
# Run gpu tflite converted models:
|
||||
while read line; do
|
||||
model_name=${line}
|
||||
if [[ $model_name == \#* ]]; then
|
||||
continue
|
||||
fi
|
||||
echo ${model_name} >> "${run_benchmark_log_file}"
|
||||
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
|
||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> "${run_benchmark_log_file}"
|
||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1' >> adb_run_cmd.txt
|
||||
adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
|
||||
if [ $? = 0 ]; then
|
||||
run_result='arm64_gpu: '${model_name}' pass'
|
||||
echo ${run_result} >> ${run_benchmark_result_file}
|
||||
else
|
||||
run_result='arm64_gpu: '${model_name}' failed'
|
||||
echo ${run_result} >> ${run_benchmark_result_file}
|
||||
return 1
|
||||
fi
|
||||
# run benchmark test without clib data
|
||||
#echo ${model_name}
|
||||
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
|
||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> "${run_benchmark_log_file}"
|
||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2' >> adb_run_cmd.txt
|
||||
adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
|
||||
if [ $? = 0 ]; then
|
||||
run_result='arm64_gpu: '${model_name}' pass'
|
||||
echo ${run_result} >> ${run_benchmark_result_file}
|
||||
else
|
||||
run_result='arm64_gpu: '${model_name}' failed'
|
||||
echo ${run_result} >> ${run_benchmark_result_file}
|
||||
return 1
|
||||
fi
|
||||
#sleep 1
|
||||
done < ${models_tflite_gpu_config}
|
||||
}
|
||||
|
||||
# Print start msg before run testcase
|
||||
|
@ -397,6 +433,7 @@ models_tflite_posttraining_config=${basepath}/models_tflite_posttraining.cfg
|
|||
models_onnx_config=${basepath}/models_onnx.cfg
|
||||
models_fp16_config=${basepath}/models_fp16.cfg
|
||||
models_mindspore_config=${basepath}/models_mindspore.cfg
|
||||
models_tflite_gpu_config=${basepath}/models_tflite_gpu.cfg
|
||||
Convert_status=0
|
||||
|
||||
rm -rf ${basepath}/ms_models
|
||||
|
|
|
@ -36,7 +36,7 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
|
|||
size_t dtype_size = sizeof(float);
|
||||
if (enable_fp16) {
|
||||
ocl_runtime->SetFp16Enable(true);
|
||||
dtype_size = sizeof(float16_t);
|
||||
dtype_size = sizeof(int16_t);
|
||||
}
|
||||
auto allocator = ocl_runtime->GetAllocator();
|
||||
int ci = shape[0];
|
||||
|
|
Loading…
Reference in New Issue