fix fp16 bug and add gpu fp16 model to ci
This commit is contained in:
parent
15244de50a
commit
2c6cfce70e
|
@ -14,6 +14,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include "src/kernel_registry.h"
|
||||
|
|
|
@ -69,7 +69,7 @@ int ConvolutionOpenCLKernel::Init() {
|
|||
TILES_X_ = UP_DIV(OW_, 4);
|
||||
TILES_Y_ = UP_DIV(OH_, 4);
|
||||
TILES_XY_ = TILES_X_ * TILES_Y_;
|
||||
use_winograd_ = UseWinograd4x4To6x6();
|
||||
use_winograd_ = UseWinograd4x4To6x6() && use_fp16_;
|
||||
|
||||
// build kernel
|
||||
if (use_winograd_) {
|
||||
|
|
|
@ -134,7 +134,14 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() {
|
|||
size_t up_co_size = C4NUM * CO4 * dtype_size;
|
||||
memset(bias_data_, 0, up_co_size);
|
||||
auto ori_bias = in_tensors_.at(kBiasIndex)->MutableData();
|
||||
memcpy(bias_data_, ori_bias, out_tensors_[0]->Channel() * dtype_size);
|
||||
if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat32) {
|
||||
float16_t *bias_ptr = static_cast<float16_t*>(bias_data_);
|
||||
for (size_t i = 0; i < in_tensors_.at(kBiasIndex)->ElementsNum(); ++i) {
|
||||
bias_ptr[i] = static_cast<float16_t>(static_cast<float*>(ori_bias)[i]);
|
||||
}
|
||||
} else {
|
||||
memcpy(bias_data_, ori_bias, out_tensors_[0]->Channel() * dtype_size);
|
||||
}
|
||||
allocator->UnmapBuffer(bias_data_);
|
||||
} else {
|
||||
MS_ASSERT(in_tensors_.size() == kInputSize1);
|
||||
|
|
|
@ -56,6 +56,19 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
|
|||
}
|
||||
for (size_t i = 0; i < in_tensors.size(); ++i) {
|
||||
if (in_tensors.at(i)->shape().size() <= 1) {
|
||||
if (mem_type == OpenCLMemType::IMG) {
|
||||
for (auto &iv : in_kernels[i]) {
|
||||
auto tensors = iv->in_tensors();
|
||||
tensors.emplace_back(in_tensors.at(i));
|
||||
iv->set_in_tensors(tensors);
|
||||
}
|
||||
} else {
|
||||
for (auto &iv : in_kernels[i]) {
|
||||
auto tensors = iv->out_tensors();
|
||||
tensors.emplace_back(in_tensors.at(i));
|
||||
iv->set_out_tensors(tensors);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
OpenCLKernel *cur_opencl_op = reinterpret_cast<OpenCLKernel *>(in_kernels[i][0]);
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
mobilenet_v1_1.0_224.tflite
|
||||
mobilenet_v2_1.0_224.tflite
|
|
@ -1,6 +1,7 @@
|
|||
mobilenet_v1_1.0_224.tflite
|
||||
mobilenet_v2_1.0_224.tflite
|
||||
resnet.tflite
|
||||
squeezenet.tflite
|
||||
mtk_AADB_HADB_MBV2_model_fp32.tflite
|
||||
hiai_cn_recognize_modify_padv2.tflite
|
||||
hiai_cv_focusShootOCRModel_08.tflite
|
||||
|
|
|
@ -479,6 +479,42 @@ function Run_arm64() {
|
|||
fi
|
||||
done < ${models_tflite_gpu_config}
|
||||
|
||||
# Run GPU fp16 converted models:
|
||||
while read line; do
|
||||
model_name=${line}
|
||||
if [[ $model_name == \#* ]]; then
|
||||
continue
|
||||
fi
|
||||
echo ${model_name} >> "${run_benchmark_log_file}"
|
||||
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
|
||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1 --fp16Priority=true --accuracyThreshold=5' >> "${run_benchmark_log_file}"
|
||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --inDataPath=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --calibDataPath=/data/local/tmp/input_output/output/'${model_name}'.ms.out --warmUpLoopCount=1 --loopCount=1 --fp16Priority=true --accuracyThreshold=5' >> adb_run_cmd.txt
|
||||
adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
|
||||
if [ $? = 0 ]; then
|
||||
run_result='arm64_gpu_fp16: '${model_name}' pass'
|
||||
echo ${run_result} >> ${run_benchmark_result_file}
|
||||
else
|
||||
run_result='arm64_gpu_fp16: '${model_name}' failed'
|
||||
echo ${run_result} >> ${run_benchmark_result_file}
|
||||
return 1
|
||||
fi
|
||||
# run benchmark test without clib data
|
||||
echo ${model_name} >> "${run_benchmark_log_file}"
|
||||
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
|
||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2 --fp16Priority=true --accuracyThreshold=5' >> "${run_benchmark_log_file}"
|
||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelPath='${model_name}'.ms --warmUpLoopCount=1 --loopCount=2 --fp16Priority=true --accuracyThreshold=5' >> adb_run_cmd.txt
|
||||
adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_benchmark_log_file}"
|
||||
if [ $? = 0 ]; then
|
||||
run_result='arm64_gpu_fp16: '${model_name}' pass'
|
||||
echo ${run_result} >> ${run_benchmark_result_file}
|
||||
else
|
||||
run_result='arm64_gpu_fp16: '${model_name}' failed'
|
||||
echo ${run_result} >> ${run_benchmark_result_file}
|
||||
return 1
|
||||
fi
|
||||
#sleep 1
|
||||
done < ${models_fp16_gpu_config}
|
||||
|
||||
# Run mindir converted models:
|
||||
while read line; do
|
||||
model_name=${line}
|
||||
|
@ -574,6 +610,7 @@ models_onnx_config=${basepath}/models_onnx.cfg
|
|||
models_fp16_config=${basepath}/models_fp16.cfg
|
||||
models_mindspore_config=${basepath}/models_mindspore.cfg
|
||||
models_tflite_gpu_config=${basepath}/models_tflite_gpu.cfg
|
||||
models_fp16_gpu_config=${basepath}/models_fp16_gpu.cfg
|
||||
|
||||
ms_models_path=${basepath}/ms_models
|
||||
|
||||
|
|
Loading…
Reference in New Issue