This commit is contained in:
lzk 2021-04-23 02:37:34 -07:00
parent 552800e03a
commit 8437320b45
4 changed files with 144 additions and 28 deletions

View File

@ -0,0 +1,55 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"
.text
.align 5
.global Float16ToFloat32
#ifndef __APPLE__
.type Float16ToFloat32, %function
#endif
// void Float16ToFloat32(const float16_t *input, float *output, int number);
// r0: input, r1: output, r2: number
Float16ToFloat32:
cmp r2, #0
beq LoopEnd
cmp r2, #16
bge Loop16
cmp r2, #8
bge Loop8
b Loop
Loop16:
vld1.16 {q0, q1}, [r0]!
vcvt.f32.f16 q3, d0
vcvt.f32.f16 q4, d1
vcvt.f32.f16 q5, d2
vst1.32 {q3, q4}, [r1]!
vcvt.f32.f16 q6, d3
subs r2, r2, #16
vst1.32 {q5, q6}, [r1]!
beq LoopEnd
cmp r2, #16
bge Loop16
cmp r2, #8
bge Loop8
b Loop
Loop8:
vld1.16 {q0}, [r0]!
vcvt.f32.f16 q1, d0
vcvt.f32.f16 q2, d1
vst1.32 {q1, q2}, [r1]!
subs r2, r2, #8
beq LoopEnd
cmp r2, #8
bge Loop8
b Loop
Loop:
vldr.16 s0, [r0]
vcvtb.f32.f16 s0, s0
vstr.32 s0, [r1]
add r0, r0, #2
add r1, r1, #4
subs r2, r2, #1
bgt Loop
LoopEnd:
mov pc, lr
#endif

View File

@ -0,0 +1,55 @@
#ifdef ENABLE_ARM32
#include "nnacl/assembly_global.h"
.text
.align 5
.global Float32ToFloat16
#ifndef __APPLE__
.type Float32ToFloat16, %function
#endif
// void Float32ToFloat16(const float *input, float16_t *output, int number);
// r0: input, r1: output, r2: number
Float32ToFloat16:
cmp r2, #0
beq LoopEnd
cmp r2, #16
bge Loop16
cmp r2, #8
bge Loop8
b Loop
Loop16:
vld1.32 {q0, q1}, [r0]!
vcvt.f16.f32 d0, q0
vcvt.f16.f32 d1, q1
vld1.32 {q2, q3}, [r0]!
vcvt.f16.f32 d2, q2
vcvt.f16.f32 d3, q3
vst1.16 {q0, q1}, [r1]!
subs r2, r2, #16
beq LoopEnd
cmp r2, #16
bge Loop16
cmp r2, #8
bge Loop8
b Loop
Loop8:
vld1.32 {q0, q1}, [r0]!
vcvt.f16.f32 d0, q0
vcvt.f16.f32 d1, q1
vst1.16 {q0}, [r1]!
subs r2, r2, #8
beq LoopEnd
cmp r2, #8
bge Loop8
b Loop
Loop:
vldr s0, [r0]
vcvtb.f16.f32 s0, s0
vstr.16 s0, [r1]
add r0, r0, #4
add r1, r1, #2
subs r2, r2, #1
bgt Loop
LoopEnd:
mov pc, lr
#endif

View File

@ -46,6 +46,7 @@ inline void Float16ToInt64(const float16_t *input, int64_t *output, int number)
} }
} }
#ifdef ENABLE_ARM64
inline void Float32ToFloat16(const float *input, float16_t *output, int number) { inline void Float32ToFloat16(const float *input, float16_t *output, int number) {
for (int i = 0; i < number; ++i) { for (int i = 0; i < number; ++i) {
output[i] = (float16_t)input[i]; output[i] = (float16_t)input[i];
@ -57,6 +58,11 @@ inline void Float16ToFloat32(const float16_t *input, float *output, int number)
output[i] = (float)input[i]; output[i] = (float)input[i];
} }
} }
#else
void Float32ToFloat16(const float *input, float16_t *output, int number);
void Float16ToFloat32(const float16_t *input, float *output, int number);
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -44,49 +44,49 @@ void DeConvWgInputPackFp16(float16_t *src_ptr, float16_t *dst_ptr, int channel,
#ifdef ENABLE_ARM82_A32 #ifdef ENABLE_ARM82_A32
void DeconvWgMergeFp16A32Fun(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_step, size_t dst_step) { void DeconvWgMergeFp16A32Fun(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_step, size_t dst_step) {
asm volatile( asm volatile(
"mov r7, %[src_ptr]\n" "mov r0, %[src_ptr]\n"
"mov r8, %[dst_ptr]\n" "mov r1, %[dst_ptr]\n"
"mov r10, r8\n" "mov r2, r1\n"
"vld1.16 {d0}, [r7], %[src_step]\n" "vld1.16 {d0}, [r0], %[src_step]\n"
"vld1.16 {d2}, [r8], %[dst_step]\n" "vld1.16 {d2}, [r1], %[dst_step]\n"
"vld1.16 {d4}, [r7], %[src_step]\n" "vld1.16 {d4}, [r0], %[src_step]\n"
"vld1.16 {d6}, [r8], %[dst_step]\n" "vld1.16 {d6}, [r1], %[dst_step]\n"
"vadd.f16 d0, d0, d2\n" "vadd.f16 d0, d0, d2\n"
"vld1.16 {d8}, [r7], %[src_step]\n" "vld1.16 {d8}, [r0], %[src_step]\n"
"vadd.f16 d4, d4, d6\n" "vadd.f16 d4, d4, d6\n"
"vst1.16 {d0}, [r10], %[dst_step]\n" "vst1.16 {d0}, [r2], %[dst_step]\n"
"vst1.16 {d4}, [r10], %[dst_step]\n" "vst1.16 {d4}, [r2], %[dst_step]\n"
"vld1.16 {d10}, [r8], %[dst_step]\n" "vld1.16 {d10}, [r1], %[dst_step]\n"
"vld1.16 {d12}, [r7], %[src_step]\n" "vld1.16 {d12}, [r0], %[src_step]\n"
"vadd.f16 d8, d8, d10\n" "vadd.f16 d8, d8, d10\n"
"vld1.16 {d14}, [r8], %[dst_step]\n" "vld1.16 {d14}, [r1], %[dst_step]\n"
"vadd.f16 d12, d12, d14\n" "vadd.f16 d12, d12, d14\n"
"vld1.16 {d0}, [r7], %[src_step]\n" "vld1.16 {d0}, [r0], %[src_step]\n"
"vst1.16 {d8}, [r10], %[dst_step]\n" "vst1.16 {d8}, [r2], %[dst_step]\n"
"vst1.16 {d12}, [r10], %[dst_step]\n" "vst1.16 {d12}, [r2], %[dst_step]\n"
"vld1.16 {d2}, [r8], %[dst_step]\n" "vld1.16 {d2}, [r1], %[dst_step]\n"
"vld1.16 {d4}, [r7], %[src_step]\n" "vld1.16 {d4}, [r0], %[src_step]\n"
"vld1.16 {d6}, [r8], %[dst_step]\n" "vld1.16 {d6}, [r1], %[dst_step]\n"
"vadd.f16 d0, d0, d2\n" "vadd.f16 d0, d0, d2\n"
"vadd.f16 d4, d4, d6\n" "vadd.f16 d4, d4, d6\n"
"vst1.16 {d0}, [r10], %[dst_step]\n" "vst1.16 {d0}, [r2], %[dst_step]\n"
"vst1.16 {d4}, [r10], %[dst_step]\n" "vst1.16 {d4}, [r2], %[dst_step]\n"
"vld1.16 {d8}, [r7], %[src_step]\n" "vld1.16 {d8}, [r0], %[src_step]\n"
"vld1.16 {d10}, [r8], %[dst_step]\n" "vld1.16 {d10}, [r1], %[dst_step]\n"
"vld1.16 {d12}, [r7], %[src_step]\n" "vld1.16 {d12}, [r0], %[src_step]\n"
"vld1.16 {d14}, [r8], %[dst_step]\n" "vld1.16 {d14}, [r1], %[dst_step]\n"
"vadd.f16 d8, d8, d10\n" "vadd.f16 d8, d8, d10\n"
"vadd.f16 d12, d12, d14\n" "vadd.f16 d12, d12, d14\n"
"vst1.16 {d8}, [r10], %[dst_step]\n" "vst1.16 {d8}, [r2], %[dst_step]\n"
"vst1.16 {d12}, [r10], %[dst_step]\n" "vst1.16 {d12}, [r2], %[dst_step]\n"
: :
: [ src_ptr ] "r"(src_ptr), [ dst_ptr ] "r"(dst_ptr), [ src_step ] "r"(src_step), [ dst_step ] "r"(dst_step) : [ src_ptr ] "r"(src_ptr), [ dst_ptr ] "r"(dst_ptr), [ src_step ] "r"(src_step), [ dst_step ] "r"(dst_step)
: "r7", "r8", "r10", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); : "r0", "r1", "r2", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
} }
#endif #endif