forked from mindspore-Ecosystem/mindspore
!4552 add fast cast algorithm
Merge pull request !4552 from lixian/master
This commit is contained in:
commit
880ee3cd36
|
@ -0,0 +1,54 @@
|
|||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global Float16ToFloat32
|
||||
#ifndef __APPLE__
|
||||
.type Float16ToFloat32, %function
|
||||
#endif
|
||||
|
||||
// void Float16ToFloat32(const float16_t *input, float *output, int number);
|
||||
// x0: input, x1: output, x2: number
|
||||
Float16ToFloat32:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
cmp x2, #64
|
||||
blt Loop
|
||||
Loop64:
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
|
||||
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
|
||||
fcvtl v16.4s, v0.4h
|
||||
fcvtl2 v17.4s, v0.8h
|
||||
fcvtl v18.4s, v1.4h
|
||||
fcvtl2 v19.4s, v1.8h
|
||||
fcvtl v20.4s, v2.4h
|
||||
fcvtl2 v21.4s, v2.8h
|
||||
fcvtl v22.4s, v3.4h
|
||||
fcvtl2 v23.4s, v3.8h
|
||||
fcvtl v24.4s, v4.4h
|
||||
fcvtl2 v25.4s, v4.8h
|
||||
fcvtl v26.4s, v5.4h
|
||||
fcvtl2 v27.4s, v5.8h
|
||||
fcvtl v28.4s, v6.4h
|
||||
fcvtl2 v29.4s, v6.8h
|
||||
fcvtl v30.4s, v7.4h
|
||||
fcvtl2 v31.4s, v7.8h
|
||||
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64
|
||||
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x1], #64
|
||||
st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x1], #64
|
||||
st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x1], #64
|
||||
subs x2, x2, #64
|
||||
ble LoopEnd
|
||||
cmp x2, #64
|
||||
bge Loop64
|
||||
Loop:
|
||||
ldr h0, [x0], #2
|
||||
fcvt s0, h0
|
||||
str s0, [x1], #4
|
||||
subs x2, x2, #1
|
||||
bgt Loop
|
||||
LoopEnd:
|
||||
ret
|
||||
#endif
|
|
@ -0,0 +1,54 @@
|
|||
#ifdef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global Float32ToFloat16
|
||||
#ifndef __APPLE__
|
||||
.type Float32ToFloat16, %function
|
||||
#endif
|
||||
|
||||
// void Float32ToFloat16(const float *input, float16_t output, int number);
|
||||
// x0: input, x1: output, x2: number
|
||||
Float32ToFloat16:
|
||||
// registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
|
||||
// https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
|
||||
// x19 ~ x29 should be also preserved
|
||||
// whereas our coding style do not permit such amount of parameters
|
||||
cmp x2, #64
|
||||
blt Loop
|
||||
Loop64:
|
||||
ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
|
||||
ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
|
||||
ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
|
||||
ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64
|
||||
fcvtn v0.4h, v16.4s
|
||||
fcvtn2 v0.8h, v17.4s
|
||||
fcvtn v1.4h, v18.4s
|
||||
fcvtn2 v1.8h, v19.4s
|
||||
fcvtn v2.4h, v20.4s
|
||||
fcvtn2 v2.8h, v21.4s
|
||||
fcvtn v3.4h, v22.4s
|
||||
fcvtn2 v3.8h, v23.4s
|
||||
fcvtn v4.4h, v24.4s
|
||||
fcvtn2 v4.8h, v25.4s
|
||||
fcvtn v5.4h, v26.4s
|
||||
fcvtn2 v5.8h, v27.4s
|
||||
fcvtn v6.4h, v28.4s
|
||||
fcvtn2 v6.8h, v29.4s
|
||||
fcvtn v7.4h, v30.4s
|
||||
fcvtn2 v7.8h, v31.4s
|
||||
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
|
||||
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
|
||||
subs x2, x2, #64
|
||||
ble LoopEnd
|
||||
cmp x2, #64
|
||||
bge Loop64
|
||||
Loop:
|
||||
ldr s0, [x0], #4
|
||||
fcvt h0, s0
|
||||
str h0, [x1], #2
|
||||
subs x2, x2, #1
|
||||
bgt Loop
|
||||
LoopEnd:
|
||||
ret
|
||||
#endif
|
|
@ -14,7 +14,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
#include "nnacl/fp16/cast_fp16.h"
|
||||
|
||||
#ifndef ENABLE_ARM64
|
||||
void Float32ToFloat16(const float *input, float16_t *output, int number) {
|
||||
for (int i = 0; i < number; ++i) {
|
||||
output[i] = (float16_t)input[i];
|
||||
|
@ -26,3 +26,4 @@ void Float16ToFloat32(const float16_t *input, float *output, int number) {
|
|||
output[i] = (float)input[i];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue