diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/Float16Tofloat32.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/Float16Tofloat32.S new file mode 100644 index 00000000000..243476ba939 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/Float16Tofloat32.S @@ -0,0 +1,55 @@ +#ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" + .text + .align 5 + .global Float16ToFloat32 +#ifndef __APPLE__ + .type Float16ToFloat32, %function +#endif + +// void Float16ToFloat32(const float16_t *input, float *output, int number); +// r0: input, r1: output, r2: number +Float16ToFloat32: + cmp r2, #0 + beq LoopEnd + cmp r2, #16 + bge Loop16 + cmp r2, #8 + bge Loop8 + b Loop + Loop16: + vld1.16 {q0, q1}, [r0]! + vcvt.f32.f16 q3, d0 + vcvt.f32.f16 q4, d1 + vcvt.f32.f16 q5, d2 + vst1.32 {q3, q4}, [r1]! + vcvt.f32.f16 q6, d3 + subs r2, r2, #16 + vst1.32 {q5, q6}, [r1]! + beq LoopEnd + cmp r2, #16 + bge Loop16 + cmp r2, #8 + bge Loop8 + b Loop + Loop8: + vld1.16 {q0}, [r0]! + vcvt.f32.f16 q1, d0 + vcvt.f32.f16 q2, d1 + vst1.32 {q1, q2}, [r1]! + subs r2, r2, #8 + beq LoopEnd + cmp r2, #8 + bge Loop8 + b Loop + Loop: + vldr.16 s0, [r0] + vcvtb.f32.f16 s0, s0 + vstr.32 s0, [r1] + add r0, r0, #2 + add r1, r1, #4 + subs r2, r2, #1 + bgt Loop + LoopEnd: + mov pc, lr +#endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/Float32ToFloat16.S b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/Float32ToFloat16.S new file mode 100644 index 00000000000..7f1ef037e41 --- /dev/null +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/arm82_aarch32_fp16/Float32ToFloat16.S @@ -0,0 +1,55 @@ +#ifdef ENABLE_ARM32 +#include "nnacl/assembly_global.h" + .text + .align 5 + .global Float32ToFloat16 +#ifndef __APPLE__ + .type Float32ToFloat16, %function +#endif + +// void Float32ToFloat16(const float *input, float16_t *output, int number); +// r0: input, r1: output, r2: number +Float32ToFloat16: + cmp r2, #0 + beq LoopEnd + cmp r2, #16 + bge Loop16 + cmp r2, #8 + bge Loop8 + b Loop + Loop16: + vld1.32 {q0, q1}, [r0]! + vcvt.f16.f32 d0, q0 + vcvt.f16.f32 d1, q1 + vld1.32 {q2, q3}, [r0]! + vcvt.f16.f32 d2, q2 + vcvt.f16.f32 d3, q3 + vst1.16 {q0, q1}, [r1]! + subs r2, r2, #16 + beq LoopEnd + cmp r2, #16 + bge Loop16 + cmp r2, #8 + bge Loop8 + b Loop + Loop8: + vld1.32 {q0, q1}, [r0]! + vcvt.f16.f32 d0, q0 + vcvt.f16.f32 d1, q1 + vst1.16 {q0}, [r1]! + subs r2, r2, #8 + beq LoopEnd + cmp r2, #8 + bge Loop8 + b Loop + Loop: + vldr s0, [r0] + vcvtb.f16.f32 s0, s0 + vstr.16 s0, [r1] + add r0, r0, #4 + add r1, r1, #2 + subs r2, r2, #1 + bgt Loop + LoopEnd: + mov pc, lr +#endif diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/cast_fp16.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/cast_fp16.h index da818ed4f8a..62608d002bb 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/cast_fp16.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/cast_fp16.h @@ -46,6 +46,7 @@ inline void Float16ToInt64(const float16_t *input, int64_t *output, int number) } } +#ifdef ENABLE_ARM64 inline void Float32ToFloat16(const float *input, float16_t *output, int number) { for (int i = 0; i < number; ++i) { output[i] = (float16_t)input[i]; @@ -57,6 +58,11 @@ inline void Float16ToFloat32(const float16_t *input, float *output, int number) output[i] = (float)input[i]; } } +#else +void Float32ToFloat16(const float *input, float16_t *output, int number); + +void Float16ToFloat32(const float16_t *input, float *output, int number); +#endif #ifdef __cplusplus } diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c index c1a97e0c954..3f327b6fe39 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/deconv_winograd_fp16.c @@ -44,49 +44,49 @@ void DeConvWgInputPackFp16(float16_t *src_ptr, float16_t *dst_ptr, int channel, #ifdef ENABLE_ARM82_A32 void DeconvWgMergeFp16A32Fun(const float16_t *src_ptr, float16_t *dst_ptr, size_t src_step, size_t dst_step) { asm volatile( - "mov r7, %[src_ptr]\n" - "mov r8, %[dst_ptr]\n" - "mov r10, r8\n" + "mov r0, %[src_ptr]\n" + "mov r1, %[dst_ptr]\n" + "mov r2, r1\n" - "vld1.16 {d0}, [r7], %[src_step]\n" - "vld1.16 {d2}, [r8], %[dst_step]\n" - "vld1.16 {d4}, [r7], %[src_step]\n" - "vld1.16 {d6}, [r8], %[dst_step]\n" + "vld1.16 {d0}, [r0], %[src_step]\n" + "vld1.16 {d2}, [r1], %[dst_step]\n" + "vld1.16 {d4}, [r0], %[src_step]\n" + "vld1.16 {d6}, [r1], %[dst_step]\n" "vadd.f16 d0, d0, d2\n" - "vld1.16 {d8}, [r7], %[src_step]\n" + "vld1.16 {d8}, [r0], %[src_step]\n" "vadd.f16 d4, d4, d6\n" - "vst1.16 {d0}, [r10], %[dst_step]\n" - "vst1.16 {d4}, [r10], %[dst_step]\n" + "vst1.16 {d0}, [r2], %[dst_step]\n" + "vst1.16 {d4}, [r2], %[dst_step]\n" - "vld1.16 {d10}, [r8], %[dst_step]\n" - "vld1.16 {d12}, [r7], %[src_step]\n" + "vld1.16 {d10}, [r1], %[dst_step]\n" + "vld1.16 {d12}, [r0], %[src_step]\n" "vadd.f16 d8, d8, d10\n" - "vld1.16 {d14}, [r8], %[dst_step]\n" + "vld1.16 {d14}, [r1], %[dst_step]\n" "vadd.f16 d12, d12, d14\n" - "vld1.16 {d0}, [r7], %[src_step]\n" - "vst1.16 {d8}, [r10], %[dst_step]\n" - "vst1.16 {d12}, [r10], %[dst_step]\n" + "vld1.16 {d0}, [r0], %[src_step]\n" + "vst1.16 {d8}, [r2], %[dst_step]\n" + "vst1.16 {d12}, [r2], %[dst_step]\n" - "vld1.16 {d2}, [r8], %[dst_step]\n" - "vld1.16 {d4}, [r7], %[src_step]\n" - "vld1.16 {d6}, [r8], %[dst_step]\n" + "vld1.16 {d2}, [r1], %[dst_step]\n" + "vld1.16 {d4}, [r0], %[src_step]\n" + "vld1.16 {d6}, [r1], %[dst_step]\n" "vadd.f16 d0, d0, d2\n" "vadd.f16 d4, d4, d6\n" - "vst1.16 {d0}, [r10], %[dst_step]\n" - "vst1.16 {d4}, [r10], %[dst_step]\n" + "vst1.16 {d0}, [r2], %[dst_step]\n" + "vst1.16 {d4}, [r2], %[dst_step]\n" - "vld1.16 {d8}, [r7], %[src_step]\n" - "vld1.16 {d10}, [r8], %[dst_step]\n" - "vld1.16 {d12}, [r7], %[src_step]\n" - "vld1.16 {d14}, [r8], %[dst_step]\n" + "vld1.16 {d8}, [r0], %[src_step]\n" + "vld1.16 {d10}, [r1], %[dst_step]\n" + "vld1.16 {d12}, [r0], %[src_step]\n" + "vld1.16 {d14}, [r1], %[dst_step]\n" "vadd.f16 d8, d8, d10\n" "vadd.f16 d12, d12, d14\n" - "vst1.16 {d8}, [r10], %[dst_step]\n" - "vst1.16 {d12}, [r10], %[dst_step]\n" + "vst1.16 {d8}, [r2], %[dst_step]\n" + "vst1.16 {d12}, [r2], %[dst_step]\n" : : [ src_ptr ] "r"(src_ptr), [ dst_ptr ] "r"(dst_ptr), [ src_step ] "r"(src_step), [ dst_step ] "r"(dst_step) - : "r7", "r8", "r10", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + : "r0", "r1", "r2", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); } #endif