!4552 add fast cast algorithm

Merge pull request !4552 from lixian/master
2020-08-17 00:11:40 +08:00 · 2020-08-17 00:11:40 +08:00 · 880ee3cd36
parent eaeb3fe7ee 0cb2b89a4c
commit 880ee3cd36
3 changed files with 110 additions and 1 deletions
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float16ToFloat32.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float16ToFloat32.S
@ -0,0 +1,54 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global Float16ToFloat32
+#ifndef __APPLE__
+.type Float16ToFloat32, %function
+#endif
+
+// void Float16ToFloat32(const float16_t *input, float *output, int number);
+// x0: input, x1: output, x2: number
+Float16ToFloat32:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    cmp x2, #64
+    blt Loop
+    Loop64:
+        ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        fcvtl v16.4s, v0.4h
+        fcvtl2 v17.4s, v0.8h
+        fcvtl v18.4s, v1.4h
+        fcvtl2 v19.4s, v1.8h
+        fcvtl v20.4s, v2.4h
+        fcvtl2 v21.4s, v2.8h
+        fcvtl v22.4s, v3.4h
+        fcvtl2 v23.4s, v3.8h
+        fcvtl v24.4s, v4.4h
+        fcvtl2 v25.4s, v4.8h
+        fcvtl v26.4s, v5.4h
+        fcvtl2 v27.4s, v5.8h
+        fcvtl v28.4s, v6.4h
+        fcvtl2 v29.4s, v6.8h
+        fcvtl v30.4s, v7.4h
+        fcvtl2 v31.4s, v7.8h
+        st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64
+        st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x1], #64
+        st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x1], #64
+        st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x1], #64
+        subs x2, x2, #64
+        ble LoopEnd
+        cmp x2, #64
+        bge Loop64
+    Loop:
+        ldr h0, [x0], #2
+        fcvt s0, h0
+        str s0, [x1], #4
+        subs x2, x2, #1
+        bgt Loop
+    LoopEnd:
+        ret
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float32ToFloat16.S
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/assembly/opt/Float32ToFloat16.S
@ -0,0 +1,54 @@
+#ifdef __aarch64__
+
+.text
+.align 5
+.global Float32ToFloat16
+#ifndef __APPLE__
+.type Float32ToFloat16, %function
+#endif
+
+// void Float32ToFloat16(const float *input, float16_t output, int number);
+// x0: input, x1: output, x2: number
+Float32ToFloat16:
+    // registers v8 ~ v15 must be preserved by a callee across subroutine calls, according to
+    // https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#simd-and-floating-point-registers
+    // x19 ~ x29 should be also preserved
+    // whereas our coding style do not permit such amount of parameters
+    cmp x2, #64
+    blt Loop
+    Loop64:
+        ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+        ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
+        ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
+        ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64
+        fcvtn v0.4h, v16.4s
+        fcvtn2 v0.8h, v17.4s
+        fcvtn v1.4h, v18.4s
+        fcvtn2 v1.8h, v19.4s
+        fcvtn v2.4h, v20.4s
+        fcvtn2 v2.8h, v21.4s
+        fcvtn v3.4h, v22.4s
+        fcvtn2 v3.8h, v23.4s
+        fcvtn v4.4h, v24.4s
+        fcvtn2 v4.8h, v25.4s
+        fcvtn v5.4h, v26.4s
+        fcvtn2 v5.8h, v27.4s
+        fcvtn v6.4h, v28.4s
+        fcvtn2 v6.8h, v29.4s
+        fcvtn v7.4h, v30.4s
+        fcvtn2 v7.8h, v31.4s
+        st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+        st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
+        subs x2, x2, #64
+        ble LoopEnd
+        cmp x2, #64
+        bge Loop64
+    Loop:
+        ldr s0, [x0], #4
+        fcvt h0, s0
+        str h0, [x1], #2
+        subs x2, x2, #1
+        bgt Loop
+    LoopEnd:
+        ret
+#endif
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/cast_fp16.c
@ -14,7 +14,7 @@
 * limitations under the License.
 */
 #include "nnacl/fp16/cast_fp16.h"
-
+#ifndef ENABLE_ARM64
 void Float32ToFloat16(const float *input, float16_t *output, int number) {
  for (int i = 0; i < number; ++i) {
    output[i] = (float16_t)input[i];
@ -26,3 +26,4 @@ void Float16ToFloat32(const float16_t *input, float *output, int number) {
    output[i] = (float)input[i];
  }
 }
+#endif