!21099 fit cross compile

Merge pull request !21099 from zhaodezan/master
2021-07-31 08:09:41 +00:00 · 2021-07-31 08:09:41 +00:00 · b63fd669f6
parent 707307cb32 e97ac52b8d
commit b63fd669f6
5 changed files with 47 additions and 40 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/fp16/MatmulBaseFp16Neon.S
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/assembly/fp16/MatmulBaseFp16Neon.S
@ -51,21 +51,21 @@ LoopRow16:
        mov x19, x5 // reload depth

        ld1 {v16.8h}, [x12], #16
-        mov v17.8h, v16.8h
-        mov v18.8h, v16.8h
-        mov v19.8h, v16.8h
-        mov v20.8h, v16.8h
-        mov v21.8h, v16.8h
-        mov v22.8h, v16.8h
-        mov v23.8h, v16.8h
-        mov v24.8h, v16.8h
-        mov v25.8h, v16.8h
-        mov v26.8h, v16.8h
-        mov v27.8h, v16.8h
-        mov v28.8h, v16.8h
-        mov v29.8h, v16.8h
-        mov v30.8h, v16.8h
-        mov v31.8h, v16.8h
+        mov v17.16b, v16.16b
+        mov v18.16b, v16.16b
+        mov v19.16b, v16.16b
+        mov v20.16b, v16.16b
+        mov v21.16b, v16.16b
+        mov v22.16b, v16.16b
+        mov v23.16b, v16.16b
+        mov v24.16b, v16.16b
+        mov v25.16b, v16.16b
+        mov v26.16b, v16.16b
+        mov v27.16b, v16.16b
+        mov v28.16b, v16.16b
+        mov v29.16b, v16.16b
+        mov v30.16b, v16.16b
+        mov v31.16b, v16.16b

        cmp x19, #4
        blt LoopDepth16One
@ -242,13 +242,13 @@ LoopRow8:
        mov x19, x5 // reload depth

        ld1 {v16.8h}, [x12], #16
-        mov v17.8h, v16.8h
-        mov v18.8h, v16.8h
-        mov v19.8h, v16.8h
-        mov v20.8h, v16.8h
-        mov v21.8h, v16.8h
-        mov v22.8h, v16.8h
-        mov v23.8h, v16.8h
+        mov v17.16b, v16.16b
+        mov v18.16b, v16.16b
+        mov v19.16b, v16.16b
+        mov v20.16b, v16.16b
+        mov v21.16b, v16.16b
+        mov v22.16b, v16.16b
+        mov v23.16b, v16.16b

        cmp x19, #4
        blt LoopDepth8One
@ -356,9 +356,9 @@ LoopRow4:
        mov x10, x0 // reload lhs ptr
        mov x19, x5 // reload depth
        ld1 {v16.8h}, [x12], #16
-        mov v17.8h, v16.8h
-        mov v18.8h, v16.8h
-        mov v19.8h, v16.8h
+        mov v17.16b, v16.16b
+        mov v18.16b, v16.16b
+        mov v19.16b, v16.16b
        cmp x19, #4
        blt LoopDepth4One
    LoopDepth4:
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/arithmetic_fp16.c
@ -801,8 +801,8 @@ int ElementLogicalAndFp16(const float16_t *input0, const float16_t *input1, floa
  uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
  uint16x8_t zeros = vdupq_n_u16(0);
  for (; index <= element_size - 8; index += C8NUM) {
-    uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input0 + index)), mask);
-    uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input1 + index)), mask);
+    uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input0 + index)), mask);
+    uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input1 + index)), mask);
    float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
    vst1q_f16(output + index, vout);
  }
@ -828,8 +828,8 @@ int ElementOptLogicalAndFp16(const float16_t *input0, const float16_t *input1, f
 #ifdef ENABLE_NEON
    for (; index <= element_size - 8; index += C8NUM) {
      float16x8_t vin1_ = vld1q_f16(input1 + index);
-      uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_opt), mask);
-      uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_), mask);
+      uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_opt), mask);
+      uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_), mask);
      float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
      vst1q_f16(output + index, vout);
    }
@ -841,8 +841,8 @@ int ElementOptLogicalAndFp16(const float16_t *input0, const float16_t *input1, f
 #ifdef ENABLE_NEON
    for (; index <= element_size - 8; index += C8NUM) {
      float16x8_t vin0_ = vld1q_f16(input0 + index);
-      uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_), mask);
-      uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_opt), mask);
+      uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_), mask);
+      uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_opt), mask);
      float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
      vst1q_f16(output + index, vout);
    }
@ -862,8 +862,8 @@ int ElementLogicalOrFp16(const float16_t *input0, const float16_t *input1, float
  uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
  uint16x8_t zeros = vdupq_n_u16(0);
  for (; index <= element_size - 8; index += C8NUM) {
-    uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input0 + index)), mask);
-    uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input1 + index)), mask);
+    uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input0 + index)), mask);
+    uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input1 + index)), mask);
    float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
    vst1q_f16(output + index, vout);
  }
@ -889,8 +889,8 @@ int ElementOptLogicalOrFp16(const float16_t *input0, const float16_t *input1, fl
 #ifdef ENABLE_NEON
    for (; index <= element_size - 8; index += C8NUM) {
      float16x8_t vin1_ = vld1q_f16(input1 + index);
-      uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_opt), mask);
-      uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_), mask);
+      uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_opt), mask);
+      uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_), mask);
      float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
      vst1q_f16(output + index, vout);
    }
@ -902,8 +902,8 @@ int ElementOptLogicalOrFp16(const float16_t *input0, const float16_t *input1, fl
 #ifdef ENABLE_NEON
    for (; index <= element_size - 8; index += C8NUM) {
      float16x8_t vin0_ = vld1q_f16(input0 + index);
-      uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_), mask);
-      uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_opt), mask);
+      uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_), mask);
+      uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_opt), mask);
      float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
      vst1q_f16(output + index, vout);
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/exp_fp16.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/exp_fp16.h
@ -35,7 +35,8 @@ static inline void single_exp_fp16(float16_t src, float16_t *dst) {
  int int_exp = (integer + 127) << 23;
  const float decimal_exp =
    1.0f + decimal * (1.0f + decimal * (0.5f + decimal * (param[3] + decimal * (param[2] + decimal * param[1]))));
-  *dst = (float16_t)(*((float *)&int_exp) * decimal_exp);
+  float *tmp = (float *)(&int_exp);
+  *dst = (float16_t)(*(tmp)*decimal_exp);
 }

 #ifdef __cplusplus
--- a/mindspore/lite/tools/benchmark/benchmark.cc
+++ b/mindspore/lite/tools/benchmark/benchmark.cc
@ -667,7 +667,10 @@ int Benchmark::InitPerfProfilingCallbackParameter() {
                         const CallBackParam &call_param) {
    struct PerfResult res;
    ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
-    read(perf_fd, &res, sizeof(struct PerfResult));
+    if (read(perf_fd, &res, sizeof(struct PerfResult)) == -1) {
+      MS_LOG(ERROR) << "Failed to read perf_fd";
+      return false;
+    }

    if (after_inputs.empty()) {
      MS_LOG(INFO) << "The num of after inputs is empty";
--- a/mindspore/lite/tools/benchmark/benchmark_unified_api.cc
+++ b/mindspore/lite/tools/benchmark/benchmark_unified_api.cc
@ -652,7 +652,10 @@ int BenchmarkUnifiedApi::InitPerfProfilingCallbackParameter() {
                            const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
    struct PerfResult res;
    ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
-    read(perf_fd, &res, sizeof(struct PerfResult));
+    if (read(perf_fd, &res, sizeof(struct PerfResult)) == -1) {
+      MS_LOG(ERROR) << "Failed to read perf_fd";
+      return false;
+    }

    if (after_inputs.empty()) {
      MS_LOG(INFO) << "The num of after inputs is empty";