!21099 fit cross compile

Merge pull request !21099 from zhaodezan/master
This commit is contained in:
i-robot 2021-07-31 08:09:41 +00:00 committed by Gitee
commit b63fd669f6
5 changed files with 47 additions and 40 deletions

View File

@ -51,21 +51,21 @@ LoopRow16:
mov x19, x5 // reload depth
ld1 {v16.8h}, [x12], #16
mov v17.8h, v16.8h
mov v18.8h, v16.8h
mov v19.8h, v16.8h
mov v20.8h, v16.8h
mov v21.8h, v16.8h
mov v22.8h, v16.8h
mov v23.8h, v16.8h
mov v24.8h, v16.8h
mov v25.8h, v16.8h
mov v26.8h, v16.8h
mov v27.8h, v16.8h
mov v28.8h, v16.8h
mov v29.8h, v16.8h
mov v30.8h, v16.8h
mov v31.8h, v16.8h
mov v17.16b, v16.16b
mov v18.16b, v16.16b
mov v19.16b, v16.16b
mov v20.16b, v16.16b
mov v21.16b, v16.16b
mov v22.16b, v16.16b
mov v23.16b, v16.16b
mov v24.16b, v16.16b
mov v25.16b, v16.16b
mov v26.16b, v16.16b
mov v27.16b, v16.16b
mov v28.16b, v16.16b
mov v29.16b, v16.16b
mov v30.16b, v16.16b
mov v31.16b, v16.16b
cmp x19, #4
blt LoopDepth16One
@ -242,13 +242,13 @@ LoopRow8:
mov x19, x5 // reload depth
ld1 {v16.8h}, [x12], #16
mov v17.8h, v16.8h
mov v18.8h, v16.8h
mov v19.8h, v16.8h
mov v20.8h, v16.8h
mov v21.8h, v16.8h
mov v22.8h, v16.8h
mov v23.8h, v16.8h
mov v17.16b, v16.16b
mov v18.16b, v16.16b
mov v19.16b, v16.16b
mov v20.16b, v16.16b
mov v21.16b, v16.16b
mov v22.16b, v16.16b
mov v23.16b, v16.16b
cmp x19, #4
blt LoopDepth8One
@ -356,9 +356,9 @@ LoopRow4:
mov x10, x0 // reload lhs ptr
mov x19, x5 // reload depth
ld1 {v16.8h}, [x12], #16
mov v17.8h, v16.8h
mov v18.8h, v16.8h
mov v19.8h, v16.8h
mov v17.16b, v16.16b
mov v18.16b, v16.16b
mov v19.16b, v16.16b
cmp x19, #4
blt LoopDepth4One
LoopDepth4:

View File

@ -801,8 +801,8 @@ int ElementLogicalAndFp16(const float16_t *input0, const float16_t *input1, floa
uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
uint16x8_t zeros = vdupq_n_u16(0);
for (; index <= element_size - 8; index += C8NUM) {
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input0 + index)), mask);
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input1 + index)), mask);
uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input0 + index)), mask);
uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input1 + index)), mask);
float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
vst1q_f16(output + index, vout);
}
@ -828,8 +828,8 @@ int ElementOptLogicalAndFp16(const float16_t *input0, const float16_t *input1, f
#ifdef ENABLE_NEON
for (; index <= element_size - 8; index += C8NUM) {
float16x8_t vin1_ = vld1q_f16(input1 + index);
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_opt), mask);
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_), mask);
uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_opt), mask);
uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_), mask);
float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
vst1q_f16(output + index, vout);
}
@ -841,8 +841,8 @@ int ElementOptLogicalAndFp16(const float16_t *input0, const float16_t *input1, f
#ifdef ENABLE_NEON
for (; index <= element_size - 8; index += C8NUM) {
float16x8_t vin0_ = vld1q_f16(input0 + index);
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_), mask);
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_opt), mask);
uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_), mask);
uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_opt), mask);
float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
vst1q_f16(output + index, vout);
}
@ -862,8 +862,8 @@ int ElementLogicalOrFp16(const float16_t *input0, const float16_t *input1, float
uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
uint16x8_t zeros = vdupq_n_u16(0);
for (; index <= element_size - 8; index += C8NUM) {
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input0 + index)), mask);
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input1 + index)), mask);
uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input0 + index)), mask);
uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input1 + index)), mask);
float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
vst1q_f16(output + index, vout);
}
@ -889,8 +889,8 @@ int ElementOptLogicalOrFp16(const float16_t *input0, const float16_t *input1, fl
#ifdef ENABLE_NEON
for (; index <= element_size - 8; index += C8NUM) {
float16x8_t vin1_ = vld1q_f16(input1 + index);
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_opt), mask);
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_), mask);
uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_opt), mask);
uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_), mask);
float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
vst1q_f16(output + index, vout);
}
@ -902,8 +902,8 @@ int ElementOptLogicalOrFp16(const float16_t *input0, const float16_t *input1, fl
#ifdef ENABLE_NEON
for (; index <= element_size - 8; index += C8NUM) {
float16x8_t vin0_ = vld1q_f16(input0 + index);
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_), mask);
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_opt), mask);
uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_), mask);
uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_opt), mask);
float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
vst1q_f16(output + index, vout);
}

View File

@ -35,7 +35,8 @@ static inline void single_exp_fp16(float16_t src, float16_t *dst) {
int int_exp = (integer + 127) << 23;
const float decimal_exp =
1.0f + decimal * (1.0f + decimal * (0.5f + decimal * (param[3] + decimal * (param[2] + decimal * param[1]))));
*dst = (float16_t)(*((float *)&int_exp) * decimal_exp);
float *tmp = (float *)(&int_exp);
*dst = (float16_t)(*(tmp)*decimal_exp);
}
#ifdef __cplusplus

View File

@ -667,7 +667,10 @@ int Benchmark::InitPerfProfilingCallbackParameter() {
const CallBackParam &call_param) {
struct PerfResult res;
ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
read(perf_fd, &res, sizeof(struct PerfResult));
if (read(perf_fd, &res, sizeof(struct PerfResult)) == -1) {
MS_LOG(ERROR) << "Failed to read perf_fd";
return false;
}
if (after_inputs.empty()) {
MS_LOG(INFO) << "The num of after inputs is empty";

View File

@ -652,7 +652,10 @@ int BenchmarkUnifiedApi::InitPerfProfilingCallbackParameter() {
const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
struct PerfResult res;
ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
read(perf_fd, &res, sizeof(struct PerfResult));
if (read(perf_fd, &res, sizeof(struct PerfResult)) == -1) {
MS_LOG(ERROR) << "Failed to read perf_fd";
return false;
}
if (after_inputs.empty()) {
MS_LOG(INFO) << "The num of after inputs is empty";