forked from mindspore-Ecosystem/mindspore
!21099 fit cross compile
Merge pull request !21099 from zhaodezan/master
This commit is contained in:
commit
b63fd669f6
|
@ -51,21 +51,21 @@ LoopRow16:
|
|||
mov x19, x5 // reload depth
|
||||
|
||||
ld1 {v16.8h}, [x12], #16
|
||||
mov v17.8h, v16.8h
|
||||
mov v18.8h, v16.8h
|
||||
mov v19.8h, v16.8h
|
||||
mov v20.8h, v16.8h
|
||||
mov v21.8h, v16.8h
|
||||
mov v22.8h, v16.8h
|
||||
mov v23.8h, v16.8h
|
||||
mov v24.8h, v16.8h
|
||||
mov v25.8h, v16.8h
|
||||
mov v26.8h, v16.8h
|
||||
mov v27.8h, v16.8h
|
||||
mov v28.8h, v16.8h
|
||||
mov v29.8h, v16.8h
|
||||
mov v30.8h, v16.8h
|
||||
mov v31.8h, v16.8h
|
||||
mov v17.16b, v16.16b
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v16.16b
|
||||
mov v20.16b, v16.16b
|
||||
mov v21.16b, v16.16b
|
||||
mov v22.16b, v16.16b
|
||||
mov v23.16b, v16.16b
|
||||
mov v24.16b, v16.16b
|
||||
mov v25.16b, v16.16b
|
||||
mov v26.16b, v16.16b
|
||||
mov v27.16b, v16.16b
|
||||
mov v28.16b, v16.16b
|
||||
mov v29.16b, v16.16b
|
||||
mov v30.16b, v16.16b
|
||||
mov v31.16b, v16.16b
|
||||
|
||||
cmp x19, #4
|
||||
blt LoopDepth16One
|
||||
|
@ -242,13 +242,13 @@ LoopRow8:
|
|||
mov x19, x5 // reload depth
|
||||
|
||||
ld1 {v16.8h}, [x12], #16
|
||||
mov v17.8h, v16.8h
|
||||
mov v18.8h, v16.8h
|
||||
mov v19.8h, v16.8h
|
||||
mov v20.8h, v16.8h
|
||||
mov v21.8h, v16.8h
|
||||
mov v22.8h, v16.8h
|
||||
mov v23.8h, v16.8h
|
||||
mov v17.16b, v16.16b
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v16.16b
|
||||
mov v20.16b, v16.16b
|
||||
mov v21.16b, v16.16b
|
||||
mov v22.16b, v16.16b
|
||||
mov v23.16b, v16.16b
|
||||
|
||||
cmp x19, #4
|
||||
blt LoopDepth8One
|
||||
|
@ -356,9 +356,9 @@ LoopRow4:
|
|||
mov x10, x0 // reload lhs ptr
|
||||
mov x19, x5 // reload depth
|
||||
ld1 {v16.8h}, [x12], #16
|
||||
mov v17.8h, v16.8h
|
||||
mov v18.8h, v16.8h
|
||||
mov v19.8h, v16.8h
|
||||
mov v17.16b, v16.16b
|
||||
mov v18.16b, v16.16b
|
||||
mov v19.16b, v16.16b
|
||||
cmp x19, #4
|
||||
blt LoopDepth4One
|
||||
LoopDepth4:
|
||||
|
|
|
@ -801,8 +801,8 @@ int ElementLogicalAndFp16(const float16_t *input0, const float16_t *input1, floa
|
|||
uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
|
||||
uint16x8_t zeros = vdupq_n_u16(0);
|
||||
for (; index <= element_size - 8; index += C8NUM) {
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input0 + index)), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input1 + index)), mask);
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input0 + index)), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input1 + index)), mask);
|
||||
float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
|
||||
vst1q_f16(output + index, vout);
|
||||
}
|
||||
|
@ -828,8 +828,8 @@ int ElementOptLogicalAndFp16(const float16_t *input0, const float16_t *input1, f
|
|||
#ifdef ENABLE_NEON
|
||||
for (; index <= element_size - 8; index += C8NUM) {
|
||||
float16x8_t vin1_ = vld1q_f16(input1 + index);
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_opt), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_), mask);
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_opt), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_), mask);
|
||||
float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
|
||||
vst1q_f16(output + index, vout);
|
||||
}
|
||||
|
@ -841,8 +841,8 @@ int ElementOptLogicalAndFp16(const float16_t *input0, const float16_t *input1, f
|
|||
#ifdef ENABLE_NEON
|
||||
for (; index <= element_size - 8; index += C8NUM) {
|
||||
float16x8_t vin0_ = vld1q_f16(input0 + index);
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_opt), mask);
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_opt), mask);
|
||||
float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
|
||||
vst1q_f16(output + index, vout);
|
||||
}
|
||||
|
@ -862,8 +862,8 @@ int ElementLogicalOrFp16(const float16_t *input0, const float16_t *input1, float
|
|||
uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
|
||||
uint16x8_t zeros = vdupq_n_u16(0);
|
||||
for (; index <= element_size - 8; index += C8NUM) {
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input0 + index)), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vld1q_f16(input1 + index)), mask);
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input0 + index)), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input1 + index)), mask);
|
||||
float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
|
||||
vst1q_f16(output + index, vout);
|
||||
}
|
||||
|
@ -889,8 +889,8 @@ int ElementOptLogicalOrFp16(const float16_t *input0, const float16_t *input1, fl
|
|||
#ifdef ENABLE_NEON
|
||||
for (; index <= element_size - 8; index += C8NUM) {
|
||||
float16x8_t vin1_ = vld1q_f16(input1 + index);
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_opt), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_), mask);
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_opt), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_), mask);
|
||||
float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
|
||||
vst1q_f16(output + index, vout);
|
||||
}
|
||||
|
@ -902,8 +902,8 @@ int ElementOptLogicalOrFp16(const float16_t *input0, const float16_t *input1, fl
|
|||
#ifdef ENABLE_NEON
|
||||
for (; index <= element_size - 8; index += C8NUM) {
|
||||
float16x8_t vin0_ = vld1q_f16(input0 + index);
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_s16_f16(vin0_), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_s16_f16(vin1_opt), mask);
|
||||
uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_), mask);
|
||||
uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_opt), mask);
|
||||
float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
|
||||
vst1q_f16(output + index, vout);
|
||||
}
|
||||
|
|
|
@ -35,7 +35,8 @@ static inline void single_exp_fp16(float16_t src, float16_t *dst) {
|
|||
int int_exp = (integer + 127) << 23;
|
||||
const float decimal_exp =
|
||||
1.0f + decimal * (1.0f + decimal * (0.5f + decimal * (param[3] + decimal * (param[2] + decimal * param[1]))));
|
||||
*dst = (float16_t)(*((float *)&int_exp) * decimal_exp);
|
||||
float *tmp = (float *)(&int_exp);
|
||||
*dst = (float16_t)(*(tmp)*decimal_exp);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -667,7 +667,10 @@ int Benchmark::InitPerfProfilingCallbackParameter() {
|
|||
const CallBackParam &call_param) {
|
||||
struct PerfResult res;
|
||||
ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
|
||||
read(perf_fd, &res, sizeof(struct PerfResult));
|
||||
if (read(perf_fd, &res, sizeof(struct PerfResult)) == -1) {
|
||||
MS_LOG(ERROR) << "Failed to read perf_fd";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (after_inputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of after inputs is empty";
|
||||
|
|
|
@ -652,7 +652,10 @@ int BenchmarkUnifiedApi::InitPerfProfilingCallbackParameter() {
|
|||
const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
|
||||
struct PerfResult res;
|
||||
ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
|
||||
read(perf_fd, &res, sizeof(struct PerfResult));
|
||||
if (read(perf_fd, &res, sizeof(struct PerfResult)) == -1) {
|
||||
MS_LOG(ERROR) << "Failed to read perf_fd";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (after_inputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of after inputs is empty";
|
||||
|
|
Loading…
Reference in New Issue