forked from mindspore-Ecosystem/mindspore
!9485 [MSLITE] Support GEMV for ARM v7a
From: @zhanyuan1 Reviewed-by: @zhang_xue_tong,@zhanghaibo5 Signed-off-by: @zhang_xue_tong
This commit is contained in:
commit
e31d2636df
|
@ -0,0 +1,185 @@
|
|||
#ifdef __arm__
|
||||
#ifndef __aarch64__
|
||||
|
||||
.text
|
||||
.align 5
|
||||
.global MatVecMulFp32
|
||||
#ifndef __APPLE__
|
||||
.type MatVecMulFp32, %function
|
||||
#endif
|
||||
|
||||
// void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col)
|
||||
// r0: a
|
||||
// r1: b
|
||||
// r2: c
|
||||
// r3: bias
|
||||
// r4: act_type
|
||||
// r5: depth
|
||||
// r6: col
|
||||
|
||||
MatVecMulFp32:
|
||||
// r4-r8 and q4-q7 must be saved according to https://static.docs.arm.com/ihi0042/i/aapcs32.pdf
|
||||
push {r0-r8, r10, r11, lr}
|
||||
add sp, sp, #48
|
||||
|
||||
ldr r4, [sp]
|
||||
ldr r5, [sp, #4]
|
||||
ldr r6, [sp, #8]
|
||||
|
||||
mov r10, #4
|
||||
mul r10, r10, r5 // stride = depth * sizeof(float)
|
||||
mov r11, #4
|
||||
mul r11, r11, r10 // stride x 4
|
||||
|
||||
cmp r6, #4
|
||||
blt Col1Loop
|
||||
|
||||
Col4Loop:
|
||||
mov r7, r0 // reload a(vector) ptr
|
||||
mov r9, r1 // reload b(matrix) ptr
|
||||
mov r8, r5 // reload depth value
|
||||
|
||||
veor q9, q9, q9
|
||||
veor q10, q10, q10
|
||||
veor q11, q11, q11
|
||||
veor q12, q12, q12
|
||||
veor q15, q15, q15
|
||||
|
||||
cmp r8, #4
|
||||
blt Col4Depth1
|
||||
|
||||
Col4Depth4:
|
||||
vld1.f32 {q8}, [r7]!
|
||||
add lr, r9, r10
|
||||
vld1.f32 {q0}, [r9]!
|
||||
vld1.f32 {q1}, [lr], r10
|
||||
vld1.f32 {q2}, [lr], r10
|
||||
vld1.f32 {q3}, [lr]
|
||||
|
||||
vmla.f32 q9, q8, q0
|
||||
vmla.f32 q10, q8, q1
|
||||
vmla.f32 q11, q8, q2
|
||||
vmla.f32 q12, q8, q3
|
||||
sub r8, r8, #4
|
||||
cmp r8, #4
|
||||
bge Col4Depth4
|
||||
|
||||
vpadd.f32 d26, d18, d20
|
||||
vpadd.f32 d27, d19, d21
|
||||
vpadd.f32 d28, d22, d24
|
||||
vpadd.f32 d29, d23, d25
|
||||
vadd.f32 d30, d26, d27
|
||||
vadd.f32 d31, d28, d29
|
||||
cmp r8, #0
|
||||
beq Col4End
|
||||
|
||||
Col4Depth1:
|
||||
vld1.f32 {d0[0]}, [r7]!
|
||||
add lr, r9, r10
|
||||
vld1.f32 {d2[0]}, [r9]!
|
||||
vld1.f32 {d2[1]}, [lr], r10
|
||||
vld1.f32 {d3[0]}, [lr], r10
|
||||
vld1.f32 {d3[1]}, [lr]
|
||||
|
||||
vmla.f32 q15, q1, d0[0]
|
||||
subs r8, r8, #1
|
||||
bne Col4Depth1
|
||||
|
||||
Col4End:
|
||||
cmp r3, #0
|
||||
beq Col4Activation
|
||||
vld1.f32 {q13}, [r3]!
|
||||
vadd.f32 q15, q15, q13
|
||||
|
||||
Col4Activation:
|
||||
cmp r4, #3
|
||||
beq Col4Relu6
|
||||
cmp r4, #1
|
||||
beq Col4Relu
|
||||
b Col4Write
|
||||
|
||||
Col4Relu6:
|
||||
vmov.i32 q12, #6
|
||||
vcvt.f32.s32 q12, q12
|
||||
vmin.f32 q15, q15, q12
|
||||
|
||||
Col4Relu:
|
||||
veor q13, q13, q13
|
||||
vmax.f32 q15, q15, q13
|
||||
|
||||
Col4Write:
|
||||
vst1.f32 {q15}, [r2]!
|
||||
subs r6, r6, #4
|
||||
beq End
|
||||
add r1, r1, r11
|
||||
cmp r6, #4
|
||||
bge Col4Loop
|
||||
|
||||
Col1Loop:
|
||||
mov r7, r0 // reload a(vector) ptr
|
||||
mov r9, r1 // reload b(matrix) ptr
|
||||
mov r8, r5 // reload depth value
|
||||
veor q10, q10, q10
|
||||
veor q13, q13, q13
|
||||
veor q15, q15, q15
|
||||
|
||||
cmp r8, #4
|
||||
blt Col1Depth1
|
||||
|
||||
Col1Depth4:
|
||||
vld1.f32 {q0}, [r7]!
|
||||
vld1.f32 {q1}, [r9]!
|
||||
|
||||
vmla.f32 q10, q1, q0
|
||||
sub r8, r8, #4
|
||||
cmp r8, #4
|
||||
bge Col1Depth4
|
||||
|
||||
vpadd.f32 d24, d20, d22
|
||||
vpadd.f32 d25, d21, d23
|
||||
vadd.f32 d30, d24, d25
|
||||
cmp r8, #0
|
||||
beq Col1End
|
||||
|
||||
Col1Depth1:
|
||||
vld1.f32 {d0[0]}, [r7]!
|
||||
vld1.f32 {d2[0]}, [r9]!
|
||||
|
||||
vmla.f32 d30, d2, d0[0]
|
||||
subs r8, r8, #1
|
||||
bne Col1Depth1
|
||||
|
||||
Col1End:
|
||||
cmp r3, #0
|
||||
beq Col1Activation
|
||||
vld1.f32 {d28[0]}, [r3]!
|
||||
vadd.f32 d30, d30, d28
|
||||
|
||||
Col1Activation:
|
||||
cmp r4, #3
|
||||
beq Col1Relu6
|
||||
cmp r4, #1
|
||||
beq Col1Relu
|
||||
b Col1Write
|
||||
|
||||
Col1Relu6:
|
||||
vmov.i32 d26, #6
|
||||
vcvt.f32.s32 d26, d26
|
||||
vmin.f32 d30, d30, d26
|
||||
|
||||
Col1Relu:
|
||||
veor d24, d24, d24
|
||||
vmax.f32 d30, d30, d24
|
||||
|
||||
Col1Write:
|
||||
vst1.f32 {d30[0]}, [r2]!
|
||||
subs r6, r6, #1
|
||||
beq End
|
||||
add r1, r1, r10
|
||||
b Col1Loop
|
||||
|
||||
End:
|
||||
sub sp, sp, #48
|
||||
pop {r0-r8, r10, r11, pc}
|
||||
#endif
|
||||
#endif
|
|
@ -1,12 +1,12 @@
|
|||
#ifdef __aarch64__
|
||||
.text
|
||||
.align 5
|
||||
.global MatVecMulFp32Neon64
|
||||
.global MatVecMulFp32
|
||||
#ifndef __APPLE__
|
||||
.type MatVecMulFp32Neon64, %function
|
||||
.type MatVecMulFp32, %function
|
||||
#endif
|
||||
|
||||
// void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col)
|
||||
// void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col)
|
||||
// x0: a
|
||||
// x1: b
|
||||
// x2: c
|
||||
|
@ -15,7 +15,7 @@
|
|||
// w5: depth
|
||||
// w6: col
|
||||
|
||||
MatVecMulFp32Neon64:
|
||||
MatVecMulFp32:
|
||||
sub sp, sp, #128
|
||||
st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
|
||||
st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
|
||||
|
|
|
@ -682,8 +682,8 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT
|
|||
}
|
||||
|
||||
void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col) {
|
||||
#ifdef ENABLE_ARM64
|
||||
MatVecMulFp32Neon64(a, b, c, bias, (int)act_type, depth, col);
|
||||
#ifdef ENABLE_ARM
|
||||
MatVecMulFp32(a, b, c, bias, (int)act_type, depth, col);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -36,12 +36,14 @@ void RowMajor2Row12Major(const float *src_ptr, float *dst_ptr, int row, int col)
|
|||
void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col);
|
||||
void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col);
|
||||
void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col);
|
||||
#ifdef ENABLE_ARM
|
||||
void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col);
|
||||
#endif
|
||||
#ifdef ENABLE_ARM64
|
||||
void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
|
||||
int col, size_t stride, size_t writeNhwc, size_t WriteWino);
|
||||
void MatmulFloatNeon64Opt(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
|
||||
int col, size_t stride, size_t write_mode);
|
||||
void MatVecMulFp32Neon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col);
|
||||
#elif ENABLE_ARM32
|
||||
void MatmulFloatNeon32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row,
|
||||
int col, int stride, size_t writeNhwc, size_t WriteWino);
|
||||
|
|
|
@ -58,7 +58,7 @@ int FullconnectionCPUKernel::ReSize() {
|
|||
thread_count_ = MSMIN(thread_count_, UP_DIV(fc_param_->col_8_, 8));
|
||||
thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_8_, 8), thread_count_);
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
#ifdef ENABLE_ARM
|
||||
if (fc_param_->row_ == 1) {
|
||||
is_vector_input_ = true;
|
||||
} else {
|
||||
|
@ -76,19 +76,15 @@ int FullconnectionCPUKernel::ReSize() {
|
|||
}
|
||||
|
||||
#if defined(ENABLE_ARM32) || defined(ENABLE_X86_64_SSE)
|
||||
a_pack_ptr_ = reinterpret_cast<float *>(malloc(fc_param_->row_4_ * fc_param_->deep_ * sizeof(float)));
|
||||
if (a_pack_ptr_ == nullptr) {
|
||||
return RET_MEMORY_FAILED;
|
||||
}
|
||||
memset(a_pack_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_ * sizeof(float));
|
||||
int row_tmp = is_vector_input_ ? 1 : fc_param_->row_4_;
|
||||
#else
|
||||
int row_tmp = is_vector_input_ ? 1 : fc_param_->row_12_;
|
||||
#endif
|
||||
a_pack_ptr_ = reinterpret_cast<float *>(malloc(row_tmp * fc_param_->deep_ * sizeof(float)));
|
||||
if (a_pack_ptr_ == nullptr) {
|
||||
return RET_MEMORY_FAILED;
|
||||
}
|
||||
memset(a_pack_ptr_, 0, row_tmp * fc_param_->deep_ * sizeof(float));
|
||||
#endif
|
||||
|
||||
int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_8_;
|
||||
b_pack_ptr_ = reinterpret_cast<float *>(malloc(col_tmp * fc_param_->deep_ * sizeof(float)));
|
||||
|
|
|
@ -66,9 +66,11 @@ int MatmulCPUKernel::MallocMatrixABuffer() {
|
|||
}
|
||||
params_->batch = batch;
|
||||
params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2];
|
||||
#ifdef ENABLE_ARM64
|
||||
#ifdef ENABLE_ARM
|
||||
if (params_->a_init_shape_ && params_->row_ == 1) {
|
||||
is_vector_a_ = true;
|
||||
} else {
|
||||
is_vector_a_ = false;
|
||||
}
|
||||
#endif
|
||||
params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1];
|
||||
|
@ -76,18 +78,10 @@ int MatmulCPUKernel::MallocMatrixABuffer() {
|
|||
params_->row_12_ = UP_ROUND(params_->row_, C12NUM);
|
||||
|
||||
#if defined(ENABLE_ARM32) || defined(ENABLE_X86_64_SSE)
|
||||
if (params_->a_const_) {
|
||||
a_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * params_->row_4_ * params_->deep_ * sizeof(float)));
|
||||
} else {
|
||||
a_pack_ptr_ = reinterpret_cast<float *>(
|
||||
context_->allocator->Malloc(params_->batch * params_->row_4_ * params_->deep_ * sizeof(float)));
|
||||
}
|
||||
if (a_pack_ptr_ == nullptr) {
|
||||
FreeTmpBuffer();
|
||||
return RET_MEMORY_FAILED;
|
||||
}
|
||||
int row_tmp = is_vector_a_ ? 1 : params_->row_4_;
|
||||
#else
|
||||
int row_tmp = is_vector_a_ ? 1 : params_->row_12_;
|
||||
#endif
|
||||
if (params_->a_const_) {
|
||||
a_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * row_tmp * params_->deep_ * sizeof(float)));
|
||||
} else {
|
||||
|
@ -98,7 +92,7 @@ int MatmulCPUKernel::MallocMatrixABuffer() {
|
|||
FreeTmpBuffer();
|
||||
return RET_MEMORY_FAILED;
|
||||
}
|
||||
#endif
|
||||
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue