643 lines
12 KiB
ArmAsm
643 lines
12 KiB
ArmAsm
/*
|
|
* Author: Anton Blanchard <anton@au.ibm.com>
|
|
* Copyright 2015 IBM Corporation.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
#include <asm/ppc_asm.h>
|
|
#include <asm/export.h>
|
|
#include <asm/ppc-opcode.h>
|
|
|
|
#define off8 r6
|
|
#define off16 r7
|
|
#define off24 r8
|
|
|
|
#define rA r9
|
|
#define rB r10
|
|
#define rC r11
|
|
#define rD r27
|
|
#define rE r28
|
|
#define rF r29
|
|
#define rG r30
|
|
#define rH r31
|
|
|
|
#ifdef __LITTLE_ENDIAN__
|
|
#define LH lhbrx
|
|
#define LW lwbrx
|
|
#define LD ldbrx
|
|
#define LVS lvsr
|
|
#define VPERM(_VRT,_VRA,_VRB,_VRC) \
|
|
vperm _VRT,_VRB,_VRA,_VRC
|
|
#else
|
|
#define LH lhzx
|
|
#define LW lwzx
|
|
#define LD ldx
|
|
#define LVS lvsl
|
|
#define VPERM(_VRT,_VRA,_VRB,_VRC) \
|
|
vperm _VRT,_VRA,_VRB,_VRC
|
|
#endif
|
|
|
|
#define VMX_THRESH 4096
|
|
#define ENTER_VMX_OPS \
|
|
mflr r0; \
|
|
std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
|
|
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
|
|
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
|
|
std r0,16(r1); \
|
|
stdu r1,-STACKFRAMESIZE(r1); \
|
|
bl enter_vmx_ops; \
|
|
cmpwi cr1,r3,0; \
|
|
ld r0,STACKFRAMESIZE+16(r1); \
|
|
ld r3,STK_REG(R31)(r1); \
|
|
ld r4,STK_REG(R30)(r1); \
|
|
ld r5,STK_REG(R29)(r1); \
|
|
addi r1,r1,STACKFRAMESIZE; \
|
|
mtlr r0
|
|
|
|
#define EXIT_VMX_OPS \
|
|
mflr r0; \
|
|
std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
|
|
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
|
|
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
|
|
std r0,16(r1); \
|
|
stdu r1,-STACKFRAMESIZE(r1); \
|
|
bl exit_vmx_ops; \
|
|
ld r0,STACKFRAMESIZE+16(r1); \
|
|
ld r3,STK_REG(R31)(r1); \
|
|
ld r4,STK_REG(R30)(r1); \
|
|
ld r5,STK_REG(R29)(r1); \
|
|
addi r1,r1,STACKFRAMESIZE; \
|
|
mtlr r0
|
|
|
|
/*
|
|
* LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
|
|
* 16 bytes boundary and permute the result with the 1st 16 bytes.
|
|
|
|
* | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
|
|
* ^ ^ ^
|
|
* 0xbbbb10 0xbbbb20 0xbbb30
|
|
* ^
|
|
* _vaddr
|
|
*
|
|
*
|
|
* _vmask is the mask generated by LVS
|
|
* _v1st_qw is the 1st aligned QW of current addr which is already loaded.
|
|
* for example: 0xyyyyyyyyyyyyy012 for big endian
|
|
* _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
|
|
* for example: 0x3456789abcdefzzz for big endian
|
|
* The permute result is saved in _v_res.
|
|
* for example: 0x0123456789abcdef for big endian.
|
|
*/
|
|
#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
|
|
lvx _v2nd_qw,_vaddr,off16; \
|
|
VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
|
|
|
|
/*
|
|
* There are 2 categories for memcmp:
|
|
* 1) src/dst has the same offset to the 8 bytes boundary. The handlers
|
|
* are named like .Lsameoffset_xxxx
|
|
* 2) src/dst has different offset to the 8 bytes boundary. The handlers
|
|
* are named like .Ldiffoffset_xxxx
|
|
*/
|
|
_GLOBAL_TOC(memcmp)
|
|
cmpdi cr1,r5,0
|
|
|
|
/* Use the short loop if the src/dst addresses are not
|
|
* with the same offset of 8 bytes align boundary.
|
|
*/
|
|
xor r6,r3,r4
|
|
andi. r6,r6,7
|
|
|
|
/* Fall back to short loop if compare at aligned addrs
|
|
* with less than 8 bytes.
|
|
*/
|
|
cmpdi cr6,r5,7
|
|
|
|
beq cr1,.Lzero
|
|
bgt cr6,.Lno_short
|
|
|
|
.Lshort:
|
|
mtctr r5
|
|
1: lbz rA,0(r3)
|
|
lbz rB,0(r4)
|
|
subf. rC,rB,rA
|
|
bne .Lnon_zero
|
|
bdz .Lzero
|
|
|
|
lbz rA,1(r3)
|
|
lbz rB,1(r4)
|
|
subf. rC,rB,rA
|
|
bne .Lnon_zero
|
|
bdz .Lzero
|
|
|
|
lbz rA,2(r3)
|
|
lbz rB,2(r4)
|
|
subf. rC,rB,rA
|
|
bne .Lnon_zero
|
|
bdz .Lzero
|
|
|
|
lbz rA,3(r3)
|
|
lbz rB,3(r4)
|
|
subf. rC,rB,rA
|
|
bne .Lnon_zero
|
|
|
|
addi r3,r3,4
|
|
addi r4,r4,4
|
|
|
|
bdnz 1b
|
|
|
|
.Lzero:
|
|
li r3,0
|
|
blr
|
|
|
|
.Lno_short:
|
|
dcbt 0,r3
|
|
dcbt 0,r4
|
|
bne .Ldiffoffset_8bytes_make_align_start
|
|
|
|
|
|
.Lsameoffset_8bytes_make_align_start:
|
|
/* attempt to compare bytes not aligned with 8 bytes so that
|
|
* rest comparison can run based on 8 bytes alignment.
|
|
*/
|
|
andi. r6,r3,7
|
|
|
|
/* Try to compare the first double word which is not 8 bytes aligned:
|
|
* load the first double word at (src & ~7UL) and shift left appropriate
|
|
* bits before comparision.
|
|
*/
|
|
rlwinm r6,r3,3,26,28
|
|
beq .Lsameoffset_8bytes_aligned
|
|
clrrdi r3,r3,3
|
|
clrrdi r4,r4,3
|
|
LD rA,0,r3
|
|
LD rB,0,r4
|
|
sld rA,rA,r6
|
|
sld rB,rB,r6
|
|
cmpld cr0,rA,rB
|
|
srwi r6,r6,3
|
|
bne cr0,.LcmpAB_lightweight
|
|
subfic r6,r6,8
|
|
subf. r5,r6,r5
|
|
addi r3,r3,8
|
|
addi r4,r4,8
|
|
beq .Lzero
|
|
|
|
.Lsameoffset_8bytes_aligned:
|
|
/* now we are aligned with 8 bytes.
|
|
* Use .Llong loop if left cmp bytes are equal or greater than 32B.
|
|
*/
|
|
cmpdi cr6,r5,31
|
|
bgt cr6,.Llong
|
|
|
|
.Lcmp_lt32bytes:
|
|
/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
|
|
cmpdi cr5,r5,7
|
|
srdi r0,r5,3
|
|
ble cr5,.Lcmp_rest_lt8bytes
|
|
|
|
/* handle 8 ~ 31 bytes */
|
|
clrldi r5,r5,61
|
|
mtctr r0
|
|
2:
|
|
LD rA,0,r3
|
|
LD rB,0,r4
|
|
cmpld cr0,rA,rB
|
|
addi r3,r3,8
|
|
addi r4,r4,8
|
|
bne cr0,.LcmpAB_lightweight
|
|
bdnz 2b
|
|
|
|
cmpwi r5,0
|
|
beq .Lzero
|
|
|
|
.Lcmp_rest_lt8bytes:
|
|
/*
|
|
* Here we have less than 8 bytes to compare. At least s1 is aligned to
|
|
* 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
|
|
* page boundary, otherwise we might read past the end of the buffer and
|
|
* trigger a page fault. We use 4K as the conservative minimum page
|
|
* size. If we detect that case we go to the byte-by-byte loop.
|
|
*
|
|
* Otherwise the next double word is loaded from s1 and s2, and shifted
|
|
* right to compare the appropriate bits.
|
|
*/
|
|
clrldi r6,r4,(64-12) // r6 = r4 & 0xfff
|
|
cmpdi r6,0xff8
|
|
bgt .Lshort
|
|
|
|
subfic r6,r5,8
|
|
slwi r6,r6,3
|
|
LD rA,0,r3
|
|
LD rB,0,r4
|
|
srd rA,rA,r6
|
|
srd rB,rB,r6
|
|
cmpld cr0,rA,rB
|
|
bne cr0,.LcmpAB_lightweight
|
|
b .Lzero
|
|
|
|
.Lnon_zero:
|
|
mr r3,rC
|
|
blr
|
|
|
|
.Llong:
|
|
#ifdef CONFIG_ALTIVEC
|
|
BEGIN_FTR_SECTION
|
|
/* Try to use vmx loop if length is equal or greater than 4K */
|
|
cmpldi cr6,r5,VMX_THRESH
|
|
bge cr6,.Lsameoffset_vmx_cmp
|
|
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
|
|
|
|
.Llong_novmx_cmp:
|
|
#endif
|
|
/* At least s1 addr is aligned with 8 bytes */
|
|
li off8,8
|
|
li off16,16
|
|
li off24,24
|
|
|
|
std r31,-8(r1)
|
|
std r30,-16(r1)
|
|
std r29,-24(r1)
|
|
std r28,-32(r1)
|
|
std r27,-40(r1)
|
|
|
|
srdi r0,r5,5
|
|
mtctr r0
|
|
andi. r5,r5,31
|
|
|
|
LD rA,0,r3
|
|
LD rB,0,r4
|
|
|
|
LD rC,off8,r3
|
|
LD rD,off8,r4
|
|
|
|
LD rE,off16,r3
|
|
LD rF,off16,r4
|
|
|
|
LD rG,off24,r3
|
|
LD rH,off24,r4
|
|
cmpld cr0,rA,rB
|
|
|
|
addi r3,r3,32
|
|
addi r4,r4,32
|
|
|
|
bdz .Lfirst32
|
|
|
|
LD rA,0,r3
|
|
LD rB,0,r4
|
|
cmpld cr1,rC,rD
|
|
|
|
LD rC,off8,r3
|
|
LD rD,off8,r4
|
|
cmpld cr6,rE,rF
|
|
|
|
LD rE,off16,r3
|
|
LD rF,off16,r4
|
|
cmpld cr7,rG,rH
|
|
bne cr0,.LcmpAB
|
|
|
|
LD rG,off24,r3
|
|
LD rH,off24,r4
|
|
cmpld cr0,rA,rB
|
|
bne cr1,.LcmpCD
|
|
|
|
addi r3,r3,32
|
|
addi r4,r4,32
|
|
|
|
bdz .Lsecond32
|
|
|
|
.balign 16
|
|
|
|
1: LD rA,0,r3
|
|
LD rB,0,r4
|
|
cmpld cr1,rC,rD
|
|
bne cr6,.LcmpEF
|
|
|
|
LD rC,off8,r3
|
|
LD rD,off8,r4
|
|
cmpld cr6,rE,rF
|
|
bne cr7,.LcmpGH
|
|
|
|
LD rE,off16,r3
|
|
LD rF,off16,r4
|
|
cmpld cr7,rG,rH
|
|
bne cr0,.LcmpAB
|
|
|
|
LD rG,off24,r3
|
|
LD rH,off24,r4
|
|
cmpld cr0,rA,rB
|
|
bne cr1,.LcmpCD
|
|
|
|
addi r3,r3,32
|
|
addi r4,r4,32
|
|
|
|
bdnz 1b
|
|
|
|
.Lsecond32:
|
|
cmpld cr1,rC,rD
|
|
bne cr6,.LcmpEF
|
|
|
|
cmpld cr6,rE,rF
|
|
bne cr7,.LcmpGH
|
|
|
|
cmpld cr7,rG,rH
|
|
bne cr0,.LcmpAB
|
|
|
|
bne cr1,.LcmpCD
|
|
bne cr6,.LcmpEF
|
|
bne cr7,.LcmpGH
|
|
|
|
.Ltail:
|
|
ld r31,-8(r1)
|
|
ld r30,-16(r1)
|
|
ld r29,-24(r1)
|
|
ld r28,-32(r1)
|
|
ld r27,-40(r1)
|
|
|
|
cmpdi r5,0
|
|
beq .Lzero
|
|
b .Lshort
|
|
|
|
.Lfirst32:
|
|
cmpld cr1,rC,rD
|
|
cmpld cr6,rE,rF
|
|
cmpld cr7,rG,rH
|
|
|
|
bne cr0,.LcmpAB
|
|
bne cr1,.LcmpCD
|
|
bne cr6,.LcmpEF
|
|
bne cr7,.LcmpGH
|
|
|
|
b .Ltail
|
|
|
|
.LcmpAB:
|
|
li r3,1
|
|
bgt cr0,.Lout
|
|
li r3,-1
|
|
b .Lout
|
|
|
|
.LcmpCD:
|
|
li r3,1
|
|
bgt cr1,.Lout
|
|
li r3,-1
|
|
b .Lout
|
|
|
|
.LcmpEF:
|
|
li r3,1
|
|
bgt cr6,.Lout
|
|
li r3,-1
|
|
b .Lout
|
|
|
|
.LcmpGH:
|
|
li r3,1
|
|
bgt cr7,.Lout
|
|
li r3,-1
|
|
|
|
.Lout:
|
|
ld r31,-8(r1)
|
|
ld r30,-16(r1)
|
|
ld r29,-24(r1)
|
|
ld r28,-32(r1)
|
|
ld r27,-40(r1)
|
|
blr
|
|
|
|
.LcmpAB_lightweight: /* skip NV GPRS restore */
|
|
li r3,1
|
|
bgtlr
|
|
li r3,-1
|
|
blr
|
|
|
|
#ifdef CONFIG_ALTIVEC
|
|
.Lsameoffset_vmx_cmp:
|
|
/* Enter with src/dst addrs has the same offset with 8 bytes
|
|
* align boundary.
|
|
*
|
|
* There is an optimization based on following fact: memcmp()
|
|
* prones to fail early at the first 32 bytes.
|
|
* Before applying VMX instructions which will lead to 32x128bits
|
|
* VMX regs load/restore penalty, we compare the first 32 bytes
|
|
* so that we can catch the ~80% fail cases.
|
|
*/
|
|
|
|
li r0,4
|
|
mtctr r0
|
|
.Lsameoffset_prechk_32B_loop:
|
|
LD rA,0,r3
|
|
LD rB,0,r4
|
|
cmpld cr0,rA,rB
|
|
addi r3,r3,8
|
|
addi r4,r4,8
|
|
bne cr0,.LcmpAB_lightweight
|
|
addi r5,r5,-8
|
|
bdnz .Lsameoffset_prechk_32B_loop
|
|
|
|
ENTER_VMX_OPS
|
|
beq cr1,.Llong_novmx_cmp
|
|
|
|
3:
|
|
/* need to check whether r4 has the same offset with r3
|
|
* for 16 bytes boundary.
|
|
*/
|
|
xor r0,r3,r4
|
|
andi. r0,r0,0xf
|
|
bne .Ldiffoffset_vmx_cmp_start
|
|
|
|
/* len is no less than 4KB. Need to align with 16 bytes further.
|
|
*/
|
|
andi. rA,r3,8
|
|
LD rA,0,r3
|
|
beq 4f
|
|
LD rB,0,r4
|
|
cmpld cr0,rA,rB
|
|
addi r3,r3,8
|
|
addi r4,r4,8
|
|
addi r5,r5,-8
|
|
|
|
beq cr0,4f
|
|
/* save and restore cr0 */
|
|
mfocrf r5,128
|
|
EXIT_VMX_OPS
|
|
mtocrf 128,r5
|
|
b .LcmpAB_lightweight
|
|
|
|
4:
|
|
/* compare 32 bytes for each loop */
|
|
srdi r0,r5,5
|
|
mtctr r0
|
|
clrldi r5,r5,59
|
|
li off16,16
|
|
|
|
.balign 16
|
|
5:
|
|
lvx v0,0,r3
|
|
lvx v1,0,r4
|
|
VCMPEQUD_RC(v0,v0,v1)
|
|
bnl cr6,7f
|
|
lvx v0,off16,r3
|
|
lvx v1,off16,r4
|
|
VCMPEQUD_RC(v0,v0,v1)
|
|
bnl cr6,6f
|
|
addi r3,r3,32
|
|
addi r4,r4,32
|
|
bdnz 5b
|
|
|
|
EXIT_VMX_OPS
|
|
cmpdi r5,0
|
|
beq .Lzero
|
|
b .Lcmp_lt32bytes
|
|
|
|
6:
|
|
addi r3,r3,16
|
|
addi r4,r4,16
|
|
|
|
7:
|
|
/* diff the last 16 bytes */
|
|
EXIT_VMX_OPS
|
|
LD rA,0,r3
|
|
LD rB,0,r4
|
|
cmpld cr0,rA,rB
|
|
li off8,8
|
|
bne cr0,.LcmpAB_lightweight
|
|
|
|
LD rA,off8,r3
|
|
LD rB,off8,r4
|
|
cmpld cr0,rA,rB
|
|
bne cr0,.LcmpAB_lightweight
|
|
b .Lzero
|
|
#endif
|
|
|
|
.Ldiffoffset_8bytes_make_align_start:
|
|
/* now try to align s1 with 8 bytes */
|
|
rlwinm r6,r3,3,26,28
|
|
beq .Ldiffoffset_align_s1_8bytes
|
|
|
|
clrrdi r3,r3,3
|
|
LD rA,0,r3
|
|
LD rB,0,r4 /* unaligned load */
|
|
sld rA,rA,r6
|
|
srd rA,rA,r6
|
|
srd rB,rB,r6
|
|
cmpld cr0,rA,rB
|
|
srwi r6,r6,3
|
|
bne cr0,.LcmpAB_lightweight
|
|
|
|
subfic r6,r6,8
|
|
subf. r5,r6,r5
|
|
addi r3,r3,8
|
|
add r4,r4,r6
|
|
|
|
beq .Lzero
|
|
|
|
.Ldiffoffset_align_s1_8bytes:
|
|
/* now s1 is aligned with 8 bytes. */
|
|
#ifdef CONFIG_ALTIVEC
|
|
BEGIN_FTR_SECTION
|
|
/* only do vmx ops when the size equal or greater than 4K bytes */
|
|
cmpdi cr5,r5,VMX_THRESH
|
|
bge cr5,.Ldiffoffset_vmx_cmp
|
|
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
|
|
|
|
.Ldiffoffset_novmx_cmp:
|
|
#endif
|
|
|
|
|
|
cmpdi cr5,r5,31
|
|
ble cr5,.Lcmp_lt32bytes
|
|
|
|
#ifdef CONFIG_ALTIVEC
|
|
b .Llong_novmx_cmp
|
|
#else
|
|
b .Llong
|
|
#endif
|
|
|
|
#ifdef CONFIG_ALTIVEC
|
|
.Ldiffoffset_vmx_cmp:
|
|
/* perform a 32 bytes pre-checking before
|
|
* enable VMX operations.
|
|
*/
|
|
li r0,4
|
|
mtctr r0
|
|
.Ldiffoffset_prechk_32B_loop:
|
|
LD rA,0,r3
|
|
LD rB,0,r4
|
|
cmpld cr0,rA,rB
|
|
addi r3,r3,8
|
|
addi r4,r4,8
|
|
bne cr0,.LcmpAB_lightweight
|
|
addi r5,r5,-8
|
|
bdnz .Ldiffoffset_prechk_32B_loop
|
|
|
|
ENTER_VMX_OPS
|
|
beq cr1,.Ldiffoffset_novmx_cmp
|
|
|
|
.Ldiffoffset_vmx_cmp_start:
|
|
/* Firstly try to align r3 with 16 bytes */
|
|
andi. r6,r3,0xf
|
|
li off16,16
|
|
beq .Ldiffoffset_vmx_s1_16bytes_align
|
|
|
|
LVS v3,0,r3
|
|
LVS v4,0,r4
|
|
|
|
lvx v5,0,r3
|
|
lvx v6,0,r4
|
|
LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
|
|
LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
|
|
|
|
VCMPEQUB_RC(v7,v9,v10)
|
|
bnl cr6,.Ldiffoffset_vmx_diff_found
|
|
|
|
subfic r6,r6,16
|
|
subf r5,r6,r5
|
|
add r3,r3,r6
|
|
add r4,r4,r6
|
|
|
|
.Ldiffoffset_vmx_s1_16bytes_align:
|
|
/* now s1 is aligned with 16 bytes */
|
|
lvx v6,0,r4
|
|
LVS v4,0,r4
|
|
srdi r6,r5,5 /* loop for 32 bytes each */
|
|
clrldi r5,r5,59
|
|
mtctr r6
|
|
|
|
.balign 16
|
|
.Ldiffoffset_vmx_32bytesloop:
|
|
/* the first qw of r4 was saved in v6 */
|
|
lvx v9,0,r3
|
|
LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
|
|
VCMPEQUB_RC(v7,v9,v10)
|
|
vor v6,v8,v8
|
|
bnl cr6,.Ldiffoffset_vmx_diff_found
|
|
|
|
addi r3,r3,16
|
|
addi r4,r4,16
|
|
|
|
lvx v9,0,r3
|
|
LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
|
|
VCMPEQUB_RC(v7,v9,v10)
|
|
vor v6,v8,v8
|
|
bnl cr6,.Ldiffoffset_vmx_diff_found
|
|
|
|
addi r3,r3,16
|
|
addi r4,r4,16
|
|
|
|
bdnz .Ldiffoffset_vmx_32bytesloop
|
|
|
|
EXIT_VMX_OPS
|
|
|
|
cmpdi r5,0
|
|
beq .Lzero
|
|
b .Lcmp_lt32bytes
|
|
|
|
.Ldiffoffset_vmx_diff_found:
|
|
EXIT_VMX_OPS
|
|
/* anyway, the diff will appear in next 16 bytes */
|
|
li r5,16
|
|
b .Lcmp_lt32bytes
|
|
|
|
#endif
|
|
EXPORT_SYMBOL(memcmp)
|