crypto: powerpc - Factor out the core CRC vpmsum algorithm
The core nuts and bolts of the crc32c vpmsum algorithm will also work for a number of other CRC algorithms with different polynomials. Factor out the function into a new asm file. To handle multiple users of the function, a user simply provides constants, defines the name of their CRC function, and then #includes the core algorithm file. Cc: Anton Blanchard <anton@samba.org> Signed-off-by: Daniel Axtens <dja@axtens.net> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
2e6d603e51
commit
de696a2643
|
@ -0,0 +1,726 @@
|
|||
/*
|
||||
* Core of the accelerated CRC algorithm.
|
||||
* In your file, define the constants and CRC_FUNCTION_NAME
|
||||
* Then include this file.
|
||||
*
|
||||
* Calculate the checksum of data that is 16 byte aligned and a multiple of
|
||||
* 16 bytes.
|
||||
*
|
||||
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
|
||||
* chunks in order to mask the latency of the vpmsum instructions. If we
|
||||
* have more than 32 kB of data to checksum we repeat this step multiple
|
||||
* times, passing in the previous 1024 bits.
|
||||
*
|
||||
* The next step is to reduce the 1024 bits to 64 bits. This step adds
|
||||
* 32 bits of 0s to the end - this matches what a CRC does. We just
|
||||
* calculate constants that land the data in this 32 bits.
|
||||
*
|
||||
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
|
||||
* for n = CRC using POWER8 instructions. We use x = 32.
|
||||
*
|
||||
* http://en.wikipedia.org/wiki/Barrett_reduction
|
||||
*
|
||||
* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <asm/ppc_asm.h>
|
||||
#include <asm/ppc-opcode.h>
|
||||
|
||||
#define MAX_SIZE 32768
|
||||
|
||||
.text
|
||||
|
||||
#if defined(__BIG_ENDIAN__)
|
||||
#define BYTESWAP_DATA
|
||||
#else
|
||||
#undef BYTESWAP_DATA
|
||||
#endif
|
||||
|
||||
#define off16 r25
|
||||
#define off32 r26
|
||||
#define off48 r27
|
||||
#define off64 r28
|
||||
#define off80 r29
|
||||
#define off96 r30
|
||||
#define off112 r31
|
||||
|
||||
#define const1 v24
|
||||
#define const2 v25
|
||||
|
||||
#define byteswap v26
|
||||
#define mask_32bit v27
|
||||
#define mask_64bit v28
|
||||
#define zeroes v29
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
#define VPERM(A, B, C, D) vperm A, B, C, D
|
||||
#else
|
||||
#define VPERM(A, B, C, D)
|
||||
#endif
|
||||
|
||||
/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
|
||||
FUNC_START(CRC_FUNCTION_NAME)
|
||||
std r31,-8(r1)
|
||||
std r30,-16(r1)
|
||||
std r29,-24(r1)
|
||||
std r28,-32(r1)
|
||||
std r27,-40(r1)
|
||||
std r26,-48(r1)
|
||||
std r25,-56(r1)
|
||||
|
||||
li off16,16
|
||||
li off32,32
|
||||
li off48,48
|
||||
li off64,64
|
||||
li off80,80
|
||||
li off96,96
|
||||
li off112,112
|
||||
li r0,0
|
||||
|
||||
/* Enough room for saving 10 non volatile VMX registers */
|
||||
subi r6,r1,56+10*16
|
||||
subi r7,r1,56+2*16
|
||||
|
||||
stvx v20,0,r6
|
||||
stvx v21,off16,r6
|
||||
stvx v22,off32,r6
|
||||
stvx v23,off48,r6
|
||||
stvx v24,off64,r6
|
||||
stvx v25,off80,r6
|
||||
stvx v26,off96,r6
|
||||
stvx v27,off112,r6
|
||||
stvx v28,0,r7
|
||||
stvx v29,off16,r7
|
||||
|
||||
mr r10,r3
|
||||
|
||||
vxor zeroes,zeroes,zeroes
|
||||
vspltisw v0,-1
|
||||
|
||||
vsldoi mask_32bit,zeroes,v0,4
|
||||
vsldoi mask_64bit,zeroes,v0,8
|
||||
|
||||
/* Get the initial value into v8 */
|
||||
vxor v8,v8,v8
|
||||
MTVRD(v8, R3)
|
||||
vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
addis r3,r2,.byteswap_constant@toc@ha
|
||||
addi r3,r3,.byteswap_constant@toc@l
|
||||
|
||||
lvx byteswap,0,r3
|
||||
addi r3,r3,16
|
||||
#endif
|
||||
|
||||
cmpdi r5,256
|
||||
blt .Lshort
|
||||
|
||||
rldicr r6,r5,0,56
|
||||
|
||||
/* Checksum in blocks of MAX_SIZE */
|
||||
1: lis r7,MAX_SIZE@h
|
||||
ori r7,r7,MAX_SIZE@l
|
||||
mr r9,r7
|
||||
cmpd r6,r7
|
||||
bgt 2f
|
||||
mr r7,r6
|
||||
2: subf r6,r7,r6
|
||||
|
||||
/* our main loop does 128 bytes at a time */
|
||||
srdi r7,r7,7
|
||||
|
||||
/*
|
||||
* Work out the offset into the constants table to start at. Each
|
||||
* constant is 16 bytes, and it is used against 128 bytes of input
|
||||
* data - 128 / 16 = 8
|
||||
*/
|
||||
sldi r8,r7,4
|
||||
srdi r9,r9,3
|
||||
subf r8,r8,r9
|
||||
|
||||
/* We reduce our final 128 bytes in a separate step */
|
||||
addi r7,r7,-1
|
||||
mtctr r7
|
||||
|
||||
addis r3,r2,.constants@toc@ha
|
||||
addi r3,r3,.constants@toc@l
|
||||
|
||||
/* Find the start of our constants */
|
||||
add r3,r3,r8
|
||||
|
||||
/* zero v0-v7 which will contain our checksums */
|
||||
vxor v0,v0,v0
|
||||
vxor v1,v1,v1
|
||||
vxor v2,v2,v2
|
||||
vxor v3,v3,v3
|
||||
vxor v4,v4,v4
|
||||
vxor v5,v5,v5
|
||||
vxor v6,v6,v6
|
||||
vxor v7,v7,v7
|
||||
|
||||
lvx const1,0,r3
|
||||
|
||||
/*
|
||||
* If we are looping back to consume more data we use the values
|
||||
* already in v16-v23.
|
||||
*/
|
||||
cmpdi r0,1
|
||||
beq 2f
|
||||
|
||||
/* First warm up pass */
|
||||
lvx v16,0,r4
|
||||
lvx v17,off16,r4
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPERM(v17,v17,v17,byteswap)
|
||||
lvx v18,off32,r4
|
||||
lvx v19,off48,r4
|
||||
VPERM(v18,v18,v18,byteswap)
|
||||
VPERM(v19,v19,v19,byteswap)
|
||||
lvx v20,off64,r4
|
||||
lvx v21,off80,r4
|
||||
VPERM(v20,v20,v20,byteswap)
|
||||
VPERM(v21,v21,v21,byteswap)
|
||||
lvx v22,off96,r4
|
||||
lvx v23,off112,r4
|
||||
VPERM(v22,v22,v22,byteswap)
|
||||
VPERM(v23,v23,v23,byteswap)
|
||||
addi r4,r4,8*16
|
||||
|
||||
/* xor in initial value */
|
||||
vxor v16,v16,v8
|
||||
|
||||
2: bdz .Lfirst_warm_up_done
|
||||
|
||||
addi r3,r3,16
|
||||
lvx const2,0,r3
|
||||
|
||||
/* Second warm up pass */
|
||||
VPMSUMD(v8,v16,const1)
|
||||
lvx v16,0,r4
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v9,v17,const1)
|
||||
lvx v17,off16,r4
|
||||
VPERM(v17,v17,v17,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v10,v18,const1)
|
||||
lvx v18,off32,r4
|
||||
VPERM(v18,v18,v18,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v11,v19,const1)
|
||||
lvx v19,off48,r4
|
||||
VPERM(v19,v19,v19,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v12,v20,const1)
|
||||
lvx v20,off64,r4
|
||||
VPERM(v20,v20,v20,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v13,v21,const1)
|
||||
lvx v21,off80,r4
|
||||
VPERM(v21,v21,v21,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v14,v22,const1)
|
||||
lvx v22,off96,r4
|
||||
VPERM(v22,v22,v22,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v15,v23,const1)
|
||||
lvx v23,off112,r4
|
||||
VPERM(v23,v23,v23,byteswap)
|
||||
|
||||
addi r4,r4,8*16
|
||||
|
||||
bdz .Lfirst_cool_down
|
||||
|
||||
/*
|
||||
* main loop. We modulo schedule it such that it takes three iterations
|
||||
* to complete - first iteration load, second iteration vpmsum, third
|
||||
* iteration xor.
|
||||
*/
|
||||
.balign 16
|
||||
4: lvx const1,0,r3
|
||||
addi r3,r3,16
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v0,v0,v8
|
||||
VPMSUMD(v8,v16,const2)
|
||||
lvx v16,0,r4
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v1,v1,v9
|
||||
VPMSUMD(v9,v17,const2)
|
||||
lvx v17,off16,r4
|
||||
VPERM(v17,v17,v17,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v2,v2,v10
|
||||
VPMSUMD(v10,v18,const2)
|
||||
lvx v18,off32,r4
|
||||
VPERM(v18,v18,v18,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v3,v3,v11
|
||||
VPMSUMD(v11,v19,const2)
|
||||
lvx v19,off48,r4
|
||||
VPERM(v19,v19,v19,byteswap)
|
||||
lvx const2,0,r3
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v4,v4,v12
|
||||
VPMSUMD(v12,v20,const1)
|
||||
lvx v20,off64,r4
|
||||
VPERM(v20,v20,v20,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v5,v5,v13
|
||||
VPMSUMD(v13,v21,const1)
|
||||
lvx v21,off80,r4
|
||||
VPERM(v21,v21,v21,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v6,v6,v14
|
||||
VPMSUMD(v14,v22,const1)
|
||||
lvx v22,off96,r4
|
||||
VPERM(v22,v22,v22,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v7,v7,v15
|
||||
VPMSUMD(v15,v23,const1)
|
||||
lvx v23,off112,r4
|
||||
VPERM(v23,v23,v23,byteswap)
|
||||
|
||||
addi r4,r4,8*16
|
||||
|
||||
bdnz 4b
|
||||
|
||||
.Lfirst_cool_down:
|
||||
/* First cool down pass */
|
||||
lvx const1,0,r3
|
||||
addi r3,r3,16
|
||||
|
||||
vxor v0,v0,v8
|
||||
VPMSUMD(v8,v16,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v1,v1,v9
|
||||
VPMSUMD(v9,v17,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v2,v2,v10
|
||||
VPMSUMD(v10,v18,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v3,v3,v11
|
||||
VPMSUMD(v11,v19,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v4,v4,v12
|
||||
VPMSUMD(v12,v20,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v5,v5,v13
|
||||
VPMSUMD(v13,v21,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v6,v6,v14
|
||||
VPMSUMD(v14,v22,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v7,v7,v15
|
||||
VPMSUMD(v15,v23,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
.Lsecond_cool_down:
|
||||
/* Second cool down pass */
|
||||
vxor v0,v0,v8
|
||||
vxor v1,v1,v9
|
||||
vxor v2,v2,v10
|
||||
vxor v3,v3,v11
|
||||
vxor v4,v4,v12
|
||||
vxor v5,v5,v13
|
||||
vxor v6,v6,v14
|
||||
vxor v7,v7,v15
|
||||
|
||||
/*
|
||||
* vpmsumd produces a 96 bit result in the least significant bits
|
||||
* of the register. Since we are bit reflected we have to shift it
|
||||
* left 32 bits so it occupies the least significant bits in the
|
||||
* bit reflected domain.
|
||||
*/
|
||||
vsldoi v0,v0,zeroes,4
|
||||
vsldoi v1,v1,zeroes,4
|
||||
vsldoi v2,v2,zeroes,4
|
||||
vsldoi v3,v3,zeroes,4
|
||||
vsldoi v4,v4,zeroes,4
|
||||
vsldoi v5,v5,zeroes,4
|
||||
vsldoi v6,v6,zeroes,4
|
||||
vsldoi v7,v7,zeroes,4
|
||||
|
||||
/* xor with last 1024 bits */
|
||||
lvx v8,0,r4
|
||||
lvx v9,off16,r4
|
||||
VPERM(v8,v8,v8,byteswap)
|
||||
VPERM(v9,v9,v9,byteswap)
|
||||
lvx v10,off32,r4
|
||||
lvx v11,off48,r4
|
||||
VPERM(v10,v10,v10,byteswap)
|
||||
VPERM(v11,v11,v11,byteswap)
|
||||
lvx v12,off64,r4
|
||||
lvx v13,off80,r4
|
||||
VPERM(v12,v12,v12,byteswap)
|
||||
VPERM(v13,v13,v13,byteswap)
|
||||
lvx v14,off96,r4
|
||||
lvx v15,off112,r4
|
||||
VPERM(v14,v14,v14,byteswap)
|
||||
VPERM(v15,v15,v15,byteswap)
|
||||
|
||||
addi r4,r4,8*16
|
||||
|
||||
vxor v16,v0,v8
|
||||
vxor v17,v1,v9
|
||||
vxor v18,v2,v10
|
||||
vxor v19,v3,v11
|
||||
vxor v20,v4,v12
|
||||
vxor v21,v5,v13
|
||||
vxor v22,v6,v14
|
||||
vxor v23,v7,v15
|
||||
|
||||
li r0,1
|
||||
cmpdi r6,0
|
||||
addi r6,r6,128
|
||||
bne 1b
|
||||
|
||||
/* Work out how many bytes we have left */
|
||||
andi. r5,r5,127
|
||||
|
||||
/* Calculate where in the constant table we need to start */
|
||||
subfic r6,r5,128
|
||||
add r3,r3,r6
|
||||
|
||||
/* How many 16 byte chunks are in the tail */
|
||||
srdi r7,r5,4
|
||||
mtctr r7
|
||||
|
||||
/*
|
||||
* Reduce the previously calculated 1024 bits to 64 bits, shifting
|
||||
* 32 bits to include the trailing 32 bits of zeros
|
||||
*/
|
||||
lvx v0,0,r3
|
||||
lvx v1,off16,r3
|
||||
lvx v2,off32,r3
|
||||
lvx v3,off48,r3
|
||||
lvx v4,off64,r3
|
||||
lvx v5,off80,r3
|
||||
lvx v6,off96,r3
|
||||
lvx v7,off112,r3
|
||||
addi r3,r3,8*16
|
||||
|
||||
VPMSUMW(v0,v16,v0)
|
||||
VPMSUMW(v1,v17,v1)
|
||||
VPMSUMW(v2,v18,v2)
|
||||
VPMSUMW(v3,v19,v3)
|
||||
VPMSUMW(v4,v20,v4)
|
||||
VPMSUMW(v5,v21,v5)
|
||||
VPMSUMW(v6,v22,v6)
|
||||
VPMSUMW(v7,v23,v7)
|
||||
|
||||
/* Now reduce the tail (0 - 112 bytes) */
|
||||
cmpdi r7,0
|
||||
beq 1f
|
||||
|
||||
lvx v16,0,r4
|
||||
lvx v17,0,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off16,r4
|
||||
lvx v17,off16,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off32,r4
|
||||
lvx v17,off32,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off48,r4
|
||||
lvx v17,off48,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off64,r4
|
||||
lvx v17,off64,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off80,r4
|
||||
lvx v17,off80,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off96,r4
|
||||
lvx v17,off96,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
|
||||
/* Now xor all the parallel chunks together */
|
||||
1: vxor v0,v0,v1
|
||||
vxor v2,v2,v3
|
||||
vxor v4,v4,v5
|
||||
vxor v6,v6,v7
|
||||
|
||||
vxor v0,v0,v2
|
||||
vxor v4,v4,v6
|
||||
|
||||
vxor v0,v0,v4
|
||||
|
||||
.Lbarrett_reduction:
|
||||
/* Barrett constants */
|
||||
addis r3,r2,.barrett_constants@toc@ha
|
||||
addi r3,r3,.barrett_constants@toc@l
|
||||
|
||||
lvx const1,0,r3
|
||||
lvx const2,off16,r3
|
||||
|
||||
vsldoi v1,v0,v0,8
|
||||
vxor v0,v0,v1 /* xor two 64 bit results together */
|
||||
|
||||
/* shift left one bit */
|
||||
vspltisb v1,1
|
||||
vsl v0,v0,v1
|
||||
|
||||
vand v0,v0,mask_64bit
|
||||
|
||||
/*
|
||||
* The reflected version of Barrett reduction. Instead of bit
|
||||
* reflecting our data (which is expensive to do), we bit reflect our
|
||||
* constants and our algorithm, which means the intermediate data in
|
||||
* our vector registers goes from 0-63 instead of 63-0. We can reflect
|
||||
* the algorithm because we don't carry in mod 2 arithmetic.
|
||||
*/
|
||||
vand v1,v0,mask_32bit /* bottom 32 bits of a */
|
||||
VPMSUMD(v1,v1,const1) /* ma */
|
||||
vand v1,v1,mask_32bit /* bottom 32bits of ma */
|
||||
VPMSUMD(v1,v1,const2) /* qn */
|
||||
vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
|
||||
|
||||
/*
|
||||
* Since we are bit reflected, the result (ie the low 32 bits) is in
|
||||
* the high 32 bits. We just need to shift it left 4 bytes
|
||||
* V0 [ 0 1 X 3 ]
|
||||
* V0 [ 0 X 2 3 ]
|
||||
*/
|
||||
vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
|
||||
|
||||
/* Get it into r3 */
|
||||
MFVRD(R3, v0)
|
||||
|
||||
.Lout:
|
||||
subi r6,r1,56+10*16
|
||||
subi r7,r1,56+2*16
|
||||
|
||||
lvx v20,0,r6
|
||||
lvx v21,off16,r6
|
||||
lvx v22,off32,r6
|
||||
lvx v23,off48,r6
|
||||
lvx v24,off64,r6
|
||||
lvx v25,off80,r6
|
||||
lvx v26,off96,r6
|
||||
lvx v27,off112,r6
|
||||
lvx v28,0,r7
|
||||
lvx v29,off16,r7
|
||||
|
||||
ld r31,-8(r1)
|
||||
ld r30,-16(r1)
|
||||
ld r29,-24(r1)
|
||||
ld r28,-32(r1)
|
||||
ld r27,-40(r1)
|
||||
ld r26,-48(r1)
|
||||
ld r25,-56(r1)
|
||||
|
||||
blr
|
||||
|
||||
.Lfirst_warm_up_done:
|
||||
lvx const1,0,r3
|
||||
addi r3,r3,16
|
||||
|
||||
VPMSUMD(v8,v16,const1)
|
||||
VPMSUMD(v9,v17,const1)
|
||||
VPMSUMD(v10,v18,const1)
|
||||
VPMSUMD(v11,v19,const1)
|
||||
VPMSUMD(v12,v20,const1)
|
||||
VPMSUMD(v13,v21,const1)
|
||||
VPMSUMD(v14,v22,const1)
|
||||
VPMSUMD(v15,v23,const1)
|
||||
|
||||
b .Lsecond_cool_down
|
||||
|
||||
.Lshort:
|
||||
cmpdi r5,0
|
||||
beq .Lzero
|
||||
|
||||
addis r3,r2,.short_constants@toc@ha
|
||||
addi r3,r3,.short_constants@toc@l
|
||||
|
||||
/* Calculate where in the constant table we need to start */
|
||||
subfic r6,r5,256
|
||||
add r3,r3,r6
|
||||
|
||||
/* How many 16 byte chunks? */
|
||||
srdi r7,r5,4
|
||||
mtctr r7
|
||||
|
||||
vxor v19,v19,v19
|
||||
vxor v20,v20,v20
|
||||
|
||||
lvx v0,0,r4
|
||||
lvx v16,0,r3
|
||||
VPERM(v0,v0,v16,byteswap)
|
||||
vxor v0,v0,v8 /* xor in initial value */
|
||||
VPMSUMW(v0,v0,v16)
|
||||
bdz .Lv0
|
||||
|
||||
lvx v1,off16,r4
|
||||
lvx v17,off16,r3
|
||||
VPERM(v1,v1,v17,byteswap)
|
||||
VPMSUMW(v1,v1,v17)
|
||||
bdz .Lv1
|
||||
|
||||
lvx v2,off32,r4
|
||||
lvx v16,off32,r3
|
||||
VPERM(v2,v2,v16,byteswap)
|
||||
VPMSUMW(v2,v2,v16)
|
||||
bdz .Lv2
|
||||
|
||||
lvx v3,off48,r4
|
||||
lvx v17,off48,r3
|
||||
VPERM(v3,v3,v17,byteswap)
|
||||
VPMSUMW(v3,v3,v17)
|
||||
bdz .Lv3
|
||||
|
||||
lvx v4,off64,r4
|
||||
lvx v16,off64,r3
|
||||
VPERM(v4,v4,v16,byteswap)
|
||||
VPMSUMW(v4,v4,v16)
|
||||
bdz .Lv4
|
||||
|
||||
lvx v5,off80,r4
|
||||
lvx v17,off80,r3
|
||||
VPERM(v5,v5,v17,byteswap)
|
||||
VPMSUMW(v5,v5,v17)
|
||||
bdz .Lv5
|
||||
|
||||
lvx v6,off96,r4
|
||||
lvx v16,off96,r3
|
||||
VPERM(v6,v6,v16,byteswap)
|
||||
VPMSUMW(v6,v6,v16)
|
||||
bdz .Lv6
|
||||
|
||||
lvx v7,off112,r4
|
||||
lvx v17,off112,r3
|
||||
VPERM(v7,v7,v17,byteswap)
|
||||
VPMSUMW(v7,v7,v17)
|
||||
bdz .Lv7
|
||||
|
||||
addi r3,r3,128
|
||||
addi r4,r4,128
|
||||
|
||||
lvx v8,0,r4
|
||||
lvx v16,0,r3
|
||||
VPERM(v8,v8,v16,byteswap)
|
||||
VPMSUMW(v8,v8,v16)
|
||||
bdz .Lv8
|
||||
|
||||
lvx v9,off16,r4
|
||||
lvx v17,off16,r3
|
||||
VPERM(v9,v9,v17,byteswap)
|
||||
VPMSUMW(v9,v9,v17)
|
||||
bdz .Lv9
|
||||
|
||||
lvx v10,off32,r4
|
||||
lvx v16,off32,r3
|
||||
VPERM(v10,v10,v16,byteswap)
|
||||
VPMSUMW(v10,v10,v16)
|
||||
bdz .Lv10
|
||||
|
||||
lvx v11,off48,r4
|
||||
lvx v17,off48,r3
|
||||
VPERM(v11,v11,v17,byteswap)
|
||||
VPMSUMW(v11,v11,v17)
|
||||
bdz .Lv11
|
||||
|
||||
lvx v12,off64,r4
|
||||
lvx v16,off64,r3
|
||||
VPERM(v12,v12,v16,byteswap)
|
||||
VPMSUMW(v12,v12,v16)
|
||||
bdz .Lv12
|
||||
|
||||
lvx v13,off80,r4
|
||||
lvx v17,off80,r3
|
||||
VPERM(v13,v13,v17,byteswap)
|
||||
VPMSUMW(v13,v13,v17)
|
||||
bdz .Lv13
|
||||
|
||||
lvx v14,off96,r4
|
||||
lvx v16,off96,r3
|
||||
VPERM(v14,v14,v16,byteswap)
|
||||
VPMSUMW(v14,v14,v16)
|
||||
bdz .Lv14
|
||||
|
||||
lvx v15,off112,r4
|
||||
lvx v17,off112,r3
|
||||
VPERM(v15,v15,v17,byteswap)
|
||||
VPMSUMW(v15,v15,v17)
|
||||
|
||||
.Lv15: vxor v19,v19,v15
|
||||
.Lv14: vxor v20,v20,v14
|
||||
.Lv13: vxor v19,v19,v13
|
||||
.Lv12: vxor v20,v20,v12
|
||||
.Lv11: vxor v19,v19,v11
|
||||
.Lv10: vxor v20,v20,v10
|
||||
.Lv9: vxor v19,v19,v9
|
||||
.Lv8: vxor v20,v20,v8
|
||||
.Lv7: vxor v19,v19,v7
|
||||
.Lv6: vxor v20,v20,v6
|
||||
.Lv5: vxor v19,v19,v5
|
||||
.Lv4: vxor v20,v20,v4
|
||||
.Lv3: vxor v19,v19,v3
|
||||
.Lv2: vxor v20,v20,v2
|
||||
.Lv1: vxor v19,v19,v1
|
||||
.Lv0: vxor v20,v20,v0
|
||||
|
||||
vxor v0,v19,v20
|
||||
|
||||
b .Lbarrett_reduction
|
||||
|
||||
.Lzero:
|
||||
mr r3,r10
|
||||
b .Lout
|
||||
|
||||
FUNC_END(CRC_FUNCTION_NAME)
|
|
@ -1,20 +1,5 @@
|
|||
/*
|
||||
* Calculate the checksum of data that is 16 byte aligned and a multiple of
|
||||
* 16 bytes.
|
||||
*
|
||||
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
|
||||
* chunks in order to mask the latency of the vpmsum instructions. If we
|
||||
* have more than 32 kB of data to checksum we repeat this step multiple
|
||||
* times, passing in the previous 1024 bits.
|
||||
*
|
||||
* The next step is to reduce the 1024 bits to 64 bits. This step adds
|
||||
* 32 bits of 0s to the end - this matches what a CRC does. We just
|
||||
* calculate constants that land the data in this 32 bits.
|
||||
*
|
||||
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
|
||||
* for n = CRC using POWER8 instructions. We use x = 32.
|
||||
*
|
||||
* http://en.wikipedia.org/wiki/Barrett_reduction
|
||||
* Calculate a crc32c with vpmsum acceleration
|
||||
*
|
||||
* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
|
||||
*
|
||||
|
@ -23,9 +8,6 @@
|
|||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
#include <asm/ppc_asm.h>
|
||||
#include <asm/ppc-opcode.h>
|
||||
|
||||
.section .rodata
|
||||
.balign 16
|
||||
|
||||
|
@ -33,7 +15,6 @@
|
|||
/* byte reverse permute constant */
|
||||
.octa 0x0F0E0D0C0B0A09080706050403020100
|
||||
|
||||
#define MAX_SIZE 32768
|
||||
.constants:
|
||||
|
||||
/* Reduce 262144 kbits to 1024 bits */
|
||||
|
@ -860,694 +841,5 @@
|
|||
/* 33 bit reflected Barrett constant n */
|
||||
.octa 0x00000000000000000000000105ec76f1
|
||||
|
||||
.text
|
||||
|
||||
#if defined(__BIG_ENDIAN__)
|
||||
#define BYTESWAP_DATA
|
||||
#else
|
||||
#undef BYTESWAP_DATA
|
||||
#endif
|
||||
|
||||
#define off16 r25
|
||||
#define off32 r26
|
||||
#define off48 r27
|
||||
#define off64 r28
|
||||
#define off80 r29
|
||||
#define off96 r30
|
||||
#define off112 r31
|
||||
|
||||
#define const1 v24
|
||||
#define const2 v25
|
||||
|
||||
#define byteswap v26
|
||||
#define mask_32bit v27
|
||||
#define mask_64bit v28
|
||||
#define zeroes v29
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
#define VPERM(A, B, C, D) vperm A, B, C, D
|
||||
#else
|
||||
#define VPERM(A, B, C, D)
|
||||
#endif
|
||||
|
||||
/* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */
|
||||
FUNC_START(__crc32c_vpmsum)
|
||||
std r31,-8(r1)
|
||||
std r30,-16(r1)
|
||||
std r29,-24(r1)
|
||||
std r28,-32(r1)
|
||||
std r27,-40(r1)
|
||||
std r26,-48(r1)
|
||||
std r25,-56(r1)
|
||||
|
||||
li off16,16
|
||||
li off32,32
|
||||
li off48,48
|
||||
li off64,64
|
||||
li off80,80
|
||||
li off96,96
|
||||
li off112,112
|
||||
li r0,0
|
||||
|
||||
/* Enough room for saving 10 non volatile VMX registers */
|
||||
subi r6,r1,56+10*16
|
||||
subi r7,r1,56+2*16
|
||||
|
||||
stvx v20,0,r6
|
||||
stvx v21,off16,r6
|
||||
stvx v22,off32,r6
|
||||
stvx v23,off48,r6
|
||||
stvx v24,off64,r6
|
||||
stvx v25,off80,r6
|
||||
stvx v26,off96,r6
|
||||
stvx v27,off112,r6
|
||||
stvx v28,0,r7
|
||||
stvx v29,off16,r7
|
||||
|
||||
mr r10,r3
|
||||
|
||||
vxor zeroes,zeroes,zeroes
|
||||
vspltisw v0,-1
|
||||
|
||||
vsldoi mask_32bit,zeroes,v0,4
|
||||
vsldoi mask_64bit,zeroes,v0,8
|
||||
|
||||
/* Get the initial value into v8 */
|
||||
vxor v8,v8,v8
|
||||
MTVRD(v8, R3)
|
||||
vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
addis r3,r2,.byteswap_constant@toc@ha
|
||||
addi r3,r3,.byteswap_constant@toc@l
|
||||
|
||||
lvx byteswap,0,r3
|
||||
addi r3,r3,16
|
||||
#endif
|
||||
|
||||
cmpdi r5,256
|
||||
blt .Lshort
|
||||
|
||||
rldicr r6,r5,0,56
|
||||
|
||||
/* Checksum in blocks of MAX_SIZE */
|
||||
1: lis r7,MAX_SIZE@h
|
||||
ori r7,r7,MAX_SIZE@l
|
||||
mr r9,r7
|
||||
cmpd r6,r7
|
||||
bgt 2f
|
||||
mr r7,r6
|
||||
2: subf r6,r7,r6
|
||||
|
||||
/* our main loop does 128 bytes at a time */
|
||||
srdi r7,r7,7
|
||||
|
||||
/*
|
||||
* Work out the offset into the constants table to start at. Each
|
||||
* constant is 16 bytes, and it is used against 128 bytes of input
|
||||
* data - 128 / 16 = 8
|
||||
*/
|
||||
sldi r8,r7,4
|
||||
srdi r9,r9,3
|
||||
subf r8,r8,r9
|
||||
|
||||
/* We reduce our final 128 bytes in a separate step */
|
||||
addi r7,r7,-1
|
||||
mtctr r7
|
||||
|
||||
addis r3,r2,.constants@toc@ha
|
||||
addi r3,r3,.constants@toc@l
|
||||
|
||||
/* Find the start of our constants */
|
||||
add r3,r3,r8
|
||||
|
||||
/* zero v0-v7 which will contain our checksums */
|
||||
vxor v0,v0,v0
|
||||
vxor v1,v1,v1
|
||||
vxor v2,v2,v2
|
||||
vxor v3,v3,v3
|
||||
vxor v4,v4,v4
|
||||
vxor v5,v5,v5
|
||||
vxor v6,v6,v6
|
||||
vxor v7,v7,v7
|
||||
|
||||
lvx const1,0,r3
|
||||
|
||||
/*
|
||||
* If we are looping back to consume more data we use the values
|
||||
* already in v16-v23.
|
||||
*/
|
||||
cmpdi r0,1
|
||||
beq 2f
|
||||
|
||||
/* First warm up pass */
|
||||
lvx v16,0,r4
|
||||
lvx v17,off16,r4
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPERM(v17,v17,v17,byteswap)
|
||||
lvx v18,off32,r4
|
||||
lvx v19,off48,r4
|
||||
VPERM(v18,v18,v18,byteswap)
|
||||
VPERM(v19,v19,v19,byteswap)
|
||||
lvx v20,off64,r4
|
||||
lvx v21,off80,r4
|
||||
VPERM(v20,v20,v20,byteswap)
|
||||
VPERM(v21,v21,v21,byteswap)
|
||||
lvx v22,off96,r4
|
||||
lvx v23,off112,r4
|
||||
VPERM(v22,v22,v22,byteswap)
|
||||
VPERM(v23,v23,v23,byteswap)
|
||||
addi r4,r4,8*16
|
||||
|
||||
/* xor in initial value */
|
||||
vxor v16,v16,v8
|
||||
|
||||
2: bdz .Lfirst_warm_up_done
|
||||
|
||||
addi r3,r3,16
|
||||
lvx const2,0,r3
|
||||
|
||||
/* Second warm up pass */
|
||||
VPMSUMD(v8,v16,const1)
|
||||
lvx v16,0,r4
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v9,v17,const1)
|
||||
lvx v17,off16,r4
|
||||
VPERM(v17,v17,v17,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v10,v18,const1)
|
||||
lvx v18,off32,r4
|
||||
VPERM(v18,v18,v18,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v11,v19,const1)
|
||||
lvx v19,off48,r4
|
||||
VPERM(v19,v19,v19,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v12,v20,const1)
|
||||
lvx v20,off64,r4
|
||||
VPERM(v20,v20,v20,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v13,v21,const1)
|
||||
lvx v21,off80,r4
|
||||
VPERM(v21,v21,v21,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v14,v22,const1)
|
||||
lvx v22,off96,r4
|
||||
VPERM(v22,v22,v22,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v15,v23,const1)
|
||||
lvx v23,off112,r4
|
||||
VPERM(v23,v23,v23,byteswap)
|
||||
|
||||
addi r4,r4,8*16
|
||||
|
||||
bdz .Lfirst_cool_down
|
||||
|
||||
/*
|
||||
* main loop. We modulo schedule it such that it takes three iterations
|
||||
* to complete - first iteration load, second iteration vpmsum, third
|
||||
* iteration xor.
|
||||
*/
|
||||
.balign 16
|
||||
4: lvx const1,0,r3
|
||||
addi r3,r3,16
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v0,v0,v8
|
||||
VPMSUMD(v8,v16,const2)
|
||||
lvx v16,0,r4
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v1,v1,v9
|
||||
VPMSUMD(v9,v17,const2)
|
||||
lvx v17,off16,r4
|
||||
VPERM(v17,v17,v17,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v2,v2,v10
|
||||
VPMSUMD(v10,v18,const2)
|
||||
lvx v18,off32,r4
|
||||
VPERM(v18,v18,v18,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v3,v3,v11
|
||||
VPMSUMD(v11,v19,const2)
|
||||
lvx v19,off48,r4
|
||||
VPERM(v19,v19,v19,byteswap)
|
||||
lvx const2,0,r3
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v4,v4,v12
|
||||
VPMSUMD(v12,v20,const1)
|
||||
lvx v20,off64,r4
|
||||
VPERM(v20,v20,v20,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v5,v5,v13
|
||||
VPMSUMD(v13,v21,const1)
|
||||
lvx v21,off80,r4
|
||||
VPERM(v21,v21,v21,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v6,v6,v14
|
||||
VPMSUMD(v14,v22,const1)
|
||||
lvx v22,off96,r4
|
||||
VPERM(v22,v22,v22,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v7,v7,v15
|
||||
VPMSUMD(v15,v23,const1)
|
||||
lvx v23,off112,r4
|
||||
VPERM(v23,v23,v23,byteswap)
|
||||
|
||||
addi r4,r4,8*16
|
||||
|
||||
bdnz 4b
|
||||
|
||||
.Lfirst_cool_down:
|
||||
/* First cool down pass */
|
||||
lvx const1,0,r3
|
||||
addi r3,r3,16
|
||||
|
||||
vxor v0,v0,v8
|
||||
VPMSUMD(v8,v16,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v1,v1,v9
|
||||
VPMSUMD(v9,v17,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v2,v2,v10
|
||||
VPMSUMD(v10,v18,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v3,v3,v11
|
||||
VPMSUMD(v11,v19,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v4,v4,v12
|
||||
VPMSUMD(v12,v20,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v5,v5,v13
|
||||
VPMSUMD(v13,v21,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v6,v6,v14
|
||||
VPMSUMD(v14,v22,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v7,v7,v15
|
||||
VPMSUMD(v15,v23,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
.Lsecond_cool_down:
|
||||
/* Second cool down pass */
|
||||
vxor v0,v0,v8
|
||||
vxor v1,v1,v9
|
||||
vxor v2,v2,v10
|
||||
vxor v3,v3,v11
|
||||
vxor v4,v4,v12
|
||||
vxor v5,v5,v13
|
||||
vxor v6,v6,v14
|
||||
vxor v7,v7,v15
|
||||
|
||||
/*
|
||||
* vpmsumd produces a 96 bit result in the least significant bits
|
||||
* of the register. Since we are bit reflected we have to shift it
|
||||
* left 32 bits so it occupies the least significant bits in the
|
||||
* bit reflected domain.
|
||||
*/
|
||||
vsldoi v0,v0,zeroes,4
|
||||
vsldoi v1,v1,zeroes,4
|
||||
vsldoi v2,v2,zeroes,4
|
||||
vsldoi v3,v3,zeroes,4
|
||||
vsldoi v4,v4,zeroes,4
|
||||
vsldoi v5,v5,zeroes,4
|
||||
vsldoi v6,v6,zeroes,4
|
||||
vsldoi v7,v7,zeroes,4
|
||||
|
||||
/* xor with last 1024 bits */
|
||||
lvx v8,0,r4
|
||||
lvx v9,off16,r4
|
||||
VPERM(v8,v8,v8,byteswap)
|
||||
VPERM(v9,v9,v9,byteswap)
|
||||
lvx v10,off32,r4
|
||||
lvx v11,off48,r4
|
||||
VPERM(v10,v10,v10,byteswap)
|
||||
VPERM(v11,v11,v11,byteswap)
|
||||
lvx v12,off64,r4
|
||||
lvx v13,off80,r4
|
||||
VPERM(v12,v12,v12,byteswap)
|
||||
VPERM(v13,v13,v13,byteswap)
|
||||
lvx v14,off96,r4
|
||||
lvx v15,off112,r4
|
||||
VPERM(v14,v14,v14,byteswap)
|
||||
VPERM(v15,v15,v15,byteswap)
|
||||
|
||||
addi r4,r4,8*16
|
||||
|
||||
vxor v16,v0,v8
|
||||
vxor v17,v1,v9
|
||||
vxor v18,v2,v10
|
||||
vxor v19,v3,v11
|
||||
vxor v20,v4,v12
|
||||
vxor v21,v5,v13
|
||||
vxor v22,v6,v14
|
||||
vxor v23,v7,v15
|
||||
|
||||
li r0,1
|
||||
cmpdi r6,0
|
||||
addi r6,r6,128
|
||||
bne 1b
|
||||
|
||||
/* Work out how many bytes we have left */
|
||||
andi. r5,r5,127
|
||||
|
||||
/* Calculate where in the constant table we need to start */
|
||||
subfic r6,r5,128
|
||||
add r3,r3,r6
|
||||
|
||||
/* How many 16 byte chunks are in the tail */
|
||||
srdi r7,r5,4
|
||||
mtctr r7
|
||||
|
||||
/*
|
||||
* Reduce the previously calculated 1024 bits to 64 bits, shifting
|
||||
* 32 bits to include the trailing 32 bits of zeros
|
||||
*/
|
||||
lvx v0,0,r3
|
||||
lvx v1,off16,r3
|
||||
lvx v2,off32,r3
|
||||
lvx v3,off48,r3
|
||||
lvx v4,off64,r3
|
||||
lvx v5,off80,r3
|
||||
lvx v6,off96,r3
|
||||
lvx v7,off112,r3
|
||||
addi r3,r3,8*16
|
||||
|
||||
VPMSUMW(v0,v16,v0)
|
||||
VPMSUMW(v1,v17,v1)
|
||||
VPMSUMW(v2,v18,v2)
|
||||
VPMSUMW(v3,v19,v3)
|
||||
VPMSUMW(v4,v20,v4)
|
||||
VPMSUMW(v5,v21,v5)
|
||||
VPMSUMW(v6,v22,v6)
|
||||
VPMSUMW(v7,v23,v7)
|
||||
|
||||
/* Now reduce the tail (0 - 112 bytes) */
|
||||
cmpdi r7,0
|
||||
beq 1f
|
||||
|
||||
lvx v16,0,r4
|
||||
lvx v17,0,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off16,r4
|
||||
lvx v17,off16,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off32,r4
|
||||
lvx v17,off32,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off48,r4
|
||||
lvx v17,off48,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off64,r4
|
||||
lvx v17,off64,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off80,r4
|
||||
lvx v17,off80,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off96,r4
|
||||
lvx v17,off96,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
|
||||
/* Now xor all the parallel chunks together */
|
||||
1: vxor v0,v0,v1
|
||||
vxor v2,v2,v3
|
||||
vxor v4,v4,v5
|
||||
vxor v6,v6,v7
|
||||
|
||||
vxor v0,v0,v2
|
||||
vxor v4,v4,v6
|
||||
|
||||
vxor v0,v0,v4
|
||||
|
||||
.Lbarrett_reduction:
|
||||
/* Barrett constants */
|
||||
addis r3,r2,.barrett_constants@toc@ha
|
||||
addi r3,r3,.barrett_constants@toc@l
|
||||
|
||||
lvx const1,0,r3
|
||||
lvx const2,off16,r3
|
||||
|
||||
vsldoi v1,v0,v0,8
|
||||
vxor v0,v0,v1 /* xor two 64 bit results together */
|
||||
|
||||
/* shift left one bit */
|
||||
vspltisb v1,1
|
||||
vsl v0,v0,v1
|
||||
|
||||
vand v0,v0,mask_64bit
|
||||
|
||||
/*
|
||||
* The reflected version of Barrett reduction. Instead of bit
|
||||
* reflecting our data (which is expensive to do), we bit reflect our
|
||||
* constants and our algorithm, which means the intermediate data in
|
||||
* our vector registers goes from 0-63 instead of 63-0. We can reflect
|
||||
* the algorithm because we don't carry in mod 2 arithmetic.
|
||||
*/
|
||||
vand v1,v0,mask_32bit /* bottom 32 bits of a */
|
||||
VPMSUMD(v1,v1,const1) /* ma */
|
||||
vand v1,v1,mask_32bit /* bottom 32bits of ma */
|
||||
VPMSUMD(v1,v1,const2) /* qn */
|
||||
vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
|
||||
|
||||
/*
|
||||
* Since we are bit reflected, the result (ie the low 32 bits) is in
|
||||
* the high 32 bits. We just need to shift it left 4 bytes
|
||||
* V0 [ 0 1 X 3 ]
|
||||
* V0 [ 0 X 2 3 ]
|
||||
*/
|
||||
vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
|
||||
|
||||
/* Get it into r3 */
|
||||
MFVRD(R3, v0)
|
||||
|
||||
.Lout:
|
||||
subi r6,r1,56+10*16
|
||||
subi r7,r1,56+2*16
|
||||
|
||||
lvx v20,0,r6
|
||||
lvx v21,off16,r6
|
||||
lvx v22,off32,r6
|
||||
lvx v23,off48,r6
|
||||
lvx v24,off64,r6
|
||||
lvx v25,off80,r6
|
||||
lvx v26,off96,r6
|
||||
lvx v27,off112,r6
|
||||
lvx v28,0,r7
|
||||
lvx v29,off16,r7
|
||||
|
||||
ld r31,-8(r1)
|
||||
ld r30,-16(r1)
|
||||
ld r29,-24(r1)
|
||||
ld r28,-32(r1)
|
||||
ld r27,-40(r1)
|
||||
ld r26,-48(r1)
|
||||
ld r25,-56(r1)
|
||||
|
||||
blr
|
||||
|
||||
.Lfirst_warm_up_done:
|
||||
lvx const1,0,r3
|
||||
addi r3,r3,16
|
||||
|
||||
VPMSUMD(v8,v16,const1)
|
||||
VPMSUMD(v9,v17,const1)
|
||||
VPMSUMD(v10,v18,const1)
|
||||
VPMSUMD(v11,v19,const1)
|
||||
VPMSUMD(v12,v20,const1)
|
||||
VPMSUMD(v13,v21,const1)
|
||||
VPMSUMD(v14,v22,const1)
|
||||
VPMSUMD(v15,v23,const1)
|
||||
|
||||
b .Lsecond_cool_down
|
||||
|
||||
.Lshort:
|
||||
cmpdi r5,0
|
||||
beq .Lzero
|
||||
|
||||
addis r3,r2,.short_constants@toc@ha
|
||||
addi r3,r3,.short_constants@toc@l
|
||||
|
||||
/* Calculate where in the constant table we need to start */
|
||||
subfic r6,r5,256
|
||||
add r3,r3,r6
|
||||
|
||||
/* How many 16 byte chunks? */
|
||||
srdi r7,r5,4
|
||||
mtctr r7
|
||||
|
||||
vxor v19,v19,v19
|
||||
vxor v20,v20,v20
|
||||
|
||||
lvx v0,0,r4
|
||||
lvx v16,0,r3
|
||||
VPERM(v0,v0,v16,byteswap)
|
||||
vxor v0,v0,v8 /* xor in initial value */
|
||||
VPMSUMW(v0,v0,v16)
|
||||
bdz .Lv0
|
||||
|
||||
lvx v1,off16,r4
|
||||
lvx v17,off16,r3
|
||||
VPERM(v1,v1,v17,byteswap)
|
||||
VPMSUMW(v1,v1,v17)
|
||||
bdz .Lv1
|
||||
|
||||
lvx v2,off32,r4
|
||||
lvx v16,off32,r3
|
||||
VPERM(v2,v2,v16,byteswap)
|
||||
VPMSUMW(v2,v2,v16)
|
||||
bdz .Lv2
|
||||
|
||||
lvx v3,off48,r4
|
||||
lvx v17,off48,r3
|
||||
VPERM(v3,v3,v17,byteswap)
|
||||
VPMSUMW(v3,v3,v17)
|
||||
bdz .Lv3
|
||||
|
||||
lvx v4,off64,r4
|
||||
lvx v16,off64,r3
|
||||
VPERM(v4,v4,v16,byteswap)
|
||||
VPMSUMW(v4,v4,v16)
|
||||
bdz .Lv4
|
||||
|
||||
lvx v5,off80,r4
|
||||
lvx v17,off80,r3
|
||||
VPERM(v5,v5,v17,byteswap)
|
||||
VPMSUMW(v5,v5,v17)
|
||||
bdz .Lv5
|
||||
|
||||
lvx v6,off96,r4
|
||||
lvx v16,off96,r3
|
||||
VPERM(v6,v6,v16,byteswap)
|
||||
VPMSUMW(v6,v6,v16)
|
||||
bdz .Lv6
|
||||
|
||||
lvx v7,off112,r4
|
||||
lvx v17,off112,r3
|
||||
VPERM(v7,v7,v17,byteswap)
|
||||
VPMSUMW(v7,v7,v17)
|
||||
bdz .Lv7
|
||||
|
||||
addi r3,r3,128
|
||||
addi r4,r4,128
|
||||
|
||||
lvx v8,0,r4
|
||||
lvx v16,0,r3
|
||||
VPERM(v8,v8,v16,byteswap)
|
||||
VPMSUMW(v8,v8,v16)
|
||||
bdz .Lv8
|
||||
|
||||
lvx v9,off16,r4
|
||||
lvx v17,off16,r3
|
||||
VPERM(v9,v9,v17,byteswap)
|
||||
VPMSUMW(v9,v9,v17)
|
||||
bdz .Lv9
|
||||
|
||||
lvx v10,off32,r4
|
||||
lvx v16,off32,r3
|
||||
VPERM(v10,v10,v16,byteswap)
|
||||
VPMSUMW(v10,v10,v16)
|
||||
bdz .Lv10
|
||||
|
||||
lvx v11,off48,r4
|
||||
lvx v17,off48,r3
|
||||
VPERM(v11,v11,v17,byteswap)
|
||||
VPMSUMW(v11,v11,v17)
|
||||
bdz .Lv11
|
||||
|
||||
lvx v12,off64,r4
|
||||
lvx v16,off64,r3
|
||||
VPERM(v12,v12,v16,byteswap)
|
||||
VPMSUMW(v12,v12,v16)
|
||||
bdz .Lv12
|
||||
|
||||
lvx v13,off80,r4
|
||||
lvx v17,off80,r3
|
||||
VPERM(v13,v13,v17,byteswap)
|
||||
VPMSUMW(v13,v13,v17)
|
||||
bdz .Lv13
|
||||
|
||||
lvx v14,off96,r4
|
||||
lvx v16,off96,r3
|
||||
VPERM(v14,v14,v16,byteswap)
|
||||
VPMSUMW(v14,v14,v16)
|
||||
bdz .Lv14
|
||||
|
||||
lvx v15,off112,r4
|
||||
lvx v17,off112,r3
|
||||
VPERM(v15,v15,v17,byteswap)
|
||||
VPMSUMW(v15,v15,v17)
|
||||
|
||||
.Lv15: vxor v19,v19,v15
|
||||
.Lv14: vxor v20,v20,v14
|
||||
.Lv13: vxor v19,v19,v13
|
||||
.Lv12: vxor v20,v20,v12
|
||||
.Lv11: vxor v19,v19,v11
|
||||
.Lv10: vxor v20,v20,v10
|
||||
.Lv9: vxor v19,v19,v9
|
||||
.Lv8: vxor v20,v20,v8
|
||||
.Lv7: vxor v19,v19,v7
|
||||
.Lv6: vxor v20,v20,v6
|
||||
.Lv5: vxor v19,v19,v5
|
||||
.Lv4: vxor v20,v20,v4
|
||||
.Lv3: vxor v19,v19,v3
|
||||
.Lv2: vxor v20,v20,v2
|
||||
.Lv1: vxor v19,v19,v1
|
||||
.Lv0: vxor v20,v20,v0
|
||||
|
||||
vxor v0,v19,v20
|
||||
|
||||
b .Lbarrett_reduction
|
||||
|
||||
.Lzero:
|
||||
mr r3,r10
|
||||
b .Lout
|
||||
|
||||
FUNC_END(__crc32_vpmsum)
|
||||
#define CRC_FUNCTION_NAME __crc32c_vpmsum
|
||||
#include "crc32-vpmsum_core.S"
|
||||
|
|
Loading…
Reference in New Issue