461 lines
14 KiB
ArmAsm
461 lines
14 KiB
ArmAsm
|
/*
|
||
|
* Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
|
||
|
*
|
||
|
* The white paper on CRC32C calculations with PCLMULQDQ instruction can be
|
||
|
* downloaded from:
|
||
|
* http://download.intel.com/design/intarch/papers/323405.pdf
|
||
|
*
|
||
|
* Copyright (C) 2012 Intel Corporation.
|
||
|
*
|
||
|
* Authors:
|
||
|
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||
|
* James Guilford <james.guilford@intel.com>
|
||
|
* David Cote <david.m.cote@intel.com>
|
||
|
* Tim Chen <tim.c.chen@linux.intel.com>
|
||
|
*
|
||
|
* This software is available to you under a choice of one of two
|
||
|
* licenses. You may choose to be licensed under the terms of the GNU
|
||
|
* General Public License (GPL) Version 2, available from the file
|
||
|
* COPYING in the main directory of this source tree, or the
|
||
|
* OpenIB.org BSD license below:
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or
|
||
|
* without modification, are permitted provided that the following
|
||
|
* conditions are met:
|
||
|
*
|
||
|
* - Redistributions of source code must retain the above
|
||
|
* copyright notice, this list of conditions and the following
|
||
|
* disclaimer.
|
||
|
*
|
||
|
* - Redistributions in binary form must reproduce the above
|
||
|
* copyright notice, this list of conditions and the following
|
||
|
* disclaimer in the documentation and/or other materials
|
||
|
* provided with the distribution.
|
||
|
*
|
||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||
|
* SOFTWARE.
|
||
|
*/
|
||
|
|
||
|
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
|
||
|
|
||
|
.macro LABEL prefix n
|
||
|
\prefix\n\():
|
||
|
.endm
|
||
|
|
||
|
.macro JMPTBL_ENTRY i
|
||
|
.word crc_\i - crc_array
|
||
|
.endm
|
||
|
|
||
|
.macro JNC_LESS_THAN j
|
||
|
jnc less_than_\j
|
||
|
.endm
|
||
|
|
||
|
# Define threshold where buffers are considered "small" and routed to more
|
||
|
# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
|
||
|
# SMALL_SIZE can be no larger than 255.
|
||
|
|
||
|
#define SMALL_SIZE 200
|
||
|
|
||
|
.if (SMALL_SIZE > 255)
|
||
|
.error "SMALL_ SIZE must be < 256"
|
||
|
.endif
|
||
|
|
||
|
# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
|
||
|
|
||
|
.global crc_pcl
|
||
|
crc_pcl:
|
||
|
#define bufp %rdi
|
||
|
#define bufp_dw %edi
|
||
|
#define bufp_w %di
|
||
|
#define bufp_b %dil
|
||
|
#define bufptmp %rcx
|
||
|
#define block_0 %rcx
|
||
|
#define block_1 %rdx
|
||
|
#define block_2 %r11
|
||
|
#define len %rsi
|
||
|
#define len_dw %esi
|
||
|
#define len_w %si
|
||
|
#define len_b %sil
|
||
|
#define crc_init_arg %rdx
|
||
|
#define tmp %rbx
|
||
|
#define crc_init %r8
|
||
|
#define crc_init_dw %r8d
|
||
|
#define crc1 %r9
|
||
|
#define crc2 %r10
|
||
|
|
||
|
pushq %rbx
|
||
|
pushq %rdi
|
||
|
pushq %rsi
|
||
|
|
||
|
## Move crc_init for Linux to a different
|
||
|
mov crc_init_arg, crc_init
|
||
|
|
||
|
################################################################
|
||
|
## 1) ALIGN:
|
||
|
################################################################
|
||
|
|
||
|
mov bufp, bufptmp # rdi = *buf
|
||
|
neg bufp
|
||
|
and $7, bufp # calculate the unalignment amount of
|
||
|
# the address
|
||
|
je proc_block # Skip if aligned
|
||
|
|
||
|
## If len is less than 8 and we're unaligned, we need to jump
|
||
|
## to special code to avoid reading beyond the end of the buffer
|
||
|
cmp $8, len
|
||
|
jae do_align
|
||
|
# less_than_8 expects length in upper 3 bits of len_dw
|
||
|
# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
|
||
|
shl $32-3+1, len_dw
|
||
|
jmp less_than_8_post_shl1
|
||
|
|
||
|
do_align:
|
||
|
#### Calculate CRC of unaligned bytes of the buffer (if any)
|
||
|
movq (bufptmp), tmp # load a quadward from the buffer
|
||
|
add bufp, bufptmp # align buffer pointer for quadword
|
||
|
# processing
|
||
|
sub bufp, len # update buffer length
|
||
|
align_loop:
|
||
|
crc32b %bl, crc_init_dw # compute crc32 of 1-byte
|
||
|
shr $8, tmp # get next byte
|
||
|
dec bufp
|
||
|
jne align_loop
|
||
|
|
||
|
proc_block:
|
||
|
|
||
|
################################################################
|
||
|
## 2) PROCESS BLOCKS:
|
||
|
################################################################
|
||
|
|
||
|
## compute num of bytes to be processed
|
||
|
movq len, tmp # save num bytes in tmp
|
||
|
|
||
|
cmpq $128*24, len
|
||
|
jae full_block
|
||
|
|
||
|
continue_block:
|
||
|
cmpq $SMALL_SIZE, len
|
||
|
jb small
|
||
|
|
||
|
## len < 128*24
|
||
|
movq $2731, %rax # 2731 = ceil(2^16 / 24)
|
||
|
mul len_dw
|
||
|
shrq $16, %rax
|
||
|
|
||
|
## eax contains floor(bytes / 24) = num 24-byte chunks to do
|
||
|
|
||
|
## process rax 24-byte chunks (128 >= rax >= 0)
|
||
|
|
||
|
## compute end address of each block
|
||
|
## block 0 (base addr + RAX * 8)
|
||
|
## block 1 (base addr + RAX * 16)
|
||
|
## block 2 (base addr + RAX * 24)
|
||
|
lea (bufptmp, %rax, 8), block_0
|
||
|
lea (block_0, %rax, 8), block_1
|
||
|
lea (block_1, %rax, 8), block_2
|
||
|
|
||
|
xor crc1, crc1
|
||
|
xor crc2, crc2
|
||
|
|
||
|
## branch into array
|
||
|
lea jump_table(%rip), bufp
|
||
|
movzxw (bufp, %rax, 2), len
|
||
|
offset=crc_array-jump_table
|
||
|
lea offset(bufp, len, 1), bufp
|
||
|
jmp *bufp
|
||
|
|
||
|
################################################################
|
||
|
## 2a) PROCESS FULL BLOCKS:
|
||
|
################################################################
|
||
|
full_block:
|
||
|
movq $128,%rax
|
||
|
lea 128*8*2(block_0), block_1
|
||
|
lea 128*8*3(block_0), block_2
|
||
|
add $128*8*1, block_0
|
||
|
|
||
|
xor crc1,crc1
|
||
|
xor crc2,crc2
|
||
|
|
||
|
# Fall thruogh into top of crc array (crc_128)
|
||
|
|
||
|
################################################################
|
||
|
## 3) CRC Array:
|
||
|
################################################################
|
||
|
|
||
|
crc_array:
|
||
|
i=128
|
||
|
.rept 128-1
|
||
|
.altmacro
|
||
|
LABEL crc_ %i
|
||
|
.noaltmacro
|
||
|
crc32q -i*8(block_0), crc_init
|
||
|
crc32q -i*8(block_1), crc1
|
||
|
crc32q -i*8(block_2), crc2
|
||
|
i=(i-1)
|
||
|
.endr
|
||
|
|
||
|
.altmacro
|
||
|
LABEL crc_ %i
|
||
|
.noaltmacro
|
||
|
crc32q -i*8(block_0), crc_init
|
||
|
crc32q -i*8(block_1), crc1
|
||
|
# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
|
||
|
|
||
|
mov block_2, block_0
|
||
|
|
||
|
################################################################
|
||
|
## 4) Combine three results:
|
||
|
################################################################
|
||
|
|
||
|
lea (K_table-16)(%rip), bufp # first entry is for idx 1
|
||
|
shlq $3, %rax # rax *= 8
|
||
|
subq %rax, tmp # tmp -= rax*8
|
||
|
shlq $1, %rax
|
||
|
subq %rax, tmp # tmp -= rax*16
|
||
|
# (total tmp -= rax*24)
|
||
|
addq %rax, bufp
|
||
|
|
||
|
movdqa (bufp), %xmm0 # 2 consts: K1:K2
|
||
|
|
||
|
movq crc_init, %xmm1 # CRC for block 1
|
||
|
pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2
|
||
|
|
||
|
movq crc1, %xmm2 # CRC for block 2
|
||
|
pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
|
||
|
|
||
|
pxor %xmm2,%xmm1
|
||
|
movq %xmm1, %rax
|
||
|
xor -i*8(block_2), %rax
|
||
|
mov crc2, crc_init
|
||
|
crc32 %rax, crc_init
|
||
|
|
||
|
################################################################
|
||
|
## 5) Check for end:
|
||
|
################################################################
|
||
|
|
||
|
LABEL crc_ 0
|
||
|
mov tmp, len
|
||
|
cmp $128*24, tmp
|
||
|
jae full_block
|
||
|
cmp $24, tmp
|
||
|
jae continue_block
|
||
|
|
||
|
less_than_24:
|
||
|
shl $32-4, len_dw # less_than_16 expects length
|
||
|
# in upper 4 bits of len_dw
|
||
|
jnc less_than_16
|
||
|
crc32q (bufptmp), crc_init
|
||
|
crc32q 8(bufptmp), crc_init
|
||
|
jz do_return
|
||
|
add $16, bufptmp
|
||
|
# len is less than 8 if we got here
|
||
|
# less_than_8 expects length in upper 3 bits of len_dw
|
||
|
# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
|
||
|
shl $2, len_dw
|
||
|
jmp less_than_8_post_shl1
|
||
|
|
||
|
#######################################################################
|
||
|
## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
|
||
|
#######################################################################
|
||
|
small:
|
||
|
shl $32-8, len_dw # Prepare len_dw for less_than_256
|
||
|
j=256
|
||
|
.rept 5 # j = {256, 128, 64, 32, 16}
|
||
|
.altmacro
|
||
|
LABEL less_than_ %j # less_than_j: Length should be in
|
||
|
# upper lg(j) bits of len_dw
|
||
|
j=(j/2)
|
||
|
shl $1, len_dw # Get next MSB
|
||
|
JNC_LESS_THAN %j
|
||
|
.noaltmacro
|
||
|
i=0
|
||
|
.rept (j/8)
|
||
|
crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
|
||
|
i=i+8
|
||
|
.endr
|
||
|
jz do_return # Return if remaining length is zero
|
||
|
add $j, bufptmp # Advance buf
|
||
|
.endr
|
||
|
|
||
|
less_than_8: # Length should be stored in
|
||
|
# upper 3 bits of len_dw
|
||
|
shl $1, len_dw
|
||
|
less_than_8_post_shl1:
|
||
|
jnc less_than_4
|
||
|
crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
|
||
|
jz do_return # return if remaining data is zero
|
||
|
add $4, bufptmp
|
||
|
less_than_4: # Length should be stored in
|
||
|
# upper 2 bits of len_dw
|
||
|
shl $1, len_dw
|
||
|
jnc less_than_2
|
||
|
crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
|
||
|
jz do_return # return if remaining data is zero
|
||
|
add $2, bufptmp
|
||
|
less_than_2: # Length should be stored in the MSB
|
||
|
# of len_dw
|
||
|
shl $1, len_dw
|
||
|
jnc less_than_1
|
||
|
crc32b (bufptmp), crc_init_dw # CRC of 1 byte
|
||
|
less_than_1: # Length should be zero
|
||
|
do_return:
|
||
|
movq crc_init, %rax
|
||
|
popq %rsi
|
||
|
popq %rdi
|
||
|
popq %rbx
|
||
|
ret
|
||
|
|
||
|
################################################################
|
||
|
## jump table Table is 129 entries x 2 bytes each
|
||
|
################################################################
|
||
|
.align 4
|
||
|
jump_table:
|
||
|
i=0
|
||
|
.rept 129
|
||
|
.altmacro
|
||
|
JMPTBL_ENTRY %i
|
||
|
.noaltmacro
|
||
|
i=i+1
|
||
|
.endr
|
||
|
################################################################
|
||
|
## PCLMULQDQ tables
|
||
|
## Table is 128 entries x 2 quad words each
|
||
|
################################################################
|
||
|
.data
|
||
|
.align 64
|
||
|
K_table:
|
||
|
.quad 0x14cd00bd6,0x105ec76f0
|
||
|
.quad 0x0ba4fc28e,0x14cd00bd6
|
||
|
.quad 0x1d82c63da,0x0f20c0dfe
|
||
|
.quad 0x09e4addf8,0x0ba4fc28e
|
||
|
.quad 0x039d3b296,0x1384aa63a
|
||
|
.quad 0x102f9b8a2,0x1d82c63da
|
||
|
.quad 0x14237f5e6,0x01c291d04
|
||
|
.quad 0x00d3b6092,0x09e4addf8
|
||
|
.quad 0x0c96cfdc0,0x0740eef02
|
||
|
.quad 0x18266e456,0x039d3b296
|
||
|
.quad 0x0daece73e,0x0083a6eec
|
||
|
.quad 0x0ab7aff2a,0x102f9b8a2
|
||
|
.quad 0x1248ea574,0x1c1733996
|
||
|
.quad 0x083348832,0x14237f5e6
|
||
|
.quad 0x12c743124,0x02ad91c30
|
||
|
.quad 0x0b9e02b86,0x00d3b6092
|
||
|
.quad 0x018b33a4e,0x06992cea2
|
||
|
.quad 0x1b331e26a,0x0c96cfdc0
|
||
|
.quad 0x17d35ba46,0x07e908048
|
||
|
.quad 0x1bf2e8b8a,0x18266e456
|
||
|
.quad 0x1a3e0968a,0x11ed1f9d8
|
||
|
.quad 0x0ce7f39f4,0x0daece73e
|
||
|
.quad 0x061d82e56,0x0f1d0f55e
|
||
|
.quad 0x0d270f1a2,0x0ab7aff2a
|
||
|
.quad 0x1c3f5f66c,0x0a87ab8a8
|
||
|
.quad 0x12ed0daac,0x1248ea574
|
||
|
.quad 0x065863b64,0x08462d800
|
||
|
.quad 0x11eef4f8e,0x083348832
|
||
|
.quad 0x1ee54f54c,0x071d111a8
|
||
|
.quad 0x0b3e32c28,0x12c743124
|
||
|
.quad 0x0064f7f26,0x0ffd852c6
|
||
|
.quad 0x0dd7e3b0c,0x0b9e02b86
|
||
|
.quad 0x0f285651c,0x0dcb17aa4
|
||
|
.quad 0x010746f3c,0x018b33a4e
|
||
|
.quad 0x1c24afea4,0x0f37c5aee
|
||
|
.quad 0x0271d9844,0x1b331e26a
|
||
|
.quad 0x08e766a0c,0x06051d5a2
|
||
|
.quad 0x093a5f730,0x17d35ba46
|
||
|
.quad 0x06cb08e5c,0x11d5ca20e
|
||
|
.quad 0x06b749fb2,0x1bf2e8b8a
|
||
|
.quad 0x1167f94f2,0x021f3d99c
|
||
|
.quad 0x0cec3662e,0x1a3e0968a
|
||
|
.quad 0x19329634a,0x08f158014
|
||
|
.quad 0x0e6fc4e6a,0x0ce7f39f4
|
||
|
.quad 0x08227bb8a,0x1a5e82106
|
||
|
.quad 0x0b0cd4768,0x061d82e56
|
||
|
.quad 0x13c2b89c4,0x188815ab2
|
||
|
.quad 0x0d7a4825c,0x0d270f1a2
|
||
|
.quad 0x10f5ff2ba,0x105405f3e
|
||
|
.quad 0x00167d312,0x1c3f5f66c
|
||
|
.quad 0x0f6076544,0x0e9adf796
|
||
|
.quad 0x026f6a60a,0x12ed0daac
|
||
|
.quad 0x1a2adb74e,0x096638b34
|
||
|
.quad 0x19d34af3a,0x065863b64
|
||
|
.quad 0x049c3cc9c,0x1e50585a0
|
||
|
.quad 0x068bce87a,0x11eef4f8e
|
||
|
.quad 0x1524fa6c6,0x19f1c69dc
|
||
|
.quad 0x16cba8aca,0x1ee54f54c
|
||
|
.quad 0x042d98888,0x12913343e
|
||
|
.quad 0x1329d9f7e,0x0b3e32c28
|
||
|
.quad 0x1b1c69528,0x088f25a3a
|
||
|
.quad 0x02178513a,0x0064f7f26
|
||
|
.quad 0x0e0ac139e,0x04e36f0b0
|
||
|
.quad 0x0170076fa,0x0dd7e3b0c
|
||
|
.quad 0x141a1a2e2,0x0bd6f81f8
|
||
|
.quad 0x16ad828b4,0x0f285651c
|
||
|
.quad 0x041d17b64,0x19425cbba
|
||
|
.quad 0x1fae1cc66,0x010746f3c
|
||
|
.quad 0x1a75b4b00,0x18db37e8a
|
||
|
.quad 0x0f872e54c,0x1c24afea4
|
||
|
.quad 0x01e41e9fc,0x04c144932
|
||
|
.quad 0x086d8e4d2,0x0271d9844
|
||
|
.quad 0x160f7af7a,0x052148f02
|
||
|
.quad 0x05bb8f1bc,0x08e766a0c
|
||
|
.quad 0x0a90fd27a,0x0a3c6f37a
|
||
|
.quad 0x0b3af077a,0x093a5f730
|
||
|
.quad 0x04984d782,0x1d22c238e
|
||
|
.quad 0x0ca6ef3ac,0x06cb08e5c
|
||
|
.quad 0x0234e0b26,0x063ded06a
|
||
|
.quad 0x1d88abd4a,0x06b749fb2
|
||
|
.quad 0x04597456a,0x04d56973c
|
||
|
.quad 0x0e9e28eb4,0x1167f94f2
|
||
|
.quad 0x07b3ff57a,0x19385bf2e
|
||
|
.quad 0x0c9c8b782,0x0cec3662e
|
||
|
.quad 0x13a9cba9e,0x0e417f38a
|
||
|
.quad 0x093e106a4,0x19329634a
|
||
|
.quad 0x167001a9c,0x14e727980
|
||
|
.quad 0x1ddffc5d4,0x0e6fc4e6a
|
||
|
.quad 0x00df04680,0x0d104b8fc
|
||
|
.quad 0x02342001e,0x08227bb8a
|
||
|
.quad 0x00a2a8d7e,0x05b397730
|
||
|
.quad 0x168763fa6,0x0b0cd4768
|
||
|
.quad 0x1ed5a407a,0x0e78eb416
|
||
|
.quad 0x0d2c3ed1a,0x13c2b89c4
|
||
|
.quad 0x0995a5724,0x1641378f0
|
||
|
.quad 0x19b1afbc4,0x0d7a4825c
|
||
|
.quad 0x109ffedc0,0x08d96551c
|
||
|
.quad 0x0f2271e60,0x10f5ff2ba
|
||
|
.quad 0x00b0bf8ca,0x00bf80dd2
|
||
|
.quad 0x123888b7a,0x00167d312
|
||
|
.quad 0x1e888f7dc,0x18dcddd1c
|
||
|
.quad 0x002ee03b2,0x0f6076544
|
||
|
.quad 0x183e8d8fe,0x06a45d2b2
|
||
|
.quad 0x133d7a042,0x026f6a60a
|
||
|
.quad 0x116b0f50c,0x1dd3e10e8
|
||
|
.quad 0x05fabe670,0x1a2adb74e
|
||
|
.quad 0x130004488,0x0de87806c
|
||
|
.quad 0x000bcf5f6,0x19d34af3a
|
||
|
.quad 0x18f0c7078,0x014338754
|
||
|
.quad 0x017f27698,0x049c3cc9c
|
||
|
.quad 0x058ca5f00,0x15e3e77ee
|
||
|
.quad 0x1af900c24,0x068bce87a
|
||
|
.quad 0x0b5cfca28,0x0dd07448e
|
||
|
.quad 0x0ded288f8,0x1524fa6c6
|
||
|
.quad 0x059f229bc,0x1d8048348
|
||
|
.quad 0x06d390dec,0x16cba8aca
|
||
|
.quad 0x037170390,0x0a3e3e02c
|
||
|
.quad 0x06353c1cc,0x042d98888
|
||
|
.quad 0x0c4584f5c,0x0d73c7bea
|
||
|
.quad 0x1f16a3418,0x1329d9f7e
|
||
|
.quad 0x0531377e2,0x185137662
|
||
|
.quad 0x1d8d9ca7c,0x1b1c69528
|
||
|
.quad 0x0b25b29f2,0x18a08b5bc
|
||
|
.quad 0x19fb2a8b0,0x02178513a
|
||
|
.quad 0x1a08fe6ac,0x1da758ae0
|
||
|
.quad 0x045cddf4e,0x0e0ac139e
|
||
|
.quad 0x1a91647f2,0x169cf9eb0
|
||
|
.quad 0x1a0f717c4,0x0170076fa
|