forked from OSchip/llvm-project
641 lines
13 KiB
ArmAsm
641 lines
13 KiB
ArmAsm
# Test performs a BB reordering with unsupported
|
|
# instruction jrcxz. Reordering works correctly with the
|
|
# follow options: None, Normal or Reverse. Other strategies
|
|
# are completed with Assertion `isIntN(Size * 8 + 1, Value).
|
|
# The cause is the distance between BB where one contains
|
|
# jrcxz instruction.
|
|
# Example: OpenSSL
|
|
# https://github.com/openssl/openssl/blob/master/crypto/bn/asm/x86_64-mont5.pl#L3319
|
|
|
|
# REQUIRES: system-linux
|
|
|
|
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
|
|
# RUN: %s -o %t.o
|
|
# RUN: link_fdata %s %t.o %t.fdata
|
|
# RUN: %clang %cflags %t.o -falign-labels -march=native -o %t.exe -Wl,-q
|
|
|
|
# RUN: llvm-bolt %t.exe -o %t.bolted --data %t.fdata \
|
|
# RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort \
|
|
# RUN: --split-functions --split-all-cold --split-eh --dyno-stats \
|
|
# RUN: --print-finalized 2>&1 | FileCheck %s
|
|
|
|
# CHECK-NOT: value of -2105 is too large for field of 1 byte.
|
|
|
|
.text
|
|
.section .text.startup,"ax",@progbits
|
|
.p2align 5,,31
|
|
.globl main
|
|
.type main, @function
|
|
main:
|
|
jmp bn_sqrx8x_internal
|
|
|
|
.globl bn_sqrx8x_internal
|
|
.hidden bn_sqrx8x_internal
|
|
.type bn_sqrx8x_internal,@function
|
|
.align 32
|
|
bn_sqrx8x_internal:
|
|
__bn_sqrx8x_internal:
|
|
# FDATA: 1 bn_from_mont8x 160 1 bn_sqrx8x_internal 0 0 56
|
|
# FDATA: 1 bn_sqrx8x_internal 13 1 bn_sqrx8x_internal 40 0 60972
|
|
# FDATA: 1 bn_sqrx8x_internal 5f 1 bn_sqrx8x_internal 2c 0 60972
|
|
# FDATA: 1 bn_sqrx8x_internal 2f1 1 bn_sqrx8x_internal 500 0 60972
|
|
# FDATA: 1 bn_sqrx8x_internal 34a 1 bn_sqrx8x_internal 360 0 60972
|
|
# FDATA: 1 bn_sqrx8x_internal 411 1 bn_sqrx8x_internal 360 0 447888
|
|
# FDATA: 1 bn_sqrx8x_internal 411 1 bn_sqrx8x_internal 417 0 63984
|
|
# FDATA: 1 bn_sqrx8x_internal 427 1 bn_sqrx8x_internal 480 0 60972
|
|
# FDATA: 1 bn_sqrx8x_internal 427 1 bn_sqrx8x_internal 429 0 3012
|
|
# FDATA: 1 bn_sqrx8x_internal 467 1 bn_sqrx8x_internal 360 0 3012
|
|
# FDATA: 1 bn_sqrx8x_internal 4ba 1 bn_sqrx8x_internal 80 0 58964
|
|
# FDATA: 1 bn_sqrx8x_internal 4ba 1 bn_sqrx8x_internal 4c0 0 2008
|
|
# FDATA: 1 bn_sqrx8x_internal 4fb 1 bn_sqrx8x_internal 80 0 2008
|
|
# FDATA: 1 bn_sqrx8x_internal 5f0 1 bn_sqrx8x_internal 5f2 0 180908
|
|
# FDATA: 1 bn_sqrx8x_internal 61b 1 bn_sqrx8x_internal 540 0 180908
|
|
# FDATA: 1 bn_sqrx8x_internal 632 1 bn_sqrx8x_internal 637 0 59020
|
|
# FDATA: 1 bn_sqrx8x_internal 657 1 bn_sqrx8x_internal 660 0 59020
|
|
# FDATA: 1 bn_sqrx8x_internal 696 1 bn_sqrx8x_internal 6a0 0 120048
|
|
# FDATA: 1 bn_sqrx8x_internal 75a 1 bn_sqrx8x_internal 6a0 0 840336
|
|
# FDATA: 1 bn_sqrx8x_internal 75a 1 bn_sqrx8x_internal 760 0 120048
|
|
# FDATA: 1 bn_sqrx8x_internal 768 1 bn_sqrx8x_internal 76e 0 120048
|
|
# FDATA: 1 bn_sqrx8x_internal 7b2 1 bn_sqrx8x_internal 7c0 0 120048
|
|
# FDATA: 1 bn_sqrx8x_internal 86e 1 bn_sqrx8x_internal 7c0 0 896560
|
|
# FDATA: 1 bn_sqrx8x_internal 86e 1 bn_sqrx8x_internal 874 0 128080
|
|
# FDATA: 1 bn_sqrx8x_internal 879 1 bn_sqrx8x_internal 8c0 0 120048
|
|
# FDATA: 1 bn_sqrx8x_internal 879 1 bn_sqrx8x_internal 87b 0 8032
|
|
# FDATA: 1 bn_sqrx8x_internal 8bb 1 bn_sqrx8x_internal 7c0 0 8032
|
|
# FDATA: 1 bn_sqrx8x_internal 8e8 1 bn_sqrx8x_internal 8ed 0 120048
|
|
# FDATA: 1 bn_sqrx8x_internal 955 1 bn_sqrx8x_internal 660 0 61028
|
|
# FDATA: 1 bn_sqrx8x_internal 955 1 bn_sqrx8x_internal 95b 0 59020
|
|
# FDATA: 0 [unknown] 0 1 bn_sqrx8x_internal 5f0 0 59020
|
|
.cfi_startproc
|
|
leaq 48+8(%rsp),%rdi
|
|
leaq (%rsi,%r9,1),%rbp
|
|
movq %r9,0+8(%rsp)
|
|
movq %rbp,8+8(%rsp)
|
|
jmp .Lsqr8x_zero_start
|
|
|
|
.align 32
|
|
.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
|
|
.Lsqrx8x_zero:
|
|
.byte 0x3e
|
|
movdqa %xmm0,0(%rdi)
|
|
movdqa %xmm0,16(%rdi)
|
|
movdqa %xmm0,32(%rdi)
|
|
movdqa %xmm0,48(%rdi)
|
|
.Lsqr8x_zero_start:
|
|
movdqa %xmm0,64(%rdi)
|
|
movdqa %xmm0,80(%rdi)
|
|
movdqa %xmm0,96(%rdi)
|
|
movdqa %xmm0,112(%rdi)
|
|
leaq 128(%rdi),%rdi
|
|
subq $64,%r9
|
|
jnz .Lsqrx8x_zero
|
|
|
|
movq 0(%rsi),%rdx
|
|
|
|
xorq %r10,%r10
|
|
xorq %r11,%r11
|
|
xorq %r12,%r12
|
|
xorq %r13,%r13
|
|
xorq %r14,%r14
|
|
xorq %r15,%r15
|
|
leaq 48+8(%rsp),%rdi
|
|
xorq %rbp,%rbp
|
|
jmp .Lsqrx8x_outer_loop
|
|
|
|
.align 32
|
|
.Lsqrx8x_outer_loop:
|
|
mulxq 8(%rsi),%r8,%rax
|
|
adcxq %r9,%r8
|
|
adoxq %rax,%r10
|
|
mulxq 16(%rsi),%r9,%rax
|
|
adcxq %r10,%r9
|
|
adoxq %rax,%r11
|
|
.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
|
|
adcxq %r11,%r10
|
|
adoxq %rax,%r12
|
|
.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
|
|
adcxq %r12,%r11
|
|
adoxq %rax,%r13
|
|
mulxq 40(%rsi),%r12,%rax
|
|
adcxq %r13,%r12
|
|
adoxq %rax,%r14
|
|
mulxq 48(%rsi),%r13,%rax
|
|
adcxq %r14,%r13
|
|
adoxq %r15,%rax
|
|
mulxq 56(%rsi),%r14,%r15
|
|
movq 8(%rsi),%rdx
|
|
adcxq %rax,%r14
|
|
adoxq %rbp,%r15
|
|
adcq 64(%rdi),%r15
|
|
movq %r8,8(%rdi)
|
|
movq %r9,16(%rdi)
|
|
sbbq %rcx,%rcx
|
|
xorq %rbp,%rbp
|
|
|
|
mulxq 16(%rsi),%r8,%rbx
|
|
mulxq 24(%rsi),%r9,%rax
|
|
adcxq %r10,%r8
|
|
adoxq %rbx,%r9
|
|
mulxq 32(%rsi),%r10,%rbx
|
|
adcxq %r11,%r9
|
|
adoxq %rax,%r10
|
|
.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
|
|
adcxq %r12,%r10
|
|
adoxq %rbx,%r11
|
|
.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
|
|
adcxq %r13,%r11
|
|
adoxq %r14,%r12
|
|
.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
|
|
movq 16(%rsi),%rdx
|
|
adcxq %rax,%r12
|
|
adoxq %rbx,%r13
|
|
adcxq %r15,%r13
|
|
adoxq %rbp,%r14
|
|
adcxq %rbp,%r14
|
|
|
|
movq %r8,24(%rdi)
|
|
movq %r9,32(%rdi)
|
|
|
|
mulxq 24(%rsi),%r8,%rbx
|
|
mulxq 32(%rsi),%r9,%rax
|
|
adcxq %r10,%r8
|
|
adoxq %rbx,%r9
|
|
mulxq 40(%rsi),%r10,%rbx
|
|
adcxq %r11,%r9
|
|
adoxq %rax,%r10
|
|
.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
|
|
adcxq %r12,%r10
|
|
adoxq %r13,%r11
|
|
.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
|
|
.byte 0x3e
|
|
movq 24(%rsi),%rdx
|
|
adcxq %rbx,%r11
|
|
adoxq %rax,%r12
|
|
adcxq %r14,%r12
|
|
movq %r8,40(%rdi)
|
|
movq %r9,48(%rdi)
|
|
mulxq 32(%rsi),%r8,%rax
|
|
adoxq %rbp,%r13
|
|
adcxq %rbp,%r13
|
|
|
|
mulxq 40(%rsi),%r9,%rbx
|
|
adcxq %r10,%r8
|
|
adoxq %rax,%r9
|
|
mulxq 48(%rsi),%r10,%rax
|
|
adcxq %r11,%r9
|
|
adoxq %r12,%r10
|
|
mulxq 56(%rsi),%r11,%r12
|
|
movq 32(%rsi),%rdx
|
|
movq 40(%rsi),%r14
|
|
adcxq %rbx,%r10
|
|
adoxq %rax,%r11
|
|
movq 48(%rsi),%r15
|
|
adcxq %r13,%r11
|
|
adoxq %rbp,%r12
|
|
adcxq %rbp,%r12
|
|
|
|
movq %r8,56(%rdi)
|
|
movq %r9,64(%rdi)
|
|
|
|
mulxq %r14,%r9,%rax
|
|
movq 56(%rsi),%r8
|
|
adcxq %r10,%r9
|
|
mulxq %r15,%r10,%rbx
|
|
adoxq %rax,%r10
|
|
adcxq %r11,%r10
|
|
mulxq %r8,%r11,%rax
|
|
movq %r14,%rdx
|
|
adoxq %rbx,%r11
|
|
adcxq %r12,%r11
|
|
|
|
adcxq %rbp,%rax
|
|
|
|
mulxq %r15,%r14,%rbx
|
|
mulxq %r8,%r12,%r13
|
|
movq %r15,%rdx
|
|
leaq 64(%rsi),%rsi
|
|
adcxq %r14,%r11
|
|
adoxq %rbx,%r12
|
|
adcxq %rax,%r12
|
|
adoxq %rbp,%r13
|
|
|
|
.byte 0x67,0x67
|
|
mulxq %r8,%r8,%r14
|
|
adcxq %r8,%r13
|
|
adcxq %rbp,%r14
|
|
|
|
cmpq 8+8(%rsp),%rsi
|
|
je .Lsqrx8x_outer_break
|
|
|
|
negq %rcx
|
|
movq $-8,%rcx
|
|
movq %rbp,%r15
|
|
movq 64(%rdi),%r8
|
|
adcxq 72(%rdi),%r9
|
|
adcxq 80(%rdi),%r10
|
|
adcxq 88(%rdi),%r11
|
|
adcq 96(%rdi),%r12
|
|
adcq 104(%rdi),%r13
|
|
adcq 112(%rdi),%r14
|
|
adcq 120(%rdi),%r15
|
|
leaq (%rsi),%rbp
|
|
leaq 128(%rdi),%rdi
|
|
sbbq %rax,%rax
|
|
|
|
movq -64(%rsi),%rdx
|
|
movq %rax,16+8(%rsp)
|
|
movq %rdi,24+8(%rsp)
|
|
|
|
|
|
xorl %eax,%eax
|
|
jmp .Lsqrx8x_loop
|
|
|
|
.align 32
|
|
.Lsqrx8x_loop:
|
|
movq %r8,%rbx
|
|
mulxq 0(%rbp),%rax,%r8
|
|
adcxq %rax,%rbx
|
|
adoxq %r9,%r8
|
|
|
|
mulxq 8(%rbp),%rax,%r9
|
|
adcxq %rax,%r8
|
|
adoxq %r10,%r9
|
|
|
|
mulxq 16(%rbp),%rax,%r10
|
|
adcxq %rax,%r9
|
|
adoxq %r11,%r10
|
|
|
|
mulxq 24(%rbp),%rax,%r11
|
|
adcxq %rax,%r10
|
|
adoxq %r12,%r11
|
|
|
|
.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
|
|
adcxq %rax,%r11
|
|
adoxq %r13,%r12
|
|
|
|
mulxq 40(%rbp),%rax,%r13
|
|
adcxq %rax,%r12
|
|
adoxq %r14,%r13
|
|
|
|
mulxq 48(%rbp),%rax,%r14
|
|
movq %rbx,(%rdi,%rcx,8)
|
|
movl $0,%ebx
|
|
adcxq %rax,%r13
|
|
adoxq %r15,%r14
|
|
|
|
.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
|
|
movq 8(%rsi,%rcx,8),%rdx
|
|
adcxq %rax,%r14
|
|
adoxq %rbx,%r15
|
|
adcxq %rbx,%r15
|
|
|
|
.byte 0x67
|
|
incq %rcx
|
|
jnz .Lsqrx8x_loop
|
|
|
|
leaq 64(%rbp),%rbp
|
|
movq $-8,%rcx
|
|
cmpq 8+8(%rsp),%rbp
|
|
je .Lsqrx8x_break
|
|
|
|
subq 16+8(%rsp),%rbx
|
|
.byte 0x66
|
|
movq -64(%rsi),%rdx
|
|
adcxq 0(%rdi),%r8
|
|
adcxq 8(%rdi),%r9
|
|
adcq 16(%rdi),%r10
|
|
adcq 24(%rdi),%r11
|
|
adcq 32(%rdi),%r12
|
|
adcq 40(%rdi),%r13
|
|
adcq 48(%rdi),%r14
|
|
adcq 56(%rdi),%r15
|
|
leaq 64(%rdi),%rdi
|
|
.byte 0x67
|
|
sbbq %rax,%rax
|
|
xorl %ebx,%ebx
|
|
movq %rax,16+8(%rsp)
|
|
jmp .Lsqrx8x_loop
|
|
|
|
.align 32
|
|
.Lsqrx8x_break:
|
|
xorq %rbp,%rbp
|
|
subq 16+8(%rsp),%rbx
|
|
adcxq %rbp,%r8
|
|
movq 24+8(%rsp),%rcx
|
|
adcxq %rbp,%r9
|
|
movq 0(%rsi),%rdx
|
|
adcq $0,%r10
|
|
movq %r8,0(%rdi)
|
|
adcq $0,%r11
|
|
adcq $0,%r12
|
|
adcq $0,%r13
|
|
adcq $0,%r14
|
|
adcq $0,%r15
|
|
cmpq %rcx,%rdi
|
|
je .Lsqrx8x_outer_loop
|
|
|
|
movq %r9,8(%rdi)
|
|
movq 8(%rcx),%r9
|
|
movq %r10,16(%rdi)
|
|
movq 16(%rcx),%r10
|
|
movq %r11,24(%rdi)
|
|
movq 24(%rcx),%r11
|
|
movq %r12,32(%rdi)
|
|
movq 32(%rcx),%r12
|
|
movq %r13,40(%rdi)
|
|
movq 40(%rcx),%r13
|
|
movq %r14,48(%rdi)
|
|
movq 48(%rcx),%r14
|
|
movq %r15,56(%rdi)
|
|
movq 56(%rcx),%r15
|
|
movq %rcx,%rdi
|
|
jmp .Lsqrx8x_outer_loop
|
|
|
|
.align 32
|
|
.Lsqrx8x_outer_break:
|
|
movq %r9,72(%rdi)
|
|
.byte 102,72,15,126,217
|
|
movq %r10,80(%rdi)
|
|
movq %r11,88(%rdi)
|
|
movq %r12,96(%rdi)
|
|
movq %r13,104(%rdi)
|
|
movq %r14,112(%rdi)
|
|
leaq 48+8(%rsp),%rdi
|
|
movq (%rsi,%rcx,1),%rdx
|
|
|
|
movq 8(%rdi),%r11
|
|
xorq %r10,%r10
|
|
movq 0+8(%rsp),%r9
|
|
adoxq %r11,%r11
|
|
movq 16(%rdi),%r12
|
|
movq 24(%rdi),%r13
|
|
|
|
.align 32
|
|
.Lsqrx4x_shift_n_add:
|
|
mulxq %rdx,%rax,%rbx
|
|
adoxq %r12,%r12
|
|
adcxq %r10,%rax
|
|
.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
|
|
.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
|
|
adoxq %r13,%r13
|
|
adcxq %r11,%rbx
|
|
movq 40(%rdi),%r11
|
|
movq %rax,0(%rdi)
|
|
movq %rbx,8(%rdi)
|
|
|
|
mulxq %rdx,%rax,%rbx
|
|
adoxq %r10,%r10
|
|
adcxq %r12,%rax
|
|
movq 16(%rsi,%rcx,1),%rdx
|
|
movq 48(%rdi),%r12
|
|
adoxq %r11,%r11
|
|
adcxq %r13,%rbx
|
|
movq 56(%rdi),%r13
|
|
movq %rax,16(%rdi)
|
|
movq %rbx,24(%rdi)
|
|
|
|
mulxq %rdx,%rax,%rbx
|
|
adoxq %r12,%r12
|
|
adcxq %r10,%rax
|
|
movq 24(%rsi,%rcx,1),%rdx
|
|
leaq 32(%rcx),%rcx
|
|
movq 64(%rdi),%r10
|
|
adoxq %r13,%r13
|
|
adcxq %r11,%rbx
|
|
movq 72(%rdi),%r11
|
|
movq %rax,32(%rdi)
|
|
movq %rbx,40(%rdi)
|
|
|
|
mulxq %rdx,%rax,%rbx
|
|
adoxq %r10,%r10
|
|
adcxq %r12,%rax
|
|
jrcxz .Lsqrx4x_shift_n_add_break
|
|
.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
|
|
adoxq %r11,%r11
|
|
adcxq %r13,%rbx
|
|
movq 80(%rdi),%r12
|
|
movq 88(%rdi),%r13
|
|
movq %rax,48(%rdi)
|
|
movq %rbx,56(%rdi)
|
|
leaq 64(%rdi),%rdi
|
|
nop
|
|
jmp .Lsqrx4x_shift_n_add
|
|
|
|
.align 32
|
|
.Lsqrx4x_shift_n_add_break:
|
|
adcxq %r13,%rbx
|
|
movq %rax,48(%rdi)
|
|
movq %rbx,56(%rdi)
|
|
leaq 64(%rdi),%rdi
|
|
.byte 102,72,15,126,213
|
|
__bn_sqrx8x_reduction:
|
|
xorl %eax,%eax
|
|
movq 32+8(%rsp),%rbx
|
|
movq 48+8(%rsp),%rdx
|
|
leaq -64(%rbp,%r9,1),%rcx
|
|
|
|
movq %rcx,0+8(%rsp)
|
|
movq %rdi,8+8(%rsp)
|
|
|
|
leaq 48+8(%rsp),%rdi
|
|
jmp .Lsqrx8x_reduction_loop
|
|
|
|
.align 32
|
|
.Lsqrx8x_reduction_loop:
|
|
movq 8(%rdi),%r9
|
|
movq 16(%rdi),%r10
|
|
movq 24(%rdi),%r11
|
|
movq 32(%rdi),%r12
|
|
movq %rdx,%r8
|
|
imulq %rbx,%rdx
|
|
movq 40(%rdi),%r13
|
|
movq 48(%rdi),%r14
|
|
movq 56(%rdi),%r15
|
|
movq %rax,24+8(%rsp)
|
|
|
|
leaq 64(%rdi),%rdi
|
|
xorq %rsi,%rsi
|
|
movq $-8,%rcx
|
|
jmp .Lsqrx8x_reduce
|
|
|
|
.align 32
|
|
.Lsqrx8x_reduce:
|
|
movq %r8,%rbx
|
|
mulxq 0(%rbp),%rax,%r8
|
|
adcxq %rbx,%rax
|
|
adoxq %r9,%r8
|
|
|
|
mulxq 8(%rbp),%rbx,%r9
|
|
adcxq %rbx,%r8
|
|
adoxq %r10,%r9
|
|
|
|
mulxq 16(%rbp),%rbx,%r10
|
|
adcxq %rbx,%r9
|
|
adoxq %r11,%r10
|
|
|
|
mulxq 24(%rbp),%rbx,%r11
|
|
adcxq %rbx,%r10
|
|
adoxq %r12,%r11
|
|
|
|
.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
|
|
movq %rdx,%rax
|
|
movq %r8,%rdx
|
|
adcxq %rbx,%r11
|
|
adoxq %r13,%r12
|
|
|
|
mulxq 32+8(%rsp),%rbx,%rdx
|
|
movq %rax,%rdx
|
|
movq %rax,64+48+8(%rsp,%rcx,8)
|
|
|
|
mulxq 40(%rbp),%rax,%r13
|
|
adcxq %rax,%r12
|
|
adoxq %r14,%r13
|
|
|
|
mulxq 48(%rbp),%rax,%r14
|
|
adcxq %rax,%r13
|
|
adoxq %r15,%r14
|
|
|
|
mulxq 56(%rbp),%rax,%r15
|
|
movq %rbx,%rdx
|
|
adcxq %rax,%r14
|
|
adoxq %rsi,%r15
|
|
adcxq %rsi,%r15
|
|
|
|
.byte 0x67,0x67,0x67
|
|
incq %rcx
|
|
jnz .Lsqrx8x_reduce
|
|
|
|
movq %rsi,%rax
|
|
cmpq 0+8(%rsp),%rbp
|
|
jae .Lsqrx8x_no_tail
|
|
|
|
movq 48+8(%rsp),%rdx
|
|
addq 0(%rdi),%r8
|
|
leaq 64(%rbp),%rbp
|
|
movq $-8,%rcx
|
|
adcxq 8(%rdi),%r9
|
|
adcxq 16(%rdi),%r10
|
|
adcq 24(%rdi),%r11
|
|
adcq 32(%rdi),%r12
|
|
adcq 40(%rdi),%r13
|
|
adcq 48(%rdi),%r14
|
|
adcq 56(%rdi),%r15
|
|
leaq 64(%rdi),%rdi
|
|
sbbq %rax,%rax
|
|
|
|
xorq %rsi,%rsi
|
|
movq %rax,16+8(%rsp)
|
|
jmp .Lsqrx8x_tail
|
|
|
|
.align 32
|
|
.Lsqrx8x_tail:
|
|
movq %r8,%rbx
|
|
mulxq 0(%rbp),%rax,%r8
|
|
adcxq %rax,%rbx
|
|
adoxq %r9,%r8
|
|
|
|
mulxq 8(%rbp),%rax,%r9
|
|
adcxq %rax,%r8
|
|
adoxq %r10,%r9
|
|
|
|
mulxq 16(%rbp),%rax,%r10
|
|
adcxq %rax,%r9
|
|
adoxq %r11,%r10
|
|
|
|
mulxq 24(%rbp),%rax,%r11
|
|
adcxq %rax,%r10
|
|
adoxq %r12,%r11
|
|
|
|
.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
|
|
adcxq %rax,%r11
|
|
adoxq %r13,%r12
|
|
|
|
mulxq 40(%rbp),%rax,%r13
|
|
adcxq %rax,%r12
|
|
adoxq %r14,%r13
|
|
|
|
mulxq 48(%rbp),%rax,%r14
|
|
adcxq %rax,%r13
|
|
adoxq %r15,%r14
|
|
|
|
mulxq 56(%rbp),%rax,%r15
|
|
movq 72+48+8(%rsp,%rcx,8),%rdx
|
|
adcxq %rax,%r14
|
|
adoxq %rsi,%r15
|
|
movq %rbx,(%rdi,%rcx,8)
|
|
movq %r8,%rbx
|
|
adcxq %rsi,%r15
|
|
|
|
incq %rcx
|
|
jnz .Lsqrx8x_tail
|
|
|
|
cmpq 0+8(%rsp),%rbp
|
|
jae .Lsqrx8x_tail_done
|
|
|
|
subq 16+8(%rsp),%rsi
|
|
movq 48+8(%rsp),%rdx
|
|
leaq 64(%rbp),%rbp
|
|
adcq 0(%rdi),%r8
|
|
adcq 8(%rdi),%r9
|
|
adcq 16(%rdi),%r10
|
|
adcq 24(%rdi),%r11
|
|
adcq 32(%rdi),%r12
|
|
adcq 40(%rdi),%r13
|
|
adcq 48(%rdi),%r14
|
|
adcq 56(%rdi),%r15
|
|
leaq 64(%rdi),%rdi
|
|
sbbq %rax,%rax
|
|
subq $8,%rcx
|
|
|
|
xorq %rsi,%rsi
|
|
movq %rax,16+8(%rsp)
|
|
jmp .Lsqrx8x_tail
|
|
|
|
.align 32
|
|
.Lsqrx8x_tail_done:
|
|
xorq %rax,%rax
|
|
addq 24+8(%rsp),%r8
|
|
adcq $0,%r9
|
|
adcq $0,%r10
|
|
adcq $0,%r11
|
|
adcq $0,%r12
|
|
adcq $0,%r13
|
|
adcq $0,%r14
|
|
adcq $0,%r15
|
|
adcq $0,%rax
|
|
|
|
subq 16+8(%rsp),%rsi
|
|
.Lsqrx8x_no_tail:
|
|
adcq 0(%rdi),%r8
|
|
.byte 102,72,15,126,217
|
|
adcq 8(%rdi),%r9
|
|
movq 56(%rbp),%rsi
|
|
.byte 102,72,15,126,213
|
|
adcq 16(%rdi),%r10
|
|
adcq 24(%rdi),%r11
|
|
adcq 32(%rdi),%r12
|
|
adcq 40(%rdi),%r13
|
|
adcq 48(%rdi),%r14
|
|
adcq 56(%rdi),%r15
|
|
adcq $0,%rax
|
|
|
|
movq 32+8(%rsp),%rbx
|
|
movq 64(%rdi,%rcx,1),%rdx
|
|
|
|
movq %r8,0(%rdi)
|
|
leaq 64(%rdi),%r8
|
|
movq %r9,8(%rdi)
|
|
movq %r10,16(%rdi)
|
|
movq %r11,24(%rdi)
|
|
movq %r12,32(%rdi)
|
|
movq %r13,40(%rdi)
|
|
movq %r14,48(%rdi)
|
|
movq %r15,56(%rdi)
|
|
|
|
leaq 64(%rdi,%rcx,1),%rdi
|
|
cmpq 8+8(%rsp),%r8
|
|
jb .Lsqrx8x_reduction_loop
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
|