llvm-project/bolt/test/X86/bug-reorder-bb-jrcxz.s

641 lines
13 KiB
ArmAsm

# Test performs a BB reordering with unsupported
# instruction jrcxz. Reordering works correctly with the
# follow options: None, Normal or Reverse. Other strategies
# are completed with Assertion `isIntN(Size * 8 + 1, Value).
# The cause is the distance between BB where one contains
# jrcxz instruction.
# Example: OpenSSL
# https://github.com/openssl/openssl/blob/master/crypto/bn/asm/x86_64-mont5.pl#L3319
# REQUIRES: system-linux
# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
# RUN: %s -o %t.o
# RUN: link_fdata %s %t.o %t.fdata
# RUN: %clang %cflags %t.o -falign-labels -march=native -o %t.exe -Wl,-q
# RUN: llvm-bolt %t.exe -o %t.bolted --data %t.fdata \
# RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort \
# RUN: --split-functions --split-all-cold --split-eh --dyno-stats \
# RUN: --print-finalized 2>&1 | FileCheck %s
# CHECK-NOT: value of -2105 is too large for field of 1 byte.
.text
.section .text.startup,"ax",@progbits
.p2align 5,,31
.globl main
.type main, @function
main:
jmp bn_sqrx8x_internal
.globl bn_sqrx8x_internal
.hidden bn_sqrx8x_internal
.type bn_sqrx8x_internal,@function
.align 32
bn_sqrx8x_internal:
__bn_sqrx8x_internal:
# FDATA: 1 bn_from_mont8x 160 1 bn_sqrx8x_internal 0 0 56
# FDATA: 1 bn_sqrx8x_internal 13 1 bn_sqrx8x_internal 40 0 60972
# FDATA: 1 bn_sqrx8x_internal 5f 1 bn_sqrx8x_internal 2c 0 60972
# FDATA: 1 bn_sqrx8x_internal 2f1 1 bn_sqrx8x_internal 500 0 60972
# FDATA: 1 bn_sqrx8x_internal 34a 1 bn_sqrx8x_internal 360 0 60972
# FDATA: 1 bn_sqrx8x_internal 411 1 bn_sqrx8x_internal 360 0 447888
# FDATA: 1 bn_sqrx8x_internal 411 1 bn_sqrx8x_internal 417 0 63984
# FDATA: 1 bn_sqrx8x_internal 427 1 bn_sqrx8x_internal 480 0 60972
# FDATA: 1 bn_sqrx8x_internal 427 1 bn_sqrx8x_internal 429 0 3012
# FDATA: 1 bn_sqrx8x_internal 467 1 bn_sqrx8x_internal 360 0 3012
# FDATA: 1 bn_sqrx8x_internal 4ba 1 bn_sqrx8x_internal 80 0 58964
# FDATA: 1 bn_sqrx8x_internal 4ba 1 bn_sqrx8x_internal 4c0 0 2008
# FDATA: 1 bn_sqrx8x_internal 4fb 1 bn_sqrx8x_internal 80 0 2008
# FDATA: 1 bn_sqrx8x_internal 5f0 1 bn_sqrx8x_internal 5f2 0 180908
# FDATA: 1 bn_sqrx8x_internal 61b 1 bn_sqrx8x_internal 540 0 180908
# FDATA: 1 bn_sqrx8x_internal 632 1 bn_sqrx8x_internal 637 0 59020
# FDATA: 1 bn_sqrx8x_internal 657 1 bn_sqrx8x_internal 660 0 59020
# FDATA: 1 bn_sqrx8x_internal 696 1 bn_sqrx8x_internal 6a0 0 120048
# FDATA: 1 bn_sqrx8x_internal 75a 1 bn_sqrx8x_internal 6a0 0 840336
# FDATA: 1 bn_sqrx8x_internal 75a 1 bn_sqrx8x_internal 760 0 120048
# FDATA: 1 bn_sqrx8x_internal 768 1 bn_sqrx8x_internal 76e 0 120048
# FDATA: 1 bn_sqrx8x_internal 7b2 1 bn_sqrx8x_internal 7c0 0 120048
# FDATA: 1 bn_sqrx8x_internal 86e 1 bn_sqrx8x_internal 7c0 0 896560
# FDATA: 1 bn_sqrx8x_internal 86e 1 bn_sqrx8x_internal 874 0 128080
# FDATA: 1 bn_sqrx8x_internal 879 1 bn_sqrx8x_internal 8c0 0 120048
# FDATA: 1 bn_sqrx8x_internal 879 1 bn_sqrx8x_internal 87b 0 8032
# FDATA: 1 bn_sqrx8x_internal 8bb 1 bn_sqrx8x_internal 7c0 0 8032
# FDATA: 1 bn_sqrx8x_internal 8e8 1 bn_sqrx8x_internal 8ed 0 120048
# FDATA: 1 bn_sqrx8x_internal 955 1 bn_sqrx8x_internal 660 0 61028
# FDATA: 1 bn_sqrx8x_internal 955 1 bn_sqrx8x_internal 95b 0 59020
# FDATA: 0 [unknown] 0 1 bn_sqrx8x_internal 5f0 0 59020
.cfi_startproc
leaq 48+8(%rsp),%rdi
leaq (%rsi,%r9,1),%rbp
movq %r9,0+8(%rsp)
movq %rbp,8+8(%rsp)
jmp .Lsqr8x_zero_start
.align 32
.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
.Lsqrx8x_zero:
.byte 0x3e
movdqa %xmm0,0(%rdi)
movdqa %xmm0,16(%rdi)
movdqa %xmm0,32(%rdi)
movdqa %xmm0,48(%rdi)
.Lsqr8x_zero_start:
movdqa %xmm0,64(%rdi)
movdqa %xmm0,80(%rdi)
movdqa %xmm0,96(%rdi)
movdqa %xmm0,112(%rdi)
leaq 128(%rdi),%rdi
subq $64,%r9
jnz .Lsqrx8x_zero
movq 0(%rsi),%rdx
xorq %r10,%r10
xorq %r11,%r11
xorq %r12,%r12
xorq %r13,%r13
xorq %r14,%r14
xorq %r15,%r15
leaq 48+8(%rsp),%rdi
xorq %rbp,%rbp
jmp .Lsqrx8x_outer_loop
.align 32
.Lsqrx8x_outer_loop:
mulxq 8(%rsi),%r8,%rax
adcxq %r9,%r8
adoxq %rax,%r10
mulxq 16(%rsi),%r9,%rax
adcxq %r10,%r9
adoxq %rax,%r11
.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
adcxq %r11,%r10
adoxq %rax,%r12
.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
adcxq %r12,%r11
adoxq %rax,%r13
mulxq 40(%rsi),%r12,%rax
adcxq %r13,%r12
adoxq %rax,%r14
mulxq 48(%rsi),%r13,%rax
adcxq %r14,%r13
adoxq %r15,%rax
mulxq 56(%rsi),%r14,%r15
movq 8(%rsi),%rdx
adcxq %rax,%r14
adoxq %rbp,%r15
adcq 64(%rdi),%r15
movq %r8,8(%rdi)
movq %r9,16(%rdi)
sbbq %rcx,%rcx
xorq %rbp,%rbp
mulxq 16(%rsi),%r8,%rbx
mulxq 24(%rsi),%r9,%rax
adcxq %r10,%r8
adoxq %rbx,%r9
mulxq 32(%rsi),%r10,%rbx
adcxq %r11,%r9
adoxq %rax,%r10
.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
adcxq %r12,%r10
adoxq %rbx,%r11
.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
adcxq %r13,%r11
adoxq %r14,%r12
.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
movq 16(%rsi),%rdx
adcxq %rax,%r12
adoxq %rbx,%r13
adcxq %r15,%r13
adoxq %rbp,%r14
adcxq %rbp,%r14
movq %r8,24(%rdi)
movq %r9,32(%rdi)
mulxq 24(%rsi),%r8,%rbx
mulxq 32(%rsi),%r9,%rax
adcxq %r10,%r8
adoxq %rbx,%r9
mulxq 40(%rsi),%r10,%rbx
adcxq %r11,%r9
adoxq %rax,%r10
.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
adcxq %r12,%r10
adoxq %r13,%r11
.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
.byte 0x3e
movq 24(%rsi),%rdx
adcxq %rbx,%r11
adoxq %rax,%r12
adcxq %r14,%r12
movq %r8,40(%rdi)
movq %r9,48(%rdi)
mulxq 32(%rsi),%r8,%rax
adoxq %rbp,%r13
adcxq %rbp,%r13
mulxq 40(%rsi),%r9,%rbx
adcxq %r10,%r8
adoxq %rax,%r9
mulxq 48(%rsi),%r10,%rax
adcxq %r11,%r9
adoxq %r12,%r10
mulxq 56(%rsi),%r11,%r12
movq 32(%rsi),%rdx
movq 40(%rsi),%r14
adcxq %rbx,%r10
adoxq %rax,%r11
movq 48(%rsi),%r15
adcxq %r13,%r11
adoxq %rbp,%r12
adcxq %rbp,%r12
movq %r8,56(%rdi)
movq %r9,64(%rdi)
mulxq %r14,%r9,%rax
movq 56(%rsi),%r8
adcxq %r10,%r9
mulxq %r15,%r10,%rbx
adoxq %rax,%r10
adcxq %r11,%r10
mulxq %r8,%r11,%rax
movq %r14,%rdx
adoxq %rbx,%r11
adcxq %r12,%r11
adcxq %rbp,%rax
mulxq %r15,%r14,%rbx
mulxq %r8,%r12,%r13
movq %r15,%rdx
leaq 64(%rsi),%rsi
adcxq %r14,%r11
adoxq %rbx,%r12
adcxq %rax,%r12
adoxq %rbp,%r13
.byte 0x67,0x67
mulxq %r8,%r8,%r14
adcxq %r8,%r13
adcxq %rbp,%r14
cmpq 8+8(%rsp),%rsi
je .Lsqrx8x_outer_break
negq %rcx
movq $-8,%rcx
movq %rbp,%r15
movq 64(%rdi),%r8
adcxq 72(%rdi),%r9
adcxq 80(%rdi),%r10
adcxq 88(%rdi),%r11
adcq 96(%rdi),%r12
adcq 104(%rdi),%r13
adcq 112(%rdi),%r14
adcq 120(%rdi),%r15
leaq (%rsi),%rbp
leaq 128(%rdi),%rdi
sbbq %rax,%rax
movq -64(%rsi),%rdx
movq %rax,16+8(%rsp)
movq %rdi,24+8(%rsp)
xorl %eax,%eax
jmp .Lsqrx8x_loop
.align 32
.Lsqrx8x_loop:
movq %r8,%rbx
mulxq 0(%rbp),%rax,%r8
adcxq %rax,%rbx
adoxq %r9,%r8
mulxq 8(%rbp),%rax,%r9
adcxq %rax,%r8
adoxq %r10,%r9
mulxq 16(%rbp),%rax,%r10
adcxq %rax,%r9
adoxq %r11,%r10
mulxq 24(%rbp),%rax,%r11
adcxq %rax,%r10
adoxq %r12,%r11
.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
adcxq %rax,%r11
adoxq %r13,%r12
mulxq 40(%rbp),%rax,%r13
adcxq %rax,%r12
adoxq %r14,%r13
mulxq 48(%rbp),%rax,%r14
movq %rbx,(%rdi,%rcx,8)
movl $0,%ebx
adcxq %rax,%r13
adoxq %r15,%r14
.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
movq 8(%rsi,%rcx,8),%rdx
adcxq %rax,%r14
adoxq %rbx,%r15
adcxq %rbx,%r15
.byte 0x67
incq %rcx
jnz .Lsqrx8x_loop
leaq 64(%rbp),%rbp
movq $-8,%rcx
cmpq 8+8(%rsp),%rbp
je .Lsqrx8x_break
subq 16+8(%rsp),%rbx
.byte 0x66
movq -64(%rsi),%rdx
adcxq 0(%rdi),%r8
adcxq 8(%rdi),%r9
adcq 16(%rdi),%r10
adcq 24(%rdi),%r11
adcq 32(%rdi),%r12
adcq 40(%rdi),%r13
adcq 48(%rdi),%r14
adcq 56(%rdi),%r15
leaq 64(%rdi),%rdi
.byte 0x67
sbbq %rax,%rax
xorl %ebx,%ebx
movq %rax,16+8(%rsp)
jmp .Lsqrx8x_loop
.align 32
.Lsqrx8x_break:
xorq %rbp,%rbp
subq 16+8(%rsp),%rbx
adcxq %rbp,%r8
movq 24+8(%rsp),%rcx
adcxq %rbp,%r9
movq 0(%rsi),%rdx
adcq $0,%r10
movq %r8,0(%rdi)
adcq $0,%r11
adcq $0,%r12
adcq $0,%r13
adcq $0,%r14
adcq $0,%r15
cmpq %rcx,%rdi
je .Lsqrx8x_outer_loop
movq %r9,8(%rdi)
movq 8(%rcx),%r9
movq %r10,16(%rdi)
movq 16(%rcx),%r10
movq %r11,24(%rdi)
movq 24(%rcx),%r11
movq %r12,32(%rdi)
movq 32(%rcx),%r12
movq %r13,40(%rdi)
movq 40(%rcx),%r13
movq %r14,48(%rdi)
movq 48(%rcx),%r14
movq %r15,56(%rdi)
movq 56(%rcx),%r15
movq %rcx,%rdi
jmp .Lsqrx8x_outer_loop
.align 32
.Lsqrx8x_outer_break:
movq %r9,72(%rdi)
.byte 102,72,15,126,217
movq %r10,80(%rdi)
movq %r11,88(%rdi)
movq %r12,96(%rdi)
movq %r13,104(%rdi)
movq %r14,112(%rdi)
leaq 48+8(%rsp),%rdi
movq (%rsi,%rcx,1),%rdx
movq 8(%rdi),%r11
xorq %r10,%r10
movq 0+8(%rsp),%r9
adoxq %r11,%r11
movq 16(%rdi),%r12
movq 24(%rdi),%r13
.align 32
.Lsqrx4x_shift_n_add:
mulxq %rdx,%rax,%rbx
adoxq %r12,%r12
adcxq %r10,%rax
.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
adoxq %r13,%r13
adcxq %r11,%rbx
movq 40(%rdi),%r11
movq %rax,0(%rdi)
movq %rbx,8(%rdi)
mulxq %rdx,%rax,%rbx
adoxq %r10,%r10
adcxq %r12,%rax
movq 16(%rsi,%rcx,1),%rdx
movq 48(%rdi),%r12
adoxq %r11,%r11
adcxq %r13,%rbx
movq 56(%rdi),%r13
movq %rax,16(%rdi)
movq %rbx,24(%rdi)
mulxq %rdx,%rax,%rbx
adoxq %r12,%r12
adcxq %r10,%rax
movq 24(%rsi,%rcx,1),%rdx
leaq 32(%rcx),%rcx
movq 64(%rdi),%r10
adoxq %r13,%r13
adcxq %r11,%rbx
movq 72(%rdi),%r11
movq %rax,32(%rdi)
movq %rbx,40(%rdi)
mulxq %rdx,%rax,%rbx
adoxq %r10,%r10
adcxq %r12,%rax
jrcxz .Lsqrx4x_shift_n_add_break
.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
adoxq %r11,%r11
adcxq %r13,%rbx
movq 80(%rdi),%r12
movq 88(%rdi),%r13
movq %rax,48(%rdi)
movq %rbx,56(%rdi)
leaq 64(%rdi),%rdi
nop
jmp .Lsqrx4x_shift_n_add
.align 32
.Lsqrx4x_shift_n_add_break:
adcxq %r13,%rbx
movq %rax,48(%rdi)
movq %rbx,56(%rdi)
leaq 64(%rdi),%rdi
.byte 102,72,15,126,213
__bn_sqrx8x_reduction:
xorl %eax,%eax
movq 32+8(%rsp),%rbx
movq 48+8(%rsp),%rdx
leaq -64(%rbp,%r9,1),%rcx
movq %rcx,0+8(%rsp)
movq %rdi,8+8(%rsp)
leaq 48+8(%rsp),%rdi
jmp .Lsqrx8x_reduction_loop
.align 32
.Lsqrx8x_reduction_loop:
movq 8(%rdi),%r9
movq 16(%rdi),%r10
movq 24(%rdi),%r11
movq 32(%rdi),%r12
movq %rdx,%r8
imulq %rbx,%rdx
movq 40(%rdi),%r13
movq 48(%rdi),%r14
movq 56(%rdi),%r15
movq %rax,24+8(%rsp)
leaq 64(%rdi),%rdi
xorq %rsi,%rsi
movq $-8,%rcx
jmp .Lsqrx8x_reduce
.align 32
.Lsqrx8x_reduce:
movq %r8,%rbx
mulxq 0(%rbp),%rax,%r8
adcxq %rbx,%rax
adoxq %r9,%r8
mulxq 8(%rbp),%rbx,%r9
adcxq %rbx,%r8
adoxq %r10,%r9
mulxq 16(%rbp),%rbx,%r10
adcxq %rbx,%r9
adoxq %r11,%r10
mulxq 24(%rbp),%rbx,%r11
adcxq %rbx,%r10
adoxq %r12,%r11
.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
movq %rdx,%rax
movq %r8,%rdx
adcxq %rbx,%r11
adoxq %r13,%r12
mulxq 32+8(%rsp),%rbx,%rdx
movq %rax,%rdx
movq %rax,64+48+8(%rsp,%rcx,8)
mulxq 40(%rbp),%rax,%r13
adcxq %rax,%r12
adoxq %r14,%r13
mulxq 48(%rbp),%rax,%r14
adcxq %rax,%r13
adoxq %r15,%r14
mulxq 56(%rbp),%rax,%r15
movq %rbx,%rdx
adcxq %rax,%r14
adoxq %rsi,%r15
adcxq %rsi,%r15
.byte 0x67,0x67,0x67
incq %rcx
jnz .Lsqrx8x_reduce
movq %rsi,%rax
cmpq 0+8(%rsp),%rbp
jae .Lsqrx8x_no_tail
movq 48+8(%rsp),%rdx
addq 0(%rdi),%r8
leaq 64(%rbp),%rbp
movq $-8,%rcx
adcxq 8(%rdi),%r9
adcxq 16(%rdi),%r10
adcq 24(%rdi),%r11
adcq 32(%rdi),%r12
adcq 40(%rdi),%r13
adcq 48(%rdi),%r14
adcq 56(%rdi),%r15
leaq 64(%rdi),%rdi
sbbq %rax,%rax
xorq %rsi,%rsi
movq %rax,16+8(%rsp)
jmp .Lsqrx8x_tail
.align 32
.Lsqrx8x_tail:
movq %r8,%rbx
mulxq 0(%rbp),%rax,%r8
adcxq %rax,%rbx
adoxq %r9,%r8
mulxq 8(%rbp),%rax,%r9
adcxq %rax,%r8
adoxq %r10,%r9
mulxq 16(%rbp),%rax,%r10
adcxq %rax,%r9
adoxq %r11,%r10
mulxq 24(%rbp),%rax,%r11
adcxq %rax,%r10
adoxq %r12,%r11
.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
adcxq %rax,%r11
adoxq %r13,%r12
mulxq 40(%rbp),%rax,%r13
adcxq %rax,%r12
adoxq %r14,%r13
mulxq 48(%rbp),%rax,%r14
adcxq %rax,%r13
adoxq %r15,%r14
mulxq 56(%rbp),%rax,%r15
movq 72+48+8(%rsp,%rcx,8),%rdx
adcxq %rax,%r14
adoxq %rsi,%r15
movq %rbx,(%rdi,%rcx,8)
movq %r8,%rbx
adcxq %rsi,%r15
incq %rcx
jnz .Lsqrx8x_tail
cmpq 0+8(%rsp),%rbp
jae .Lsqrx8x_tail_done
subq 16+8(%rsp),%rsi
movq 48+8(%rsp),%rdx
leaq 64(%rbp),%rbp
adcq 0(%rdi),%r8
adcq 8(%rdi),%r9
adcq 16(%rdi),%r10
adcq 24(%rdi),%r11
adcq 32(%rdi),%r12
adcq 40(%rdi),%r13
adcq 48(%rdi),%r14
adcq 56(%rdi),%r15
leaq 64(%rdi),%rdi
sbbq %rax,%rax
subq $8,%rcx
xorq %rsi,%rsi
movq %rax,16+8(%rsp)
jmp .Lsqrx8x_tail
.align 32
.Lsqrx8x_tail_done:
xorq %rax,%rax
addq 24+8(%rsp),%r8
adcq $0,%r9
adcq $0,%r10
adcq $0,%r11
adcq $0,%r12
adcq $0,%r13
adcq $0,%r14
adcq $0,%r15
adcq $0,%rax
subq 16+8(%rsp),%rsi
.Lsqrx8x_no_tail:
adcq 0(%rdi),%r8
.byte 102,72,15,126,217
adcq 8(%rdi),%r9
movq 56(%rbp),%rsi
.byte 102,72,15,126,213
adcq 16(%rdi),%r10
adcq 24(%rdi),%r11
adcq 32(%rdi),%r12
adcq 40(%rdi),%r13
adcq 48(%rdi),%r14
adcq 56(%rdi),%r15
adcq $0,%rax
movq 32+8(%rsp),%rbx
movq 64(%rdi,%rcx,1),%rdx
movq %r8,0(%rdi)
leaq 64(%rdi),%r8
movq %r9,8(%rdi)
movq %r10,16(%rdi)
movq %r11,24(%rdi)
movq %r12,32(%rdi)
movq %r13,40(%rdi)
movq %r14,48(%rdi)
movq %r15,56(%rdi)
leaq 64(%rdi,%rcx,1),%rdi
cmpq 8+8(%rsp),%r8
jb .Lsqrx8x_reduction_loop
.byte 0xf3,0xc3
.cfi_endproc
.size bn_sqrx8x_internal,.-bn_sqrx8x_internal