llvm-project/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

2917 lines
91 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE
; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86,X86-SSE1
; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86,X86-SSE41
; This tests codegen time inlining/optimization of memcmp
; rdar://6480398
@.str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
declare dso_local i32 @memcmp(i8*, i8*, i32)
define i32 @length0(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length0:
; X86: # %bb.0:
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind
ret i32 %m
}
define i1 @length0_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length0_eq:
; X86: # %bb.0:
; X86-NEXT: movb $1, %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i1 @length0_lt(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length0_lt:
; X86: # %bb.0:
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind
%c = icmp slt i32 %m, 0
ret i1 %c
}
define i32 @length2(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
; X86-NEXT: movzwl (%eax), %edx
; X86-NEXT: rolw $8, %cx
; X86-NEXT: rolw $8, %dx
; X86-NEXT: movzwl %cx, %eax
; X86-NEXT: movzwl %dx, %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
ret i32 %m
}
define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length2_eq:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
; X86-NEXT: cmpw (%eax), %cx
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i1 @length2_lt(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length2_lt:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
; X86-NEXT: movzwl (%eax), %edx
; X86-NEXT: rolw $8, %cx
; X86-NEXT: rolw $8, %dx
; X86-NEXT: movzwl %cx, %eax
; X86-NEXT: movzwl %dx, %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
%c = icmp slt i32 %m, 0
ret i1 %c
}
define i1 @length2_gt(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length2_gt:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: rolw $8, %cx
; X86-NEXT: rolw $8, %ax
; X86-NEXT: movzwl %cx, %ecx
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setg %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
%c = icmp sgt i32 %m, 0
ret i1 %c
}
define i1 @length2_eq_const(i8* %X) nounwind {
; X86-LABEL: length2_eq_const:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: cmpl $12849, %eax # imm = 0x3231
; X86-NEXT: setne %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 2) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
}
define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length2_eq_nobuiltin_attr:
; X86: # %bb.0:
; X86-NEXT: pushl $2
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i32 @length3(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length3:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl (%eax), %edx
; X86-NEXT: movzwl (%ecx), %esi
; X86-NEXT: rolw $8, %dx
; X86-NEXT: rolw $8, %si
; X86-NEXT: cmpw %si, %dx
; X86-NEXT: jne .LBB9_3
; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: movzbl 2(%eax), %eax
; X86-NEXT: movzbl 2(%ecx), %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
; X86-NEXT: .LBB9_3: # %res_block
; X86-NEXT: setae %al
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind
ret i32 %m
}
define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length3_eq:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %edx
; X86-NEXT: xorw (%eax), %dx
; X86-NEXT: movb 2(%ecx), %cl
; X86-NEXT: xorb 2(%eax), %cl
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: orw %dx, %ax
; X86-NEXT: setne %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
}
define i32 @length4(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length4:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
; X86-NEXT: movl (%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: seta %al
; X86-NEXT: sbbl $0, %eax
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
ret i32 %m
}
define i1 @length4_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length4_eq:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
; X86-NEXT: cmpl (%eax), %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
}
define i1 @length4_lt(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length4_lt:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
; X86-NEXT: movl (%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: seta %al
; X86-NEXT: sbbl $0, %eax
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
%c = icmp slt i32 %m, 0
ret i1 %c
}
define i1 @length4_gt(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length4_gt:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
; X86-NEXT: movl (%eax), %eax
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: cmpl %eax, %ecx
; X86-NEXT: seta %dl
; X86-NEXT: sbbl $0, %edx
; X86-NEXT: testl %edx, %edx
; X86-NEXT: setg %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
%c = icmp sgt i32 %m, 0
ret i1 %c
}
define i1 @length4_eq_const(i8* %X) nounwind {
; X86-LABEL: length4_eq_const:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 4) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i32 @length5(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length5:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl (%eax), %edx
; X86-NEXT: movl (%ecx), %esi
; X86-NEXT: bswapl %edx
; X86-NEXT: bswapl %esi
; X86-NEXT: cmpl %esi, %edx
; X86-NEXT: jne .LBB16_3
; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: movzbl 4(%eax), %eax
; X86-NEXT: movzbl 4(%ecx), %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
; X86-NEXT: .LBB16_3: # %res_block
; X86-NEXT: setae %al
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
ret i32 %m
}
define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length5_eq:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: xorl (%eax), %edx
; X86-NEXT: movb 4(%ecx), %cl
; X86-NEXT: xorb 4(%eax), %cl
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: orl %edx, %eax
; X86-NEXT: setne %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
}
define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length5_lt:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl (%eax), %edx
; X86-NEXT: movl (%ecx), %esi
; X86-NEXT: bswapl %edx
; X86-NEXT: bswapl %esi
; X86-NEXT: cmpl %esi, %edx
; X86-NEXT: jne .LBB18_3
; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: movzbl 4(%eax), %eax
; X86-NEXT: movzbl 4(%ecx), %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: jmp .LBB18_2
; X86-NEXT: .LBB18_3: # %res_block
; X86-NEXT: setae %al
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB18_2: # %endblock
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
%c = icmp slt i32 %m, 0
ret i1 %c
}
define i32 @length7(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length7:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl (%esi), %ecx
; X86-NEXT: movl (%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: jne .LBB19_2
; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: movl 3(%esi), %ecx
; X86-NEXT: movl 3(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: je .LBB19_3
; X86-NEXT: .LBB19_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: setae %al
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB19_3: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind
ret i32 %m
}
define i1 @length7_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length7_eq:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: movl 3(%ecx), %ecx
; X86-NEXT: xorl (%eax), %edx
; X86-NEXT: xorl 3(%eax), %ecx
; X86-NEXT: orl %edx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
}
define i1 @length7_lt(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length7_lt:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl (%esi), %ecx
; X86-NEXT: movl (%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: jne .LBB21_2
; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: movl 3(%esi), %ecx
; X86-NEXT: movl 3(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: je .LBB21_3
; X86-NEXT: .LBB21_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: setae %al
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB21_3: # %endblock
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind
%c = icmp slt i32 %m, 0
ret i1 %c
}
define i32 @length8(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length8:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl (%esi), %ecx
; X86-NEXT: movl (%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: jne .LBB22_2
; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: je .LBB22_3
; X86-NEXT: .LBB22_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: setae %al
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB22_3: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind
ret i32 %m
}
define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length8_eq:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: movl 4(%ecx), %ecx
; X86-NEXT: xorl (%eax), %edx
; X86-NEXT: xorl 4(%eax), %ecx
; X86-NEXT: orl %edx, %ecx
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i1 @length8_eq_const(i8* %X) nounwind {
; X86-LABEL: length8_eq_const:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130
; X86-NEXT: xorl (%eax), %ecx
; X86-NEXT: movl $926299444, %edx # imm = 0x37363534
; X86-NEXT: xorl 4(%eax), %edx
; X86-NEXT: orl %ecx, %edx
; X86-NEXT: setne %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 8) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
}
define i1 @length9_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length9_eq:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: movl 4(%ecx), %esi
; X86-NEXT: xorl (%eax), %edx
; X86-NEXT: xorl 4(%eax), %esi
; X86-NEXT: orl %edx, %esi
; X86-NEXT: movb 8(%ecx), %cl
; X86-NEXT: xorb 8(%eax), %cl
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: orl %esi, %eax
; X86-NEXT: sete %al
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i1 @length10_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length10_eq:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: movl 4(%ecx), %esi
; X86-NEXT: xorl (%eax), %edx
; X86-NEXT: xorl 4(%eax), %esi
; X86-NEXT: orl %edx, %esi
; X86-NEXT: movzwl 8(%ecx), %ecx
; X86-NEXT: xorw 8(%eax), %cx
; X86-NEXT: movzwl %cx, %eax
; X86-NEXT: orl %esi, %eax
; X86-NEXT: sete %al
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 10) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i1 @length11_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length11_eq:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: movl 4(%ecx), %esi
; X86-NEXT: xorl (%eax), %edx
; X86-NEXT: xorl 4(%eax), %esi
; X86-NEXT: orl %edx, %esi
; X86-NEXT: movl 7(%ecx), %ecx
; X86-NEXT: xorl 7(%eax), %ecx
; X86-NEXT: orl %esi, %ecx
; X86-NEXT: sete %al
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 11) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length12_eq:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: movl 4(%ecx), %esi
; X86-NEXT: xorl (%eax), %edx
; X86-NEXT: xorl 4(%eax), %esi
; X86-NEXT: orl %edx, %esi
; X86-NEXT: movl 8(%ecx), %ecx
; X86-NEXT: xorl 8(%eax), %ecx
; X86-NEXT: orl %esi, %ecx
; X86-NEXT: setne %al
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
}
define i32 @length12(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length12:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl (%esi), %ecx
; X86-NEXT: movl (%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: jne .LBB29_3
; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: jne .LBB29_3
; X86-NEXT: # %bb.2: # %loadbb2
; X86-NEXT: movl 8(%esi), %ecx
; X86-NEXT: movl 8(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: je .LBB29_4
; X86-NEXT: .LBB29_3: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: setae %al
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB29_4: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind
ret i32 %m
}
define i1 @length13_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length13_eq:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl (%edx), %esi
; X86-NEXT: movl 4(%edx), %eax
; X86-NEXT: xorl (%ecx), %esi
; X86-NEXT: xorl 4(%ecx), %eax
; X86-NEXT: orl %esi, %eax
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-NEXT: movl 8(%edx), %esi
; X86-NEXT: xorl 8(%ecx), %esi
; X86-NEXT: movb 12(%edx), %dl
; X86-NEXT: xorb 12(%ecx), %dl
; X86-NEXT: movzbl %dl, %ecx
; X86-NEXT: orl %esi, %ecx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %al
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 13) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length14_eq:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl (%edx), %esi
; X86-NEXT: movl 4(%edx), %eax
; X86-NEXT: xorl (%ecx), %esi
; X86-NEXT: xorl 4(%ecx), %eax
; X86-NEXT: orl %esi, %eax
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-NEXT: movl 8(%edx), %esi
; X86-NEXT: xorl 8(%ecx), %esi
; X86-NEXT: movzwl 12(%edx), %edx
; X86-NEXT: xorw 12(%ecx), %dx
; X86-NEXT: movzwl %dx, %ecx
; X86-NEXT: orl %esi, %ecx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %al
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 14) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length15_eq:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl (%edx), %esi
; X86-NEXT: movl 4(%edx), %eax
; X86-NEXT: xorl (%ecx), %esi
; X86-NEXT: xorl 4(%ecx), %eax
; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl 8(%edx), %esi
; X86-NEXT: xorl 8(%ecx), %esi
; X86-NEXT: movl 11(%edx), %edx
; X86-NEXT: xorl 11(%ecx), %edx
; X86-NEXT: orl %esi, %edx
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 15) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
define i32 @length16(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length16:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl (%esi), %ecx
; X86-NEXT: movl (%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: jne .LBB33_4
; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: jne .LBB33_4
; X86-NEXT: # %bb.2: # %loadbb2
; X86-NEXT: movl 8(%esi), %ecx
; X86-NEXT: movl 8(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: jne .LBB33_4
; X86-NEXT: # %bb.3: # %loadbb3
; X86-NEXT: movl 12(%esi), %ecx
; X86-NEXT: movl 12(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: je .LBB33_5
; X86-NEXT: .LBB33_4: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: setae %al
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB33_5: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 16) nounwind
ret i32 %m
}
define i1 @length16_eq(i8* %x, i8* %y) nounwind {
; X86-NOSSE-LABEL: length16_eq:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl %esi
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NOSSE-NEXT: movl (%edx), %esi
; X86-NOSSE-NEXT: movl 4(%edx), %eax
; X86-NOSSE-NEXT: xorl (%ecx), %esi
; X86-NOSSE-NEXT: xorl 4(%ecx), %eax
; X86-NOSSE-NEXT: orl %esi, %eax
; X86-NOSSE-NEXT: movl 8(%edx), %esi
; X86-NOSSE-NEXT: xorl 8(%ecx), %esi
; X86-NOSSE-NEXT: movl 12(%edx), %edx
; X86-NOSSE-NEXT: xorl 12(%ecx), %edx
; X86-NOSSE-NEXT: orl %esi, %edx
; X86-NOSSE-NEXT: orl %eax, %edx
; X86-NOSSE-NEXT: setne %al
; X86-NOSSE-NEXT: popl %esi
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length16_eq:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl %esi
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE1-NEXT: movl (%edx), %esi
; X86-SSE1-NEXT: movl 4(%edx), %eax
; X86-SSE1-NEXT: xorl (%ecx), %esi
; X86-SSE1-NEXT: xorl 4(%ecx), %eax
; X86-SSE1-NEXT: orl %esi, %eax
; X86-SSE1-NEXT: movl 8(%edx), %esi
; X86-SSE1-NEXT: xorl 8(%ecx), %esi
; X86-SSE1-NEXT: movl 12(%edx), %edx
; X86-SSE1-NEXT: xorl 12(%ecx), %edx
; X86-SSE1-NEXT: orl %esi, %edx
; X86-SSE1-NEXT: orl %eax, %edx
; X86-SSE1-NEXT: setne %al
; X86-SSE1-NEXT: popl %esi
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length16_eq:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
; X86-SSE2-NEXT: movdqu (%eax), %xmm1
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X86-SSE2-NEXT: pmovmskb %xmm1, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length16_eq:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE41-NEXT: movdqu (%ecx), %xmm0
; X86-SSE41-NEXT: movdqu (%eax), %xmm1
; X86-SSE41-NEXT: pxor %xmm0, %xmm1
; X86-SSE41-NEXT: ptest %xmm1, %xmm1
; X86-SSE41-NEXT: setne %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
}
define i1 @length16_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length16_lt:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl (%esi), %ecx
; X86-NEXT: movl (%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: jne .LBB35_4
; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: jne .LBB35_4
; X86-NEXT: # %bb.2: # %loadbb2
; X86-NEXT: movl 8(%esi), %ecx
; X86-NEXT: movl 8(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: jne .LBB35_4
; X86-NEXT: # %bb.3: # %loadbb3
; X86-NEXT: movl 12(%esi), %ecx
; X86-NEXT: movl 12(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: je .LBB35_5
; X86-NEXT: .LBB35_4: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: setae %al
; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB35_5: # %endblock
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length16_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length16_gt:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl (%esi), %eax
; X86-NEXT: movl (%edx), %ecx
; X86-NEXT: bswapl %eax
; X86-NEXT: bswapl %ecx
; X86-NEXT: cmpl %ecx, %eax
; X86-NEXT: jne .LBB36_4
; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: movl 4(%esi), %eax
; X86-NEXT: movl 4(%edx), %ecx
; X86-NEXT: bswapl %eax
; X86-NEXT: bswapl %ecx
; X86-NEXT: cmpl %ecx, %eax
; X86-NEXT: jne .LBB36_4
; X86-NEXT: # %bb.2: # %loadbb2
; X86-NEXT: movl 8(%esi), %eax
; X86-NEXT: movl 8(%edx), %ecx
; X86-NEXT: bswapl %eax
; X86-NEXT: bswapl %ecx
; X86-NEXT: cmpl %ecx, %eax
; X86-NEXT: jne .LBB36_4
; X86-NEXT: # %bb.3: # %loadbb3
; X86-NEXT: movl 12(%esi), %eax
; X86-NEXT: movl 12(%edx), %ecx
; X86-NEXT: bswapl %eax
; X86-NEXT: bswapl %ecx
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: cmpl %ecx, %eax
; X86-NEXT: je .LBB36_5
; X86-NEXT: .LBB36_4: # %res_block
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: cmpl %ecx, %eax
; X86-NEXT: setae %dl
; X86-NEXT: leal -1(%edx,%edx), %edx
; X86-NEXT: .LBB36_5: # %endblock
; X86-NEXT: testl %edx, %edx
; X86-NEXT: setg %al
; X86-NEXT: popl %esi
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length16_eq_const(i8* %X) nounwind {
; X86-NOSSE-LABEL: length16_eq_const:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl %esi
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOSSE-NEXT: movl $858927408, %ecx # imm = 0x33323130
; X86-NOSSE-NEXT: xorl (%eax), %ecx
; X86-NOSSE-NEXT: movl $926299444, %edx # imm = 0x37363534
; X86-NOSSE-NEXT: xorl 4(%eax), %edx
; X86-NOSSE-NEXT: orl %ecx, %edx
; X86-NOSSE-NEXT: movl $825243960, %ecx # imm = 0x31303938
; X86-NOSSE-NEXT: xorl 8(%eax), %ecx
; X86-NOSSE-NEXT: movl $892613426, %esi # imm = 0x35343332
; X86-NOSSE-NEXT: xorl 12(%eax), %esi
; X86-NOSSE-NEXT: orl %ecx, %esi
; X86-NOSSE-NEXT: orl %edx, %esi
; X86-NOSSE-NEXT: sete %al
; X86-NOSSE-NEXT: popl %esi
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length16_eq_const:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl %esi
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE1-NEXT: movl $858927408, %ecx # imm = 0x33323130
; X86-SSE1-NEXT: xorl (%eax), %ecx
; X86-SSE1-NEXT: movl $926299444, %edx # imm = 0x37363534
; X86-SSE1-NEXT: xorl 4(%eax), %edx
; X86-SSE1-NEXT: orl %ecx, %edx
; X86-SSE1-NEXT: movl $825243960, %ecx # imm = 0x31303938
; X86-SSE1-NEXT: xorl 8(%eax), %ecx
; X86-SSE1-NEXT: movl $892613426, %esi # imm = 0x35343332
; X86-SSE1-NEXT: xorl 12(%eax), %esi
; X86-SSE1-NEXT: orl %ecx, %esi
; X86-SSE1-NEXT: orl %edx, %esi
; X86-SSE1-NEXT: sete %al
; X86-SSE1-NEXT: popl %esi
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length16_eq_const:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length16_eq_const:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movdqu (%eax), %xmm0
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 16) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
define i32 @length24(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length24:
; X86: # %bb.0:
; X86-NEXT: pushl $24
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 24) nounwind
ret i32 %m
}
define i1 @length24_eq(i8* %x, i8* %y) nounwind {
; X86-NOSSE-LABEL: length24_eq:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $24
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: sete %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length24_eq:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $24
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: sete %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length24_eq:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1
; X86-SSE2-NEXT: movdqu (%eax), %xmm2
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length24_eq:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE41-NEXT: movdqu (%ecx), %xmm0
; X86-SSE41-NEXT: movdqu 8(%ecx), %xmm1
; X86-SSE41-NEXT: movdqu (%eax), %xmm2
; X86-SSE41-NEXT: pxor %xmm0, %xmm2
; X86-SSE41-NEXT: movdqu 8(%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
; X86-SSE41-NEXT: por %xmm2, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind
%cmp = icmp eq i32 %call, 0
ret i1 %cmp
}
define i1 @length24_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length24_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $24
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length24_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length24_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $24
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length24_eq_const(i8* %X) nounwind {
; X86-NOSSE-LABEL: length24_eq_const:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $24
; X86-NOSSE-NEXT: pushl $.L.str
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: setne %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length24_eq_const:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $24
; X86-SSE1-NEXT: pushl $.L.str
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: setne %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length24_eq_const:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length24_eq_const:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movdqu (%eax), %xmm0
; X86-SSE41-NEXT: movdqu 8(%eax), %xmm1
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE41-NEXT: por %xmm1, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: setne %al
; X86-SSE41-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 24) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
}
define i32 @length31(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length31:
; X86: # %bb.0:
; X86-NEXT: pushl $31
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 31) nounwind
ret i32 %m
}
define i1 @length31_eq(i8* %x, i8* %y) nounwind {
; X86-NOSSE-LABEL: length31_eq:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $31
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: sete %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length31_eq:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $31
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: sete %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length31_eq:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 15(%ecx), %xmm1
; X86-SSE2-NEXT: movdqu (%eax), %xmm2
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length31_eq:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE41-NEXT: movdqu (%ecx), %xmm0
; X86-SSE41-NEXT: movdqu 15(%ecx), %xmm1
; X86-SSE41-NEXT: movdqu (%eax), %xmm2
; X86-SSE41-NEXT: pxor %xmm0, %xmm2
; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
; X86-SSE41-NEXT: por %xmm2, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind
%cmp = icmp eq i32 %call, 0
ret i1 %cmp
}
define i1 @length31_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length31_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $31
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length31_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length31_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $31
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
; X86-NOSSE-LABEL: length31_eq_prefer128:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $31
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: sete %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length31_eq_prefer128:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $31
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: sete %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length31_eq_prefer128:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 15(%ecx), %xmm1
; X86-SSE2-NEXT: movdqu (%eax), %xmm2
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length31_eq_prefer128:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE41-NEXT: movdqu (%ecx), %xmm0
; X86-SSE41-NEXT: movdqu 15(%ecx), %xmm1
; X86-SSE41-NEXT: movdqu (%eax), %xmm2
; X86-SSE41-NEXT: pxor %xmm0, %xmm2
; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
; X86-SSE41-NEXT: por %xmm2, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind
%cmp = icmp eq i32 %call, 0
ret i1 %cmp
}
define i1 @length31_eq_const(i8* %X) nounwind {
; X86-NOSSE-LABEL: length31_eq_const:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $31
; X86-NOSSE-NEXT: pushl $.L.str
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: setne %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length31_eq_const:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $31
; X86-SSE1-NEXT: pushl $.L.str
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: setne %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length31_eq_const:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: movdqu 15(%eax), %xmm1
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length31_eq_const:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movdqu (%eax), %xmm0
; X86-SSE41-NEXT: movdqu 15(%eax), %xmm1
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE41-NEXT: por %xmm1, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: setne %al
; X86-SSE41-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 31) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
}
define i32 @length32(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length32:
; X86: # %bb.0:
; X86-NEXT: pushl $32
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 32) nounwind
ret i32 %m
}
; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-NOSSE-LABEL: length32_eq:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $32
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: sete %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length32_eq:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $32
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: sete %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
; X86-SSE2-NEXT: movdqu (%eax), %xmm2
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length32_eq:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE41-NEXT: movdqu (%ecx), %xmm0
; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1
; X86-SSE41-NEXT: movdqu (%eax), %xmm2
; X86-SSE41-NEXT: pxor %xmm0, %xmm2
; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
; X86-SSE41-NEXT: por %xmm2, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
%cmp = icmp eq i32 %call, 0
ret i1 %cmp
}
define i1 @length32_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length32_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $32
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length32_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length32_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $32
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
; X86-NOSSE-LABEL: length32_eq_prefer128:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $32
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: sete %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length32_eq_prefer128:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $32
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: sete %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq_prefer128:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
; X86-SSE2-NEXT: movdqu (%eax), %xmm2
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length32_eq_prefer128:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE41-NEXT: movdqu (%ecx), %xmm0
; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1
; X86-SSE41-NEXT: movdqu (%eax), %xmm2
; X86-SSE41-NEXT: pxor %xmm0, %xmm2
; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
; X86-SSE41-NEXT: por %xmm2, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
%cmp = icmp eq i32 %call, 0
ret i1 %cmp
}
define i1 @length32_eq_const(i8* %X) nounwind {
; X86-NOSSE-LABEL: length32_eq_const:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $32
; X86-NOSSE-NEXT: pushl $.L.str
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: setne %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length32_eq_const:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $32
; X86-SSE1-NEXT: pushl $.L.str
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: setne %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq_const:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length32_eq_const:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movdqu (%eax), %xmm0
; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE41-NEXT: por %xmm1, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: setne %al
; X86-SSE41-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 32) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
}
define i32 @length48(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length48:
; X86: # %bb.0:
; X86-NEXT: pushl $48
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 48) nounwind
ret i32 %m
}
define i1 @length48_eq(i8* %x, i8* %y) nounwind {
; X86-NOSSE-LABEL: length48_eq:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $48
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: sete %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length48_eq:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $48
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: sete %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length48_eq:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
; X86-SSE2-NEXT: movdqu (%eax), %xmm2
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1
; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3
; X86-SSE2-NEXT: pand %xmm0, %xmm3
; X86-SSE2-NEXT: pand %xmm2, %xmm3
; X86-SSE2-NEXT: pmovmskb %xmm3, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length48_eq:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE41-NEXT: movdqu (%ecx), %xmm0
; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1
; X86-SSE41-NEXT: movdqu (%eax), %xmm2
; X86-SSE41-NEXT: pxor %xmm0, %xmm2
; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1
; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3
; X86-SSE41-NEXT: pxor %xmm1, %xmm3
; X86-SSE41-NEXT: por %xmm0, %xmm3
; X86-SSE41-NEXT: por %xmm2, %xmm3
; X86-SSE41-NEXT: ptest %xmm3, %xmm3
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind
%cmp = icmp eq i32 %call, 0
ret i1 %cmp
}
define i1 @length48_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length48_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $48
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length48_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length48_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $48
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
; X86-NOSSE-LABEL: length48_eq_prefer128:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $48
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: sete %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length48_eq_prefer128:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $48
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: sete %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length48_eq_prefer128:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
; X86-SSE2-NEXT: movdqu (%eax), %xmm2
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1
; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3
; X86-SSE2-NEXT: pand %xmm0, %xmm3
; X86-SSE2-NEXT: pand %xmm2, %xmm3
; X86-SSE2-NEXT: pmovmskb %xmm3, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length48_eq_prefer128:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE41-NEXT: movdqu (%ecx), %xmm0
; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1
; X86-SSE41-NEXT: movdqu (%eax), %xmm2
; X86-SSE41-NEXT: pxor %xmm0, %xmm2
; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1
; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3
; X86-SSE41-NEXT: pxor %xmm1, %xmm3
; X86-SSE41-NEXT: por %xmm0, %xmm3
; X86-SSE41-NEXT: por %xmm2, %xmm3
; X86-SSE41-NEXT: ptest %xmm3, %xmm3
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind
%cmp = icmp eq i32 %call, 0
ret i1 %cmp
}
define i1 @length48_eq_const(i8* %X) nounwind {
; X86-NOSSE-LABEL: length48_eq_const:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $48
; X86-NOSSE-NEXT: pushl $.L.str
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: setne %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length48_eq_const:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $48
; X86-SSE1-NEXT: pushl $.L.str
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: setne %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length48_eq_const:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE2-NEXT: pand %xmm1, %xmm2
; X86-SSE2-NEXT: pand %xmm0, %xmm2
; X86-SSE2-NEXT: pmovmskb %xmm2, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length48_eq_const:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movdqu (%eax), %xmm0
; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE41-NEXT: por %xmm1, %xmm2
; X86-SSE41-NEXT: por %xmm0, %xmm2
; X86-SSE41-NEXT: ptest %xmm2, %xmm2
; X86-SSE41-NEXT: setne %al
; X86-SSE41-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 48) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
}
define i32 @length63(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length63:
; X86: # %bb.0:
; X86-NEXT: pushl $63
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 63) nounwind
ret i32 %m
}
define i1 @length63_eq(i8* %x, i8* %y) nounwind {
; X86-NOSSE-LABEL: length63_eq:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $63
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: setne %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length63_eq:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $63
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: setne %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length63_eq:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE2-NEXT: movdqu (%ecx), %xmm1
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm2
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm1
; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm2
; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm3
; X86-SSE2-NEXT: movdqu 47(%ecx), %xmm2
; X86-SSE2-NEXT: movdqu 47(%eax), %xmm4
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm4
; X86-SSE2-NEXT: pand %xmm3, %xmm4
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE2-NEXT: pand %xmm1, %xmm4
; X86-SSE2-NEXT: pand %xmm0, %xmm4
; X86-SSE2-NEXT: pmovmskb %xmm4, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length63_eq:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE41-NEXT: movdqu (%ecx), %xmm1
; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm2
; X86-SSE41-NEXT: movdqu (%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE41-NEXT: pxor %xmm2, %xmm1
; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm2
; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE41-NEXT: pxor %xmm2, %xmm3
; X86-SSE41-NEXT: movdqu 47(%ecx), %xmm2
; X86-SSE41-NEXT: movdqu 47(%eax), %xmm4
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE41-NEXT: pxor %xmm2, %xmm4
; X86-SSE41-NEXT: por %xmm3, %xmm4
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE41-NEXT: por %xmm1, %xmm4
; X86-SSE41-NEXT: por %xmm0, %xmm4
; X86-SSE41-NEXT: ptest %xmm4, %xmm4
; X86-SSE41-NEXT: setne %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
}
define i1 @length63_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length63_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $63
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length63_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length63_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $63
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length63_eq_const(i8* %X) nounwind {
; X86-NOSSE-LABEL: length63_eq_const:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $63
; X86-NOSSE-NEXT: pushl $.L.str
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: sete %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length63_eq_const:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $63
; X86-SSE1-NEXT: pushl $.L.str
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: sete %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length63_eq_const:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2
; X86-SSE2-NEXT: movdqu 47(%eax), %xmm3
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE2-NEXT: pand %xmm3, %xmm2
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pand %xmm2, %xmm1
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length63_eq_const:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movdqu (%eax), %xmm0
; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2
; X86-SSE41-NEXT: movdqu 47(%eax), %xmm3
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE41-NEXT: por %xmm3, %xmm2
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE41-NEXT: por %xmm2, %xmm1
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE41-NEXT: por %xmm1, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 63) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i32 @length64(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length64:
; X86: # %bb.0:
; X86-NEXT: pushl $64
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 64) nounwind
ret i32 %m
}
define i1 @length64_eq(i8* %x, i8* %y) nounwind {
; X86-NOSSE-LABEL: length64_eq:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $64
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: setne %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length64_eq:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $64
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: setne %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length64_eq:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE2-NEXT: movdqu (%ecx), %xmm1
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm2
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm1
; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm2
; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm3
; X86-SSE2-NEXT: movdqu 48(%ecx), %xmm2
; X86-SSE2-NEXT: movdqu 48(%eax), %xmm4
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm4
; X86-SSE2-NEXT: pand %xmm3, %xmm4
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE2-NEXT: pand %xmm1, %xmm4
; X86-SSE2-NEXT: pand %xmm0, %xmm4
; X86-SSE2-NEXT: pmovmskb %xmm4, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length64_eq:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE41-NEXT: movdqu (%ecx), %xmm1
; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm2
; X86-SSE41-NEXT: movdqu (%eax), %xmm0
; X86-SSE41-NEXT: pxor %xmm1, %xmm0
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE41-NEXT: pxor %xmm2, %xmm1
; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm2
; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE41-NEXT: pxor %xmm2, %xmm3
; X86-SSE41-NEXT: movdqu 48(%ecx), %xmm2
; X86-SSE41-NEXT: movdqu 48(%eax), %xmm4
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE41-NEXT: pxor %xmm2, %xmm4
; X86-SSE41-NEXT: por %xmm3, %xmm4
RegAllocGreedy: Account for reserved registers in num regs heuristic This simple heuristic uses the estimated live range length combined with the number of registers in the class to switch which heuristic to use. This was taking the raw number of registers in the class, even though not all of them may be available. AMDGPU heavily relies on dynamically reserved numbers of registers based on user attributes to satisfy occupancy constraints, so the raw number is highly misleading. There are still a few problems here. In the original testcase that made me notice this, the live range size is incorrect after the scheduler rearranges instructions, since the instructions don't have the original InstrDist offsets. Additionally, I think it would be more appropriate to use the number of disjointly allocatable registers in the class. For the AMDGPU register tuples, there are a large number of registers in each tuple class, but only a small fraction can actually be allocated at the same time since they all overlap with each other. It seems we do not have a query that corresponds to the number of independently allocatable registers. Relatedly, I'm still debugging some allocation failures where overlapping tuples seem to not be handled correctly. The test changes are mostly noise. There are a handful of x86 tests that look like regressions with an additional spill, and a handful that now avoid a spill. The worst looking regression is likely test/Thumb2/mve-vld4.ll which introduces a few additional spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll shows a massive improvement by completely eliminating a large number of spills inside a loop.
2021-08-22 02:54:51 +08:00
; X86-SSE41-NEXT: por %xmm1, %xmm4
; X86-SSE41-NEXT: por %xmm0, %xmm4
; X86-SSE41-NEXT: ptest %xmm4, %xmm4
; X86-SSE41-NEXT: setne %al
; X86-SSE41-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
}
define i1 @length64_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length64_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $64
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length64_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length64_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $64
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length64_eq_const(i8* %X) nounwind {
; X86-NOSSE-LABEL: length64_eq_const:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $64
; X86-NOSSE-NEXT: pushl $.L.str
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
; X86-NOSSE-NEXT: addl $12, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: sete %al
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length64_eq_const:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl $64
; X86-SSE1-NEXT: pushl $.L.str
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
; X86-SSE1-NEXT: calll memcmp
; X86-SSE1-NEXT: addl $12, %esp
; X86-SSE1-NEXT: testl %eax, %eax
; X86-SSE1-NEXT: sete %al
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length64_eq_const:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2
; X86-SSE2-NEXT: movdqu 48(%eax), %xmm3
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE2-NEXT: pand %xmm3, %xmm2
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE2-NEXT: pand %xmm2, %xmm1
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: pand %xmm1, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
;
; X86-SSE41-LABEL: length64_eq_const:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movdqu (%eax), %xmm0
; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2
; X86-SSE41-NEXT: movdqu 48(%eax), %xmm3
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE41-NEXT: por %xmm3, %xmm2
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE41-NEXT: por %xmm2, %xmm1
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE41-NEXT: por %xmm1, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 64) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i32 @length96(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length96:
; X86: # %bb.0:
; X86-NEXT: pushl $96
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 96) nounwind
ret i32 %m
}
define i1 @length96_eq(i8* %x, i8* %y) nounwind {
; X86-LABEL: length96_eq:
; X86: # %bb.0:
; X86-NEXT: pushl $96
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
}
define i1 @length96_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length96_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $96
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length96_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length96_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $96
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length96_eq_const(i8* %X) nounwind {
; X86-LABEL: length96_eq_const:
; X86: # %bb.0:
; X86-NEXT: pushl $96
; X86-NEXT: pushl $.L.str
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 96) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i32 @length127(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length127:
; X86: # %bb.0:
; X86-NEXT: pushl $127
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 127) nounwind
ret i32 %m
}
define i1 @length127_eq(i8* %x, i8* %y) nounwind {
; X86-LABEL: length127_eq:
; X86: # %bb.0:
; X86-NEXT: pushl $127
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
}
define i1 @length127_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length127_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $127
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length127_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length127_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $127
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length127_eq_const(i8* %X) nounwind {
; X86-LABEL: length127_eq_const:
; X86: # %bb.0:
; X86-NEXT: pushl $127
; X86-NEXT: pushl $.L.str
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 127) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i32 @length128(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length128:
; X86: # %bb.0:
; X86-NEXT: pushl $128
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 128) nounwind
ret i32 %m
}
define i1 @length128_eq(i8* %x, i8* %y) nounwind {
; X86-LABEL: length128_eq:
; X86: # %bb.0:
; X86-NEXT: pushl $128
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
}
define i1 @length128_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length128_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $128
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length128_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length128_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $128
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length128_eq_const(i8* %X) nounwind {
; X86-LABEL: length128_eq_const:
; X86: # %bb.0:
; X86-NEXT: pushl $128
; X86-NEXT: pushl $.L.str
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 128) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i32 @length192(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length192:
; X86: # %bb.0:
; X86-NEXT: pushl $192
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 192) nounwind
ret i32 %m
}
define i1 @length192_eq(i8* %x, i8* %y) nounwind {
; X86-LABEL: length192_eq:
; X86: # %bb.0:
; X86-NEXT: pushl $192
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
}
define i1 @length192_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length192_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $192
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length192_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length192_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $192
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length192_eq_const(i8* %X) nounwind {
; X86-LABEL: length192_eq_const:
; X86: # %bb.0:
; X86-NEXT: pushl $192
; X86-NEXT: pushl $.L.str
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 192) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i32 @length255(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length255:
; X86: # %bb.0:
; X86-NEXT: pushl $255
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 255) nounwind
ret i32 %m
}
define i1 @length255_eq(i8* %x, i8* %y) nounwind {
; X86-LABEL: length255_eq:
; X86: # %bb.0:
; X86-NEXT: pushl $255
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
}
define i1 @length255_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length255_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $255
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length255_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length255_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $255
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length255_eq_const(i8* %X) nounwind {
; X86-LABEL: length255_eq_const:
; X86: # %bb.0:
; X86-NEXT: pushl $255
; X86-NEXT: pushl $.L.str
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 255) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i32 @length256(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length256:
; X86: # %bb.0:
; X86-NEXT: pushl $256 # imm = 0x100
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 256) nounwind
ret i32 %m
}
define i1 @length256_eq(i8* %x, i8* %y) nounwind {
; X86-LABEL: length256_eq:
; X86: # %bb.0:
; X86-NEXT: pushl $256 # imm = 0x100
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
}
define i1 @length256_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length256_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $256 # imm = 0x100
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length256_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length256_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $256 # imm = 0x100
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length256_eq_const(i8* %X) nounwind {
; X86-LABEL: length256_eq_const:
; X86: # %bb.0:
; X86-NEXT: pushl $256 # imm = 0x100
; X86-NEXT: pushl $.L.str
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 256) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i32 @length384(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length384:
; X86: # %bb.0:
; X86-NEXT: pushl $384 # imm = 0x180
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 384) nounwind
ret i32 %m
}
define i1 @length384_eq(i8* %x, i8* %y) nounwind {
; X86-LABEL: length384_eq:
; X86: # %bb.0:
; X86-NEXT: pushl $384 # imm = 0x180
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
}
define i1 @length384_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length384_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $384 # imm = 0x180
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length384_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length384_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $384 # imm = 0x180
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length384_eq_const(i8* %X) nounwind {
; X86-LABEL: length384_eq_const:
; X86: # %bb.0:
; X86-NEXT: pushl $384 # imm = 0x180
; X86-NEXT: pushl $.L.str
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 384) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i32 @length511(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length511:
; X86: # %bb.0:
; X86-NEXT: pushl $511 # imm = 0x1FF
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 511) nounwind
ret i32 %m
}
define i1 @length511_eq(i8* %x, i8* %y) nounwind {
; X86-LABEL: length511_eq:
; X86: # %bb.0:
; X86-NEXT: pushl $511 # imm = 0x1FF
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
}
define i1 @length511_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length511_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $511 # imm = 0x1FF
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length511_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length511_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $511 # imm = 0x1FF
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length511_eq_const(i8* %X) nounwind {
; X86-LABEL: length511_eq_const:
; X86: # %bb.0:
; X86-NEXT: pushl $511 # imm = 0x1FF
; X86-NEXT: pushl $.L.str
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 511) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
define i32 @length512(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length512:
; X86: # %bb.0:
; X86-NEXT: pushl $512 # imm = 0x200
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 512) nounwind
ret i32 %m
}
define i1 @length512_eq(i8* %x, i8* %y) nounwind {
; X86-LABEL: length512_eq:
; X86: # %bb.0:
; X86-NEXT: pushl $512 # imm = 0x200
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
}
define i1 @length512_lt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length512_lt:
; X86: # %bb.0:
; X86-NEXT: pushl $512 # imm = 0x200
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind
%cmp = icmp slt i32 %call, 0
ret i1 %cmp
}
define i1 @length512_gt(i8* %x, i8* %y) nounwind {
; X86-LABEL: length512_gt:
; X86: # %bb.0:
; X86-NEXT: pushl $512 # imm = 0x200
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
%call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind
%cmp = icmp sgt i32 %call, 0
ret i1 %cmp
}
define i1 @length512_eq_const(i8* %X) nounwind {
; X86-LABEL: length512_eq_const:
; X86: # %bb.0:
; X86-NEXT: pushl $512 # imm = 0x200
; X86-NEXT: pushl $.L.str
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 512) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
; This checks that we do not do stupid things with huge sizes.
define i32 @huge_length(i8* %X, i8* %Y) nounwind {
; X86-LABEL: huge_length:
; X86: # %bb.0:
; X86-NEXT: pushl $-1
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9223372036854775807) nounwind
ret i32 %m
}
define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: huge_length_eq:
; X86: # %bb.0:
; X86-NEXT: pushl $-1
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9223372036854775807) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
; This checks non-constant sizes.
define i32 @nonconst_length(i8* %X, i8* %Y, i32 %size) nounwind {
; X86-LABEL: nonconst_length:
; X86: # %bb.0:
; X86-NEXT: jmp memcmp # TAILCALL
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 %size) nounwind
ret i32 %m
}
define i1 @nonconst_length_eq(i8* %X, i8* %Y, i32 %size) nounwind {
; X86-LABEL: nonconst_length_eq:
; X86: # %bb.0:
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $12, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
; X86-NEXT: retl
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 %size) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}