2017-03-23 06:39:17 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2017-10-24 23:27:47 +08:00
|
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
|
|
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X86 --check-prefix=SSE --check-prefix=X86-SSE1
|
|
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=SSE --check-prefix=X86-SSE2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
|
2009-12-24 08:37:38 +08:00
|
|
|
|
|
|
|
; This tests codegen time inlining/optimization of memcmp
|
|
|
|
; rdar://6480398
|
|
|
|
|
2017-03-23 23:38:22 +08:00
|
|
|
@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
|
2009-12-24 08:37:38 +08:00
|
|
|
|
2017-02-04 03:11:19 +08:00
|
|
|
declare i32 @memcmp(i8*, i8*, i64)
|
2009-12-24 08:37:38 +08:00
|
|
|
|
2017-10-27 20:34:18 +08:00
|
|
|
define i32 @length0(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length0:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-10-27 20:34:18 +08:00
|
|
|
; X86-NEXT: xorl %eax, %eax
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length0:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-10-27 20:34:18 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 0) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length0_eq(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length0_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-10-27 20:34:18 +08:00
|
|
|
; X86-NEXT: movb $1, %al
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length0_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-10-27 20:34:18 +08:00
|
|
|
; X64-NEXT: movb $1, %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 0) nounwind
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
2019-04-05 23:03:25 +08:00
|
|
|
define i1 @length0_lt(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length0_lt:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: xorl %eax, %eax
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length0_lt:
|
|
|
|
; X64: # %bb.0:
|
|
|
|
; X64-NEXT: xorl %eax, %eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 0) nounwind
|
|
|
|
%c = icmp slt i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
2017-06-08 23:01:29 +08:00
|
|
|
define i32 @length2(i8* %X, i8* %Y) nounwind {
|
2017-07-18 22:19:34 +08:00
|
|
|
; X86-LABEL: length2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-07-18 22:19:34 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movzwl (%ecx), %ecx
|
2017-08-01 02:08:24 +08:00
|
|
|
; X86-NEXT: movzwl (%eax), %edx
|
2017-07-18 22:19:34 +08:00
|
|
|
; X86-NEXT: rolw $8, %cx
|
2017-08-01 02:08:24 +08:00
|
|
|
; X86-NEXT: rolw $8, %dx
|
|
|
|
; X86-NEXT: movzwl %cx, %eax
|
|
|
|
; X86-NEXT: movzwl %dx, %ecx
|
|
|
|
; X86-NEXT: subl %ecx, %eax
|
2017-07-18 22:19:34 +08:00
|
|
|
; X86-NEXT: retl
|
2017-06-08 23:01:29 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: length2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-06-20 23:58:30 +08:00
|
|
|
; X64-NEXT: movzwl (%rdi), %eax
|
|
|
|
; X64-NEXT: movzwl (%rsi), %ecx
|
|
|
|
; X64-NEXT: rolw $8, %ax
|
|
|
|
; X64-NEXT: rolw $8, %cx
|
2017-08-01 02:08:24 +08:00
|
|
|
; X64-NEXT: movzwl %ax, %eax
|
|
|
|
; X64-NEXT: movzwl %cx, %ecx
|
|
|
|
; X64-NEXT: subl %ecx, %eax
|
2017-06-20 23:58:30 +08:00
|
|
|
; X64-NEXT: retq
|
2017-06-08 23:01:29 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
|
|
|
|
2017-06-09 01:02:39 +08:00
|
|
|
define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length2_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movzwl (%ecx), %ecx
|
|
|
|
; X86-NEXT: cmpw (%eax), %cx
|
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
2017-03-25 06:09:48 +08:00
|
|
|
;
|
2017-06-08 23:01:29 +08:00
|
|
|
; X64-LABEL: length2_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-03-25 06:09:48 +08:00
|
|
|
; X64-NEXT: movzwl (%rdi), %eax
|
|
|
|
; X64-NEXT: cmpw (%rsi), %ax
|
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: retq
|
2017-03-23 06:39:17 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
2009-12-24 08:37:38 +08:00
|
|
|
}
|
|
|
|
|
2019-04-05 23:03:25 +08:00
|
|
|
define i1 @length2_lt(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length2_lt:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movzwl (%ecx), %ecx
|
|
|
|
; X86-NEXT: movzwl (%eax), %edx
|
|
|
|
; X86-NEXT: rolw $8, %cx
|
|
|
|
; X86-NEXT: rolw $8, %dx
|
|
|
|
; X86-NEXT: movzwl %cx, %eax
|
|
|
|
; X86-NEXT: movzwl %dx, %ecx
|
|
|
|
; X86-NEXT: subl %ecx, %eax
|
|
|
|
; X86-NEXT: shrl $31, %eax
|
|
|
|
; X86-NEXT: # kill: def $al killed $al killed $eax
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length2_lt:
|
|
|
|
; X64: # %bb.0:
|
|
|
|
; X64-NEXT: movzwl (%rdi), %eax
|
|
|
|
; X64-NEXT: movzwl (%rsi), %ecx
|
|
|
|
; X64-NEXT: rolw $8, %ax
|
|
|
|
; X64-NEXT: rolw $8, %cx
|
|
|
|
; X64-NEXT: movzwl %ax, %eax
|
|
|
|
; X64-NEXT: movzwl %cx, %ecx
|
|
|
|
; X64-NEXT: subl %ecx, %eax
|
|
|
|
; X64-NEXT: shrl $31, %eax
|
|
|
|
; X64-NEXT: # kill: def $al killed $al killed $eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
|
|
|
|
%c = icmp slt i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length2_gt(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length2_gt:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movzwl (%ecx), %ecx
|
|
|
|
; X86-NEXT: movzwl (%eax), %eax
|
|
|
|
; X86-NEXT: rolw $8, %cx
|
|
|
|
; X86-NEXT: rolw $8, %ax
|
|
|
|
; X86-NEXT: movzwl %cx, %ecx
|
|
|
|
; X86-NEXT: movzwl %ax, %eax
|
|
|
|
; X86-NEXT: subl %eax, %ecx
|
|
|
|
; X86-NEXT: testl %ecx, %ecx
|
|
|
|
; X86-NEXT: setg %al
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length2_gt:
|
|
|
|
; X64: # %bb.0:
|
|
|
|
; X64-NEXT: movzwl (%rdi), %eax
|
|
|
|
; X64-NEXT: movzwl (%rsi), %ecx
|
|
|
|
; X64-NEXT: rolw $8, %ax
|
|
|
|
; X64-NEXT: rolw $8, %cx
|
|
|
|
; X64-NEXT: movzwl %ax, %eax
|
|
|
|
; X64-NEXT: movzwl %cx, %ecx
|
|
|
|
; X64-NEXT: subl %ecx, %eax
|
|
|
|
; X64-NEXT: testl %eax, %eax
|
|
|
|
; X64-NEXT: setg %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
|
|
|
|
%c = icmp sgt i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
2017-06-09 01:02:39 +08:00
|
|
|
define i1 @length2_eq_const(i8* %X) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length2_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movzwl (%eax), %eax
|
|
|
|
; X86-NEXT: cmpl $12849, %eax # imm = 0x3231
|
|
|
|
; X86-NEXT: setne %al
|
|
|
|
; X86-NEXT: retl
|
2017-03-25 06:09:48 +08:00
|
|
|
;
|
2017-06-08 23:01:29 +08:00
|
|
|
; X64-LABEL: length2_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-03-25 06:09:48 +08:00
|
|
|
; X64-NEXT: movzwl (%rdi), %eax
|
|
|
|
; X64-NEXT: cmpl $12849, %eax # imm = 0x3231
|
|
|
|
; X64-NEXT: setne %al
|
|
|
|
; X64-NEXT: retq
|
2017-03-23 23:38:22 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
|
2017-03-23 06:39:17 +08:00
|
|
|
%c = icmp ne i32 %m, 0
|
|
|
|
ret i1 %c
|
2009-12-24 08:37:38 +08:00
|
|
|
}
|
|
|
|
|
2017-06-09 01:02:39 +08:00
|
|
|
define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length2_eq_nobuiltin_attr:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $2
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: testl %eax, %eax
|
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
2017-03-25 06:09:48 +08:00
|
|
|
;
|
2017-06-08 23:01:29 +08:00
|
|
|
; X64-LABEL: length2_eq_nobuiltin_attr:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-03-25 06:09:48 +08:00
|
|
|
; X64-NEXT: pushq %rax
|
|
|
|
; X64-NEXT: movl $2, %edx
|
|
|
|
; X64-NEXT: callq memcmp
|
|
|
|
; X64-NEXT: testl %eax, %eax
|
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: popq %rcx
|
|
|
|
; X64-NEXT: retq
|
2017-03-23 06:39:17 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
2016-06-18 04:24:07 +08:00
|
|
|
}
|
2009-12-24 08:37:38 +08:00
|
|
|
|
2017-06-08 23:01:29 +08:00
|
|
|
define i32 @length3(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0: # %loadbb
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X86-NEXT: pushl %esi
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movzwl (%eax), %edx
|
|
|
|
; X86-NEXT: movzwl (%ecx), %esi
|
|
|
|
; X86-NEXT: rolw $8, %dx
|
|
|
|
; X86-NEXT: rolw $8, %si
|
2017-08-02 01:24:54 +08:00
|
|
|
; X86-NEXT: cmpw %si, %dx
|
2019-04-05 23:03:25 +08:00
|
|
|
; X86-NEXT: jne .LBB9_1
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-NEXT: # %bb.2: # %loadbb1
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X86-NEXT: movzbl 2(%eax), %eax
|
|
|
|
; X86-NEXT: movzbl 2(%ecx), %ecx
|
|
|
|
; X86-NEXT: subl %ecx, %eax
|
|
|
|
; X86-NEXT: popl %esi
|
|
|
|
; X86-NEXT: retl
|
2019-04-05 23:03:25 +08:00
|
|
|
; X86-NEXT: .LBB9_1: # %res_block
|
2017-08-11 23:44:14 +08:00
|
|
|
; X86-NEXT: setae %al
|
|
|
|
; X86-NEXT: movzbl %al, %eax
|
|
|
|
; X86-NEXT: leal -1(%eax,%eax), %eax
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X86-NEXT: popl %esi
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: retl
|
2017-06-08 23:01:29 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: length3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0: # %loadbb
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movzwl (%rdi), %eax
|
|
|
|
; X64-NEXT: movzwl (%rsi), %ecx
|
|
|
|
; X64-NEXT: rolw $8, %ax
|
|
|
|
; X64-NEXT: rolw $8, %cx
|
2017-08-02 01:24:54 +08:00
|
|
|
; X64-NEXT: cmpw %cx, %ax
|
2019-04-05 23:03:25 +08:00
|
|
|
; X64-NEXT: jne .LBB9_1
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-NEXT: # %bb.2: # %loadbb1
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movzbl 2(%rdi), %eax
|
|
|
|
; X64-NEXT: movzbl 2(%rsi), %ecx
|
|
|
|
; X64-NEXT: subl %ecx, %eax
|
|
|
|
; X64-NEXT: retq
|
2019-04-05 23:03:25 +08:00
|
|
|
; X64-NEXT: .LBB9_1: # %res_block
|
2017-08-11 23:44:14 +08:00
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: movzbl %al, %eax
|
|
|
|
; X64-NEXT: leal -1(%rax,%rax), %eax
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: retq
|
2017-06-08 23:01:29 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length3_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-11-02 23:53:10 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
2017-11-03 20:12:27 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movzwl (%ecx), %edx
|
2018-01-07 00:16:04 +08:00
|
|
|
; X86-NEXT: xorw (%eax), %dx
|
|
|
|
; X86-NEXT: movb 2(%ecx), %cl
|
|
|
|
; X86-NEXT: xorb 2(%eax), %cl
|
|
|
|
; X86-NEXT: movzbl %cl, %eax
|
|
|
|
; X86-NEXT: orw %dx, %ax
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: setne %al
|
|
|
|
; X86-NEXT: retl
|
2017-06-08 23:01:29 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: length3_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movzwl (%rdi), %eax
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-NEXT: xorw (%rsi), %ax
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movb 2(%rdi), %cl
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-NEXT: xorb 2(%rsi), %cl
|
|
|
|
; X64-NEXT: movzbl %cl, %ecx
|
|
|
|
; X64-NEXT: orw %ax, %cx
|
2017-06-08 23:01:29 +08:00
|
|
|
; X64-NEXT: setne %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
|
|
|
|
%c = icmp ne i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @length4(i8* %X, i8* %Y) nounwind {
|
2017-07-18 22:19:34 +08:00
|
|
|
; X86-LABEL: length4:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-07-18 22:19:34 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movl (%ecx), %ecx
|
2017-08-01 02:08:24 +08:00
|
|
|
; X86-NEXT: movl (%eax), %edx
|
2017-07-18 22:19:34 +08:00
|
|
|
; X86-NEXT: bswapl %ecx
|
2017-08-01 02:08:24 +08:00
|
|
|
; X86-NEXT: bswapl %edx
|
|
|
|
; X86-NEXT: xorl %eax, %eax
|
|
|
|
; X86-NEXT: cmpl %edx, %ecx
|
|
|
|
; X86-NEXT: seta %al
|
|
|
|
; X86-NEXT: sbbl $0, %eax
|
2017-07-18 22:19:34 +08:00
|
|
|
; X86-NEXT: retl
|
2017-06-08 23:01:29 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: length4:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-08-01 02:08:24 +08:00
|
|
|
; X64-NEXT: movl (%rdi), %ecx
|
|
|
|
; X64-NEXT: movl (%rsi), %edx
|
2017-06-20 23:58:30 +08:00
|
|
|
; X64-NEXT: bswapl %ecx
|
2017-08-01 02:08:24 +08:00
|
|
|
; X64-NEXT: bswapl %edx
|
|
|
|
; X64-NEXT: xorl %eax, %eax
|
|
|
|
; X64-NEXT: cmpl %edx, %ecx
|
|
|
|
; X64-NEXT: seta %al
|
|
|
|
; X64-NEXT: sbbl $0, %eax
|
2017-06-20 23:58:30 +08:00
|
|
|
; X64-NEXT: retq
|
2017-06-08 23:01:29 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
|
|
|
|
2017-06-09 01:02:39 +08:00
|
|
|
define i1 @length4_eq(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length4_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movl (%ecx), %ecx
|
|
|
|
; X86-NEXT: cmpl (%eax), %ecx
|
|
|
|
; X86-NEXT: setne %al
|
|
|
|
; X86-NEXT: retl
|
2017-03-25 06:09:48 +08:00
|
|
|
;
|
2017-06-08 23:01:29 +08:00
|
|
|
; X64-LABEL: length4_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-03-25 06:09:48 +08:00
|
|
|
; X64-NEXT: movl (%rdi), %eax
|
|
|
|
; X64-NEXT: cmpl (%rsi), %eax
|
|
|
|
; X64-NEXT: setne %al
|
|
|
|
; X64-NEXT: retq
|
2017-03-23 06:39:17 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
|
|
|
|
%c = icmp ne i32 %m, 0
|
|
|
|
ret i1 %c
|
2009-12-24 08:37:38 +08:00
|
|
|
}
|
|
|
|
|
2019-04-05 23:03:25 +08:00
|
|
|
define i1 @length4_lt(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length4_lt:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movl (%ecx), %ecx
|
|
|
|
; X86-NEXT: movl (%eax), %edx
|
|
|
|
; X86-NEXT: bswapl %ecx
|
|
|
|
; X86-NEXT: bswapl %edx
|
|
|
|
; X86-NEXT: xorl %eax, %eax
|
|
|
|
; X86-NEXT: cmpl %edx, %ecx
|
|
|
|
; X86-NEXT: seta %al
|
|
|
|
; X86-NEXT: sbbl $0, %eax
|
|
|
|
; X86-NEXT: shrl $31, %eax
|
|
|
|
; X86-NEXT: # kill: def $al killed $al killed $eax
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length4_lt:
|
|
|
|
; X64: # %bb.0:
|
|
|
|
; X64-NEXT: movl (%rdi), %ecx
|
|
|
|
; X64-NEXT: movl (%rsi), %edx
|
|
|
|
; X64-NEXT: bswapl %ecx
|
|
|
|
; X64-NEXT: bswapl %edx
|
|
|
|
; X64-NEXT: xorl %eax, %eax
|
|
|
|
; X64-NEXT: cmpl %edx, %ecx
|
|
|
|
; X64-NEXT: seta %al
|
|
|
|
; X64-NEXT: sbbl $0, %eax
|
|
|
|
; X64-NEXT: shrl $31, %eax
|
|
|
|
; X64-NEXT: # kill: def $al killed $al killed $eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
|
|
|
|
%c = icmp slt i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length4_gt(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length4_gt:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movl (%ecx), %ecx
|
|
|
|
; X86-NEXT: movl (%eax), %eax
|
|
|
|
; X86-NEXT: bswapl %ecx
|
|
|
|
; X86-NEXT: bswapl %eax
|
|
|
|
; X86-NEXT: xorl %edx, %edx
|
|
|
|
; X86-NEXT: cmpl %eax, %ecx
|
|
|
|
; X86-NEXT: seta %dl
|
|
|
|
; X86-NEXT: sbbl $0, %edx
|
|
|
|
; X86-NEXT: testl %edx, %edx
|
|
|
|
; X86-NEXT: setg %al
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length4_gt:
|
|
|
|
; X64: # %bb.0:
|
|
|
|
; X64-NEXT: movl (%rdi), %eax
|
|
|
|
; X64-NEXT: movl (%rsi), %ecx
|
|
|
|
; X64-NEXT: bswapl %eax
|
|
|
|
; X64-NEXT: bswapl %ecx
|
|
|
|
; X64-NEXT: xorl %edx, %edx
|
|
|
|
; X64-NEXT: cmpl %ecx, %eax
|
|
|
|
; X64-NEXT: seta %dl
|
|
|
|
; X64-NEXT: sbbl $0, %edx
|
|
|
|
; X64-NEXT: testl %edx, %edx
|
|
|
|
; X64-NEXT: setg %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
|
|
|
|
%c = icmp sgt i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
2017-06-09 01:02:39 +08:00
|
|
|
define i1 @length4_eq_const(i8* %X) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length4_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231
|
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
2017-03-25 06:09:48 +08:00
|
|
|
;
|
2017-06-08 23:01:29 +08:00
|
|
|
; X64-LABEL: length4_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-03-25 06:09:48 +08:00
|
|
|
; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231
|
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: retq
|
2017-03-23 23:38:22 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind
|
2017-03-23 06:39:17 +08:00
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
2009-12-24 09:07:17 +08:00
|
|
|
}
|
|
|
|
|
2017-06-08 23:01:29 +08:00
|
|
|
define i32 @length5(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length5:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0: # %loadbb
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X86-NEXT: pushl %esi
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl (%eax), %edx
|
|
|
|
; X86-NEXT: movl (%ecx), %esi
|
|
|
|
; X86-NEXT: bswapl %edx
|
|
|
|
; X86-NEXT: bswapl %esi
|
|
|
|
; X86-NEXT: cmpl %esi, %edx
|
2019-04-05 23:03:25 +08:00
|
|
|
; X86-NEXT: jne .LBB16_1
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-NEXT: # %bb.2: # %loadbb1
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X86-NEXT: movzbl 4(%eax), %eax
|
|
|
|
; X86-NEXT: movzbl 4(%ecx), %ecx
|
|
|
|
; X86-NEXT: subl %ecx, %eax
|
|
|
|
; X86-NEXT: popl %esi
|
|
|
|
; X86-NEXT: retl
|
2019-04-05 23:03:25 +08:00
|
|
|
; X86-NEXT: .LBB16_1: # %res_block
|
2017-08-11 23:44:14 +08:00
|
|
|
; X86-NEXT: setae %al
|
|
|
|
; X86-NEXT: movzbl %al, %eax
|
|
|
|
; X86-NEXT: leal -1(%eax,%eax), %eax
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X86-NEXT: popl %esi
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: retl
|
2017-06-08 23:01:29 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: length5:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0: # %loadbb
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movl (%rdi), %eax
|
|
|
|
; X64-NEXT: movl (%rsi), %ecx
|
|
|
|
; X64-NEXT: bswapl %eax
|
|
|
|
; X64-NEXT: bswapl %ecx
|
2017-08-02 01:24:54 +08:00
|
|
|
; X64-NEXT: cmpl %ecx, %eax
|
2019-04-05 23:03:25 +08:00
|
|
|
; X64-NEXT: jne .LBB16_1
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-NEXT: # %bb.2: # %loadbb1
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movzbl 4(%rdi), %eax
|
|
|
|
; X64-NEXT: movzbl 4(%rsi), %ecx
|
|
|
|
; X64-NEXT: subl %ecx, %eax
|
|
|
|
; X64-NEXT: retq
|
2019-04-05 23:03:25 +08:00
|
|
|
; X64-NEXT: .LBB16_1: # %res_block
|
2017-08-11 23:44:14 +08:00
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: movzbl %al, %eax
|
|
|
|
; X64-NEXT: leal -1(%rax,%rax), %eax
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: retq
|
2017-06-08 23:01:29 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length5_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-11-02 23:53:10 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
2017-11-03 20:12:27 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movl (%ecx), %edx
|
2018-01-07 00:16:04 +08:00
|
|
|
; X86-NEXT: xorl (%eax), %edx
|
|
|
|
; X86-NEXT: movb 4(%ecx), %cl
|
|
|
|
; X86-NEXT: xorb 4(%eax), %cl
|
|
|
|
; X86-NEXT: movzbl %cl, %eax
|
|
|
|
; X86-NEXT: orl %edx, %eax
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: setne %al
|
|
|
|
; X86-NEXT: retl
|
2017-06-08 23:01:29 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: length5_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movl (%rdi), %eax
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-NEXT: xorl (%rsi), %eax
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movb 4(%rdi), %cl
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-NEXT: xorb 4(%rsi), %cl
|
|
|
|
; X64-NEXT: movzbl %cl, %ecx
|
|
|
|
; X64-NEXT: orl %eax, %ecx
|
2017-06-08 23:01:29 +08:00
|
|
|
; X64-NEXT: setne %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
|
|
|
|
%c = icmp ne i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
2019-04-05 23:03:25 +08:00
|
|
|
define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length5_lt:
|
|
|
|
; X86: # %bb.0: # %loadbb
|
|
|
|
; X86-NEXT: pushl %esi
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl (%eax), %edx
|
|
|
|
; X86-NEXT: movl (%ecx), %esi
|
|
|
|
; X86-NEXT: bswapl %edx
|
|
|
|
; X86-NEXT: bswapl %esi
|
|
|
|
; X86-NEXT: cmpl %esi, %edx
|
|
|
|
; X86-NEXT: jne .LBB18_1
|
|
|
|
; X86-NEXT: # %bb.2: # %loadbb1
|
|
|
|
; X86-NEXT: movzbl 4(%eax), %eax
|
|
|
|
; X86-NEXT: movzbl 4(%ecx), %ecx
|
|
|
|
; X86-NEXT: subl %ecx, %eax
|
|
|
|
; X86-NEXT: jmp .LBB18_3
|
|
|
|
; X86-NEXT: .LBB18_1: # %res_block
|
|
|
|
; X86-NEXT: setae %al
|
|
|
|
; X86-NEXT: movzbl %al, %eax
|
|
|
|
; X86-NEXT: leal -1(%eax,%eax), %eax
|
|
|
|
; X86-NEXT: .LBB18_3: # %endblock
|
|
|
|
; X86-NEXT: shrl $31, %eax
|
|
|
|
; X86-NEXT: # kill: def $al killed $al killed $eax
|
|
|
|
; X86-NEXT: popl %esi
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length5_lt:
|
|
|
|
; X64: # %bb.0: # %loadbb
|
|
|
|
; X64-NEXT: movl (%rdi), %eax
|
|
|
|
; X64-NEXT: movl (%rsi), %ecx
|
|
|
|
; X64-NEXT: bswapl %eax
|
|
|
|
; X64-NEXT: bswapl %ecx
|
|
|
|
; X64-NEXT: cmpl %ecx, %eax
|
|
|
|
; X64-NEXT: jne .LBB18_1
|
|
|
|
; X64-NEXT: # %bb.2: # %loadbb1
|
|
|
|
; X64-NEXT: movzbl 4(%rdi), %eax
|
|
|
|
; X64-NEXT: movzbl 4(%rsi), %ecx
|
|
|
|
; X64-NEXT: subl %ecx, %eax
|
|
|
|
; X64-NEXT: shrl $31, %eax
|
|
|
|
; X64-NEXT: # kill: def $al killed $al killed $eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
; X64-NEXT: .LBB18_1: # %res_block
|
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: movzbl %al, %eax
|
|
|
|
; X64-NEXT: leal -1(%rax,%rax), %eax
|
|
|
|
; X64-NEXT: shrl $31, %eax
|
|
|
|
; X64-NEXT: # kill: def $al killed $al killed $eax
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
|
|
|
|
%c = icmp slt i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
2018-12-04 20:35:51 +08:00
|
|
|
define i1 @length7_eq(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length7_eq:
|
|
|
|
; X86: # %bb.0:
|
2018-12-20 21:01:04 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movl (%ecx), %edx
|
|
|
|
; X86-NEXT: movl 3(%ecx), %ecx
|
|
|
|
; X86-NEXT: xorl (%eax), %edx
|
|
|
|
; X86-NEXT: xorl 3(%eax), %ecx
|
|
|
|
; X86-NEXT: orl %edx, %ecx
|
2018-12-04 20:35:51 +08:00
|
|
|
; X86-NEXT: setne %al
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length7_eq:
|
|
|
|
; X64: # %bb.0:
|
2018-12-20 21:01:04 +08:00
|
|
|
; X64-NEXT: movl (%rdi), %eax
|
|
|
|
; X64-NEXT: movl 3(%rdi), %ecx
|
|
|
|
; X64-NEXT: xorl (%rsi), %eax
|
|
|
|
; X64-NEXT: xorl 3(%rsi), %ecx
|
|
|
|
; X64-NEXT: orl %eax, %ecx
|
2018-12-04 20:35:51 +08:00
|
|
|
; X64-NEXT: setne %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 7) nounwind
|
|
|
|
%c = icmp ne i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
2017-06-08 23:01:29 +08:00
|
|
|
define i32 @length8(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X86-NEXT: pushl %esi
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
|
|
|
|
; X86-NEXT: movl (%esi), %ecx
|
|
|
|
; X86-NEXT: movl (%eax), %edx
|
|
|
|
; X86-NEXT: bswapl %ecx
|
|
|
|
; X86-NEXT: bswapl %edx
|
|
|
|
; X86-NEXT: cmpl %edx, %ecx
|
2019-04-05 23:03:25 +08:00
|
|
|
; X86-NEXT: jne .LBB20_2
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-NEXT: # %bb.1: # %loadbb1
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X86-NEXT: movl 4(%esi), %ecx
|
|
|
|
; X86-NEXT: movl 4(%eax), %edx
|
|
|
|
; X86-NEXT: bswapl %ecx
|
|
|
|
; X86-NEXT: bswapl %edx
|
|
|
|
; X86-NEXT: xorl %eax, %eax
|
|
|
|
; X86-NEXT: cmpl %edx, %ecx
|
2019-04-05 23:03:25 +08:00
|
|
|
; X86-NEXT: je .LBB20_3
|
|
|
|
; X86-NEXT: .LBB20_2: # %res_block
|
2017-08-11 23:44:14 +08:00
|
|
|
; X86-NEXT: xorl %eax, %eax
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X86-NEXT: cmpl %edx, %ecx
|
2017-08-11 23:44:14 +08:00
|
|
|
; X86-NEXT: setae %al
|
|
|
|
; X86-NEXT: leal -1(%eax,%eax), %eax
|
2019-04-05 23:03:25 +08:00
|
|
|
; X86-NEXT: .LBB20_3: # %endblock
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X86-NEXT: popl %esi
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: retl
|
2017-06-08 23:01:29 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: length8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-08-01 02:08:24 +08:00
|
|
|
; X64-NEXT: movq (%rdi), %rcx
|
|
|
|
; X64-NEXT: movq (%rsi), %rdx
|
2017-06-20 23:58:30 +08:00
|
|
|
; X64-NEXT: bswapq %rcx
|
2017-08-01 02:08:24 +08:00
|
|
|
; X64-NEXT: bswapq %rdx
|
|
|
|
; X64-NEXT: xorl %eax, %eax
|
|
|
|
; X64-NEXT: cmpq %rdx, %rcx
|
|
|
|
; X64-NEXT: seta %al
|
|
|
|
; X64-NEXT: sbbl $0, %eax
|
2017-06-20 23:58:30 +08:00
|
|
|
; X64-NEXT: retq
|
2017-06-08 23:01:29 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
|
|
|
|
2017-06-09 01:02:39 +08:00
|
|
|
define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length8_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-11-02 23:53:10 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
2017-11-03 20:12:27 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-NEXT: movl (%ecx), %edx
|
2018-01-07 00:16:04 +08:00
|
|
|
; X86-NEXT: movl 4(%ecx), %ecx
|
|
|
|
; X86-NEXT: xorl (%eax), %edx
|
|
|
|
; X86-NEXT: xorl 4(%eax), %ecx
|
|
|
|
; X86-NEXT: orl %edx, %ecx
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
2017-03-25 06:09:48 +08:00
|
|
|
;
|
2017-06-08 23:01:29 +08:00
|
|
|
; X64-LABEL: length8_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-03-25 06:09:48 +08:00
|
|
|
; X64-NEXT: movq (%rdi), %rax
|
|
|
|
; X64-NEXT: cmpq (%rsi), %rax
|
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: retq
|
2017-03-23 06:39:17 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
2009-12-24 09:07:17 +08:00
|
|
|
}
|
|
|
|
|
2017-06-09 01:02:39 +08:00
|
|
|
define i1 @length8_eq_const(i8* %X) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length8_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2018-01-07 00:16:04 +08:00
|
|
|
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130
|
|
|
|
; X86-NEXT: xorl (%eax), %ecx
|
|
|
|
; X86-NEXT: movl $926299444, %edx # imm = 0x37363534
|
|
|
|
; X86-NEXT: xorl 4(%eax), %edx
|
|
|
|
; X86-NEXT: orl %ecx, %edx
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: setne %al
|
|
|
|
; X86-NEXT: retl
|
2017-03-25 06:09:48 +08:00
|
|
|
;
|
2017-06-08 23:01:29 +08:00
|
|
|
; X64-LABEL: length8_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-03-25 06:09:48 +08:00
|
|
|
; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
|
|
|
|
; X64-NEXT: cmpq %rax, (%rdi)
|
|
|
|
; X64-NEXT: setne %al
|
|
|
|
; X64-NEXT: retq
|
2017-03-23 23:38:22 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind
|
2017-03-23 06:39:17 +08:00
|
|
|
%c = icmp ne i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
2009-12-24 09:07:17 +08:00
|
|
|
|
2018-12-04 20:35:51 +08:00
|
|
|
define i1 @length9_eq(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length9_eq:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $9
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: testl %eax, %eax
|
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length9_eq:
|
|
|
|
; X64: # %bb.0:
|
|
|
|
; X64-NEXT: movq (%rdi), %rax
|
|
|
|
; X64-NEXT: xorq (%rsi), %rax
|
|
|
|
; X64-NEXT: movb 8(%rdi), %cl
|
|
|
|
; X64-NEXT: xorb 8(%rsi), %cl
|
|
|
|
; X64-NEXT: movzbl %cl, %ecx
|
|
|
|
; X64-NEXT: orq %rax, %rcx
|
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9) nounwind
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length10_eq(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length10_eq:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $10
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: testl %eax, %eax
|
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length10_eq:
|
|
|
|
; X64: # %bb.0:
|
|
|
|
; X64-NEXT: movq (%rdi), %rax
|
|
|
|
; X64-NEXT: xorq (%rsi), %rax
|
|
|
|
; X64-NEXT: movzwl 8(%rdi), %ecx
|
|
|
|
; X64-NEXT: xorw 8(%rsi), %cx
|
|
|
|
; X64-NEXT: movzwl %cx, %ecx
|
|
|
|
; X64-NEXT: orq %rax, %rcx
|
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 10) nounwind
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length11_eq(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length11_eq:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $11
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: testl %eax, %eax
|
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length11_eq:
|
|
|
|
; X64: # %bb.0:
|
2018-12-20 21:01:04 +08:00
|
|
|
; X64-NEXT: movq (%rdi), %rax
|
|
|
|
; X64-NEXT: movq 3(%rdi), %rcx
|
|
|
|
; X64-NEXT: xorq (%rsi), %rax
|
|
|
|
; X64-NEXT: xorq 3(%rsi), %rcx
|
|
|
|
; X64-NEXT: orq %rax, %rcx
|
2018-12-04 20:35:51 +08:00
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 11) nounwind
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
2017-06-08 23:01:29 +08:00
|
|
|
define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length12_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-07-26 01:04:37 +08:00
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $12
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: testl %eax, %eax
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: setne %al
|
|
|
|
; X86-NEXT: retl
|
2017-06-08 23:01:29 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: length12_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movq (%rdi), %rax
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-NEXT: xorq (%rsi), %rax
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movl 8(%rdi), %ecx
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-NEXT: xorl 8(%rsi), %ecx
|
|
|
|
; X64-NEXT: orq %rax, %rcx
|
2017-06-08 23:01:29 +08:00
|
|
|
; X64-NEXT: setne %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
|
|
|
|
%c = icmp ne i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @length12(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length12:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-07-26 01:04:37 +08:00
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $12
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: retl
|
2017-06-08 23:01:29 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: length12:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movq (%rdi), %rcx
|
|
|
|
; X64-NEXT: movq (%rsi), %rdx
|
|
|
|
; X64-NEXT: bswapq %rcx
|
|
|
|
; X64-NEXT: bswapq %rdx
|
|
|
|
; X64-NEXT: cmpq %rdx, %rcx
|
2019-04-05 23:03:25 +08:00
|
|
|
; X64-NEXT: jne .LBB27_2
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-NEXT: # %bb.1: # %loadbb1
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movl 8(%rdi), %ecx
|
|
|
|
; X64-NEXT: movl 8(%rsi), %edx
|
|
|
|
; X64-NEXT: bswapl %ecx
|
|
|
|
; X64-NEXT: bswapl %edx
|
|
|
|
; X64-NEXT: xorl %eax, %eax
|
|
|
|
; X64-NEXT: cmpq %rdx, %rcx
|
2019-04-05 23:03:25 +08:00
|
|
|
; X64-NEXT: je .LBB27_3
|
|
|
|
; X64-NEXT: .LBB27_2: # %res_block
|
2017-08-11 23:44:14 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: cmpq %rdx, %rcx
|
2017-08-11 23:44:14 +08:00
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: leal -1(%rax,%rax), %eax
|
2019-04-05 23:03:25 +08:00
|
|
|
; X64-NEXT: .LBB27_3: # %endblock
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: retq
|
2017-06-08 23:01:29 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
|
|
|
|
2018-12-04 20:35:51 +08:00
|
|
|
define i1 @length13_eq(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length13_eq:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $13
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: testl %eax, %eax
|
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length13_eq:
|
|
|
|
; X64: # %bb.0:
|
2018-12-20 21:01:04 +08:00
|
|
|
; X64-NEXT: movq (%rdi), %rax
|
|
|
|
; X64-NEXT: movq 5(%rdi), %rcx
|
|
|
|
; X64-NEXT: xorq (%rsi), %rax
|
|
|
|
; X64-NEXT: xorq 5(%rsi), %rcx
|
|
|
|
; X64-NEXT: orq %rax, %rcx
|
2018-12-04 20:35:51 +08:00
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 13) nounwind
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length14_eq:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $14
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: testl %eax, %eax
|
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length14_eq:
|
|
|
|
; X64: # %bb.0:
|
2018-12-20 21:01:04 +08:00
|
|
|
; X64-NEXT: movq (%rdi), %rax
|
|
|
|
; X64-NEXT: movq 6(%rdi), %rcx
|
|
|
|
; X64-NEXT: xorq (%rsi), %rax
|
|
|
|
; X64-NEXT: xorq 6(%rsi), %rcx
|
|
|
|
; X64-NEXT: orq %rax, %rcx
|
2018-12-04 20:35:51 +08:00
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 14) nounwind
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length15_eq:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $15
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: testl %eax, %eax
|
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length15_eq:
|
|
|
|
; X64: # %bb.0:
|
2018-12-20 21:01:04 +08:00
|
|
|
; X64-NEXT: movq (%rdi), %rax
|
|
|
|
; X64-NEXT: movq 7(%rdi), %rcx
|
|
|
|
; X64-NEXT: xorq (%rsi), %rax
|
|
|
|
; X64-NEXT: xorq 7(%rsi), %rcx
|
|
|
|
; X64-NEXT: orq %rax, %rcx
|
2018-12-04 20:35:51 +08:00
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 15) nounwind
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
2017-06-08 23:01:29 +08:00
|
|
|
; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
|
|
|
|
|
|
|
|
define i32 @length16(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-07-26 01:04:37 +08:00
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $16
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: retl
|
2017-06-08 23:01:29 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: length16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movq (%rdi), %rcx
|
|
|
|
; X64-NEXT: movq (%rsi), %rdx
|
|
|
|
; X64-NEXT: bswapq %rcx
|
|
|
|
; X64-NEXT: bswapq %rdx
|
|
|
|
; X64-NEXT: cmpq %rdx, %rcx
|
2019-04-05 23:03:25 +08:00
|
|
|
; X64-NEXT: jne .LBB31_2
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-NEXT: # %bb.1: # %loadbb1
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: movq 8(%rdi), %rcx
|
|
|
|
; X64-NEXT: movq 8(%rsi), %rdx
|
|
|
|
; X64-NEXT: bswapq %rcx
|
|
|
|
; X64-NEXT: bswapq %rdx
|
|
|
|
; X64-NEXT: xorl %eax, %eax
|
|
|
|
; X64-NEXT: cmpq %rdx, %rcx
|
2019-04-05 23:03:25 +08:00
|
|
|
; X64-NEXT: je .LBB31_3
|
|
|
|
; X64-NEXT: .LBB31_2: # %res_block
|
2017-08-11 23:44:14 +08:00
|
|
|
; X64-NEXT: xorl %eax, %eax
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: cmpq %rdx, %rcx
|
2017-08-11 23:44:14 +08:00
|
|
|
; X64-NEXT: setae %al
|
|
|
|
; X64-NEXT: leal -1(%rax,%rax), %eax
|
2019-04-05 23:03:25 +08:00
|
|
|
; X64-NEXT: .LBB31_3: # %endblock
|
[x86, CGP] increase memcmp() expansion up to 4 load pairs
It should be a win to avoid going out to the system lib for all small memcmp() calls using scalar ops. For x86 32-bit, this means most everything up to 16 bytes. For 64-bit, that doubles because we can do 8-byte loads.
Notes:
Reduced from 4 to 2 loads for -Os behavior, which might not be optimal in all cases. It's effectively a question of how much do we trust the system implementation. Linux and macOS (and Windows I assume, but did not test) have optimized memcmp() code for x86, so it's probably not bad either way? PPC is using 8/4 for defaults on these. We do not expand at all for -Oz.
There are still potential improvements to make for the CGP expansion IR and/or lowering such as avoiding select-of-constants (D34904) and not doing zexts to the max load type before doing a compare.
We have special-case SSE/AVX codegen for (memcmp(x, y, 16/32) == 0) that will no longer be produced after this patch. I've shown the experimental justification for that change in PR33329:
https://bugs.llvm.org/show_bug.cgi?id=33329#c12
TLDR: While the vector code is a likely winner, we can't guarantee that it's a winner in all cases on all CPUs, so I'm willing to sacrifice it for the greater good of expanding all small memcmp(). If we want to resurrect that codegen, it can be done by adjusting the CGP params or poking a hole to let those fall-through the CGP expansion.
Committed on behalf of Sanjay Patel
Differential Revision: https://reviews.llvm.org/D35067
llvm-svn: 308322
2017-07-18 23:55:30 +08:00
|
|
|
; X64-NEXT: retq
|
2017-06-08 23:01:29 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length16_eq(i8* %x, i8* %y) nounwind {
|
2017-07-26 01:04:37 +08:00
|
|
|
; X86-NOSSE-LABEL: length16_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-NOSSE: # %bb.0:
|
2017-07-26 01:04:37 +08:00
|
|
|
; X86-NOSSE-NEXT: pushl $0
|
|
|
|
; X86-NOSSE-NEXT: pushl $16
|
|
|
|
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NOSSE-NEXT: calll memcmp
|
|
|
|
; X86-NOSSE-NEXT: addl $16, %esp
|
|
|
|
; X86-NOSSE-NEXT: testl %eax, %eax
|
|
|
|
; X86-NOSSE-NEXT: setne %al
|
|
|
|
; X86-NOSSE-NEXT: retl
|
|
|
|
;
|
2017-10-24 23:27:47 +08:00
|
|
|
; X86-SSE1-LABEL: length16_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-SSE1: # %bb.0:
|
2017-10-24 23:27:47 +08:00
|
|
|
; X86-SSE1-NEXT: pushl $0
|
|
|
|
; X86-SSE1-NEXT: pushl $16
|
|
|
|
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-SSE1-NEXT: calll memcmp
|
|
|
|
; X86-SSE1-NEXT: addl $16, %esp
|
|
|
|
; X86-SSE1-NEXT: testl %eax, %eax
|
|
|
|
; X86-SSE1-NEXT: setne %al
|
|
|
|
; X86-SSE1-NEXT: retl
|
|
|
|
;
|
2017-07-26 01:04:37 +08:00
|
|
|
; X86-SSE2-LABEL: length16_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-SSE2: # %bb.0:
|
2017-07-26 01:04:37 +08:00
|
|
|
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
|
|
|
|
; X86-SSE2-NEXT: movdqu (%eax), %xmm1
|
|
|
|
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
|
|
|
|
; X86-SSE2-NEXT: pmovmskb %xmm1, %eax
|
|
|
|
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
|
|
|
; X86-SSE2-NEXT: setne %al
|
|
|
|
; X86-SSE2-NEXT: retl
|
2017-06-30 19:23:59 +08:00
|
|
|
;
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-LABEL: length16_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE2: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
|
|
|
|
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
|
|
|
|
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
|
|
|
|
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
|
|
|
|
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
|
|
|
; X64-SSE2-NEXT: setne %al
|
|
|
|
; X64-SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; X64-AVX-LABEL: length16_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
|
|
|
|
; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
|
|
|
|
; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
|
|
|
|
; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
|
|
|
; X64-AVX-NEXT: setne %al
|
|
|
|
; X64-AVX-NEXT: retq
|
2017-03-23 06:39:17 +08:00
|
|
|
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
|
|
|
|
%cmp = icmp ne i32 %call, 0
|
|
|
|
ret i1 %cmp
|
|
|
|
}
|
2009-12-24 09:07:17 +08:00
|
|
|
|
2017-06-09 01:02:39 +08:00
|
|
|
define i1 @length16_eq_const(i8* %X) nounwind {
|
2017-07-26 01:04:37 +08:00
|
|
|
; X86-NOSSE-LABEL: length16_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-NOSSE: # %bb.0:
|
2017-07-26 01:04:37 +08:00
|
|
|
; X86-NOSSE-NEXT: pushl $0
|
|
|
|
; X86-NOSSE-NEXT: pushl $16
|
|
|
|
; X86-NOSSE-NEXT: pushl $.L.str
|
|
|
|
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NOSSE-NEXT: calll memcmp
|
|
|
|
; X86-NOSSE-NEXT: addl $16, %esp
|
|
|
|
; X86-NOSSE-NEXT: testl %eax, %eax
|
|
|
|
; X86-NOSSE-NEXT: sete %al
|
|
|
|
; X86-NOSSE-NEXT: retl
|
|
|
|
;
|
2017-10-24 23:27:47 +08:00
|
|
|
; X86-SSE1-LABEL: length16_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-SSE1: # %bb.0:
|
2017-10-24 23:27:47 +08:00
|
|
|
; X86-SSE1-NEXT: pushl $0
|
|
|
|
; X86-SSE1-NEXT: pushl $16
|
|
|
|
; X86-SSE1-NEXT: pushl $.L.str
|
|
|
|
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-SSE1-NEXT: calll memcmp
|
|
|
|
; X86-SSE1-NEXT: addl $16, %esp
|
|
|
|
; X86-SSE1-NEXT: testl %eax, %eax
|
|
|
|
; X86-SSE1-NEXT: sete %al
|
|
|
|
; X86-SSE1-NEXT: retl
|
|
|
|
;
|
2017-07-26 01:04:37 +08:00
|
|
|
; X86-SSE2-LABEL: length16_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-SSE2: # %bb.0:
|
2017-07-26 01:04:37 +08:00
|
|
|
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
|
|
|
|
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
|
|
|
|
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
|
|
|
|
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
|
|
|
; X86-SSE2-NEXT: sete %al
|
|
|
|
; X86-SSE2-NEXT: retl
|
2017-06-30 19:23:59 +08:00
|
|
|
;
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-LABEL: length16_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE2: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
|
|
|
|
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
|
|
|
|
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
|
|
|
|
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
|
|
|
; X64-SSE2-NEXT: sete %al
|
|
|
|
; X64-SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; X64-AVX-LABEL: length16_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
|
|
|
|
; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
|
|
|
|
; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
|
|
|
; X64-AVX-NEXT: sete %al
|
|
|
|
; X64-AVX-NEXT: retq
|
2017-03-23 23:38:22 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
2017-07-25 18:33:36 +08:00
|
|
|
; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
|
|
|
|
|
|
|
|
define i32 @length24(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: length24:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-07-25 18:33:36 +08:00
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $24
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: length24:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-07-26 01:04:37 +08:00
|
|
|
; X64-NEXT: movl $24, %edx
|
|
|
|
; X64-NEXT: jmp memcmp # TAILCALL
|
2017-07-25 18:33:36 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length24_eq(i8* %x, i8* %y) nounwind {
|
2018-12-20 21:01:04 +08:00
|
|
|
; X86-NOSSE-LABEL: length24_eq:
|
|
|
|
; X86-NOSSE: # %bb.0:
|
|
|
|
; X86-NOSSE-NEXT: pushl $0
|
|
|
|
; X86-NOSSE-NEXT: pushl $24
|
|
|
|
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NOSSE-NEXT: calll memcmp
|
|
|
|
; X86-NOSSE-NEXT: addl $16, %esp
|
|
|
|
; X86-NOSSE-NEXT: testl %eax, %eax
|
|
|
|
; X86-NOSSE-NEXT: sete %al
|
|
|
|
; X86-NOSSE-NEXT: retl
|
|
|
|
;
|
|
|
|
; X86-SSE1-LABEL: length24_eq:
|
|
|
|
; X86-SSE1: # %bb.0:
|
|
|
|
; X86-SSE1-NEXT: pushl $0
|
|
|
|
; X86-SSE1-NEXT: pushl $24
|
|
|
|
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-SSE1-NEXT: calll memcmp
|
|
|
|
; X86-SSE1-NEXT: addl $16, %esp
|
|
|
|
; X86-SSE1-NEXT: testl %eax, %eax
|
|
|
|
; X86-SSE1-NEXT: sete %al
|
|
|
|
; X86-SSE1-NEXT: retl
|
|
|
|
;
|
|
|
|
; X86-SSE2-LABEL: length24_eq:
|
|
|
|
; X86-SSE2: # %bb.0:
|
|
|
|
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
|
|
|
|
; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1
|
|
|
|
; X86-SSE2-NEXT: movdqu (%eax), %xmm2
|
|
|
|
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
|
|
|
|
; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0
|
|
|
|
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
|
|
|
|
; X86-SSE2-NEXT: pand %xmm2, %xmm0
|
|
|
|
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
|
|
|
|
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
|
|
|
; X86-SSE2-NEXT: sete %al
|
|
|
|
; X86-SSE2-NEXT: retl
|
2017-07-25 18:33:36 +08:00
|
|
|
;
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-LABEL: length24_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE2: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
|
|
|
|
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
|
|
|
|
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
|
|
|
|
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
|
|
|
|
; X64-SSE2-NEXT: pand %xmm1, %xmm2
|
|
|
|
; X64-SSE2-NEXT: pmovmskb %xmm2, %eax
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
|
|
|
; X64-SSE2-NEXT: sete %al
|
|
|
|
; X64-SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; X64-AVX-LABEL: length24_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; X64-AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
|
|
|
|
; X64-AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
|
|
|
|
; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
|
|
|
; X64-AVX-NEXT: sete %al
|
|
|
|
; X64-AVX-NEXT: retq
|
2017-07-25 18:33:36 +08:00
|
|
|
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
|
|
|
|
%cmp = icmp eq i32 %call, 0
|
|
|
|
ret i1 %cmp
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length24_eq_const(i8* %X) nounwind {
|
2018-12-20 21:01:04 +08:00
|
|
|
; X86-NOSSE-LABEL: length24_eq_const:
|
|
|
|
; X86-NOSSE: # %bb.0:
|
|
|
|
; X86-NOSSE-NEXT: pushl $0
|
|
|
|
; X86-NOSSE-NEXT: pushl $24
|
|
|
|
; X86-NOSSE-NEXT: pushl $.L.str
|
|
|
|
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NOSSE-NEXT: calll memcmp
|
|
|
|
; X86-NOSSE-NEXT: addl $16, %esp
|
|
|
|
; X86-NOSSE-NEXT: testl %eax, %eax
|
|
|
|
; X86-NOSSE-NEXT: setne %al
|
|
|
|
; X86-NOSSE-NEXT: retl
|
|
|
|
;
|
|
|
|
; X86-SSE1-LABEL: length24_eq_const:
|
|
|
|
; X86-SSE1: # %bb.0:
|
|
|
|
; X86-SSE1-NEXT: pushl $0
|
|
|
|
; X86-SSE1-NEXT: pushl $24
|
|
|
|
; X86-SSE1-NEXT: pushl $.L.str
|
|
|
|
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-SSE1-NEXT: calll memcmp
|
|
|
|
; X86-SSE1-NEXT: addl $16, %esp
|
|
|
|
; X86-SSE1-NEXT: testl %eax, %eax
|
|
|
|
; X86-SSE1-NEXT: setne %al
|
|
|
|
; X86-SSE1-NEXT: retl
|
|
|
|
;
|
|
|
|
; X86-SSE2-LABEL: length24_eq_const:
|
|
|
|
; X86-SSE2: # %bb.0:
|
|
|
|
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
|
|
|
|
; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1
|
|
|
|
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1
|
|
|
|
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
|
|
|
|
; X86-SSE2-NEXT: pand %xmm1, %xmm0
|
|
|
|
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
|
|
|
|
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
|
|
|
; X86-SSE2-NEXT: setne %al
|
|
|
|
; X86-SSE2-NEXT: retl
|
2017-07-25 18:33:36 +08:00
|
|
|
;
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-LABEL: length24_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE2: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
2019-03-05 03:12:16 +08:00
|
|
|
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
|
2019-03-05 03:12:16 +08:00
|
|
|
; X64-SSE2-NEXT: pand %xmm1, %xmm0
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
|
|
|
|
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
|
|
|
; X64-SSE2-NEXT: setne %al
|
|
|
|
; X64-SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; X64-AVX-LABEL: length24_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
2019-03-05 03:12:16 +08:00
|
|
|
; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm1, %xmm1
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
|
|
|
|
; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
|
|
|
; X64-AVX-NEXT: setne %al
|
|
|
|
; X64-AVX-NEXT: retq
|
2017-07-25 18:33:36 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind
|
|
|
|
%c = icmp ne i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
2017-06-08 23:01:29 +08:00
|
|
|
define i32 @length32(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $32
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: retl
|
2017-06-08 23:01:29 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: length32:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-07-26 01:04:37 +08:00
|
|
|
; X64-NEXT: movl $32, %edx
|
|
|
|
; X64-NEXT: jmp memcmp # TAILCALL
|
2017-06-08 23:01:29 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
|
|
|
|
|
|
|
; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
|
|
|
|
|
|
|
|
define i1 @length32_eq(i8* %x, i8* %y) nounwind {
|
2017-10-30 22:19:33 +08:00
|
|
|
; X86-NOSSE-LABEL: length32_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-NOSSE: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X86-NOSSE-NEXT: pushl $0
|
|
|
|
; X86-NOSSE-NEXT: pushl $32
|
|
|
|
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NOSSE-NEXT: calll memcmp
|
|
|
|
; X86-NOSSE-NEXT: addl $16, %esp
|
|
|
|
; X86-NOSSE-NEXT: testl %eax, %eax
|
|
|
|
; X86-NOSSE-NEXT: sete %al
|
|
|
|
; X86-NOSSE-NEXT: retl
|
|
|
|
;
|
|
|
|
; X86-SSE1-LABEL: length32_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-SSE1: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X86-SSE1-NEXT: pushl $0
|
|
|
|
; X86-SSE1-NEXT: pushl $32
|
|
|
|
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-SSE1-NEXT: calll memcmp
|
|
|
|
; X86-SSE1-NEXT: addl $16, %esp
|
|
|
|
; X86-SSE1-NEXT: testl %eax, %eax
|
|
|
|
; X86-SSE1-NEXT: sete %al
|
|
|
|
; X86-SSE1-NEXT: retl
|
|
|
|
;
|
|
|
|
; X86-SSE2-LABEL: length32_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-SSE2: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
|
|
|
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
|
|
|
|
; X86-SSE2-NEXT: movdqu (%eax), %xmm2
|
|
|
|
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
|
|
|
|
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
|
|
|
|
; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
|
|
|
|
; X86-SSE2-NEXT: pand %xmm2, %xmm0
|
|
|
|
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
|
|
|
|
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
2017-10-30 22:19:33 +08:00
|
|
|
; X86-SSE2-NEXT: sete %al
|
|
|
|
; X86-SSE2-NEXT: retl
|
2017-06-30 19:23:59 +08:00
|
|
|
;
|
2017-07-26 01:04:37 +08:00
|
|
|
; X64-SSE2-LABEL: length32_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE2: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
|
|
|
|
; X64-SSE2-NEXT: movdqu (%rsi), %xmm2
|
|
|
|
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
|
|
|
|
; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0
|
|
|
|
; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
|
|
|
|
; X64-SSE2-NEXT: pand %xmm2, %xmm0
|
|
|
|
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
2017-07-26 01:04:37 +08:00
|
|
|
; X64-SSE2-NEXT: sete %al
|
|
|
|
; X64-SSE2-NEXT: retq
|
|
|
|
;
|
2017-10-24 23:27:47 +08:00
|
|
|
; X64-AVX1-LABEL: length32_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
|
|
|
|
; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm1, %xmm1
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
|
|
|
|
; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
2017-10-24 23:27:47 +08:00
|
|
|
; X64-AVX1-NEXT: sete %al
|
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-07-26 01:04:37 +08:00
|
|
|
; X64-AVX2-LABEL: length32_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX2: # %bb.0:
|
2017-07-26 01:04:37 +08:00
|
|
|
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
|
|
|
|
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
|
|
|
|
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
|
|
|
|
; X64-AVX2-NEXT: cmpl $-1, %eax
|
|
|
|
; X64-AVX2-NEXT: sete %al
|
|
|
|
; X64-AVX2-NEXT: vzeroupper
|
|
|
|
; X64-AVX2-NEXT: retq
|
2017-03-23 23:38:22 +08:00
|
|
|
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
|
|
|
|
%cmp = icmp eq i32 %call, 0
|
|
|
|
ret i1 %cmp
|
|
|
|
}
|
|
|
|
|
2017-06-09 01:02:39 +08:00
|
|
|
define i1 @length32_eq_const(i8* %X) nounwind {
|
2017-10-30 22:19:33 +08:00
|
|
|
; X86-NOSSE-LABEL: length32_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-NOSSE: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X86-NOSSE-NEXT: pushl $0
|
|
|
|
; X86-NOSSE-NEXT: pushl $32
|
|
|
|
; X86-NOSSE-NEXT: pushl $.L.str
|
|
|
|
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NOSSE-NEXT: calll memcmp
|
|
|
|
; X86-NOSSE-NEXT: addl $16, %esp
|
|
|
|
; X86-NOSSE-NEXT: testl %eax, %eax
|
|
|
|
; X86-NOSSE-NEXT: setne %al
|
|
|
|
; X86-NOSSE-NEXT: retl
|
|
|
|
;
|
|
|
|
; X86-SSE1-LABEL: length32_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-SSE1: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X86-SSE1-NEXT: pushl $0
|
|
|
|
; X86-SSE1-NEXT: pushl $32
|
|
|
|
; X86-SSE1-NEXT: pushl $.L.str
|
|
|
|
; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-SSE1-NEXT: calll memcmp
|
|
|
|
; X86-SSE1-NEXT: addl $16, %esp
|
|
|
|
; X86-SSE1-NEXT: testl %eax, %eax
|
|
|
|
; X86-SSE1-NEXT: setne %al
|
|
|
|
; X86-SSE1-NEXT: retl
|
|
|
|
;
|
|
|
|
; X86-SSE2-LABEL: length32_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86-SSE2: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
|
|
|
|
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1
|
2017-10-30 22:19:33 +08:00
|
|
|
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X86-SSE2-NEXT: pand %xmm1, %xmm0
|
|
|
|
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
|
|
|
|
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
2017-10-30 22:19:33 +08:00
|
|
|
; X86-SSE2-NEXT: setne %al
|
|
|
|
; X86-SSE2-NEXT: retl
|
2017-06-30 19:23:59 +08:00
|
|
|
;
|
2017-07-26 01:04:37 +08:00
|
|
|
; X64-SSE2-LABEL: length32_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE2: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
|
|
|
|
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-SSE2-NEXT: pand %xmm1, %xmm0
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
|
|
|
|
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
2017-07-26 01:04:37 +08:00
|
|
|
; X64-SSE2-NEXT: setne %al
|
|
|
|
; X64-SSE2-NEXT: retq
|
|
|
|
;
|
2017-10-24 23:27:47 +08:00
|
|
|
; X64-AVX1-LABEL: length32_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
|
|
|
|
; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm1, %xmm1
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
|
|
|
|
; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
|
2017-10-24 23:27:47 +08:00
|
|
|
; X64-AVX1-NEXT: setne %al
|
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
2017-07-26 01:04:37 +08:00
|
|
|
; X64-AVX2-LABEL: length32_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX2: # %bb.0:
|
2017-07-26 01:04:37 +08:00
|
|
|
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
|
|
|
|
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
|
|
|
|
; X64-AVX2-NEXT: cmpl $-1, %eax
|
|
|
|
; X64-AVX2-NEXT: setne %al
|
|
|
|
; X64-AVX2-NEXT: vzeroupper
|
|
|
|
; X64-AVX2-NEXT: retq
|
2017-03-23 23:38:22 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
|
|
|
|
%c = icmp ne i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
2017-06-08 23:01:29 +08:00
|
|
|
define i32 @length64(i8* %X, i8* %Y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $64
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: retl
|
2017-06-08 23:01:29 +08:00
|
|
|
;
|
|
|
|
; X64-LABEL: length64:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-06-08 23:01:29 +08:00
|
|
|
; X64-NEXT: movl $64, %edx
|
|
|
|
; X64-NEXT: jmp memcmp # TAILCALL
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @length64_eq(i8* %x, i8* %y) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length64_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $64
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: testl %eax, %eax
|
|
|
|
; X86-NEXT: setne %al
|
|
|
|
; X86-NEXT: retl
|
2017-03-25 06:09:48 +08:00
|
|
|
;
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-LABEL: length64_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE2: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: pushq %rax
|
|
|
|
; X64-SSE2-NEXT: movl $64, %edx
|
|
|
|
; X64-SSE2-NEXT: callq memcmp
|
|
|
|
; X64-SSE2-NEXT: testl %eax, %eax
|
|
|
|
; X64-SSE2-NEXT: setne %al
|
|
|
|
; X64-SSE2-NEXT: popq %rcx
|
|
|
|
; X64-SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; X64-AVX1-LABEL: length64_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX1-NEXT: pushq %rax
|
|
|
|
; X64-AVX1-NEXT: movl $64, %edx
|
|
|
|
; X64-AVX1-NEXT: callq memcmp
|
|
|
|
; X64-AVX1-NEXT: testl %eax, %eax
|
|
|
|
; X64-AVX1-NEXT: setne %al
|
|
|
|
; X64-AVX1-NEXT: popq %rcx
|
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; X64-AVX2-LABEL: length64_eq:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX2: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
|
|
|
|
; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
|
|
|
|
; X64-AVX2-NEXT: cmpl $-1, %eax
|
|
|
|
; X64-AVX2-NEXT: setne %al
|
|
|
|
; X64-AVX2-NEXT: vzeroupper
|
|
|
|
; X64-AVX2-NEXT: retq
|
2017-03-23 23:38:22 +08:00
|
|
|
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
|
|
|
|
%cmp = icmp ne i32 %call, 0
|
|
|
|
ret i1 %cmp
|
|
|
|
}
|
|
|
|
|
2017-06-09 01:02:39 +08:00
|
|
|
define i1 @length64_eq_const(i8* %X) nounwind {
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-LABEL: length64_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-06-30 19:23:59 +08:00
|
|
|
; X86-NEXT: pushl $0
|
|
|
|
; X86-NEXT: pushl $64
|
|
|
|
; X86-NEXT: pushl $.L.str
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: testl %eax, %eax
|
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
2017-03-25 06:09:48 +08:00
|
|
|
;
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-LABEL: length64_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-SSE2: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-SSE2-NEXT: pushq %rax
|
|
|
|
; X64-SSE2-NEXT: movl $.L.str, %esi
|
|
|
|
; X64-SSE2-NEXT: movl $64, %edx
|
|
|
|
; X64-SSE2-NEXT: callq memcmp
|
|
|
|
; X64-SSE2-NEXT: testl %eax, %eax
|
|
|
|
; X64-SSE2-NEXT: sete %al
|
|
|
|
; X64-SSE2-NEXT: popq %rcx
|
|
|
|
; X64-SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; X64-AVX1-LABEL: length64_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX1: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX1-NEXT: pushq %rax
|
|
|
|
; X64-AVX1-NEXT: movl $.L.str, %esi
|
|
|
|
; X64-AVX1-NEXT: movl $64, %edx
|
|
|
|
; X64-AVX1-NEXT: callq memcmp
|
|
|
|
; X64-AVX1-NEXT: testl %eax, %eax
|
|
|
|
; X64-AVX1-NEXT: sete %al
|
|
|
|
; X64-AVX1-NEXT: popq %rcx
|
|
|
|
; X64-AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; X64-AVX2-LABEL: length64_eq_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64-AVX2: # %bb.0:
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
|
|
|
|
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm1, %ymm1
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
|
2018-01-07 00:16:04 +08:00
|
|
|
; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
2017-10-30 22:19:33 +08:00
|
|
|
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
|
|
|
|
; X64-AVX2-NEXT: cmpl $-1, %eax
|
|
|
|
; X64-AVX2-NEXT: sete %al
|
|
|
|
; X64-AVX2-NEXT: vzeroupper
|
|
|
|
; X64-AVX2-NEXT: retq
|
2017-03-23 23:38:22 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
|
2017-03-23 06:39:17 +08:00
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
2009-12-24 08:37:38 +08:00
|
|
|
}
|
|
|
|
|
2017-10-25 19:02:09 +08:00
|
|
|
; This checks that we do not do stupid things with huge sizes.
|
|
|
|
define i32 @huge_length(i8* %X, i8* %Y) nounwind {
|
2017-10-27 16:33:51 +08:00
|
|
|
; X86-LABEL: huge_length:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X86: # %bb.0:
|
2017-10-27 16:33:51 +08:00
|
|
|
; X86-NEXT: pushl $2147483647 # imm = 0x7FFFFFFF
|
|
|
|
; X86-NEXT: pushl $-1
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: huge_length:
|
2017-12-05 01:18:51 +08:00
|
|
|
; X64: # %bb.0:
|
2017-10-27 16:33:51 +08:00
|
|
|
; X64-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
|
|
|
|
; X64-NEXT: jmp memcmp # TAILCALL
|
2017-10-25 19:02:09 +08:00
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
[SelectionDAG] Allow the user to specify a memeq function.
Summary:
Right now, when we encounter a string equality check,
e.g. `if (memcmp(a, b, s) == 0)`, we try to expand to a comparison if `s` is a
small compile-time constant, and fall back on calling `memcmp()` else.
This is sub-optimal because memcmp has to compute much more than
equality.
This patch replaces `memcmp(a, b, s) == 0` by `bcmp(a, b, s) == 0` on platforms
that support `bcmp`.
`bcmp` can be made much more efficient than `memcmp` because equality
compare is trivially parallel while lexicographic ordering has a chain
dependency.
Subscribers: fedor.sergeev, jyknight, ckennelly, gchatelet, llvm-commits
Differential Revision: https://reviews.llvm.org/D56593
llvm-svn: 355672
2019-03-08 17:07:45 +08:00
|
|
|
|
|
|
|
define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind {
|
|
|
|
; X86-LABEL: huge_length_eq:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: pushl $2147483647 # imm = 0x7FFFFFFF
|
|
|
|
; X86-NEXT: pushl $-1
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: testl %eax, %eax
|
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: huge_length_eq:
|
|
|
|
; X64: # %bb.0:
|
|
|
|
; X64-NEXT: pushq %rax
|
|
|
|
; X64-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
|
|
|
|
; X64-NEXT: callq memcmp
|
|
|
|
; X64-NEXT: testl %eax, %eax
|
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: popq %rcx
|
|
|
|
; X64-NEXT: retq
|
|
|
|
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|
|
|
|
|
|
|
|
; This checks non-constant sizes.
|
|
|
|
define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind {
|
|
|
|
; X86-LABEL: nonconst_length:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: jmp memcmp # TAILCALL
|
|
|
|
;
|
|
|
|
; X64-LABEL: nonconst_length:
|
|
|
|
; X64: # %bb.0:
|
|
|
|
; X64-NEXT: jmp memcmp # TAILCALL
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 %size) nounwind
|
|
|
|
ret i32 %m
|
|
|
|
}
|
|
|
|
|
|
|
|
define i1 @nonconst_length_eq(i8* %X, i8* %Y, i64 %size) nounwind {
|
|
|
|
; X86-LABEL: nonconst_length_eq:
|
|
|
|
; X86: # %bb.0:
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: pushl {{[0-9]+}}(%esp)
|
|
|
|
; X86-NEXT: calll memcmp
|
|
|
|
; X86-NEXT: addl $16, %esp
|
|
|
|
; X86-NEXT: testl %eax, %eax
|
|
|
|
; X86-NEXT: sete %al
|
|
|
|
; X86-NEXT: retl
|
|
|
|
;
|
|
|
|
; X64-LABEL: nonconst_length_eq:
|
|
|
|
; X64: # %bb.0:
|
|
|
|
; X64-NEXT: pushq %rax
|
|
|
|
; X64-NEXT: callq memcmp
|
|
|
|
; X64-NEXT: testl %eax, %eax
|
|
|
|
; X64-NEXT: sete %al
|
|
|
|
; X64-NEXT: popq %rcx
|
|
|
|
; X64-NEXT: retq
|
|
|
|
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 %size) nounwind
|
|
|
|
%c = icmp eq i32 %m, 0
|
|
|
|
ret i1 %c
|
|
|
|
}
|