llvm-project/llvm/test/CodeGen/X86/popcnt.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1042 lines
38 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X32,X32-NOSSE
; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X32-POPCNT
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT
; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefixes=X32,X32-SSE2
; RUN: llc < %s -mtriple=i686-unknown -mattr=ssse3 | FileCheck %s --check-prefixes=X32,X32-SSSE3
define i8 @cnt8(i8 %x) nounwind readnone {
; X32-LABEL: cnt8:
; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrb %al
; X32-NEXT: andb $85, %al
; X32-NEXT: subb %al, %cl
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: andb $51, %al
; X32-NEXT: shrb $2, %cl
; X32-NEXT: andb $51, %cl
; X32-NEXT: addb %al, %cl
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrb $4, %al
; X32-NEXT: addb %cl, %al
; X32-NEXT: andb $15, %al
; X32-NEXT: retl
;
; X64-LABEL: cnt8:
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrb %al
; X64-NEXT: andb $85, %al
; X64-NEXT: subb %al, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $51, %al
; X64-NEXT: shrb $2, %dil
; X64-NEXT: andb $51, %dil
; X64-NEXT: addb %al, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrb $4, %al
; X64-NEXT: addl %edi, %eax
; X64-NEXT: andb $15, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt8:
; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-POPCNT-NEXT: popcntl %eax, %eax
; X32-POPCNT-NEXT: # kill: def $al killed $al killed $eax
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt8:
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: movzbl %dil, %eax
; X64-POPCNT-NEXT: popcntl %eax, %eax
; X64-POPCNT-NEXT: # kill: def $al killed $al killed $eax
; X64-POPCNT-NEXT: retq
%cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
ret i8 %cnt
}
define i16 @cnt16(i16 %x) nounwind readnone {
; X32-LABEL: cnt16:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl %ecx
; X32-NEXT: andl $21845, %ecx # imm = 0x5555
; X32-NEXT: subl %ecx, %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $13107, %ecx # imm = 0x3333
; X32-NEXT: shrl $2, %eax
; X32-NEXT: andl $13107, %eax # imm = 0x3333
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $4, %ecx
; X32-NEXT: addl %eax, %ecx
; X32-NEXT: andl $3855, %ecx # imm = 0xF0F
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shll $8, %eax
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movzbl %ah, %eax
; X32-NEXT: # kill: def $ax killed $ax killed $eax
; X32-NEXT: retl
;
; X64-LABEL: cnt16:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrl %eax
; X64-NEXT: andl $21845, %eax # imm = 0x5555
; X64-NEXT: subl %eax, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $13107, %eax # imm = 0x3333
; X64-NEXT: shrl $2, %edi
; X64-NEXT: andl $13107, %edi # imm = 0x3333
; X64-NEXT: addl %eax, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrl $4, %eax
; X64-NEXT: addl %edi, %eax
; X64-NEXT: andl $3855, %eax # imm = 0xF0F
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shll $8, %ecx
; X64-NEXT: addl %eax, %ecx
; X64-NEXT: movzbl %ch, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt16:
; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: popcntw {{[0-9]+}}(%esp), %ax
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt16:
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntw %di, %ax
; X64-POPCNT-NEXT: retq
%cnt = tail call i16 @llvm.ctpop.i16(i16 %x)
ret i16 %cnt
}
define i32 @cnt32(i32 %x) nounwind readnone {
; X32-LABEL: cnt32:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl %ecx
; X32-NEXT: andl $1431655765, %ecx # imm = 0x55555555
; X32-NEXT: subl %ecx, %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $858993459, %ecx # imm = 0x33333333
; X32-NEXT: shrl $2, %eax
; X32-NEXT: andl $858993459, %eax # imm = 0x33333333
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $4, %ecx
; X32-NEXT: addl %eax, %ecx
; X32-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X32-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101
; X32-NEXT: shrl $24, %eax
; X32-NEXT: retl
;
; X64-LABEL: cnt32:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrl %eax
; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X64-NEXT: subl %eax, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $858993459, %eax # imm = 0x33333333
; X64-NEXT: shrl $2, %edi
; X64-NEXT: andl $858993459, %edi # imm = 0x33333333
; X64-NEXT: addl %eax, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrl $4, %eax
; X64-NEXT: addl %edi, %eax
; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X64-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
; X64-NEXT: shrl $24, %eax
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt32:
; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt32:
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntl %edi, %eax
; X64-POPCNT-NEXT: retq
%cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
ret i32 %cnt
}
define i64 @cnt64(i64 %x) nounwind readnone {
; X32-NOSSE-LABEL: cnt64:
; X32-NOSSE: # %bb.0:
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOSSE-NEXT: movl %ecx, %edx
; X32-NOSSE-NEXT: shrl %edx
; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555
; X32-NOSSE-NEXT: subl %edx, %ecx
; X32-NOSSE-NEXT: movl %ecx, %edx
; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
; X32-NOSSE-NEXT: shrl $2, %ecx
; X32-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333
; X32-NOSSE-NEXT: addl %edx, %ecx
; X32-NOSSE-NEXT: movl %ecx, %edx
; X32-NOSSE-NEXT: shrl $4, %edx
; X32-NOSSE-NEXT: addl %ecx, %edx
; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
; X32-NOSSE-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %ecx
; X32-NOSSE-NEXT: movl %eax, %edx
; X32-NOSSE-NEXT: shrl %edx
; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555
; X32-NOSSE-NEXT: subl %edx, %eax
; X32-NOSSE-NEXT: movl %eax, %edx
; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
; X32-NOSSE-NEXT: shrl $2, %eax
; X32-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333
; X32-NOSSE-NEXT: addl %edx, %eax
; X32-NOSSE-NEXT: movl %eax, %edx
; X32-NOSSE-NEXT: shrl $4, %edx
; X32-NOSSE-NEXT: addl %eax, %edx
; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
; X32-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %eax
; X32-NOSSE-NEXT: addl %ecx, %eax
; X32-NOSSE-NEXT: xorl %edx, %edx
; X32-NOSSE-NEXT: retl
;
; X64-LABEL: cnt64:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq %rax
; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: subq %rcx, %rdi
; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
; X64-NEXT: movq %rdi, %rcx
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: shrq $2, %rdi
; X64-NEXT: andq %rax, %rdi
; X64-NEXT: addq %rcx, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq $4, %rax
; X64-NEXT: addq %rdi, %rax
; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
; X64-NEXT: imulq %rcx, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt64:
; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
; X32-POPCNT-NEXT: addl %ecx, %eax
; X32-POPCNT-NEXT: xorl %edx, %edx
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt64:
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntq %rdi, %rax
; X64-POPCNT-NEXT: retq
;
; X32-SSE2-LABEL: cnt64:
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
; X32-SSE2-NEXT: psrlw $1, %xmm1
; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE2-NEXT: psubb %xmm1, %xmm0
; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; X32-SSE2-NEXT: movdqa %xmm0, %xmm2
; X32-SSE2-NEXT: pand %xmm1, %xmm2
; X32-SSE2-NEXT: psrlw $2, %xmm0
; X32-SSE2-NEXT: pand %xmm1, %xmm0
; X32-SSE2-NEXT: paddb %xmm2, %xmm0
; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
; X32-SSE2-NEXT: psrlw $4, %xmm1
; X32-SSE2-NEXT: paddb %xmm0, %xmm1
; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE2-NEXT: pxor %xmm0, %xmm0
; X32-SSE2-NEXT: psadbw %xmm1, %xmm0
; X32-SSE2-NEXT: movd %xmm0, %eax
; X32-SSE2-NEXT: xorl %edx, %edx
; X32-SSE2-NEXT: retl
;
; X32-SSSE3-LABEL: cnt64:
; X32-SSSE3: # %bb.0:
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2
; X32-SSSE3-NEXT: pand %xmm0, %xmm2
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4
; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4
; X32-SSSE3-NEXT: psrlw $4, %xmm1
; X32-SSSE3-NEXT: pand %xmm0, %xmm1
; X32-SSSE3-NEXT: pshufb %xmm1, %xmm3
; X32-SSSE3-NEXT: paddb %xmm4, %xmm3
; X32-SSSE3-NEXT: pxor %xmm0, %xmm0
; X32-SSSE3-NEXT: psadbw %xmm3, %xmm0
; X32-SSSE3-NEXT: movd %xmm0, %eax
; X32-SSSE3-NEXT: xorl %edx, %edx
; X32-SSSE3-NEXT: retl
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
ret i64 %cnt
}
define i128 @cnt128(i128 %x) nounwind readnone {
; X32-NOSSE-LABEL: cnt128:
; X32-NOSSE: # %bb.0:
; X32-NOSSE-NEXT: pushl %ebx
; X32-NOSSE-NEXT: pushl %edi
; X32-NOSSE-NEXT: pushl %esi
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NOSSE-NEXT: movl %edi, %ebx
; X32-NOSSE-NEXT: shrl %ebx
; X32-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555
; X32-NOSSE-NEXT: subl %ebx, %edi
; X32-NOSSE-NEXT: movl %edi, %ebx
; X32-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333
; X32-NOSSE-NEXT: shrl $2, %edi
; X32-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333
; X32-NOSSE-NEXT: addl %ebx, %edi
; X32-NOSSE-NEXT: movl %edi, %ebx
; X32-NOSSE-NEXT: shrl $4, %ebx
; X32-NOSSE-NEXT: addl %edi, %ebx
; X32-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F
; X32-NOSSE-NEXT: imull $16843009, %ebx, %edi # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %edi
; X32-NOSSE-NEXT: movl %esi, %ebx
; X32-NOSSE-NEXT: shrl %ebx
; X32-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555
; X32-NOSSE-NEXT: subl %ebx, %esi
; X32-NOSSE-NEXT: movl %esi, %ebx
; X32-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333
; X32-NOSSE-NEXT: shrl $2, %esi
; X32-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333
; X32-NOSSE-NEXT: addl %ebx, %esi
; X32-NOSSE-NEXT: movl %esi, %ebx
; X32-NOSSE-NEXT: shrl $4, %ebx
; X32-NOSSE-NEXT: addl %esi, %ebx
; X32-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F
; X32-NOSSE-NEXT: imull $16843009, %ebx, %esi # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %esi
; X32-NOSSE-NEXT: addl %edi, %esi
; X32-NOSSE-NEXT: movl %edx, %edi
; X32-NOSSE-NEXT: shrl %edi
; X32-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555
; X32-NOSSE-NEXT: subl %edi, %edx
; X32-NOSSE-NEXT: movl %edx, %edi
; X32-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333
; X32-NOSSE-NEXT: shrl $2, %edx
; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
; X32-NOSSE-NEXT: addl %edi, %edx
; X32-NOSSE-NEXT: movl %edx, %edi
; X32-NOSSE-NEXT: shrl $4, %edi
; X32-NOSSE-NEXT: addl %edx, %edi
; X32-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; X32-NOSSE-NEXT: imull $16843009, %edi, %edx # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %edx
; X32-NOSSE-NEXT: movl %ecx, %edi
; X32-NOSSE-NEXT: shrl %edi
; X32-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555
; X32-NOSSE-NEXT: subl %edi, %ecx
; X32-NOSSE-NEXT: movl %ecx, %edi
; X32-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333
; X32-NOSSE-NEXT: shrl $2, %ecx
; X32-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333
; X32-NOSSE-NEXT: addl %edi, %ecx
; X32-NOSSE-NEXT: movl %ecx, %edi
; X32-NOSSE-NEXT: shrl $4, %edi
; X32-NOSSE-NEXT: addl %ecx, %edi
; X32-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
; X32-NOSSE-NEXT: imull $16843009, %edi, %ecx # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %ecx
; X32-NOSSE-NEXT: addl %edx, %ecx
; X32-NOSSE-NEXT: addl %esi, %ecx
; X32-NOSSE-NEXT: movl %ecx, (%eax)
; X32-NOSSE-NEXT: movl $0, 12(%eax)
; X32-NOSSE-NEXT: movl $0, 8(%eax)
; X32-NOSSE-NEXT: movl $0, 4(%eax)
; X32-NOSSE-NEXT: popl %esi
; X32-NOSSE-NEXT: popl %edi
; X32-NOSSE-NEXT: popl %ebx
; X32-NOSSE-NEXT: retl $4
;
; X64-LABEL: cnt128:
; X64: # %bb.0:
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: shrq %rax
; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
; X64-NEXT: andq %r8, %rax
; X64-NEXT: subq %rax, %rsi
; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
; X64-NEXT: movq %rsi, %rcx
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: shrq $2, %rsi
; X64-NEXT: andq %rax, %rsi
; X64-NEXT: addq %rcx, %rsi
; X64-NEXT: movq %rsi, %rcx
; X64-NEXT: shrq $4, %rcx
; X64-NEXT: addq %rsi, %rcx
; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: andq %r9, %rcx
; X64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101
; X64-NEXT: imulq %rdx, %rcx
; X64-NEXT: shrq $56, %rcx
; X64-NEXT: movq %rdi, %rsi
; X64-NEXT: shrq %rsi
; X64-NEXT: andq %r8, %rsi
; X64-NEXT: subq %rsi, %rdi
; X64-NEXT: movq %rdi, %rsi
; X64-NEXT: andq %rax, %rsi
; X64-NEXT: shrq $2, %rdi
; X64-NEXT: andq %rax, %rdi
; X64-NEXT: addq %rsi, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq $4, %rax
; X64-NEXT: addq %rdi, %rax
; X64-NEXT: andq %r9, %rax
; X64-NEXT: imulq %rdx, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: addq %rcx, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt128:
; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: pushl %esi
; X32-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx
; X32-POPCNT-NEXT: addl %ecx, %edx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi
; X32-POPCNT-NEXT: addl %ecx, %esi
; X32-POPCNT-NEXT: addl %edx, %esi
; X32-POPCNT-NEXT: movl %esi, (%eax)
; X32-POPCNT-NEXT: movl $0, 12(%eax)
; X32-POPCNT-NEXT: movl $0, 8(%eax)
; X32-POPCNT-NEXT: movl $0, 4(%eax)
; X32-POPCNT-NEXT: popl %esi
; X32-POPCNT-NEXT: retl $4
;
; X64-POPCNT-LABEL: cnt128:
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntq %rsi, %rcx
; X64-POPCNT-NEXT: popcntq %rdi, %rax
; X64-POPCNT-NEXT: addq %rcx, %rax
; X64-POPCNT-NEXT: xorl %edx, %edx
; X64-POPCNT-NEXT: retq
;
; X32-SSE2-LABEL: cnt128:
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
; X32-SSE2-NEXT: psrlw $1, %xmm1
; X32-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
; X32-SSE2-NEXT: pand %xmm2, %xmm1
; X32-SSE2-NEXT: psubb %xmm1, %xmm0
; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; X32-SSE2-NEXT: movdqa %xmm0, %xmm3
; X32-SSE2-NEXT: pand %xmm1, %xmm3
; X32-SSE2-NEXT: psrlw $2, %xmm0
; X32-SSE2-NEXT: pand %xmm1, %xmm0
; X32-SSE2-NEXT: paddb %xmm3, %xmm0
; X32-SSE2-NEXT: movdqa %xmm0, %xmm3
; X32-SSE2-NEXT: psrlw $4, %xmm3
; X32-SSE2-NEXT: paddb %xmm0, %xmm3
; X32-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSE2-NEXT: pand %xmm0, %xmm3
; X32-SSE2-NEXT: pxor %xmm4, %xmm4
; X32-SSE2-NEXT: psadbw %xmm4, %xmm3
; X32-SSE2-NEXT: movd %xmm3, %ecx
; X32-SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
; X32-SSE2-NEXT: movdqa %xmm3, %xmm5
; X32-SSE2-NEXT: psrlw $1, %xmm5
; X32-SSE2-NEXT: pand %xmm2, %xmm5
; X32-SSE2-NEXT: psubb %xmm5, %xmm3
; X32-SSE2-NEXT: movdqa %xmm3, %xmm2
; X32-SSE2-NEXT: pand %xmm1, %xmm2
; X32-SSE2-NEXT: psrlw $2, %xmm3
; X32-SSE2-NEXT: pand %xmm1, %xmm3
; X32-SSE2-NEXT: paddb %xmm2, %xmm3
; X32-SSE2-NEXT: movdqa %xmm3, %xmm1
; X32-SSE2-NEXT: psrlw $4, %xmm1
; X32-SSE2-NEXT: paddb %xmm3, %xmm1
; X32-SSE2-NEXT: pand %xmm0, %xmm1
; X32-SSE2-NEXT: psadbw %xmm4, %xmm1
; X32-SSE2-NEXT: movd %xmm1, %edx
; X32-SSE2-NEXT: addl %ecx, %edx
; X32-SSE2-NEXT: movl %edx, (%eax)
; X32-SSE2-NEXT: movl $0, 12(%eax)
; X32-SSE2-NEXT: movl $0, 8(%eax)
; X32-SSE2-NEXT: movl $0, 4(%eax)
; X32-SSE2-NEXT: retl $4
;
; X32-SSSE3-LABEL: cnt128:
; X32-SSSE3: # %bb.0:
; X32-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2
; X32-SSSE3-NEXT: pand %xmm0, %xmm2
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4
; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4
; X32-SSSE3-NEXT: psrlw $4, %xmm1
; X32-SSSE3-NEXT: pand %xmm0, %xmm1
; X32-SSSE3-NEXT: movdqa %xmm3, %xmm2
; X32-SSSE3-NEXT: pshufb %xmm1, %xmm2
; X32-SSSE3-NEXT: paddb %xmm4, %xmm2
; X32-SSSE3-NEXT: pxor %xmm1, %xmm1
; X32-SSSE3-NEXT: psadbw %xmm1, %xmm2
; X32-SSSE3-NEXT: movd %xmm2, %ecx
; X32-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
; X32-SSSE3-NEXT: movdqa %xmm2, %xmm4
; X32-SSSE3-NEXT: pand %xmm0, %xmm4
; X32-SSSE3-NEXT: movdqa %xmm3, %xmm5
; X32-SSSE3-NEXT: pshufb %xmm4, %xmm5
; X32-SSSE3-NEXT: psrlw $4, %xmm2
; X32-SSSE3-NEXT: pand %xmm0, %xmm2
; X32-SSSE3-NEXT: pshufb %xmm2, %xmm3
; X32-SSSE3-NEXT: paddb %xmm5, %xmm3
; X32-SSSE3-NEXT: psadbw %xmm1, %xmm3
; X32-SSSE3-NEXT: movd %xmm3, %edx
; X32-SSSE3-NEXT: addl %ecx, %edx
; X32-SSSE3-NEXT: movl %edx, (%eax)
; X32-SSSE3-NEXT: movl $0, 12(%eax)
; X32-SSSE3-NEXT: movl $0, 8(%eax)
; X32-SSSE3-NEXT: movl $0, 4(%eax)
; X32-SSSE3-NEXT: retl $4
%cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
ret i128 %cnt
}
define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat {
; X32-LABEL: cnt64_noimplicitfloat:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrl %edx
; X32-NEXT: andl $1431655765, %edx # imm = 0x55555555
; X32-NEXT: subl %edx, %ecx
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: andl $858993459, %edx # imm = 0x33333333
; X32-NEXT: shrl $2, %ecx
; X32-NEXT: andl $858993459, %ecx # imm = 0x33333333
; X32-NEXT: addl %edx, %ecx
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrl $4, %edx
; X32-NEXT: addl %ecx, %edx
; X32-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
; X32-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101
; X32-NEXT: shrl $24, %ecx
; X32-NEXT: movl %eax, %edx
; X32-NEXT: shrl %edx
; X32-NEXT: andl $1431655765, %edx # imm = 0x55555555
; X32-NEXT: subl %edx, %eax
; X32-NEXT: movl %eax, %edx
; X32-NEXT: andl $858993459, %edx # imm = 0x33333333
; X32-NEXT: shrl $2, %eax
; X32-NEXT: andl $858993459, %eax # imm = 0x33333333
; X32-NEXT: addl %edx, %eax
; X32-NEXT: movl %eax, %edx
; X32-NEXT: shrl $4, %edx
; X32-NEXT: addl %eax, %edx
; X32-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
; X32-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101
; X32-NEXT: shrl $24, %eax
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: retl
;
; X64-LABEL: cnt64_noimplicitfloat:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq %rax
; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: subq %rcx, %rdi
; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
; X64-NEXT: movq %rdi, %rcx
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: shrq $2, %rdi
; X64-NEXT: andq %rax, %rdi
; X64-NEXT: addq %rcx, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq $4, %rax
; X64-NEXT: addq %rdi, %rax
; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
; X64-NEXT: imulq %rcx, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt64_noimplicitfloat:
; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
; X32-POPCNT-NEXT: addl %ecx, %eax
; X32-POPCNT-NEXT: xorl %edx, %edx
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt64_noimplicitfloat:
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntq %rdi, %rax
; X64-POPCNT-NEXT: retq
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
ret i64 %cnt
}
define i32 @cnt32_optsize(i32 %x) nounwind readnone optsize {
; X32-LABEL: cnt32_optsize:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl %ecx
; X32-NEXT: andl $1431655765, %ecx # imm = 0x55555555
; X32-NEXT: subl %ecx, %eax
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NEXT: movl $858993459, %ecx # imm = 0x33333333
; X32-NEXT: movl %eax, %edx
; X32-NEXT: andl %ecx, %edx
; X32-NEXT: shrl $2, %eax
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NEXT: andl %ecx, %eax
; X32-NEXT: addl %edx, %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $4, %ecx
; X32-NEXT: addl %eax, %ecx
; X32-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
; X32-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101
; X32-NEXT: shrl $24, %eax
; X32-NEXT: retl
;
; X64-LABEL: cnt32_optsize:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrl %eax
; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555
; X64-NEXT: subl %eax, %edi
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X64-NEXT: movl $858993459, %eax # imm = 0x33333333
; X64-NEXT: movl %edi, %ecx
; X64-NEXT: andl %eax, %ecx
; X64-NEXT: shrl $2, %edi
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X64-NEXT: andl %eax, %edi
; X64-NEXT: addl %ecx, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrl $4, %eax
; X64-NEXT: addl %edi, %eax
; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X64-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
; X64-NEXT: shrl $24, %eax
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt32_optsize:
; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt32_optsize:
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntl %edi, %eax
; X64-POPCNT-NEXT: retq
%cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
ret i32 %cnt
}
define i64 @cnt64_optsize(i64 %x) nounwind readnone optsize {
; X32-NOSSE-LABEL: cnt64_optsize:
; X32-NOSSE: # %bb.0:
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: pushl %ebx
; X32-NOSSE-NEXT: pushl %edi
; X32-NOSSE-NEXT: pushl %esi
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOSSE-NEXT: movl %ecx, %edx
; X32-NOSSE-NEXT: shrl %edx
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555
; X32-NOSSE-NEXT: andl %esi, %edx
; X32-NOSSE-NEXT: subl %edx, %ecx
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: movl $858993459, %edx # imm = 0x33333333
; X32-NOSSE-NEXT: movl %ecx, %edi
; X32-NOSSE-NEXT: andl %edx, %edi
; X32-NOSSE-NEXT: shrl $2, %ecx
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: andl %edx, %ecx
; X32-NOSSE-NEXT: addl %edi, %ecx
; X32-NOSSE-NEXT: movl %ecx, %edi
; X32-NOSSE-NEXT: shrl $4, %edi
; X32-NOSSE-NEXT: addl %ecx, %edi
; X32-NOSSE-NEXT: movl $252645135, %ecx # imm = 0xF0F0F0F
; X32-NOSSE-NEXT: andl %ecx, %edi
; X32-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %edi
; X32-NOSSE-NEXT: movl %eax, %ebx
; X32-NOSSE-NEXT: shrl %ebx
; X32-NOSSE-NEXT: andl %esi, %ebx
; X32-NOSSE-NEXT: subl %ebx, %eax
; X32-NOSSE-NEXT: movl %eax, %esi
; X32-NOSSE-NEXT: andl %edx, %esi
; X32-NOSSE-NEXT: shrl $2, %eax
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: andl %edx, %eax
; X32-NOSSE-NEXT: addl %esi, %eax
; X32-NOSSE-NEXT: movl %eax, %edx
; X32-NOSSE-NEXT: shrl $4, %edx
; X32-NOSSE-NEXT: addl %eax, %edx
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: andl %ecx, %edx
; X32-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %eax
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: addl %edi, %eax
; X32-NOSSE-NEXT: xorl %edx, %edx
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: popl %esi
; X32-NOSSE-NEXT: popl %edi
; X32-NOSSE-NEXT: popl %ebx
; X32-NOSSE-NEXT: retl
;
; X64-LABEL: cnt64_optsize:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq %rax
; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: subq %rcx, %rdi
; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
; X64-NEXT: movq %rdi, %rcx
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: shrq $2, %rdi
; X64-NEXT: andq %rax, %rdi
; X64-NEXT: addq %rcx, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq $4, %rax
; X64-NEXT: addq %rdi, %rax
; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
; X64-NEXT: imulq %rcx, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt64_optsize:
; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
; X32-POPCNT-NEXT: addl %ecx, %eax
; X32-POPCNT-NEXT: xorl %edx, %edx
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt64_optsize:
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntq %rdi, %rax
; X64-POPCNT-NEXT: retq
;
; X32-SSE2-LABEL: cnt64_optsize:
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
; X32-SSE2-NEXT: psrlw $1, %xmm1
; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE2-NEXT: psubb %xmm1, %xmm0
; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; X32-SSE2-NEXT: movdqa %xmm0, %xmm2
; X32-SSE2-NEXT: pand %xmm1, %xmm2
; X32-SSE2-NEXT: psrlw $2, %xmm0
; X32-SSE2-NEXT: pand %xmm1, %xmm0
; X32-SSE2-NEXT: paddb %xmm2, %xmm0
; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
; X32-SSE2-NEXT: psrlw $4, %xmm1
; X32-SSE2-NEXT: paddb %xmm0, %xmm1
; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE2-NEXT: pxor %xmm0, %xmm0
; X32-SSE2-NEXT: psadbw %xmm1, %xmm0
; X32-SSE2-NEXT: movd %xmm0, %eax
; X32-SSE2-NEXT: xorl %edx, %edx
; X32-SSE2-NEXT: retl
;
; X32-SSSE3-LABEL: cnt64_optsize:
; X32-SSSE3: # %bb.0:
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2
; X32-SSSE3-NEXT: pand %xmm0, %xmm2
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4
; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4
; X32-SSSE3-NEXT: psrlw $4, %xmm1
; X32-SSSE3-NEXT: pand %xmm0, %xmm1
; X32-SSSE3-NEXT: pshufb %xmm1, %xmm3
; X32-SSSE3-NEXT: paddb %xmm4, %xmm3
; X32-SSSE3-NEXT: pxor %xmm0, %xmm0
; X32-SSSE3-NEXT: psadbw %xmm3, %xmm0
; X32-SSSE3-NEXT: movd %xmm0, %eax
; X32-SSSE3-NEXT: xorl %edx, %edx
; X32-SSSE3-NEXT: retl
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
ret i64 %cnt
}
define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
; X32-NOSSE-LABEL: cnt128_optsize:
; X32-NOSSE: # %bb.0:
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: pushl %ebp
; X32-NOSSE-NEXT: pushl %ebx
; X32-NOSSE-NEXT: pushl %edi
; X32-NOSSE-NEXT: pushl %esi
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X32-NOSSE-NEXT: movl %ebx, %ecx
; X32-NOSSE-NEXT: shrl %ecx
; X32-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555
; X32-NOSSE-NEXT: andl %edi, %ecx
; X32-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555
; X32-NOSSE-NEXT: subl %ecx, %ebx
; X32-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333
; X32-NOSSE-NEXT: movl %ebx, %ebp
; X32-NOSSE-NEXT: andl %ecx, %ebp
; X32-NOSSE-NEXT: shrl $2, %ebx
; X32-NOSSE-NEXT: andl %ecx, %ebx
; X32-NOSSE-NEXT: addl %ebp, %ebx
; X32-NOSSE-NEXT: movl %ebx, %ebp
; X32-NOSSE-NEXT: shrl $4, %ebp
; X32-NOSSE-NEXT: addl %ebx, %ebp
; X32-NOSSE-NEXT: movl %eax, %ebx
; X32-NOSSE-NEXT: shrl %ebx
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: andl %edi, %ebx
; X32-NOSSE-NEXT: subl %ebx, %eax
; X32-NOSSE-NEXT: movl %eax, %ebx
; X32-NOSSE-NEXT: andl %ecx, %ebx
; X32-NOSSE-NEXT: shrl $2, %eax
; X32-NOSSE-NEXT: andl %ecx, %eax
; X32-NOSSE-NEXT: addl %ebx, %eax
; X32-NOSSE-NEXT: movl %eax, %edi
; X32-NOSSE-NEXT: shrl $4, %edi
; X32-NOSSE-NEXT: addl %eax, %edi
; X32-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F
; X32-NOSSE-NEXT: andl %ebx, %ebp
; X32-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %eax
; X32-NOSSE-NEXT: andl %ebx, %edi
; X32-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %edi
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: addl %eax, %edi
; X32-NOSSE-NEXT: movl %esi, %eax
; X32-NOSSE-NEXT: shrl %eax
; X32-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555
; X32-NOSSE-NEXT: andl %ebp, %eax
; X32-NOSSE-NEXT: subl %eax, %esi
; X32-NOSSE-NEXT: movl %esi, %eax
; X32-NOSSE-NEXT: andl %ecx, %eax
; X32-NOSSE-NEXT: shrl $2, %esi
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: andl %ecx, %esi
; X32-NOSSE-NEXT: addl %eax, %esi
; X32-NOSSE-NEXT: movl %esi, %eax
; X32-NOSSE-NEXT: shrl $4, %eax
; X32-NOSSE-NEXT: addl %esi, %eax
; X32-NOSSE-NEXT: movl %edx, %esi
; X32-NOSSE-NEXT: shrl %esi
; X32-NOSSE-NEXT: andl %ebp, %esi
; X32-NOSSE-NEXT: subl %esi, %edx
; X32-NOSSE-NEXT: movl %edx, %esi
; X32-NOSSE-NEXT: andl %ecx, %esi
; X32-NOSSE-NEXT: shrl $2, %edx
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: andl %ecx, %edx
; X32-NOSSE-NEXT: addl %esi, %edx
; X32-NOSSE-NEXT: movl %edx, %ecx
; X32-NOSSE-NEXT: shrl $4, %ecx
; X32-NOSSE-NEXT: addl %edx, %ecx
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: andl %ebx, %eax
; X32-NOSSE-NEXT: andl %ebx, %ecx
; X32-NOSSE-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %eax
; X32-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %ecx
; X32-NOSSE-NEXT: addl %eax, %ecx
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NOSSE-NEXT: addl %edi, %ecx
; X32-NOSSE-NEXT: xorl %edx, %edx
; X32-NOSSE-NEXT: movl %edx, 12(%eax)
; X32-NOSSE-NEXT: movl %edx, 8(%eax)
; X32-NOSSE-NEXT: movl %edx, 4(%eax)
; X32-NOSSE-NEXT: movl %ecx, (%eax)
; X32-NOSSE-NEXT: popl %esi
; X32-NOSSE-NEXT: popl %edi
; X32-NOSSE-NEXT: popl %ebx
[X86] Add custom isel to select ADD/SUB/OR/XOR/AND to their non-immediate forms under optsize when the immediate has additional users. Summary: We attempt to prevent folding immediates with multiple users under optsize. But we only do this from store nodes and X86ISD::ADD/SUB/XOR/OR/AND patterns. We don't do it for ISD::ADD/SUB/XOR/OR/AND even though we count them as users when deciding whether to fold into other nodes. This leads to situations where we block folding to a compare for example, but still fold into an AND or OR as seen in PR27202. Unfortunately touching the isel patterns in tablegen for the ISD::ADD/SUB/XOR/OR/AND opcodes will cause the patterns to be unusable for fast isel. And we don't have a way to make a fast isel only pattern. To workaround this, this patch adds custom isel in front of the isel table that will select the non-immediate forms if the immediate has additional users. This may create some issues for ANDN and NOT matching. And there's room for improvement with unsigned 32 immediates on 64-bit AND. This patch needs more thorough test cases, but I wanted to get feedback on the direction. Please send me any other test cases you've seen in the wild. I think we probably have the same issue with the immediate matching when we fold RMW from X86ISD::ADD/SUB/XOR/OR/AND. And our TEST immedaite shrinking logic. Our cost modeling for immediates that can fit in a sign extended 8-bit immediate on a 16/32/64 bit operation is completely wrong. I also wonder if we should update the ConstantHoisting cost model and block folding for "opaque" constants. But of course constants can still be created by DAG combine and lowering optimizations. Fixes PR27202 Reviewers: spatel, RKSimon, andreadb Reviewed By: RKSimon Subscribers: jsji, hiraditya, jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59909 llvm-svn: 365163
2019-07-05 06:53:57 +08:00
; X32-NOSSE-NEXT: popl %ebp
; X32-NOSSE-NEXT: retl $4
;
; X64-LABEL: cnt128_optsize:
; X64: # %bb.0:
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: shrq %rax
; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
; X64-NEXT: andq %r8, %rax
; X64-NEXT: subq %rax, %rsi
; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
; X64-NEXT: movq %rsi, %rcx
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: shrq $2, %rsi
; X64-NEXT: andq %rax, %rsi
; X64-NEXT: addq %rcx, %rsi
; X64-NEXT: movq %rsi, %rcx
; X64-NEXT: shrq $4, %rcx
; X64-NEXT: addq %rsi, %rcx
; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: andq %r9, %rcx
; X64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101
; X64-NEXT: imulq %rdx, %rcx
; X64-NEXT: shrq $56, %rcx
; X64-NEXT: movq %rdi, %rsi
; X64-NEXT: shrq %rsi
; X64-NEXT: andq %r8, %rsi
; X64-NEXT: subq %rsi, %rdi
; X64-NEXT: movq %rdi, %rsi
; X64-NEXT: andq %rax, %rsi
; X64-NEXT: shrq $2, %rdi
; X64-NEXT: andq %rax, %rdi
; X64-NEXT: addq %rsi, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq $4, %rax
; X64-NEXT: addq %rdi, %rax
; X64-NEXT: andq %r9, %rax
; X64-NEXT: imulq %rdx, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: addq %rcx, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt128_optsize:
; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: pushl %esi
; X32-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx
; X32-POPCNT-NEXT: addl %ecx, %edx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi
; X32-POPCNT-NEXT: addl %ecx, %esi
; X32-POPCNT-NEXT: addl %edx, %esi
; X32-POPCNT-NEXT: xorl %ecx, %ecx
; X32-POPCNT-NEXT: movl %ecx, 12(%eax)
; X32-POPCNT-NEXT: movl %ecx, 8(%eax)
; X32-POPCNT-NEXT: movl %ecx, 4(%eax)
; X32-POPCNT-NEXT: movl %esi, (%eax)
; X32-POPCNT-NEXT: popl %esi
; X32-POPCNT-NEXT: retl $4
;
; X64-POPCNT-LABEL: cnt128_optsize:
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntq %rsi, %rcx
; X64-POPCNT-NEXT: popcntq %rdi, %rax
; X64-POPCNT-NEXT: addq %rcx, %rax
; X64-POPCNT-NEXT: xorl %edx, %edx
; X64-POPCNT-NEXT: retq
;
; X32-SSE2-LABEL: cnt128_optsize:
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
; X32-SSE2-NEXT: psrlw $1, %xmm1
; X32-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
; X32-SSE2-NEXT: pand %xmm2, %xmm1
; X32-SSE2-NEXT: psubb %xmm1, %xmm0
; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; X32-SSE2-NEXT: movdqa %xmm0, %xmm3
; X32-SSE2-NEXT: pand %xmm1, %xmm3
; X32-SSE2-NEXT: psrlw $2, %xmm0
; X32-SSE2-NEXT: pand %xmm1, %xmm0
; X32-SSE2-NEXT: paddb %xmm3, %xmm0
; X32-SSE2-NEXT: movdqa %xmm0, %xmm3
; X32-SSE2-NEXT: psrlw $4, %xmm3
; X32-SSE2-NEXT: paddb %xmm0, %xmm3
; X32-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSE2-NEXT: pand %xmm0, %xmm3
; X32-SSE2-NEXT: pxor %xmm4, %xmm4
; X32-SSE2-NEXT: psadbw %xmm4, %xmm3
; X32-SSE2-NEXT: movd %xmm3, %ecx
; X32-SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
; X32-SSE2-NEXT: movdqa %xmm3, %xmm5
; X32-SSE2-NEXT: psrlw $1, %xmm5
; X32-SSE2-NEXT: pand %xmm2, %xmm5
; X32-SSE2-NEXT: psubb %xmm5, %xmm3
; X32-SSE2-NEXT: movdqa %xmm3, %xmm2
; X32-SSE2-NEXT: pand %xmm1, %xmm2
; X32-SSE2-NEXT: psrlw $2, %xmm3
; X32-SSE2-NEXT: pand %xmm1, %xmm3
; X32-SSE2-NEXT: paddb %xmm2, %xmm3
; X32-SSE2-NEXT: movdqa %xmm3, %xmm1
; X32-SSE2-NEXT: psrlw $4, %xmm1
; X32-SSE2-NEXT: paddb %xmm3, %xmm1
; X32-SSE2-NEXT: pand %xmm0, %xmm1
; X32-SSE2-NEXT: psadbw %xmm4, %xmm1
; X32-SSE2-NEXT: movd %xmm1, %edx
; X32-SSE2-NEXT: addl %ecx, %edx
; X32-SSE2-NEXT: xorl %ecx, %ecx
; X32-SSE2-NEXT: movl %ecx, 12(%eax)
; X32-SSE2-NEXT: movl %ecx, 8(%eax)
; X32-SSE2-NEXT: movl %ecx, 4(%eax)
; X32-SSE2-NEXT: movl %edx, (%eax)
; X32-SSE2-NEXT: retl $4
;
; X32-SSSE3-LABEL: cnt128_optsize:
; X32-SSSE3: # %bb.0:
; X32-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2
; X32-SSSE3-NEXT: pand %xmm0, %xmm2
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4
; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4
; X32-SSSE3-NEXT: psrlw $4, %xmm1
; X32-SSSE3-NEXT: pand %xmm0, %xmm1
; X32-SSSE3-NEXT: movdqa %xmm3, %xmm2
; X32-SSSE3-NEXT: pshufb %xmm1, %xmm2
; X32-SSSE3-NEXT: paddb %xmm4, %xmm2
; X32-SSSE3-NEXT: pxor %xmm1, %xmm1
; X32-SSSE3-NEXT: psadbw %xmm1, %xmm2
; X32-SSSE3-NEXT: movd %xmm2, %ecx
; X32-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
; X32-SSSE3-NEXT: movdqa %xmm2, %xmm4
; X32-SSSE3-NEXT: pand %xmm0, %xmm4
; X32-SSSE3-NEXT: movdqa %xmm3, %xmm5
; X32-SSSE3-NEXT: pshufb %xmm4, %xmm5
; X32-SSSE3-NEXT: psrlw $4, %xmm2
; X32-SSSE3-NEXT: pand %xmm0, %xmm2
; X32-SSSE3-NEXT: pshufb %xmm2, %xmm3
; X32-SSSE3-NEXT: paddb %xmm5, %xmm3
; X32-SSSE3-NEXT: psadbw %xmm1, %xmm3
; X32-SSSE3-NEXT: movd %xmm3, %edx
; X32-SSSE3-NEXT: addl %ecx, %edx
; X32-SSSE3-NEXT: xorl %ecx, %ecx
; X32-SSSE3-NEXT: movl %ecx, 12(%eax)
; X32-SSSE3-NEXT: movl %ecx, 8(%eax)
; X32-SSSE3-NEXT: movl %ecx, 4(%eax)
; X32-SSSE3-NEXT: movl %edx, (%eax)
; X32-SSSE3-NEXT: retl $4
%cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
ret i128 %cnt
}
declare i8 @llvm.ctpop.i8(i8) nounwind readnone
declare i16 @llvm.ctpop.i16(i16) nounwind readnone
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
declare i128 @llvm.ctpop.i128(i128) nounwind readnone