forked from OSchip/llvm-project
[X86] Use xmm registers to implement 64-bit popcnt on 32-bit targets if possible if popcnt instruction is not available
On 32-bit targets without popcnt, we currently expand 64-bit popcnt to sequences of arithmetic and logic ops for each 32-bit half and then add the 32 bit halves together. If we have xmm registers we can use use those to implement the operation instead. This results in less instructions then doing two separate 32-bit popcnt sequences. This mitigates some of PR41151 for the i64 on i686 case when we have SSE2. Differential Revision: https://reviews.llvm.org/D59662 llvm-svn: 356808
This commit is contained in:
parent
1ffd8e8114
commit
ce1ed55a4a
|
@ -414,6 +414,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
|
||||
if (Subtarget.is64Bit())
|
||||
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
|
||||
else
|
||||
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
|
||||
}
|
||||
|
||||
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
|
||||
|
@ -26715,6 +26717,26 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
|
|||
switch (N->getOpcode()) {
|
||||
default:
|
||||
llvm_unreachable("Do not know how to custom type legalize this operation!");
|
||||
case ISD::CTPOP: {
|
||||
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
|
||||
// Use a v2i64 if possible.
|
||||
bool NoImplicitFloatOps =
|
||||
DAG.getMachineFunction().getFunction().hasFnAttribute(
|
||||
Attribute::NoImplicitFloat);
|
||||
if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
|
||||
SDValue Wide =
|
||||
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
|
||||
Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
|
||||
// Bit count should fit in 32-bits, extract it as that and then zero
|
||||
// extend to i64. Otherwise we end up extracting bits 63:32 separately.
|
||||
Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
|
||||
Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
|
||||
DAG.getIntPtrConstant(0, dl));
|
||||
Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
|
||||
Results.push_back(Wide);
|
||||
}
|
||||
return;
|
||||
}
|
||||
case ISD::MUL: {
|
||||
EVT VT = N->getValueType(0);
|
||||
assert(VT.isVector() && "Unexpected VT");
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X32
|
||||
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X32,X32-NOSSE
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
|
||||
; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X32-POPCNT
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT
|
||||
; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefixes=X32,X32-SSE2
|
||||
; RUN: llc < %s -mtriple=i686-unknown -mattr=ssse3 | FileCheck %s --check-prefixes=X32,X32-SSSE3
|
||||
|
||||
define i8 @cnt8(i8 %x) nounwind readnone {
|
||||
; X32-LABEL: cnt8:
|
||||
|
@ -172,7 +174,127 @@ define i32 @cnt32(i32 %x) nounwind readnone {
|
|||
}
|
||||
|
||||
define i64 @cnt64(i64 %x) nounwind readnone {
|
||||
; X32-LABEL: cnt64:
|
||||
; X32-NOSSE-LABEL: cnt64:
|
||||
; X32-NOSSE: # %bb.0:
|
||||
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X32-NOSSE-NEXT: movl %ecx, %edx
|
||||
; X32-NOSSE-NEXT: shrl %edx
|
||||
; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555
|
||||
; X32-NOSSE-NEXT: subl %edx, %ecx
|
||||
; X32-NOSSE-NEXT: movl %ecx, %edx
|
||||
; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
|
||||
; X32-NOSSE-NEXT: shrl $2, %ecx
|
||||
; X32-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333
|
||||
; X32-NOSSE-NEXT: addl %edx, %ecx
|
||||
; X32-NOSSE-NEXT: movl %ecx, %edx
|
||||
; X32-NOSSE-NEXT: shrl $4, %edx
|
||||
; X32-NOSSE-NEXT: addl %ecx, %edx
|
||||
; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
|
||||
; X32-NOSSE-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101
|
||||
; X32-NOSSE-NEXT: shrl $24, %ecx
|
||||
; X32-NOSSE-NEXT: movl %eax, %edx
|
||||
; X32-NOSSE-NEXT: shrl %edx
|
||||
; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555
|
||||
; X32-NOSSE-NEXT: subl %edx, %eax
|
||||
; X32-NOSSE-NEXT: movl %eax, %edx
|
||||
; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
|
||||
; X32-NOSSE-NEXT: shrl $2, %eax
|
||||
; X32-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333
|
||||
; X32-NOSSE-NEXT: addl %edx, %eax
|
||||
; X32-NOSSE-NEXT: movl %eax, %edx
|
||||
; X32-NOSSE-NEXT: shrl $4, %edx
|
||||
; X32-NOSSE-NEXT: addl %eax, %edx
|
||||
; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
|
||||
; X32-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101
|
||||
; X32-NOSSE-NEXT: shrl $24, %eax
|
||||
; X32-NOSSE-NEXT: addl %ecx, %eax
|
||||
; X32-NOSSE-NEXT: xorl %edx, %edx
|
||||
; X32-NOSSE-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: cnt64:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: movq %rdi, %rax
|
||||
; X64-NEXT: shrq %rax
|
||||
; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
|
||||
; X64-NEXT: andq %rax, %rcx
|
||||
; X64-NEXT: subq %rcx, %rdi
|
||||
; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
|
||||
; X64-NEXT: movq %rdi, %rcx
|
||||
; X64-NEXT: andq %rax, %rcx
|
||||
; X64-NEXT: shrq $2, %rdi
|
||||
; X64-NEXT: andq %rax, %rdi
|
||||
; X64-NEXT: addq %rcx, %rdi
|
||||
; X64-NEXT: movq %rdi, %rax
|
||||
; X64-NEXT: shrq $4, %rax
|
||||
; X64-NEXT: leaq (%rax,%rdi), %rax
|
||||
; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
|
||||
; X64-NEXT: andq %rax, %rcx
|
||||
; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
|
||||
; X64-NEXT: imulq %rcx, %rax
|
||||
; X64-NEXT: shrq $56, %rax
|
||||
; X64-NEXT: retq
|
||||
;
|
||||
; X32-POPCNT-LABEL: cnt64:
|
||||
; X32-POPCNT: # %bb.0:
|
||||
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
|
||||
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
|
||||
; X32-POPCNT-NEXT: addl %ecx, %eax
|
||||
; X32-POPCNT-NEXT: xorl %edx, %edx
|
||||
; X32-POPCNT-NEXT: retl
|
||||
;
|
||||
; X64-POPCNT-LABEL: cnt64:
|
||||
; X64-POPCNT: # %bb.0:
|
||||
; X64-POPCNT-NEXT: popcntq %rdi, %rax
|
||||
; X64-POPCNT-NEXT: retq
|
||||
;
|
||||
; X32-SSE2-LABEL: cnt64:
|
||||
; X32-SSE2: # %bb.0:
|
||||
; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; X32-SSE2-NEXT: psrlw $1, %xmm1
|
||||
; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
|
||||
; X32-SSE2-NEXT: psubb %xmm1, %xmm0
|
||||
; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
|
||||
; X32-SSE2-NEXT: movdqa %xmm0, %xmm2
|
||||
; X32-SSE2-NEXT: pand %xmm1, %xmm2
|
||||
; X32-SSE2-NEXT: psrlw $2, %xmm0
|
||||
; X32-SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; X32-SSE2-NEXT: paddb %xmm2, %xmm0
|
||||
; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; X32-SSE2-NEXT: psrlw $4, %xmm1
|
||||
; X32-SSE2-NEXT: paddb %xmm0, %xmm1
|
||||
; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
|
||||
; X32-SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; X32-SSE2-NEXT: psadbw %xmm1, %xmm0
|
||||
; X32-SSE2-NEXT: movd %xmm0, %eax
|
||||
; X32-SSE2-NEXT: xorl %edx, %edx
|
||||
; X32-SSE2-NEXT: retl
|
||||
;
|
||||
; X32-SSSE3-LABEL: cnt64:
|
||||
; X32-SSSE3: # %bb.0:
|
||||
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
|
||||
; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
||||
; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2
|
||||
; X32-SSSE3-NEXT: pand %xmm0, %xmm2
|
||||
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
|
||||
; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4
|
||||
; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4
|
||||
; X32-SSSE3-NEXT: psrlw $4, %xmm1
|
||||
; X32-SSSE3-NEXT: pand %xmm0, %xmm1
|
||||
; X32-SSSE3-NEXT: pshufb %xmm1, %xmm3
|
||||
; X32-SSSE3-NEXT: paddb %xmm4, %xmm3
|
||||
; X32-SSSE3-NEXT: pxor %xmm0, %xmm0
|
||||
; X32-SSSE3-NEXT: psadbw %xmm3, %xmm0
|
||||
; X32-SSSE3-NEXT: movd %xmm0, %eax
|
||||
; X32-SSSE3-NEXT: xorl %edx, %edx
|
||||
; X32-SSSE3-NEXT: retl
|
||||
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
|
||||
ret i64 %cnt
|
||||
}
|
||||
|
||||
define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat {
|
||||
; X32-LABEL: cnt64_noimplicitfloat:
|
||||
; X32: # %bb.0:
|
||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
|
@ -210,7 +332,7 @@ define i64 @cnt64(i64 %x) nounwind readnone {
|
|||
; X32-NEXT: xorl %edx, %edx
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: cnt64:
|
||||
; X64-LABEL: cnt64_noimplicitfloat:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: movq %rdi, %rax
|
||||
; X64-NEXT: shrq %rax
|
||||
|
@ -233,7 +355,7 @@ define i64 @cnt64(i64 %x) nounwind readnone {
|
|||
; X64-NEXT: shrq $56, %rax
|
||||
; X64-NEXT: retq
|
||||
;
|
||||
; X32-POPCNT-LABEL: cnt64:
|
||||
; X32-POPCNT-LABEL: cnt64_noimplicitfloat:
|
||||
; X32-POPCNT: # %bb.0:
|
||||
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
|
||||
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
|
||||
|
@ -241,7 +363,7 @@ define i64 @cnt64(i64 %x) nounwind readnone {
|
|||
; X32-POPCNT-NEXT: xorl %edx, %edx
|
||||
; X32-POPCNT-NEXT: retl
|
||||
;
|
||||
; X64-POPCNT-LABEL: cnt64:
|
||||
; X64-POPCNT-LABEL: cnt64_noimplicitfloat:
|
||||
; X64-POPCNT: # %bb.0:
|
||||
; X64-POPCNT-NEXT: popcntq %rdi, %rax
|
||||
; X64-POPCNT-NEXT: retq
|
||||
|
|
Loading…
Reference in New Issue