[X86] Use xmm registers to implement 64-bit popcnt on 32-bit targets if possible if popcnt instruction is not available

On 32-bit targets without popcnt, we currently expand 64-bit popcnt to sequences of arithmetic and logic ops for each 32-bit half and then add the 32 bit halves together. If we have xmm registers we can use use those to implement the operation instead. This results in less instructions then doing two separate 32-bit popcnt sequences.

This mitigates some of PR41151 for the i64 on i686 case when we have SSE2.

Differential Revision: https://reviews.llvm.org/D59662

llvm-svn: 356808
This commit is contained in:
Craig Topper 2019-03-22 20:47:02 +00:00
parent 1ffd8e8114
commit ce1ed55a4a
2 changed files with 149 additions and 5 deletions

View File

@ -414,6 +414,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
if (Subtarget.is64Bit())
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
else
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
}
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
@ -26715,6 +26717,26 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
switch (N->getOpcode()) {
default:
llvm_unreachable("Do not know how to custom type legalize this operation!");
case ISD::CTPOP: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
// Use a v2i64 if possible.
bool NoImplicitFloatOps =
DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
SDValue Wide =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
// Bit count should fit in 32-bits, extract it as that and then zero
// extend to i64. Otherwise we end up extracting bits 63:32 separately.
Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
DAG.getIntPtrConstant(0, dl));
Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
Results.push_back(Wide);
}
return;
}
case ISD::MUL: {
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Unexpected VT");

View File

@ -1,8 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X32,X32-NOSSE
; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X32-POPCNT
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT
; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefixes=X32,X32-SSE2
; RUN: llc < %s -mtriple=i686-unknown -mattr=ssse3 | FileCheck %s --check-prefixes=X32,X32-SSSE3
define i8 @cnt8(i8 %x) nounwind readnone {
; X32-LABEL: cnt8:
@ -172,7 +174,127 @@ define i32 @cnt32(i32 %x) nounwind readnone {
}
define i64 @cnt64(i64 %x) nounwind readnone {
; X32-LABEL: cnt64:
; X32-NOSSE-LABEL: cnt64:
; X32-NOSSE: # %bb.0:
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOSSE-NEXT: movl %ecx, %edx
; X32-NOSSE-NEXT: shrl %edx
; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555
; X32-NOSSE-NEXT: subl %edx, %ecx
; X32-NOSSE-NEXT: movl %ecx, %edx
; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
; X32-NOSSE-NEXT: shrl $2, %ecx
; X32-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333
; X32-NOSSE-NEXT: addl %edx, %ecx
; X32-NOSSE-NEXT: movl %ecx, %edx
; X32-NOSSE-NEXT: shrl $4, %edx
; X32-NOSSE-NEXT: addl %ecx, %edx
; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
; X32-NOSSE-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %ecx
; X32-NOSSE-NEXT: movl %eax, %edx
; X32-NOSSE-NEXT: shrl %edx
; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555
; X32-NOSSE-NEXT: subl %edx, %eax
; X32-NOSSE-NEXT: movl %eax, %edx
; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
; X32-NOSSE-NEXT: shrl $2, %eax
; X32-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333
; X32-NOSSE-NEXT: addl %edx, %eax
; X32-NOSSE-NEXT: movl %eax, %edx
; X32-NOSSE-NEXT: shrl $4, %edx
; X32-NOSSE-NEXT: addl %eax, %edx
; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
; X32-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101
; X32-NOSSE-NEXT: shrl $24, %eax
; X32-NOSSE-NEXT: addl %ecx, %eax
; X32-NOSSE-NEXT: xorl %edx, %edx
; X32-NOSSE-NEXT: retl
;
; X64-LABEL: cnt64:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq %rax
; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: subq %rcx, %rdi
; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
; X64-NEXT: movq %rdi, %rcx
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: shrq $2, %rdi
; X64-NEXT: andq %rax, %rdi
; X64-NEXT: addq %rcx, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq $4, %rax
; X64-NEXT: leaq (%rax,%rdi), %rax
; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
; X64-NEXT: imulq %rcx, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt64:
; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
; X32-POPCNT-NEXT: addl %ecx, %eax
; X32-POPCNT-NEXT: xorl %edx, %edx
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt64:
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntq %rdi, %rax
; X64-POPCNT-NEXT: retq
;
; X32-SSE2-LABEL: cnt64:
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
; X32-SSE2-NEXT: psrlw $1, %xmm1
; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE2-NEXT: psubb %xmm1, %xmm0
; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; X32-SSE2-NEXT: movdqa %xmm0, %xmm2
; X32-SSE2-NEXT: pand %xmm1, %xmm2
; X32-SSE2-NEXT: psrlw $2, %xmm0
; X32-SSE2-NEXT: pand %xmm1, %xmm0
; X32-SSE2-NEXT: paddb %xmm2, %xmm0
; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
; X32-SSE2-NEXT: psrlw $4, %xmm1
; X32-SSE2-NEXT: paddb %xmm0, %xmm1
; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE2-NEXT: pxor %xmm0, %xmm0
; X32-SSE2-NEXT: psadbw %xmm1, %xmm0
; X32-SSE2-NEXT: movd %xmm0, %eax
; X32-SSE2-NEXT: xorl %edx, %edx
; X32-SSE2-NEXT: retl
;
; X32-SSSE3-LABEL: cnt64:
; X32-SSSE3: # %bb.0:
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2
; X32-SSSE3-NEXT: pand %xmm0, %xmm2
; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4
; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4
; X32-SSSE3-NEXT: psrlw $4, %xmm1
; X32-SSSE3-NEXT: pand %xmm0, %xmm1
; X32-SSSE3-NEXT: pshufb %xmm1, %xmm3
; X32-SSSE3-NEXT: paddb %xmm4, %xmm3
; X32-SSSE3-NEXT: pxor %xmm0, %xmm0
; X32-SSSE3-NEXT: psadbw %xmm3, %xmm0
; X32-SSSE3-NEXT: movd %xmm0, %eax
; X32-SSSE3-NEXT: xorl %edx, %edx
; X32-SSSE3-NEXT: retl
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
ret i64 %cnt
}
define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat {
; X32-LABEL: cnt64_noimplicitfloat:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
@ -210,7 +332,7 @@ define i64 @cnt64(i64 %x) nounwind readnone {
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: retl
;
; X64-LABEL: cnt64:
; X64-LABEL: cnt64_noimplicitfloat:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq %rax
@ -233,7 +355,7 @@ define i64 @cnt64(i64 %x) nounwind readnone {
; X64-NEXT: shrq $56, %rax
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt64:
; X32-POPCNT-LABEL: cnt64_noimplicitfloat:
; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
@ -241,7 +363,7 @@ define i64 @cnt64(i64 %x) nounwind readnone {
; X32-POPCNT-NEXT: xorl %edx, %edx
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt64:
; X64-POPCNT-LABEL: cnt64_noimplicitfloat:
; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntq %rdi, %rax
; X64-POPCNT-NEXT: retq