[X86] SimplifyMultipleUseDemandedBitsForTargetNode - add initial X86ISD::VSRAI handling.

This initial version only peeks through cases where we just demand the sign bit of an ashr shift, but we could generalize this further depending on how many sign bits we already have.

The pr18014.ll case is a minor annoyance - we've failed to to move the psrad/paddd after the blendvps which would have avoided the extra move, but we have still increased the ILP.
This commit is contained in:
Simon Pilgrim 2020-05-24 16:07:46 +01:00
parent 71bed8206b
commit 1e7865d946
4 changed files with 11 additions and 10 deletions

View File

@ -37404,6 +37404,12 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
return Vec;
break;
}
case X86ISD::VSRAI:
// iff we only need the sign bit then we can use the source directly.
// TODO: generalize where we only demand extended signbits.
if (DemandedBits.isSignMask())
return Op.getOperand(0);
break;
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
// iff we only need the sign bit then we can use R directly.

View File

@ -347,7 +347,6 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
; X64: # %bb.0: # %entry
; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X64-NEXT: vpslld $31, %ymm0, %ymm0
; X64-NEXT: vpsrad $31, %ymm0, %ymm0
; X64-NEXT: vmovdqa (%rdi), %ymm2
; X64-NEXT: vmovdqa 32(%rdi), %ymm3
; X64-NEXT: vextracti128 $1, %ymm1, %xmm4
@ -462,7 +461,6 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
; X64: # %bb.0: # %entry
; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X64-NEXT: vpslld $31, %ymm0, %ymm0
; X64-NEXT: vpsrad $31, %ymm0, %ymm0
; X64-NEXT: vmovaps (%rdi), %ymm2
; X64-NEXT: vmovaps 32(%rdi), %ymm3
; X64-NEXT: vextractf128 $1, %ymm1, %xmm4

View File

@ -1602,7 +1602,6 @@ define i1 @allones_v16i16_and1(<16 x i16> %arg) {
; AVX2-LABEL: allones_v16i16_and1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
@ -1823,7 +1822,6 @@ define i1 @allzeros_v16i16_and1(<16 x i16> %arg) {
; AVX2-LABEL: allzeros_v16i16_and1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
@ -3005,7 +3003,6 @@ define i1 @allones_v16i16_and4(<16 x i16> %arg) {
; AVX2-LABEL: allones_v16i16_and4:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $13, %ymm0, %ymm0
; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
@ -3226,7 +3223,6 @@ define i1 @allzeros_v16i16_and4(<16 x i16> %arg) {
; AVX2-LABEL: allzeros_v16i16_and4:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $13, %ymm0, %ymm0
; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax

View File

@ -2,17 +2,18 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s
; Ensure PSRAD is generated as the condition is consumed by both PADD and
; BLENDVPS. PAND requires all bits setting properly.
; BLENDVPS. PADD requires all bits setting properly.
define <4 x i32> @foo(<4 x i32>* %p, <4 x i1> %cond, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
; CHECK-NEXT: pslld $31, %xmm0
; CHECK-NEXT: psrad $31, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm3
; CHECK-NEXT: psrad $31, %xmm3
; CHECK-NEXT: paddd %xmm1, %xmm3
; CHECK-NEXT: blendvps %xmm0, %xmm1, %xmm2
; CHECK-NEXT: paddd %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm2, (%rdi)
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: movdqa %xmm3, %xmm0
; CHECK-NEXT: retq
%sext_cond = sext <4 x i1> %cond to <4 x i32>
%t1 = add <4 x i32> %v1, %sext_cond