[X86] Add vector shift by immediate to SimplifyDemandedBitsForTargetNode.

Summary: This also enables some constant folding from KnownBits propagation. This helps on some cases vXi64 case in 32-bit mode where constant vectors appear as vXi32 and a bitcast. This can prevent getNode from constant folding sra/shl/srl.

Reviewers: RKSimon, spatel

Reviewed By: spatel

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D54069

llvm-svn: 346102
This commit is contained in:
Craig Topper 2018-11-04 17:31:27 +00:00
parent bc5c3f5727
commit ed6a0a817f
8 changed files with 166 additions and 168 deletions

View File

@ -31817,6 +31817,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
SDValue Op, const APInt &OriginalDemandedBits, KnownBits &Known,
TargetLoweringOpt &TLO, unsigned Depth) const {
unsigned BitWidth = OriginalDemandedBits.getBitWidth();
unsigned Opc = Op.getOpcode();
switch(Opc) {
case X86ISD::PMULDQ:
@ -31833,6 +31834,42 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
return true;
break;
}
case X86ISD::VSHLI: {
if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
if (ShiftImm->getAPIntValue().uge(BitWidth))
break;
KnownBits KnownOp;
unsigned ShAmt = ShiftImm->getZExtValue();
APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, KnownOp, TLO,
Depth + 1))
return true;
}
break;
}
case X86ISD::VSRAI:
case X86ISD::VSRLI: {
if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
if (ShiftImm->getAPIntValue().uge(BitWidth))
break;
KnownBits KnownOp;
unsigned ShAmt = ShiftImm->getZExtValue();
APInt DemandedMask = OriginalDemandedBits << ShAmt;
// If any of the demanded bits are produced by the sign extension, we also
// demand the input sign bit.
if (Opc == X86ISD::VSRAI &&
OriginalDemandedBits.countLeadingZeros() < ShAmt)
DemandedMask.setSignBit();
if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, KnownOp, TLO,
Depth + 1))
return true;
}
break;
}
}
return TargetLowering::SimplifyDemandedBitsForTargetNode(
@ -34861,6 +34898,11 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
}
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(SDValue(N, 0),
APInt::getAllOnesValue(NumBitsPerElt), DCI))
return SDValue(N, 0);
return SDValue();
}

View File

@ -63,17 +63,7 @@ define <4 x i32> @combine_vec_lshr_known_zero0(<4 x i32> %x) {
define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_known_zero1:
; SSE: # %bb.0:
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $11, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrld $9, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $10, %xmm1
; SSE-NEXT: psrld $8, %xmm0
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_known_zero1:

View File

@ -669,20 +669,15 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: pmullw %xmm0, %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pmullw %xmm3, %xmm0
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: packuswb %xmm0, %xmm3
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; SSE41-NEXT: psllw $1, %xmm3
; SSE41-NEXT: psllw $8, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: psllw $8, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: packuswb %xmm3, %xmm2
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE41-NEXT: psllw $1, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm0, %xmm2
; SSE41-NEXT: packuswb %xmm3, %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
@ -693,21 +688,16 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; AVX1-NEXT: movl $171, %eax
; AVX1-NEXT: vmovd %eax, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; AVX1-NEXT: vpsllw $1, %xmm3, %xmm3
; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq

View File

@ -91,17 +91,14 @@ define float @signbits_ashr_extract_sitofp_1(<2 x i64> %a0) nounwind {
; X32-LABEL: signbits_ashr_extract_sitofp_1:
; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
; X32-NEXT: vpsrlq $63, %xmm1, %xmm2
; X32-NEXT: vpsrlq $32, %xmm1, %xmm1
; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; X32-NEXT: vpsrlq $63, %xmm0, %xmm2
; X32-NEXT: vpsrlq $63, %xmm0, %xmm1
; X32-NEXT: vpsrlq $32, %xmm0, %xmm0
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,32768,0,0,1,0,0,0]
; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-NEXT: vmovd %xmm0, %eax
; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0
; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
@ -128,18 +125,15 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
; X32-LABEL: signbits_ashr_shl_extract_sitofp:
; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
; X32-NEXT: vpsrlq $60, %xmm1, %xmm2
; X32-NEXT: vpsrlq $61, %xmm1, %xmm1
; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; X32-NEXT: vpsrlq $60, %xmm0, %xmm2
; X32-NEXT: vpsrlq $60, %xmm0, %xmm1
; X32-NEXT: vpsrlq $61, %xmm0, %xmm0
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [4,0,0,0,8,0,0,0]
; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsllq $20, %xmm0, %xmm0
; X32-NEXT: vmovd %xmm0, %eax
; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0
; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
@ -263,13 +257,10 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X32-NEXT: vpsrlq $60, %xmm2, %xmm3
; X32-NEXT: vpsrlq $61, %xmm2, %xmm2
; X32-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
; X32-NEXT: vpsrlq $60, %xmm0, %xmm3
; X32-NEXT: vpsrlq $60, %xmm0, %xmm2
; X32-NEXT: vpsrlq $61, %xmm0, %xmm0
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0]
; X32-NEXT: vpxor %xmm2, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm2, %xmm0, %xmm0
; X32-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
@ -281,7 +272,7 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; X32-NEXT: vpand %xmm1, %xmm0, %xmm0
; X32-NEXT: vmovd %xmm0, %eax
; X32-NEXT: vcvtsi2ssl %eax, %xmm4, %xmm0
; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
@ -320,13 +311,10 @@ define float @signbits_ashr_sextvecinreg_bitops_extract_sitofp(<2 x i64> %a0, <4
; X32-LABEL: signbits_ashr_sextvecinreg_bitops_extract_sitofp:
; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X32-NEXT: vpsrlq $60, %xmm2, %xmm3
; X32-NEXT: vpsrlq $61, %xmm2, %xmm2
; X32-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
; X32-NEXT: vpsrlq $60, %xmm0, %xmm3
; X32-NEXT: vpsrlq $60, %xmm0, %xmm2
; X32-NEXT: vpsrlq $61, %xmm0, %xmm0
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0]
; X32-NEXT: vpxor %xmm2, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm2, %xmm0, %xmm0
; X32-NEXT: vpmovsxdq %xmm1, %xmm1
@ -334,7 +322,7 @@ define float @signbits_ashr_sextvecinreg_bitops_extract_sitofp(<2 x i64> %a0, <4
; X32-NEXT: vpor %xmm1, %xmm2, %xmm1
; X32-NEXT: vpxor %xmm0, %xmm1, %xmm0
; X32-NEXT: vmovd %xmm0, %eax
; X32-NEXT: vcvtsi2ssl %eax, %xmm4, %xmm0
; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
@ -375,22 +363,19 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
; X32-NEXT: subl $16, %esp
; X32-NEXT: vpmovsxdq 16(%ebp), %xmm3
; X32-NEXT: vpmovsxdq 8(%ebp), %xmm4
; X32-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648]
; X32-NEXT: vextractf128 $1, %ymm2, %xmm5
; X32-NEXT: vpsrlq $63, %xmm5, %xmm6
; X32-NEXT: vpsrlq $33, %xmm5, %xmm5
; X32-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
; X32-NEXT: vextractf128 $1, %ymm2, %xmm6
; X32-NEXT: vpsrlq $63, %xmm6, %xmm7
; X32-NEXT: vpsrlq $33, %xmm6, %xmm6
; X32-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4,5,6,7]
; X32-NEXT: vpxor %xmm5, %xmm6, %xmm6
; X32-NEXT: vpsubq %xmm5, %xmm6, %xmm6
; X32-NEXT: vmovdqa {{.*#+}} xmm6 = [0,16384,0,0,1,0,0,0]
; X32-NEXT: vpxor %xmm6, %xmm5, %xmm5
; X32-NEXT: vpsubq %xmm6, %xmm5, %xmm5
; X32-NEXT: vpsrlq $63, %xmm2, %xmm7
; X32-NEXT: vpsrlq $33, %xmm2, %xmm2
; X32-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4,5,6,7]
; X32-NEXT: vpxor %xmm5, %xmm2, %xmm2
; X32-NEXT: vpsubq %xmm5, %xmm2, %xmm2
; X32-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
; X32-NEXT: vpxor %xmm6, %xmm2, %xmm2
; X32-NEXT: vpsubq %xmm6, %xmm2, %xmm2
; X32-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
; X32-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; X32-NEXT: vextractf128 $1, %ymm1, %xmm4
; X32-NEXT: vextractf128 $1, %ymm0, %xmm5

View File

@ -11,9 +11,9 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86-SKYLAKE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X86-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0
; X86-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X86-SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; X86-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0
@ -29,7 +29,7 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
; X86-SKX-NEXT: subl $8, %esp
; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[u,u],zero,zero,xmm0[u,u]
; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u]
; X86-SKX-NEXT: vpsrad $16, %xmm0, %xmm0
; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@ -50,9 +50,9 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
; X64-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8:
; X64-SKYLAKE: # %bb.0: # %entry
; X64-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-SKYLAKE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X64-SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X64-SKYLAKE-NEXT: vpsrad $16, %xmm0, %xmm0
; X64-SKYLAKE-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; X64-SKYLAKE-NEXT: vpsrld $7, %xmm0, %xmm0
@ -65,7 +65,7 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
; X64-SKX-LABEL: fetch_r16g16_snorm_unorm8:
; X64-SKX: # %bb.0: # %entry
; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[u,u],zero,zero,xmm0[u,u]
; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u]
; X64-SKX-NEXT: vpsrad $16, %xmm0, %xmm0
; X64-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0

View File

@ -990,15 +990,11 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
;
; X32-SSE-LABEL: constant_shift_v2i64:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: psrlq $1, %xmm2
; X32-SSE-NEXT: psrlq $7, %xmm1
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
; X32-SSE-NEXT: psrlq $1, %xmm2
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psrlq $1, %xmm1
; X32-SSE-NEXT: psrlq $7, %xmm0
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-SSE-NEXT: movapd {{.*#+}} xmm1 = [2.0E+0,7.2911220195563975E-304]
; X32-SSE-NEXT: xorpd %xmm1, %xmm0
; X32-SSE-NEXT: psubq %xmm1, %xmm0
; X32-SSE-NEXT: retl

View File

@ -1066,25 +1066,20 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
;
; X32-AVX1-LABEL: constant_shift_v4i64:
; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
; X32-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm3
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X32-AVX1-NEXT: vpsrlq $62, %xmm3, %xmm4
; X32-AVX1-NEXT: vpsrlq $31, %xmm3, %xmm3
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
; X32-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3
; X32-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
; X32-AVX1-NEXT: vpsrlq $7, %xmm1, %xmm3
; X32-AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm3
; X32-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,0,0]
; X32-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; X32-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1
; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
; X32-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
; X32-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X32-AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,16384,0,0,0,256]
; X32-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; X32-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v4i64:

View File

@ -716,26 +716,26 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64> %a0) {
define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
; SSE2-LABEL: trunc_usat_v8i32_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pxor %xmm3, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
; SSE2-NEXT: movdqa %xmm5, %xmm6
; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm6
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pandn %xmm2, %xmm5
; SSE2-NEXT: por %xmm1, %xmm5
; SSE2-NEXT: pslld $16, %xmm5
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm3, %xmm5
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: pand %xmm1, %xmm4
; SSE2-NEXT: por %xmm3, %xmm4
; SSE2-NEXT: pslld $16, %xmm4
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: packssdw %xmm5, %xmm0
; SSE2-NEXT: packssdw %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v8i32_v8i16:
@ -826,36 +826,36 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
define <16 x i16> @trunc_usat_v16i32_v16i16(<16 x i32> %a0) {
; SSE2-LABEL: trunc_usat_v16i32_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535]
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm2, %xmm6
; SSE2-NEXT: pxor %xmm7, %xmm6
; SSE2-NEXT: movdqa %xmm1, %xmm8
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm2, %xmm7
; SSE2-NEXT: pxor %xmm6, %xmm7
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: pcmpgtd %xmm6, %xmm1
; SSE2-NEXT: pcmpgtd %xmm7, %xmm1
; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm8, %xmm1
; SSE2-NEXT: pxor %xmm7, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pxor %xmm7, %xmm6
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm4
; SSE2-NEXT: movdqa %xmm5, %xmm2
; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm8, %xmm2
; SSE2-NEXT: pxor %xmm7, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm7, %xmm3
; SSE2-NEXT: movdqa %xmm5, %xmm6
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pandn %xmm8, %xmm6
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: pxor %xmm4, %xmm7
; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm4
; SSE2-NEXT: pandn %xmm8, %xmm5
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pxor %xmm6, %xmm3
; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pxor %xmm7, %xmm4
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: pxor %xmm8, %xmm6
; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
; SSE2-NEXT: pxor %xmm5, %xmm7
; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: por %xmm7, %xmm5
; SSE2-NEXT: pslld $16, %xmm5
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: pslld $16, %xmm0
@ -870,36 +870,36 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(<16 x i32> %a0) {
;
; SSSE3-LABEL: trunc_usat_v16i32_v16i16:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa %xmm1, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535]
; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm2, %xmm6
; SSSE3-NEXT: pxor %xmm7, %xmm6
; SSSE3-NEXT: movdqa %xmm1, %xmm8
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm2, %xmm7
; SSSE3-NEXT: pxor %xmm6, %xmm7
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
; SSSE3-NEXT: movdqa %xmm5, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1
; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7
; SSSE3-NEXT: pand %xmm1, %xmm2
; SSSE3-NEXT: pandn %xmm8, %xmm1
; SSSE3-NEXT: pxor %xmm7, %xmm1
; SSSE3-NEXT: por %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm3, %xmm6
; SSSE3-NEXT: pxor %xmm7, %xmm6
; SSSE3-NEXT: movdqa %xmm3, %xmm4
; SSSE3-NEXT: pxor %xmm6, %xmm4
; SSSE3-NEXT: movdqa %xmm5, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: pandn %xmm8, %xmm2
; SSSE3-NEXT: pxor %xmm7, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm7, %xmm3
; SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
; SSSE3-NEXT: pand %xmm6, %xmm0
; SSSE3-NEXT: pandn %xmm8, %xmm6
; SSSE3-NEXT: por %xmm6, %xmm0
; SSSE3-NEXT: pxor %xmm4, %xmm7
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm4
; SSSE3-NEXT: pandn %xmm8, %xmm5
; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pxor %xmm6, %xmm3
; SSSE3-NEXT: movdqa %xmm5, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm0
; SSSE3-NEXT: pxor %xmm7, %xmm4
; SSSE3-NEXT: por %xmm4, %xmm0
; SSSE3-NEXT: pxor %xmm8, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
; SSSE3-NEXT: pxor %xmm5, %xmm7
; SSSE3-NEXT: pand %xmm8, %xmm5
; SSSE3-NEXT: por %xmm7, %xmm5
; SSSE3-NEXT: pslld $16, %xmm5
; SSSE3-NEXT: psrad $16, %xmm5
; SSSE3-NEXT: pslld $16, %xmm0