[X86][SSE] Prefer BLEND(SHL(v,c1),SHL(v,c2)) over MUL(v, c3)

Now that rL336250 has landed, we should prefer 2 immediate shifts + a shuffle blend over performing a multiply. Despite the increase in instructions, this is quicker (especially for slow v4i32 multiplies), avoid loads and constant pool usage. It does mean however that we increase register pressure. The code size will go up a little but by less than what we save on the constant pool data.

This patch also adds support for v16i16 to the BLEND(SHIFT(v,c1),SHIFT(v,c2)) combine, and also prevents blending on pre-SSE41 shifts if it would introduce extra blend masks/constant pool usage.

Differential Revision: https://reviews.llvm.org/D48936

llvm-svn: 336642
This commit is contained in:
Simon Pilgrim 2018-07-10 07:58:33 +00:00
parent 5fd020c082
commit d32ca2c0b7
5 changed files with 73 additions and 48 deletions

View File

@ -4975,6 +4975,11 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
return true;
}
static bool canWidenShuffleElements(ArrayRef<int> Mask) {
SmallVector<int, 32> WidenedMask;
return canWidenShuffleElements(Mask, WidenedMask);
}
/// Returns true if Elt is a constant zero or a floating point constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
return isNullConstant(Elt) || isNullFPConstant(Elt);
@ -8954,6 +8959,12 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
}
static bool
is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
SmallVector<int, 32> RepeatedMask;
return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
}
/// Test whether a shuffle mask is equivalent within each 256-bit lane.
static bool
is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
@ -23438,12 +23449,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
return R;
}
// If possible, lower this packed shift into a vector multiply instead of
// expanding it into a sequence of scalar shifts.
if (Op.getOpcode() == ISD::SHL)
if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
// If possible, lower this shift as a sequence of two shifts by
// constant plus a BLENDing shuffle instead of scalarizing it.
// Example:
@ -23454,7 +23459,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
//
// The advantage is that the two shifts from the example would be
// lowered as X86ISD::VSRLI nodes in parallel before blending.
if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
(VT == MVT::v16i16 && Subtarget.hasInt256()))) {
SDValue Amt1, Amt2;
unsigned NumElts = VT.getVectorNumElements();
SmallVector<int, 8> ShuffleMask;
@ -23477,8 +23483,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
break;
}
// Only perform this blend if we can perform it without loading a mask.
if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2)) {
isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2) &&
(VT != MVT::v16i16 ||
is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
(VT == MVT::v4i32 || Subtarget.hasSSE41() ||
Op.getOpcode() != ISD::SHL || canWidenShuffleElements(ShuffleMask))) {
SDValue Splat1 =
DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
@ -23489,6 +23500,12 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
}
}
// If possible, lower this packed shift into a vector multiply instead of
// expanding it into a sequence of scalar shifts.
if (Op.getOpcode() == ISD::SHL)
if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
// v4i32 Non Uniform Shifts.
// If the shift amount is constant we can shift each lane using the SSE2
// immediate shifts, else we need to zero-extend each lane to the lower i64

View File

@ -264,22 +264,14 @@ define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,1073741824,1073741824]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [536870912,536870912,268435456,268435456]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pslld $31, %xmm2
; SSE2-NEXT: pslld $30, %xmm0
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pslld $29, %xmm2
; SSE2-NEXT: pslld $28, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_ext_shl1:
@ -288,8 +280,14 @@ define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pslld $30, %xmm2
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: pslld $28, %xmm2
; SSE41-NEXT: pslld $29, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_ext_shl1:

View File

@ -266,10 +266,14 @@ define <16 x i16> @test11(<16 x i16> %a) {
;
; AVX1-LABEL: test11:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7]
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test11:
@ -291,15 +295,20 @@ define <16 x i16> @test12(<16 x i16> %a) {
; AVX1-LABEL: test12:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7]
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1
; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15]
; AVX2-NEXT: retq
%lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
ret <16 x i16> %lshr

View File

@ -67,19 +67,18 @@ define <4 x i32> @test3(<4 x i32> %a) {
define <4 x i32> @test4(<4 x i32> %a) {
; SSE2-LABEL: test4:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,2,2]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pslld $1, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test4:
; SSE41: # %bb.0:
; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pslld $1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test4:

View File

@ -49,7 +49,6 @@ define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind {
; SSE41-NEXT: movw $0, -{{[0-9]+}}(%rsp)
; SSE41-NEXT: movl $0, -{{[0-9]+}}(%rsp)
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <2,4,2,2,2,u,u,u>
; SSE41-NEXT: jmp .LBB0_1
; SSE41-NEXT: .p2align 4, 0x90
; SSE41-NEXT: .LBB0_2: # %forbody
@ -58,10 +57,13 @@ define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind {
; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; SSE41-NEXT: shlq $4, %rax
; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
; SSE41-NEXT: movdqa (%rdx,%rax), %xmm2
; SSE41-NEXT: psubw %xmm0, %xmm2
; SSE41-NEXT: pmullw %xmm1, %xmm2
; SSE41-NEXT: pextrw $4, %xmm2, 8(%rcx,%rax)
; SSE41-NEXT: movdqa (%rdx,%rax), %xmm1
; SSE41-NEXT: psubw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psllw $2, %xmm2
; SSE41-NEXT: psllw $1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: pextrw $4, %xmm1, 8(%rcx,%rax)
; SSE41-NEXT: movq %xmm2, (%rcx,%rax)
; SSE41-NEXT: incl -{{[0-9]+}}(%rsp)
; SSE41-NEXT: .LBB0_1: # %forcond