[X86][SSE] SimplifyDemandedVectorEltsForTargetNode - add SSE shift multiple use handling

Add SimplifyMultipleUseDemandedVectorElts peek through for imm/var SSE shifts
This commit is contained in:
Simon Pilgrim 2020-07-23 14:09:31 +01:00
parent 68a80a4436
commit d720ba1e4b
7 changed files with 414 additions and 469 deletions

View File

@ -37042,7 +37042,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
Depth + 1))
return true;
// TODO convert SrcUndef to KnownUndef.
// Aggressively peek through ops to get at the demanded elts.
if (!DemandedElts.isAllOnesValue())
if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
Src, DemandedElts, TLO.DAG, Depth + 1))
return TLO.CombineTo(
Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
break;
}
case X86ISD::KSHIFTL: {

View File

@ -2676,11 +2676,11 @@ define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) {
; SSE2-NEXT: pandn %xmm3, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: psraw $4, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psraw $4, %xmm3
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535]
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pand %xmm1, %xmm3
@ -2787,27 +2787,27 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,0,65535,0]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: psraw $4, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,65535,65535]
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: psraw $2, %xmm2
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,0]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: psraw $1, %xmm1
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psraw $6, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,0]
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: psraw $12, %xmm5
; SSE2-NEXT: pandn %xmm5, %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: pand %xmm3, %xmm4
; SSE2-NEXT: pandn %xmm2, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,0]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psraw $1, %xmm3
; SSE2-NEXT: pandn %xmm3, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: psrlw $15, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: paddw %xmm2, %xmm0
; SSE2-NEXT: paddw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_sdiv_nonuniform6:

View File

@ -196,14 +196,13 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: psrad $2, %xmm1
; SSE-NEXT: psrad $2, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: psrad $1, %xmm0
; SSE-NEXT: psrad $3, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: psrad $3, %xmm0
; SSE-NEXT: psrad $1, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
@ -235,14 +234,13 @@ define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: psrad $2, %xmm1
; SSE-NEXT: psrad $2, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: psrad $1, %xmm0
; SSE-NEXT: psrad $3, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: psrad $3, %xmm0
; SSE-NEXT: psrad $1, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:

View File

@ -188,18 +188,17 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $33, %xmm2
; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: psrld $19, %xmm1
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: psrld $17, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: psrld $18, %xmm1
; SSE-NEXT: movaps %xmm2, %xmm3
; SSE-NEXT: psrld $17, %xmm3
; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: psrld $18, %xmm2
; SSE-NEXT: psrld $16, %xmm0
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
; SSE-NEXT: retq
;
; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1:

File diff suppressed because it is too large Load Diff

View File

@ -280,24 +280,20 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,2,2]
; CHECK-SSE2-NEXT: psrld $1, %xmm2
; CHECK-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,2,3]
; CHECK-SSE2-NEXT: movapd %xmm1, %xmm3
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
; CHECK-SSE2-NEXT: psrld $1, %xmm3
; CHECK-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,2,3]
; CHECK-SSE2-NEXT: movapd %xmm2, %xmm3
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0
; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0
@ -306,15 +302,13 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: t32_tautological:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2
; CHECK-SSE41-NEXT: psrld $1, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE41-NEXT: psrld $1, %xmm3
; CHECK-SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
@ -324,14 +318,12 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: t32_tautological:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0

View File

@ -197,22 +197,19 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: psrld $5, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE2-NEXT: psrld $2, %xmm2
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
; CHECK-SSE2-NEXT: psrld $27, %xmm1
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: psrld $5, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE2-NEXT: psrld $27, %xmm2
; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@ -222,13 +219,11 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1
; CHECK-SSE41-NEXT: psrld $5, %xmm1
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE41-NEXT: psrld $2, %xmm2
; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1
; CHECK-SSE41-NEXT: psrld $5, %xmm1
; CHECK-SSE41-NEXT: psrld $27, %xmm2
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
@ -242,13 +237,12 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpsrld $27, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2
; CHECK-AVX1-NEXT: vpsrld $27, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1