[X86][AVX] combineBitcastvxi1 - improve handling of vectors truncated to vXi1

If we're truncating to vXi1 from a wider type, then prefer the original wider vector as is simplifies folding the separate truncations + extensions.

AVX1 this is only worth it for v8i1 cases, not v4i1 where we're always better off truncating down to v4i32 for movmsk.

Helps with some regressions encountered in D96609
This commit is contained in:
Simon Pilgrim 2021-03-24 14:05:43 +00:00
parent 338d162755
commit 7920527796
4 changed files with 124 additions and 122 deletions

View File

@ -39201,17 +39201,22 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
Op, DemandedBits, DemandedElts, DAG, Depth);
}
// Helper to peek through bitops/setcc to determine size of source vector.
// Helper to peek through bitops/trunc/setcc to determine size of source vector.
// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
bool AllowTruncate) {
switch (Src.getOpcode()) {
case ISD::TRUNCATE:
if (!AllowTruncate)
return false;
LLVM_FALLTHROUGH;
case ISD::SETCC:
return Src.getOperand(0).getValueSizeInBits() == Size;
case ISD::AND:
case ISD::XOR:
case ISD::OR:
return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
checkBitcastSrcVectorSize(Src.getOperand(1), Size);
return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
}
return false;
}
@ -39266,6 +39271,7 @@ static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
SDValue Src, const SDLoc &DL) {
switch (Src.getOpcode()) {
case ISD::SETCC:
case ISD::TRUNCATE:
return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
case ISD::AND:
case ISD::XOR:
@ -39349,7 +39355,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
SExtVT = MVT::v4i32;
// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
// sign-extend to a 256-bit operation to avoid truncation.
if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
if (Subtarget.hasAVX() &&
checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
SExtVT = MVT::v4i64;
PropagateSExt = true;
}
@ -39361,8 +39368,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
// 256-bit because the shuffle is cheaper than sign extending the result of
// the compare.
if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||
checkBitcastSrcVectorSize(Src, 512))) {
if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
checkBitcastSrcVectorSize(Src, 512, true))) {
SExtVT = MVT::v8i32;
PropagateSExt = true;
}
@ -39387,7 +39394,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
break;
}
// Split if this is a <64 x i8> comparison result.
if (checkBitcastSrcVectorSize(Src, 512)) {
if (checkBitcastSrcVectorSize(Src, 512, false)) {
SExtVT = MVT::v64i8;
break;
}

View File

@ -219,16 +219,25 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) {
; SSE-NEXT: sete %al
; SSE-NEXT: retq
;
; AVX-LABEL: trunc_v4i64_v4i1:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vmovmskps %xmm0, %eax
; AVX-NEXT: cmpb $15, %al
; AVX-NEXT: sete %al
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX1-LABEL: trunc_v4i64_v4i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vmovmskps %xmm0, %eax
; AVX1-NEXT: cmpb $15, %al
; AVX1-NEXT: sete %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v4i64_v4i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0
; AVX2-NEXT: vmovmskpd %ymm0, %eax
; AVX2-NEXT: cmpb $15, %al
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_v4i1:
; AVX512F: # %bb.0:
@ -296,14 +305,11 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
;
; AVX1-LABEL: trunc_v8i32_v8i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: vpslld $31, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: cmpb $-1, %al
; AVX1-NEXT: sete %al
; AVX1-NEXT: vzeroupper
@ -311,11 +317,8 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
;
; AVX2-LABEL: trunc_v8i32_v8i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: cmpb $-1, %al
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
@ -536,17 +539,14 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
;
; AVX1-LABEL: trunc_v8i64_v8i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: cmpb $-1, %al
; AVX1-NEXT: sete %al
; AVX1-NEXT: vzeroupper
@ -557,11 +557,8 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: cmpb $-1, %al
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper

View File

@ -212,16 +212,25 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) {
; SSE-NEXT: setne %al
; SSE-NEXT: retq
;
; AVX-LABEL: trunc_v4i64_v4i1:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vmovmskps %xmm0, %eax
; AVX-NEXT: testb %al, %al
; AVX-NEXT: setne %al
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX1-LABEL: trunc_v4i64_v4i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vmovmskps %xmm0, %eax
; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: setne %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v4i64_v4i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0
; AVX2-NEXT: vmovmskpd %ymm0, %eax
; AVX2-NEXT: testb %al, %al
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_v4i1:
; AVX512F: # %bb.0:
@ -285,25 +294,21 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
;
; AVX1-LABEL: trunc_v8i32_v8i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX1-NEXT: vpslld $31, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: setne %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v8i32_v8i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: testb %al, %al
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@ -521,17 +526,15 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
;
; AVX1-LABEL: trunc_v8i64_v8i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: setne %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@ -541,11 +544,9 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: testl $43690, %eax # imm = 0xAAAA
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: testb %al, %al
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq

View File

@ -215,16 +215,25 @@ define i1 @trunc_v4i64_v4i1(<4 x i64>) {
; SSE-NEXT: setnp %al
; SSE-NEXT: retq
;
; AVX-LABEL: trunc_v4i64_v4i1:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vmovmskps %xmm0, %eax
; AVX-NEXT: testb %al, %al
; AVX-NEXT: setnp %al
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX1-LABEL: trunc_v4i64_v4i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vmovmskps %xmm0, %eax
; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: setnp %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v4i64_v4i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0
; AVX2-NEXT: vmovmskpd %ymm0, %eax
; AVX2-NEXT: testb %al, %al
; AVX2-NEXT: setnp %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_v4i1:
; AVX512F: # %bb.0:
@ -290,14 +299,11 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
;
; AVX1-LABEL: trunc_v8i32_v8i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: vpslld $31, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: setnp %al
; AVX1-NEXT: vzeroupper
@ -305,11 +311,8 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
;
; AVX2-LABEL: trunc_v8i32_v8i1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: testb %al, %al
; AVX2-NEXT: setnp %al
; AVX2-NEXT: vzeroupper
@ -548,17 +551,14 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
;
; AVX1-LABEL: trunc_v8i64_v8i1:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: setnp %al
; AVX1-NEXT: vzeroupper
@ -569,11 +569,8 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) {
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: testb %al, %al
; AVX2-NEXT: setnp %al
; AVX2-NEXT: vzeroupper