[X86] Call lowerShuffleAsBitMask for 512-bit vectors in lowerShuffleAsBlend.

This patch enables the use of lowerShuffleAsBitMask for 512-bit blends before
falling back to move immedate, GPR to k-register, and masked op.

I had to make some changes to support v8i64 when i64 is not a legal type. And to
support floating point types.

This trades a load for the move immediate and GPR move which is higher latency.
But its probably better for register pressure not having to hop through other
register classes. The load+and should play better with LICM and
rematerialization I think.

Differential Revision: https://reviews.llvm.org/D59479

llvm-svn: 356618
This commit is contained in:
Craig Topper 2019-03-20 21:30:20 +00:00
parent bbcb95a64e
commit 0367553304
4 changed files with 188 additions and 161 deletions

View File

@ -10364,11 +10364,30 @@ static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
/// one of the inputs being zeroable.
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() && "Floating point types are not supported");
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT MaskVT = VT;
MVT EltVT = VT.getVectorElementType();
SDValue Zero = DAG.getConstant(0, DL, EltVT);
SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
SDValue Zero, AllOnes;
// Use f64 if i64 isn't legal.
if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
EltVT = MVT::f64;
MaskVT = MVT::getVectorVT(EltVT, Mask.size());
}
MVT LogicVT = VT;
if (EltVT == MVT::f32 || EltVT == MVT::f64) {
Zero = DAG.getConstantFP(0.0, DL, MVT::f64);
AllOnes = DAG.getConstantFP(APInt::getAllOnesValue(64).bitsToDouble(), DL,
EltVT);
LogicVT = MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32,
Mask.size());
} else {
Zero = DAG.getConstant(0, DL, EltVT);
AllOnes = DAG.getAllOnesConstant(DL, EltVT);
}
SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
SDValue V;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
@ -10386,8 +10405,11 @@ static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
if (!V)
return SDValue(); // No non-zeroable elements!
SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
return DAG.getNode(ISD::AND, DL, VT, V, VMask);
SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
VMask = DAG.getBitcast(LogicVT, VMask);
V = DAG.getBitcast(LogicVT, V);
SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
return DAG.getBitcast(VT, And);
}
/// Try to emit a blend instruction for a shuffle using bit math.
@ -10552,7 +10574,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
DAG))
Subtarget, DAG))
return Masked;
if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
@ -10610,6 +10632,16 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
case MVT::v16i32:
case MVT::v32i16:
case MVT::v64i8: {
// Attempt to lower to a bitmask if we can. Only if not optimizing for size.
bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
if (!OptForSize) {
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return Masked;
}
// Otherwise load an immediate into a GPR, cast to k-register, and use a
// masked move.
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
@ -12766,7 +12798,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Blend;
if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
Zeroable, DAG))
Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
@ -13467,7 +13499,7 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Blend;
if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, DAG))
Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
@ -13735,7 +13767,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, DAG))
Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
@ -15571,7 +15603,7 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
// No floating point type available, if we can't use the bit operations
// for masking/blending then decompose into 128-bit vectors.
if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
DAG))
Subtarget, DAG))
return V;
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;

View File

@ -1860,16 +1860,12 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
;
; SKX-LABEL: test_build_vec_v32i1:
; SKX: ## %bb.0:
; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_build_vec_v32i1:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movl $1497715861, %eax ## imm = 0x59455495
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_build_vec_v32i1:
@ -1880,6 +1876,41 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
;
; X86-LABEL: test_build_vec_v32i1:
; X86: ## %bb.0:
; X86-NEXT: vandps LCPI40_0, %zmm0, %zmm0
; X86-NEXT: retl
%ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret
}
define <32 x i16> @test_build_vec_v32i1_optsize(<32 x i16> %x) optsize {
; KNL-LABEL: test_build_vec_v32i1_optsize:
; KNL: ## %bb.0:
; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v32i1_optsize:
; SKX: ## %bb.0:
; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_build_vec_v32i1_optsize:
; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movl $1497715861, %eax ## imm = 0x59455495
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_build_vec_v32i1_optsize:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v32i1_optsize:
; X86: ## %bb.0:
; X86-NEXT: movl $1497715861, %eax ## imm = 0x59455495
; X86-NEXT: kmovd %eax, %k1
; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
@ -1928,12 +1959,12 @@ define void @ktest_1(<8 x double> %in, double * %base) {
; KNL-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb %al, %al
; KNL-NEXT: je LBB42_2
; KNL-NEXT: je LBB43_2
; KNL-NEXT: ## %bb.1: ## %L1
; KNL-NEXT: vmovapd %zmm0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB42_2: ## %L2
; KNL-NEXT: LBB43_2: ## %L2
; KNL-NEXT: vmovapd %zmm0, 8(%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
@ -1945,12 +1976,12 @@ define void @ktest_1(<8 x double> %in, double * %base) {
; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
; SKX-NEXT: kortestb %k0, %k0
; SKX-NEXT: je LBB42_2
; SKX-NEXT: je LBB43_2
; SKX-NEXT: ## %bb.1: ## %L1
; SKX-NEXT: vmovapd %zmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB42_2: ## %L2
; SKX-NEXT: LBB43_2: ## %L2
; SKX-NEXT: vmovapd %zmm0, 8(%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@ -1963,12 +1994,12 @@ define void @ktest_1(<8 x double> %in, double * %base) {
; AVX512BW-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testb %al, %al
; AVX512BW-NEXT: je LBB42_2
; AVX512BW-NEXT: je LBB43_2
; AVX512BW-NEXT: ## %bb.1: ## %L1
; AVX512BW-NEXT: vmovapd %zmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB42_2: ## %L2
; AVX512BW-NEXT: LBB43_2: ## %L2
; AVX512BW-NEXT: vmovapd %zmm0, 8(%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@ -1980,12 +2011,12 @@ define void @ktest_1(<8 x double> %in, double * %base) {
; AVX512DQ-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
; AVX512DQ-NEXT: kortestb %k0, %k0
; AVX512DQ-NEXT: je LBB42_2
; AVX512DQ-NEXT: je LBB43_2
; AVX512DQ-NEXT: ## %bb.1: ## %L1
; AVX512DQ-NEXT: vmovapd %zmm0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB42_2: ## %L2
; AVX512DQ-NEXT: LBB43_2: ## %L2
; AVX512DQ-NEXT: vmovapd %zmm0, 8(%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@ -1998,12 +2029,12 @@ define void @ktest_1(<8 x double> %in, double * %base) {
; X86-NEXT: vmovupd 8(%eax), %zmm1 {%k1} {z}
; X86-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
; X86-NEXT: kortestb %k0, %k0
; X86-NEXT: je LBB42_2
; X86-NEXT: je LBB43_2
; X86-NEXT: ## %bb.1: ## %L1
; X86-NEXT: vmovapd %zmm0, (%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
; X86-NEXT: LBB42_2: ## %L2
; X86-NEXT: LBB43_2: ## %L2
; X86-NEXT: vmovapd %zmm0, 8(%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
@ -2052,13 +2083,13 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: je LBB43_2
; KNL-NEXT: je LBB44_2
; KNL-NEXT: ## %bb.1: ## %L1
; KNL-NEXT: vmovaps %zmm0, (%rdi)
; KNL-NEXT: vmovaps %zmm1, 64(%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB43_2: ## %L2
; KNL-NEXT: LBB44_2: ## %L2
; KNL-NEXT: vmovaps %zmm0, 4(%rdi)
; KNL-NEXT: vmovaps %zmm1, 68(%rdi)
; KNL-NEXT: vzeroupper
@ -2077,13 +2108,13 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2
; SKX-NEXT: kunpckwd %k1, %k2, %k1
; SKX-NEXT: kortestd %k1, %k0
; SKX-NEXT: je LBB43_2
; SKX-NEXT: je LBB44_2
; SKX-NEXT: ## %bb.1: ## %L1
; SKX-NEXT: vmovaps %zmm0, (%rdi)
; SKX-NEXT: vmovaps %zmm1, 64(%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB43_2: ## %L2
; SKX-NEXT: LBB44_2: ## %L2
; SKX-NEXT: vmovaps %zmm0, 4(%rdi)
; SKX-NEXT: vmovaps %zmm1, 68(%rdi)
; SKX-NEXT: vzeroupper
@ -2102,13 +2133,13 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; AVX512BW-NEXT: vcmpltps %zmm2, %zmm1, %k2
; AVX512BW-NEXT: kunpckwd %k1, %k2, %k1
; AVX512BW-NEXT: kortestd %k1, %k0
; AVX512BW-NEXT: je LBB43_2
; AVX512BW-NEXT: je LBB44_2
; AVX512BW-NEXT: ## %bb.1: ## %L1
; AVX512BW-NEXT: vmovaps %zmm0, (%rdi)
; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB43_2: ## %L2
; AVX512BW-NEXT: LBB44_2: ## %L2
; AVX512BW-NEXT: vmovaps %zmm0, 4(%rdi)
; AVX512BW-NEXT: vmovaps %zmm1, 68(%rdi)
; AVX512BW-NEXT: vzeroupper
@ -2130,13 +2161,13 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; AVX512DQ-NEXT: kmovw %k0, %ecx
; AVX512DQ-NEXT: shll $16, %ecx
; AVX512DQ-NEXT: orl %eax, %ecx
; AVX512DQ-NEXT: je LBB43_2
; AVX512DQ-NEXT: je LBB44_2
; AVX512DQ-NEXT: ## %bb.1: ## %L1
; AVX512DQ-NEXT: vmovaps %zmm0, (%rdi)
; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB43_2: ## %L2
; AVX512DQ-NEXT: LBB44_2: ## %L2
; AVX512DQ-NEXT: vmovaps %zmm0, 4(%rdi)
; AVX512DQ-NEXT: vmovaps %zmm1, 68(%rdi)
; AVX512DQ-NEXT: vzeroupper
@ -2156,13 +2187,13 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; X86-NEXT: vcmpltps %zmm2, %zmm1, %k2
; X86-NEXT: kunpckwd %k1, %k2, %k1
; X86-NEXT: kortestd %k1, %k0
; X86-NEXT: je LBB43_2
; X86-NEXT: je LBB44_2
; X86-NEXT: ## %bb.1: ## %L1
; X86-NEXT: vmovaps %zmm0, (%eax)
; X86-NEXT: vmovaps %zmm1, 64(%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
; X86-NEXT: LBB43_2: ## %L2
; X86-NEXT: LBB44_2: ## %L2
; X86-NEXT: vmovaps %zmm0, 4(%eax)
; X86-NEXT: vmovaps %zmm1, 68(%eax)
; X86-NEXT: vzeroupper
@ -3175,12 +3206,12 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testw %ax, %ax
; KNL-NEXT: jle LBB64_1
; KNL-NEXT: jle LBB65_1
; KNL-NEXT: ## %bb.2: ## %bb.2
; KNL-NEXT: popq %rax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB64_1: ## %bb.1
; KNL-NEXT: LBB65_1: ## %bb.1
; KNL-NEXT: vzeroupper
; KNL-NEXT: callq _foo
; KNL-NEXT: popq %rax
@ -3194,12 +3225,12 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testw %ax, %ax
; SKX-NEXT: jle LBB64_1
; SKX-NEXT: jle LBB65_1
; SKX-NEXT: ## %bb.2: ## %bb.2
; SKX-NEXT: popq %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB64_1: ## %bb.1
; SKX-NEXT: LBB65_1: ## %bb.1
; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _foo
; SKX-NEXT: popq %rax
@ -3213,12 +3244,12 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testw %ax, %ax
; AVX512BW-NEXT: jle LBB64_1
; AVX512BW-NEXT: jle LBB65_1
; AVX512BW-NEXT: ## %bb.2: ## %bb.2
; AVX512BW-NEXT: popq %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB64_1: ## %bb.1
; AVX512BW-NEXT: LBB65_1: ## %bb.1
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: callq _foo
; AVX512BW-NEXT: popq %rax
@ -3232,12 +3263,12 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
; AVX512DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: testw %ax, %ax
; AVX512DQ-NEXT: jle LBB64_1
; AVX512DQ-NEXT: jle LBB65_1
; AVX512DQ-NEXT: ## %bb.2: ## %bb.2
; AVX512DQ-NEXT: popq %rax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB64_1: ## %bb.1
; AVX512DQ-NEXT: LBB65_1: ## %bb.1
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: callq _foo
; AVX512DQ-NEXT: popq %rax
@ -3251,12 +3282,12 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0
; X86-NEXT: kmovd %k0, %eax
; X86-NEXT: testw %ax, %ax
; X86-NEXT: jle LBB64_1
; X86-NEXT: jle LBB65_1
; X86-NEXT: ## %bb.2: ## %bb.2
; X86-NEXT: addl $12, %esp
; X86-NEXT: vzeroupper
; X86-NEXT: retl
; X86-NEXT: LBB64_1: ## %bb.1
; X86-NEXT: LBB65_1: ## %bb.1
; X86-NEXT: vzeroupper
; X86-NEXT: calll _foo
; X86-NEXT: addl $12, %esp
@ -3284,11 +3315,11 @@ define void @ktest_allones(<16 x i32> %x, <16 x i32> %y) {
; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0
; CHECK-NEXT: kortestw %k0, %k0
; CHECK-NEXT: jb LBB65_2
; CHECK-NEXT: jb LBB66_2
; CHECK-NEXT: ## %bb.1: ## %bb.1
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _foo
; CHECK-NEXT: LBB65_2: ## %bb.2
; CHECK-NEXT: LBB66_2: ## %bb.2
; CHECK-NEXT: popq %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@ -3300,11 +3331,11 @@ define void @ktest_allones(<16 x i32> %x, <16 x i32> %y) {
; X86-NEXT: vpord %zmm1, %zmm0, %zmm0
; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0
; X86-NEXT: kortestw %k0, %k0
; X86-NEXT: jb LBB65_2
; X86-NEXT: jb LBB66_2
; X86-NEXT: ## %bb.1: ## %bb.1
; X86-NEXT: vzeroupper
; X86-NEXT: calll _foo
; X86-NEXT: LBB65_2: ## %bb.2
; X86-NEXT: LBB66_2: ## %bb.2
; X86-NEXT: addl $12, %esp
; X86-NEXT: vzeroupper
; X86-NEXT: retl
@ -3492,12 +3523,12 @@ define void @ktest_3(<8 x i32> %w, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb %al, %al
; KNL-NEXT: je LBB71_1
; KNL-NEXT: je LBB72_1
; KNL-NEXT: ## %bb.2: ## %exit
; KNL-NEXT: popq %rax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB71_1: ## %bar
; KNL-NEXT: LBB72_1: ## %bar
; KNL-NEXT: vzeroupper
; KNL-NEXT: callq _foo
; KNL-NEXT: popq %rax
@ -3514,12 +3545,12 @@ define void @ktest_3(<8 x i32> %w, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k2
; SKX-NEXT: korb %k2, %k1, %k1
; SKX-NEXT: ktestb %k1, %k0
; SKX-NEXT: je LBB71_1
; SKX-NEXT: je LBB72_1
; SKX-NEXT: ## %bb.2: ## %exit
; SKX-NEXT: popq %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB71_1: ## %bar
; SKX-NEXT: LBB72_1: ## %bar
; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _foo
; SKX-NEXT: popq %rax
@ -3542,12 +3573,12 @@ define void @ktest_3(<8 x i32> %w, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testb %al, %al
; AVX512BW-NEXT: je LBB71_1
; AVX512BW-NEXT: je LBB72_1
; AVX512BW-NEXT: ## %bb.2: ## %exit
; AVX512BW-NEXT: popq %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB71_1: ## %bar
; AVX512BW-NEXT: LBB72_1: ## %bar
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: callq _foo
; AVX512BW-NEXT: popq %rax
@ -3568,12 +3599,12 @@ define void @ktest_3(<8 x i32> %w, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
; AVX512DQ-NEXT: korb %k1, %k0, %k0
; AVX512DQ-NEXT: korb %k3, %k2, %k1
; AVX512DQ-NEXT: ktestb %k1, %k0
; AVX512DQ-NEXT: je LBB71_1
; AVX512DQ-NEXT: je LBB72_1
; AVX512DQ-NEXT: ## %bb.2: ## %exit
; AVX512DQ-NEXT: popq %rax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB71_1: ## %bar
; AVX512DQ-NEXT: LBB72_1: ## %bar
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: callq _foo
; AVX512DQ-NEXT: popq %rax
@ -3590,12 +3621,12 @@ define void @ktest_3(<8 x i32> %w, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
; X86-NEXT: vptestnmd %ymm3, %ymm3, %k2
; X86-NEXT: korb %k2, %k1, %k1
; X86-NEXT: ktestb %k1, %k0
; X86-NEXT: je LBB71_1
; X86-NEXT: je LBB72_1
; X86-NEXT: ## %bb.2: ## %exit
; X86-NEXT: addl $12, %esp
; X86-NEXT: vzeroupper
; X86-NEXT: retl
; X86-NEXT: LBB71_1: ## %bar
; X86-NEXT: LBB72_1: ## %bar
; X86-NEXT: vzeroupper
; X86-NEXT: calll _foo
; X86-NEXT: addl $12, %esp
@ -3633,12 +3664,12 @@ define void @ktest_4(<8 x i64> %w, <8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb %al, %al
; KNL-NEXT: je LBB72_1
; KNL-NEXT: je LBB73_1
; KNL-NEXT: ## %bb.2: ## %exit
; KNL-NEXT: popq %rax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB72_1: ## %bar
; KNL-NEXT: LBB73_1: ## %bar
; KNL-NEXT: vzeroupper
; KNL-NEXT: callq _foo
; KNL-NEXT: popq %rax
@ -3655,12 +3686,12 @@ define void @ktest_4(<8 x i64> %w, <8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k2
; SKX-NEXT: korb %k2, %k1, %k1
; SKX-NEXT: ktestb %k1, %k0
; SKX-NEXT: je LBB72_1
; SKX-NEXT: je LBB73_1
; SKX-NEXT: ## %bb.2: ## %exit
; SKX-NEXT: popq %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB72_1: ## %bar
; SKX-NEXT: LBB73_1: ## %bar
; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _foo
; SKX-NEXT: popq %rax
@ -3679,12 +3710,12 @@ define void @ktest_4(<8 x i64> %w, <8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testb %al, %al
; AVX512BW-NEXT: je LBB72_1
; AVX512BW-NEXT: je LBB73_1
; AVX512BW-NEXT: ## %bb.2: ## %exit
; AVX512BW-NEXT: popq %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB72_1: ## %bar
; AVX512BW-NEXT: LBB73_1: ## %bar
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: callq _foo
; AVX512BW-NEXT: popq %rax
@ -3701,12 +3732,12 @@ define void @ktest_4(<8 x i64> %w, <8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
; AVX512DQ-NEXT: vptestnmq %zmm3, %zmm3, %k2
; AVX512DQ-NEXT: korb %k2, %k1, %k1
; AVX512DQ-NEXT: ktestb %k1, %k0
; AVX512DQ-NEXT: je LBB72_1
; AVX512DQ-NEXT: je LBB73_1
; AVX512DQ-NEXT: ## %bb.2: ## %exit
; AVX512DQ-NEXT: popq %rax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB72_1: ## %bar
; AVX512DQ-NEXT: LBB73_1: ## %bar
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: callq _foo
; AVX512DQ-NEXT: popq %rax
@ -3723,12 +3754,12 @@ define void @ktest_4(<8 x i64> %w, <8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
; X86-NEXT: vptestnmq %zmm3, %zmm3, %k2
; X86-NEXT: korb %k2, %k1, %k1
; X86-NEXT: ktestb %k1, %k0
; X86-NEXT: je LBB72_1
; X86-NEXT: je LBB73_1
; X86-NEXT: ## %bb.2: ## %exit
; X86-NEXT: addl $12, %esp
; X86-NEXT: vzeroupper
; X86-NEXT: retl
; X86-NEXT: LBB72_1: ## %bar
; X86-NEXT: LBB73_1: ## %bar
; X86-NEXT: vzeroupper
; X86-NEXT: calll _foo
; X86-NEXT: addl $12, %esp
@ -3765,12 +3796,12 @@ define void @ktest_5(<16 x i32> %w, <16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
; KNL-NEXT: korw %k2, %k1, %k1
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: kortestw %k0, %k0
; KNL-NEXT: je LBB73_1
; KNL-NEXT: je LBB74_1
; KNL-NEXT: ## %bb.2: ## %exit
; KNL-NEXT: popq %rax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB73_1: ## %bar
; KNL-NEXT: LBB74_1: ## %bar
; KNL-NEXT: vzeroupper
; KNL-NEXT: callq _foo
; KNL-NEXT: popq %rax
@ -3787,12 +3818,12 @@ define void @ktest_5(<16 x i32> %w, <16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k2
; SKX-NEXT: korw %k2, %k1, %k1
; SKX-NEXT: ktestw %k1, %k0
; SKX-NEXT: je LBB73_1
; SKX-NEXT: je LBB74_1
; SKX-NEXT: ## %bb.2: ## %exit
; SKX-NEXT: popq %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB73_1: ## %bar
; SKX-NEXT: LBB74_1: ## %bar
; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _foo
; SKX-NEXT: popq %rax
@ -3810,12 +3841,12 @@ define void @ktest_5(<16 x i32> %w, <16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kortestw %k0, %k0
; AVX512BW-NEXT: je LBB73_1
; AVX512BW-NEXT: je LBB74_1
; AVX512BW-NEXT: ## %bb.2: ## %exit
; AVX512BW-NEXT: popq %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB73_1: ## %bar
; AVX512BW-NEXT: LBB74_1: ## %bar
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: callq _foo
; AVX512BW-NEXT: popq %rax
@ -3832,12 +3863,12 @@ define void @ktest_5(<16 x i32> %w, <16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
; AVX512DQ-NEXT: vptestnmd %zmm3, %zmm3, %k2
; AVX512DQ-NEXT: korw %k2, %k1, %k1
; AVX512DQ-NEXT: ktestw %k1, %k0
; AVX512DQ-NEXT: je LBB73_1
; AVX512DQ-NEXT: je LBB74_1
; AVX512DQ-NEXT: ## %bb.2: ## %exit
; AVX512DQ-NEXT: popq %rax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB73_1: ## %bar
; AVX512DQ-NEXT: LBB74_1: ## %bar
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: callq _foo
; AVX512DQ-NEXT: popq %rax
@ -3854,12 +3885,12 @@ define void @ktest_5(<16 x i32> %w, <16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
; X86-NEXT: vptestnmd %zmm3, %zmm3, %k2
; X86-NEXT: korw %k2, %k1, %k1
; X86-NEXT: ktestw %k1, %k0
; X86-NEXT: je LBB73_1
; X86-NEXT: je LBB74_1
; X86-NEXT: ## %bb.2: ## %exit
; X86-NEXT: addl $12, %esp
; X86-NEXT: vzeroupper
; X86-NEXT: retl
; X86-NEXT: LBB73_1: ## %bar
; X86-NEXT: LBB74_1: ## %bar
; X86-NEXT: vzeroupper
; X86-NEXT: calll _foo
; X86-NEXT: addl $12, %esp
@ -3911,12 +3942,12 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: je LBB74_1
; KNL-NEXT: je LBB75_1
; KNL-NEXT: ## %bb.2: ## %exit
; KNL-NEXT: popq %rax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB74_1: ## %bar
; KNL-NEXT: LBB75_1: ## %bar
; KNL-NEXT: vzeroupper
; KNL-NEXT: callq _foo
; KNL-NEXT: popq %rax
@ -3933,12 +3964,12 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
; SKX-NEXT: vptestnmw %zmm3, %zmm3, %k2
; SKX-NEXT: kord %k2, %k1, %k1
; SKX-NEXT: ktestd %k1, %k0
; SKX-NEXT: je LBB74_1
; SKX-NEXT: je LBB75_1
; SKX-NEXT: ## %bb.2: ## %exit
; SKX-NEXT: popq %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB74_1: ## %bar
; SKX-NEXT: LBB75_1: ## %bar
; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _foo
; SKX-NEXT: popq %rax
@ -3955,12 +3986,12 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
; AVX512BW-NEXT: vptestnmw %zmm3, %zmm3, %k2
; AVX512BW-NEXT: kord %k2, %k1, %k1
; AVX512BW-NEXT: ktestd %k1, %k0
; AVX512BW-NEXT: je LBB74_1
; AVX512BW-NEXT: je LBB75_1
; AVX512BW-NEXT: ## %bb.2: ## %exit
; AVX512BW-NEXT: popq %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB74_1: ## %bar
; AVX512BW-NEXT: LBB75_1: ## %bar
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: callq _foo
; AVX512BW-NEXT: popq %rax
@ -3993,12 +4024,12 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
; AVX512DQ-NEXT: kmovw %k0, %ecx
; AVX512DQ-NEXT: shll $16, %ecx
; AVX512DQ-NEXT: orl %eax, %ecx
; AVX512DQ-NEXT: je LBB74_1
; AVX512DQ-NEXT: je LBB75_1
; AVX512DQ-NEXT: ## %bb.2: ## %exit
; AVX512DQ-NEXT: popq %rax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB74_1: ## %bar
; AVX512DQ-NEXT: LBB75_1: ## %bar
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: callq _foo
; AVX512DQ-NEXT: popq %rax
@ -4015,12 +4046,12 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
; X86-NEXT: vptestnmw %zmm3, %zmm3, %k2
; X86-NEXT: kord %k2, %k1, %k1
; X86-NEXT: ktestd %k1, %k0
; X86-NEXT: je LBB74_1
; X86-NEXT: je LBB75_1
; X86-NEXT: ## %bb.2: ## %exit
; X86-NEXT: addl $12, %esp
; X86-NEXT: vzeroupper
; X86-NEXT: retl
; X86-NEXT: LBB74_1: ## %bar
; X86-NEXT: LBB75_1: ## %bar
; X86-NEXT: vzeroupper
; X86-NEXT: calll _foo
; X86-NEXT: addl $12, %esp
@ -4096,12 +4127,12 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
; KNL-NEXT: orl %eax, %edx
; KNL-NEXT: shlq $32, %rdx
; KNL-NEXT: orq %rcx, %rdx
; KNL-NEXT: je LBB75_1
; KNL-NEXT: je LBB76_1
; KNL-NEXT: ## %bb.2: ## %exit
; KNL-NEXT: popq %rax
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB75_1: ## %bar
; KNL-NEXT: LBB76_1: ## %bar
; KNL-NEXT: vzeroupper
; KNL-NEXT: callq _foo
; KNL-NEXT: popq %rax
@ -4118,12 +4149,12 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
; SKX-NEXT: vptestnmb %zmm3, %zmm3, %k2
; SKX-NEXT: korq %k2, %k1, %k1
; SKX-NEXT: ktestq %k1, %k0
; SKX-NEXT: je LBB75_1
; SKX-NEXT: je LBB76_1
; SKX-NEXT: ## %bb.2: ## %exit
; SKX-NEXT: popq %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB75_1: ## %bar
; SKX-NEXT: LBB76_1: ## %bar
; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _foo
; SKX-NEXT: popq %rax
@ -4140,12 +4171,12 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
; AVX512BW-NEXT: vptestnmb %zmm3, %zmm3, %k2
; AVX512BW-NEXT: korq %k2, %k1, %k1
; AVX512BW-NEXT: ktestq %k1, %k0
; AVX512BW-NEXT: je LBB75_1
; AVX512BW-NEXT: je LBB76_1
; AVX512BW-NEXT: ## %bb.2: ## %exit
; AVX512BW-NEXT: popq %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB75_1: ## %bar
; AVX512BW-NEXT: LBB76_1: ## %bar
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: callq _foo
; AVX512BW-NEXT: popq %rax
@ -4202,12 +4233,12 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
; AVX512DQ-NEXT: orl %eax, %edx
; AVX512DQ-NEXT: shlq $32, %rdx
; AVX512DQ-NEXT: orq %rcx, %rdx
; AVX512DQ-NEXT: je LBB75_1
; AVX512DQ-NEXT: je LBB76_1
; AVX512DQ-NEXT: ## %bb.2: ## %exit
; AVX512DQ-NEXT: popq %rax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: LBB75_1: ## %bar
; AVX512DQ-NEXT: LBB76_1: ## %bar
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: callq _foo
; AVX512DQ-NEXT: popq %rax
@ -4226,12 +4257,12 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
; X86-NEXT: kandq %k1, %k0, %k0
; X86-NEXT: kshiftrq $32, %k0, %k1
; X86-NEXT: kortestd %k1, %k0
; X86-NEXT: je LBB75_1
; X86-NEXT: je LBB76_1
; X86-NEXT: ## %bb.2: ## %exit
; X86-NEXT: addl $12, %esp
; X86-NEXT: vzeroupper
; X86-NEXT: retl
; X86-NEXT: LBB75_1: ## %bar
; X86-NEXT: LBB76_1: ## %bar
; X86-NEXT: vzeroupper
; X86-NEXT: calll _foo
; X86-NEXT: addl $12, %esp

View File

@ -128,29 +128,17 @@ define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noin
}
define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: movb $32, %al
; AVX512F-NEXT: kmovw %eax, %k0
; AVX512F-NEXT: knotw %k0, %k1
; AVX512F-NEXT: vmovupd 8(%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_8f64_f64_1u3u5zu8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movb $32, %al
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: knotw %k0, %k1
; AVX512BW-NEXT: vmovupd 8(%rdi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
; ALL-LABEL: merge_8f64_f64_1u3u5zu8:
; ALL: # %bb.0:
; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0
; ALL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movb $32, %cl
; X32-AVX512F-NEXT: kmovw %ecx, %k0
; X32-AVX512F-NEXT: knotw %k0, %k1
; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0 {%k1} {z}
; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0
; X32-AVX512F-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds double, double* %ptr, i64 1
%ptr2 = getelementptr inbounds double, double* %ptr, i64 3
@ -219,29 +207,17 @@ define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline s
}
define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: movb $32, %al
; AVX512F-NEXT: kmovw %eax, %k0
; AVX512F-NEXT: knotw %k0, %k1
; AVX512F-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_8i64_i64_1u3u5zu8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movb $32, %al
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: knotw %k0, %k1
; AVX512BW-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
; ALL-LABEL: merge_8i64_i64_1u3u5zu8:
; ALL: # %bb.0:
; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0
; ALL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movb $32, %cl
; X32-AVX512F-NEXT: kmovw %ecx, %k0
; X32-AVX512F-NEXT: knotw %k0, %k1
; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 {%k1} {z}
; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0
; X32-AVX512F-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
%ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3
@ -450,29 +426,17 @@ define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(i32* %ptr) nounwind uwtable
}
define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
; AVX512F: # %bb.0:
; AVX512F-NEXT: movw $8240, %ax # imm = 0x2030
; AVX512F-NEXT: kmovw %eax, %k0
; AVX512F-NEXT: knotw %k0, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movw $8240, %ax # imm = 0x2030
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: knotw %k0, %k1
; AVX512BW-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
; ALL: # %bb.0:
; ALL-NEXT: vmovdqu64 (%rdi), %zmm0
; ALL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movw $8240, %cx # imm = 0x2030
; X32-AVX512F-NEXT: kmovw %ecx, %k0
; X32-AVX512F-NEXT: knotw %k0, %k1
; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} {z}
; X32-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0
; X32-AVX512F-NEXT: vpandd {{\.LCPI.*}}, %zmm0, %zmm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
%ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3

View File

@ -203,9 +203,9 @@ define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a
;
; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; SKX: ## %bb.0:
; SKX-NEXT: movl $1, %eax
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: movl $65535, %eax ## imm = 0xFFFF
; SKX-NEXT: vmovd %eax, %xmm1
; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
ret <32 x i16> %shuffle