[x86] form broadcast of scalar memop even with >1 use

The unseen logic diff occurs because MayFoldLoad() is defined like this:

static bool MayFoldLoad(SDValue Op) {
  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
}

The test diffs here all seem ok to me on screen/paper, but it's hard to know
if that will lead to universally better perf for all targets. For example,
if a target implements broadcast from mem as multiple uops, we would have to
weigh the potential reduction of instructions and register pressure vs.
possible increase in number of uops. I don't know if we can make a truly
informed decision on this at compile-time.

The motivating case that I'm looking at in PR42024:
https://bugs.llvm.org/show_bug.cgi?id=42024
...resembles the diff in extract-concat.ll, but we're not going to change the
larger example there without at least 1 other fix.

Differential Revision: https://reviews.llvm.org/D74088
This commit is contained in:
Sanjay Patel 2020-02-16 10:32:56 -05:00
parent 893c630fbe
commit e48b536be6
11 changed files with 329 additions and 306 deletions

View File

@ -12927,7 +12927,12 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// If we can't broadcast from a register, check that the input is a load. // If we can't broadcast from a register, check that the input is a load.
if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
return SDValue(); return SDValue();
} else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) { } else if (ISD::isNormalLoad(V.getNode()) &&
cast<LoadSDNode>(V)->isSimple()) {
// We do not check for one-use of the vector load because a broadcast load
// is expected to be a win for code size, register pressure, and possibly
// uops even if the original vector load is not eliminated.
// 32-bit targets need to load i64 as a f64 and then bitcast the result. // 32-bit targets need to load i64 as a f64 and then bitcast the result.
if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
@ -12936,8 +12941,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
: Opcode; : Opcode;
} }
// If we are broadcasting a load that is only used by the shuffle // Reduce the vector load and shuffle to a broadcasted scalar load.
// then we can reduce the vector load to the broadcasted scalar load.
LoadSDNode *Ld = cast<LoadSDNode>(V); LoadSDNode *Ld = cast<LoadSDNode>(V);
SDValue BaseAddr = Ld->getOperand(1); SDValue BaseAddr = Ld->getOperand(1);
EVT SVT = BroadcastVT.getScalarType(); EVT SVT = BroadcastVT.getScalarType();

View File

@ -377,66 +377,60 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
; ;
; AVX2-LABEL: avg_v48i8: ; AVX2-LABEL: avg_v48i8:
; AVX2: # %bb.0: ; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm0
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero ; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm2
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
; AVX2-NEXT: vmovdqa (%rsi), %xmm6
; AVX2-NEXT: vmovdqa 16(%rsi), %xmm7
; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm5, %ymm3, %ymm3
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm5, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm5, %ymm4, %ymm4
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm5, %ymm9, %ymm5
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm4
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm6
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm6, %ymm0, %ymm0
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm6, %ymm1, %ymm1
; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm6
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm6, %ymm3, %ymm3
; AVX2-NEXT: vpbroadcastq 40(%rsi), %xmm6
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpaddd %ymm6, %ymm5, %ymm5
; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3
; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0
; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1
; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5
; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2 ; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 ; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3
; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5
; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 ; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: vpackusdw %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm6, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm4[2,3] ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm2[2,3]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX2-NEXT: vpackusdw %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vpackusdw %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vpackuswb %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpackuswb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm5[2,3] ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],ymm4[2,3]
; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3
; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovdqu %xmm1, (%rax) ; AVX2-NEXT: vmovdqu %xmm1, (%rax)

View File

@ -1730,15 +1730,15 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp,
define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) { define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0: ; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm0
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] ; CHECK-NEXT: vmovd %xmm1, %eax
; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpextrd $3, %xmm1, %eax ; CHECK-NEXT: vextractps $3, %xmm2, %eax
; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm1 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; CHECK-NEXT: vpextrd $2, %xmm0, %eax ; CHECK-NEXT: vpextrd $2, %xmm1, %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq ; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp %vec = load <16 x i32>, <16 x i32>* %vp
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
@ -1747,15 +1747,15 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0: ; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm2
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3 ; CHECK-NEXT: vmovdqa (%rdi), %xmm3
; CHECK-NEXT: vmovd %xmm2, %eax ; CHECK-NEXT: vmovaps 16(%rdi), %xmm4
; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] ; CHECK-NEXT: vmovd %xmm3, %eax
; CHECK-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
; CHECK-NEXT: vpextrd $3, %xmm3, %eax ; CHECK-NEXT: vextractps $3, %xmm4, %eax
; CHECK-NEXT: vpinsrd $2, %eax, %xmm4, %xmm3 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
; CHECK-NEXT: vpextrd $2, %xmm2, %eax ; CHECK-NEXT: vpextrd $2, %xmm3, %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq ; CHECK-NEXT: retq
@ -1769,15 +1769,15 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) { define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
; CHECK: # %bb.0: ; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm1
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
; CHECK-NEXT: vmovd %xmm1, %eax ; CHECK-NEXT: vmovaps 16(%rdi), %xmm3
; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] ; CHECK-NEXT: vmovd %xmm2, %eax
; CHECK-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
; CHECK-NEXT: vpextrd $3, %xmm2, %eax ; CHECK-NEXT: vextractps $3, %xmm3, %eax
; CHECK-NEXT: vpinsrd $2, %eax, %xmm3, %xmm2 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
; CHECK-NEXT: vpextrd $2, %xmm1, %eax ; CHECK-NEXT: vpextrd $2, %xmm2, %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq ; CHECK-NEXT: retq
@ -4205,12 +4205,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x doub
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5: ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
; CHECK: # %bb.0: ; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [6,1,1,1] ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[0,1]
; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 ; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
; CHECK-NEXT: retq ; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp %vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5> %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
@ -4222,12 +4221,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) { define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5: ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
; CHECK: # %bb.0: ; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [6,1,1,1] ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[0,1]
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq ; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp %vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5> %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>

View File

@ -126,11 +126,10 @@ define <16 x i64> @load_catcat(<4 x i64>* %p) {
; ;
; AVX2-LABEL: load_catcat: ; AVX2-LABEL: load_catcat:
; AVX2: # %bb.0: ; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps (%rdi), %ymm3
; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[1,1,1,1] ; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm1
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2] ; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm2
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3
; AVX2-NEXT: retq ; AVX2-NEXT: retq
; ;
; AVX512F-LABEL: load_catcat: ; AVX512F-LABEL: load_catcat:

View File

@ -334,12 +334,12 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE4A-NEXT: movups (%ecx), %xmm0 ; X86-SSE4A-NEXT: movups (%ecx), %xmm0
; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1 ; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax)
; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) ; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax)
; X86-SSE4A-NEXT: movntsd %xmm0, 8(%eax)
; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax) ; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax)
; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; X86-SSE4A-NEXT: movntsd %xmm1, 24(%eax)
; X86-SSE4A-NEXT: retl ; X86-SSE4A-NEXT: retl
; ;
; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore: ; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
@ -362,12 +362,12 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
; X64-SSE4A: # %bb.0: ; X64-SSE4A: # %bb.0:
; X64-SSE4A-NEXT: movups (%rdi), %xmm0 ; X64-SSE4A-NEXT: movups (%rdi), %xmm0
; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1 ; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi)
; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) ; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi)
; X64-SSE4A-NEXT: movntsd %xmm0, 8(%rsi)
; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi)
; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; X64-SSE4A-NEXT: movntsd %xmm1, 24(%rsi)
; X64-SSE4A-NEXT: retq ; X64-SSE4A-NEXT: retq
; ;
; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore: ; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore:
@ -447,12 +447,12 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE4A-NEXT: movups (%ecx), %xmm0 ; X86-SSE4A-NEXT: movups (%ecx), %xmm0
; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1 ; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax)
; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) ; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax)
; X86-SSE4A-NEXT: movntsd %xmm0, 8(%eax)
; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax) ; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax)
; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; X86-SSE4A-NEXT: movntsd %xmm1, 24(%eax)
; X86-SSE4A-NEXT: retl ; X86-SSE4A-NEXT: retl
; ;
; X64-SSE2-LABEL: merge_2_v4f32_align1: ; X64-SSE2-LABEL: merge_2_v4f32_align1:
@ -475,12 +475,12 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
; X64-SSE4A: # %bb.0: ; X64-SSE4A: # %bb.0:
; X64-SSE4A-NEXT: movups (%rdi), %xmm0 ; X64-SSE4A-NEXT: movups (%rdi), %xmm0
; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1 ; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi)
; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) ; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi)
; X64-SSE4A-NEXT: movntsd %xmm0, 8(%rsi)
; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi)
; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; X64-SSE4A-NEXT: movntsd %xmm1, 24(%rsi)
; X64-SSE4A-NEXT: retq ; X64-SSE4A-NEXT: retq
; ;
; X64-SSE41-LABEL: merge_2_v4f32_align1: ; X64-SSE41-LABEL: merge_2_v4f32_align1:

View File

@ -1510,35 +1510,32 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; ;
; AVX1-LABEL: interleave_24i32_in: ; AVX1-LABEL: interleave_24i32_in:
; AVX1: # %bb.0: ; AVX1: # %bb.0:
; AVX1-NEXT: vmovupd (%rsi), %ymm0 ; AVX1-NEXT: vmovups (%rdx), %xmm0
; AVX1-NEXT: vmovups (%rdx), %xmm1 ; AVX1-NEXT: vmovups 16(%rdx), %xmm1
; AVX1-NEXT: vmovups 16(%rdx), %xmm2 ; AVX1-NEXT: vmovups (%rsi), %xmm2
; AVX1-NEXT: vmovups (%rsi), %xmm3 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm0[2,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm1[2,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1],xmm3[0,2]
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2] ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1]
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: vbroadcastsd (%rcx), %ymm2
; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[0,1,0,1] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-NEXT: vmovups 16(%rcx), %xmm2
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,0],xmm2[3,0]
; AVX1-NEXT: vmovups 16(%rcx), %xmm3 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,1],xmm3[0,2]
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,0],xmm3[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,1],xmm4[0,2] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,2]
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,2] ; AVX1-NEXT: vbroadcastsd 24(%rsi), %ymm2
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,1,3,3] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = mem[1,0,2,2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7]
; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7] ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX1-NEXT: vmovups %ymm0, 32(%rdi) ; AVX1-NEXT: vmovups %ymm2, 32(%rdi)
; AVX1-NEXT: vmovups %ymm2, 64(%rdi) ; AVX1-NEXT: vmovups %ymm1, 64(%rdi)
; AVX1-NEXT: vmovups %ymm1, (%rdi) ; AVX1-NEXT: vmovups %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper ; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq ; AVX1-NEXT: retq
; ;
@ -1557,7 +1554,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
@ -1586,7 +1583,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3] ; AVX2-FAST-NEXT: vbroadcastsd 24(%rsi), %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
@ -1601,34 +1598,32 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; ;
; XOP-LABEL: interleave_24i32_in: ; XOP-LABEL: interleave_24i32_in:
; XOP: # %bb.0: ; XOP: # %bb.0:
; XOP-NEXT: vmovupd (%rsi), %ymm0 ; XOP-NEXT: vmovups (%rsi), %ymm0
; XOP-NEXT: vmovups (%rcx), %ymm1 ; XOP-NEXT: vmovups (%rcx), %ymm1
; XOP-NEXT: vmovups (%rdx), %xmm2
; XOP-NEXT: vmovups 16(%rdx), %xmm3
; XOP-NEXT: vmovups (%rsi), %xmm4
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0]
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2]
; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0]
; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1]
; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
; XOP-NEXT: vpermilps {{.*#+}} xmm4 = mem[0,1,0,1]
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
; XOP-NEXT: vmovups 16(%rcx), %xmm4
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0]
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2]
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0]
; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2]
; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
; XOP-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5] ; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5]
; XOP-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; XOP-NEXT: vmovups (%rdx), %xmm1
; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; XOP-NEXT: vmovups 16(%rdx), %xmm2
; XOP-NEXT: vmovups (%rsi), %xmm3
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm1[2,0]
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0]
; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; XOP-NEXT: vbroadcastsd (%rcx), %ymm3
; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
; XOP-NEXT: vmovups 16(%rcx), %xmm3
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,0],xmm3[3,0]
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,1],xmm4[0,2]
; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,0]
; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,2]
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; XOP-NEXT: vbroadcastsd 24(%rsi), %ymm3
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; XOP-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7]
; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
; XOP-NEXT: vmovups %ymm0, 32(%rdi) ; XOP-NEXT: vmovups %ymm0, 32(%rdi)
; XOP-NEXT: vmovups %ymm3, 64(%rdi) ; XOP-NEXT: vmovups %ymm2, 64(%rdi)
; XOP-NEXT: vmovups %ymm2, (%rdi) ; XOP-NEXT: vmovups %ymm1, (%rdi)
; XOP-NEXT: vzeroupper ; XOP-NEXT: vzeroupper
; XOP-NEXT: retq ; XOP-NEXT: retq
%s1 = load <8 x i32>, <8 x i32>* %q1, align 4 %s1 = load <8 x i32>, <8 x i32>* %q1, align 4

View File

@ -16,25 +16,23 @@ define void @pr34653() {
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; CHECK-NEXT: callq test ; CHECK-NEXT: callq test
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovaps %xmm0, %xmm1 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm1
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm2 ; CHECK-NEXT: vmovaps %xmm1, %xmm2
; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm3
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm4 ; CHECK-NEXT: vmovaps %xmm3, %xmm4
; CHECK-NEXT: vmovaps %xmm4, %xmm5 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm5
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm6 ; CHECK-NEXT: vmovaps %xmm5, %xmm6
; CHECK-NEXT: vmovaps %xmm6, %xmm7 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm7
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm8 ; CHECK-NEXT: vmovaps %xmm7, %xmm8
; CHECK-NEXT: vmovaps %xmm8, %xmm9 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm9
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm10 ; CHECK-NEXT: vmovaps %xmm9, %xmm10
; CHECK-NEXT: vmovaps %xmm10, %xmm11 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm11
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm12 ; CHECK-NEXT: vmovaps %xmm11, %xmm12
; CHECK-NEXT: vmovaps %xmm12, %xmm13 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm13
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm14 ; CHECK-NEXT: vmovaps %xmm13, %xmm14
; CHECK-NEXT: vmovaps %xmm14, %xmm15 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm15
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovaps %zmm15, %zmm16
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps %zmm0, %zmm16
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovaps %zmm0, %zmm17 ; CHECK-NEXT: vmovaps %zmm0, %zmm17
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@ -67,16 +65,14 @@ define void @pr34653() {
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] ; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] ; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] ; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0] ; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] ; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm12 = xmm12[1,0]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm14 = xmm14[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@ -107,11 +103,7 @@ define void @pr34653() {
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd {{.*#+}} xmm24 = mem[0],zero
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 8-byte Reload
; CHECK-NEXT: # xmm24 = mem[0],zero
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 8-byte Reload ; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 8-byte Reload
; CHECK-NEXT: # xmm25 = mem[0],zero ; CHECK-NEXT: # xmm25 = mem[0],zero
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 8-byte Reload ; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 8-byte Reload

View File

@ -60,15 +60,13 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: movl %esp, %ebp
; AVX-32-NEXT: andl $-16, %esp ; AVX-32-NEXT: andl $-16, %esp
; AVX-32-NEXT: subl $16, %esp ; AVX-32-NEXT: subl $16, %esp
; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 ; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
; AVX-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
; AVX-32-NEXT: xorl %eax, %eax ; AVX-32-NEXT: xorl %eax, %eax
; AVX-32-NEXT: vcomiss %xmm4, %xmm5 ; AVX-32-NEXT: vcomiss 12(%ebp), %xmm3
; AVX-32-NEXT: movl $-1, %ecx ; AVX-32-NEXT: movl $-1, %ecx
; AVX-32-NEXT: movl $0, %edx ; AVX-32-NEXT: movl $0, %edx
; AVX-32-NEXT: cmoval %ecx, %edx ; AVX-32-NEXT: cmoval %ecx, %edx
; AVX-32-NEXT: vcomiss %xmm3, %xmm2 ; AVX-32-NEXT: vcomiss 8(%ebp), %xmm2
; AVX-32-NEXT: cmoval %ecx, %eax ; AVX-32-NEXT: cmoval %ecx, %eax
; AVX-32-NEXT: vmovd %eax, %xmm2 ; AVX-32-NEXT: vmovd %eax, %xmm2
; AVX-32-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 ; AVX-32-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2
@ -99,17 +97,15 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
; AVX512-32-NEXT: movl %esp, %ebp ; AVX512-32-NEXT: movl %esp, %ebp
; AVX512-32-NEXT: andl $-16, %esp ; AVX512-32-NEXT: andl $-16, %esp
; AVX512-32-NEXT: subl $16, %esp ; AVX512-32-NEXT: subl $16, %esp
; AVX512-32-NEXT: vmovaps 8(%ebp), %xmm3
; AVX512-32-NEXT: movw $-3, %ax ; AVX512-32-NEXT: movw $-3, %ax
; AVX512-32-NEXT: kmovw %eax, %k0 ; AVX512-32-NEXT: kmovw %eax, %k0
; AVX512-32-NEXT: vcomiss %xmm3, %xmm2 ; AVX512-32-NEXT: vcomiss 8(%ebp), %xmm2
; AVX512-32-NEXT: seta %al ; AVX512-32-NEXT: seta %al
; AVX512-32-NEXT: andl $1, %eax ; AVX512-32-NEXT: andl $1, %eax
; AVX512-32-NEXT: kmovw %eax, %k1 ; AVX512-32-NEXT: kmovw %eax, %k1
; AVX512-32-NEXT: kandw %k0, %k1, %k0 ; AVX512-32-NEXT: kandw %k0, %k1, %k0
; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX512-32-NEXT: vcomiss %xmm3, %xmm2 ; AVX512-32-NEXT: vcomiss 12(%ebp), %xmm2
; AVX512-32-NEXT: seta %al ; AVX512-32-NEXT: seta %al
; AVX512-32-NEXT: kmovw %eax, %k1 ; AVX512-32-NEXT: kmovw %eax, %k1
; AVX512-32-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-32-NEXT: kshiftlw $15, %k1, %k1
@ -148,17 +144,15 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
; AVX512F-32-NEXT: subl $16, %esp ; AVX512F-32-NEXT: subl $16, %esp
; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3
; AVX512F-32-NEXT: movw $-3, %ax ; AVX512F-32-NEXT: movw $-3, %ax
; AVX512F-32-NEXT: kmovw %eax, %k0 ; AVX512F-32-NEXT: kmovw %eax, %k0
; AVX512F-32-NEXT: vcomiss %xmm3, %xmm2 ; AVX512F-32-NEXT: vcomiss 8(%ebp), %xmm2
; AVX512F-32-NEXT: seta %al ; AVX512F-32-NEXT: seta %al
; AVX512F-32-NEXT: andl $1, %eax ; AVX512F-32-NEXT: andl $1, %eax
; AVX512F-32-NEXT: kmovw %eax, %k1 ; AVX512F-32-NEXT: kmovw %eax, %k1
; AVX512F-32-NEXT: kandw %k0, %k1, %k0 ; AVX512F-32-NEXT: kandw %k0, %k1, %k0
; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX512F-32-NEXT: vcomiss %xmm3, %xmm2 ; AVX512F-32-NEXT: vcomiss 12(%ebp), %xmm2
; AVX512F-32-NEXT: seta %al ; AVX512F-32-NEXT: seta %al
; AVX512F-32-NEXT: kmovw %eax, %k1 ; AVX512F-32-NEXT: kmovw %eax, %k1
; AVX512F-32-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-32-NEXT: kshiftlw $15, %k1, %k1
@ -257,16 +251,14 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: movl %esp, %ebp
; AVX-32-NEXT: andl $-16, %esp ; AVX-32-NEXT: andl $-16, %esp
; AVX-32-NEXT: subl $16, %esp ; AVX-32-NEXT: subl $16, %esp
; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 ; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
; AVX-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
; AVX-32-NEXT: xorl %eax, %eax ; AVX-32-NEXT: xorl %eax, %eax
; AVX-32-NEXT: vucomiss %xmm4, %xmm5 ; AVX-32-NEXT: vucomiss 12(%ebp), %xmm3
; AVX-32-NEXT: movl $-1, %ecx ; AVX-32-NEXT: movl $-1, %ecx
; AVX-32-NEXT: movl $-1, %edx ; AVX-32-NEXT: movl $-1, %edx
; AVX-32-NEXT: cmovnel %eax, %edx ; AVX-32-NEXT: cmovnel %eax, %edx
; AVX-32-NEXT: cmovpl %eax, %edx ; AVX-32-NEXT: cmovpl %eax, %edx
; AVX-32-NEXT: vucomiss %xmm3, %xmm2 ; AVX-32-NEXT: vucomiss 8(%ebp), %xmm2
; AVX-32-NEXT: cmovnel %eax, %ecx ; AVX-32-NEXT: cmovnel %eax, %ecx
; AVX-32-NEXT: cmovpl %eax, %ecx ; AVX-32-NEXT: cmovpl %eax, %ecx
; AVX-32-NEXT: vmovd %ecx, %xmm2 ; AVX-32-NEXT: vmovd %ecx, %xmm2
@ -300,28 +292,26 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
; AVX512-32-NEXT: movl %esp, %ebp ; AVX512-32-NEXT: movl %esp, %ebp
; AVX512-32-NEXT: andl $-16, %esp ; AVX512-32-NEXT: andl $-16, %esp
; AVX512-32-NEXT: subl $16, %esp ; AVX512-32-NEXT: subl $16, %esp
; AVX512-32-NEXT: vmovaps 8(%ebp), %xmm3 ; AVX512-32-NEXT: movw $-3, %ax
; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
; AVX512-32-NEXT: vucomiss %xmm4, %xmm5
; AVX512-32-NEXT: setnp %al
; AVX512-32-NEXT: sete %cl
; AVX512-32-NEXT: testb %al, %cl
; AVX512-32-NEXT: setne %al
; AVX512-32-NEXT: kmovw %eax, %k0 ; AVX512-32-NEXT: kmovw %eax, %k0
; AVX512-32-NEXT: kshiftlw $15, %k0, %k0 ; AVX512-32-NEXT: vucomiss 8(%ebp), %xmm2
; AVX512-32-NEXT: kshiftrw $14, %k0, %k0
; AVX512-32-NEXT: vucomiss %xmm3, %xmm2
; AVX512-32-NEXT: setnp %al ; AVX512-32-NEXT: setnp %al
; AVX512-32-NEXT: sete %cl ; AVX512-32-NEXT: sete %cl
; AVX512-32-NEXT: testb %al, %cl ; AVX512-32-NEXT: testb %al, %cl
; AVX512-32-NEXT: setne %al ; AVX512-32-NEXT: setne %al
; AVX512-32-NEXT: andl $1, %eax ; AVX512-32-NEXT: andl $1, %eax
; AVX512-32-NEXT: kmovw %eax, %k1 ; AVX512-32-NEXT: kmovw %eax, %k1
; AVX512-32-NEXT: movw $-3, %ax ; AVX512-32-NEXT: kandw %k0, %k1, %k0
; AVX512-32-NEXT: kmovw %eax, %k2 ; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX512-32-NEXT: kandw %k2, %k1, %k1 ; AVX512-32-NEXT: vucomiss 12(%ebp), %xmm2
; AVX512-32-NEXT: korw %k0, %k1, %k1 ; AVX512-32-NEXT: setnp %al
; AVX512-32-NEXT: sete %cl
; AVX512-32-NEXT: testb %al, %cl
; AVX512-32-NEXT: setne %al
; AVX512-32-NEXT: kmovw %eax, %k1
; AVX512-32-NEXT: kshiftlw $15, %k1, %k1
; AVX512-32-NEXT: kshiftrw $14, %k1, %k1
; AVX512-32-NEXT: korw %k1, %k0, %k1
; AVX512-32-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; AVX512-32-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; AVX512-32-NEXT: movl %ebp, %esp ; AVX512-32-NEXT: movl %ebp, %esp
; AVX512-32-NEXT: popl %ebp ; AVX512-32-NEXT: popl %ebp
@ -361,28 +351,26 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
; AVX512F-32-NEXT: subl $16, %esp ; AVX512F-32-NEXT: subl $16, %esp
; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3 ; AVX512F-32-NEXT: movw $-3, %ax
; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
; AVX512F-32-NEXT: vucomiss %xmm4, %xmm5
; AVX512F-32-NEXT: setnp %al
; AVX512F-32-NEXT: sete %cl
; AVX512F-32-NEXT: testb %al, %cl
; AVX512F-32-NEXT: setne %al
; AVX512F-32-NEXT: kmovw %eax, %k0 ; AVX512F-32-NEXT: kmovw %eax, %k0
; AVX512F-32-NEXT: kshiftlw $15, %k0, %k0 ; AVX512F-32-NEXT: vucomiss 8(%ebp), %xmm2
; AVX512F-32-NEXT: kshiftrw $14, %k0, %k0
; AVX512F-32-NEXT: vucomiss %xmm3, %xmm2
; AVX512F-32-NEXT: setnp %al ; AVX512F-32-NEXT: setnp %al
; AVX512F-32-NEXT: sete %cl ; AVX512F-32-NEXT: sete %cl
; AVX512F-32-NEXT: testb %al, %cl ; AVX512F-32-NEXT: testb %al, %cl
; AVX512F-32-NEXT: setne %al ; AVX512F-32-NEXT: setne %al
; AVX512F-32-NEXT: andl $1, %eax ; AVX512F-32-NEXT: andl $1, %eax
; AVX512F-32-NEXT: kmovw %eax, %k1 ; AVX512F-32-NEXT: kmovw %eax, %k1
; AVX512F-32-NEXT: movw $-3, %ax ; AVX512F-32-NEXT: kandw %k0, %k1, %k0
; AVX512F-32-NEXT: kmovw %eax, %k2 ; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX512F-32-NEXT: kandw %k2, %k1, %k1 ; AVX512F-32-NEXT: vucomiss 12(%ebp), %xmm2
; AVX512F-32-NEXT: korw %k0, %k1, %k1 ; AVX512F-32-NEXT: setnp %al
; AVX512F-32-NEXT: sete %cl
; AVX512F-32-NEXT: testb %al, %cl
; AVX512F-32-NEXT: setne %al
; AVX512F-32-NEXT: kmovw %eax, %k1
; AVX512F-32-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-32-NEXT: kshiftrw $14, %k1, %k1
; AVX512F-32-NEXT: korw %k1, %k0, %k1
; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-32-NEXT: movl %ebp, %esp ; AVX512F-32-NEXT: movl %ebp, %esp

View File

@ -1106,34 +1106,61 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
} }
define double @test_v16f64(double %a0, <16 x double> %a1) { define double @test_v16f64(double %a0, <16 x double> %a1) {
; SSE-LABEL: test_v16f64: ; SSE2-LABEL: test_v16f64:
; SSE: # %bb.0: ; SSE2: # %bb.0:
; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 ; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: addsd %xmm2, %xmm0 ; SSE2-NEXT: addsd %xmm2, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: addsd %xmm2, %xmm0 ; SSE2-NEXT: addsd %xmm2, %xmm0
; SSE-NEXT: addsd %xmm3, %xmm0 ; SSE2-NEXT: addsd %xmm3, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
; SSE-NEXT: addsd %xmm3, %xmm0 ; SSE2-NEXT: addsd %xmm3, %xmm0
; SSE-NEXT: addsd %xmm4, %xmm0 ; SSE2-NEXT: addsd %xmm4, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
; SSE-NEXT: addsd %xmm4, %xmm0 ; SSE2-NEXT: addsd %xmm4, %xmm0
; SSE-NEXT: addsd %xmm5, %xmm0 ; SSE2-NEXT: addsd %xmm5, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
; SSE-NEXT: addsd %xmm5, %xmm0 ; SSE2-NEXT: addsd %xmm5, %xmm0
; SSE-NEXT: addsd %xmm6, %xmm0 ; SSE2-NEXT: addsd %xmm6, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
; SSE-NEXT: addsd %xmm6, %xmm0 ; SSE2-NEXT: addsd %xmm6, %xmm0
; SSE-NEXT: addsd %xmm7, %xmm0 ; SSE2-NEXT: addsd %xmm7, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
; SSE-NEXT: addsd %xmm7, %xmm0 ; SSE2-NEXT: addsd %xmm7, %xmm0
; SSE-NEXT: addsd %xmm8, %xmm0 ; SSE2-NEXT: addsd %xmm8, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
; SSE-NEXT: addsd %xmm8, %xmm0 ; SSE2-NEXT: addsd %xmm8, %xmm0
; SSE-NEXT: retq ; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16f64:
; SSE41: # %bb.0:
; SSE41-NEXT: addsd %xmm1, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE41-NEXT: addsd %xmm1, %xmm0
; SSE41-NEXT: addsd %xmm2, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
; SSE41-NEXT: addsd %xmm2, %xmm0
; SSE41-NEXT: addsd %xmm3, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
; SSE41-NEXT: addsd %xmm3, %xmm0
; SSE41-NEXT: addsd %xmm4, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
; SSE41-NEXT: addsd %xmm4, %xmm0
; SSE41-NEXT: addsd %xmm5, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
; SSE41-NEXT: addsd %xmm5, %xmm0
; SSE41-NEXT: addsd %xmm6, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
; SSE41-NEXT: addsd %xmm6, %xmm0
; SSE41-NEXT: addsd %xmm7, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
; SSE41-NEXT: addsd %xmm7, %xmm0
; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0
; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0
; SSE41-NEXT: retq
; ;
; AVX-LABEL: test_v16f64: ; AVX-LABEL: test_v16f64:
; AVX: # %bb.0: ; AVX: # %bb.0:

View File

@ -1075,34 +1075,61 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
} }
define double @test_v16f64(double %a0, <16 x double> %a1) { define double @test_v16f64(double %a0, <16 x double> %a1) {
; SSE-LABEL: test_v16f64: ; SSE2-LABEL: test_v16f64:
; SSE: # %bb.0: ; SSE2: # %bb.0:
; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 ; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE2-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE2-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE2-NEXT: mulsd %xmm2, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE2-NEXT: mulsd %xmm2, %xmm0
; SSE-NEXT: mulsd %xmm3, %xmm0 ; SSE2-NEXT: mulsd %xmm3, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
; SSE-NEXT: mulsd %xmm3, %xmm0 ; SSE2-NEXT: mulsd %xmm3, %xmm0
; SSE-NEXT: mulsd %xmm4, %xmm0 ; SSE2-NEXT: mulsd %xmm4, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
; SSE-NEXT: mulsd %xmm4, %xmm0 ; SSE2-NEXT: mulsd %xmm4, %xmm0
; SSE-NEXT: mulsd %xmm5, %xmm0 ; SSE2-NEXT: mulsd %xmm5, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
; SSE-NEXT: mulsd %xmm5, %xmm0 ; SSE2-NEXT: mulsd %xmm5, %xmm0
; SSE-NEXT: mulsd %xmm6, %xmm0 ; SSE2-NEXT: mulsd %xmm6, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
; SSE-NEXT: mulsd %xmm6, %xmm0 ; SSE2-NEXT: mulsd %xmm6, %xmm0
; SSE-NEXT: mulsd %xmm7, %xmm0 ; SSE2-NEXT: mulsd %xmm7, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
; SSE-NEXT: mulsd %xmm7, %xmm0 ; SSE2-NEXT: mulsd %xmm7, %xmm0
; SSE-NEXT: mulsd %xmm8, %xmm0 ; SSE2-NEXT: mulsd %xmm8, %xmm0
; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] ; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
; SSE-NEXT: mulsd %xmm8, %xmm0 ; SSE2-NEXT: mulsd %xmm8, %xmm0
; SSE-NEXT: retq ; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16f64:
; SSE41: # %bb.0:
; SSE41-NEXT: mulsd %xmm1, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE41-NEXT: mulsd %xmm1, %xmm0
; SSE41-NEXT: mulsd %xmm2, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
; SSE41-NEXT: mulsd %xmm2, %xmm0
; SSE41-NEXT: mulsd %xmm3, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
; SSE41-NEXT: mulsd %xmm3, %xmm0
; SSE41-NEXT: mulsd %xmm4, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
; SSE41-NEXT: mulsd %xmm4, %xmm0
; SSE41-NEXT: mulsd %xmm5, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
; SSE41-NEXT: mulsd %xmm5, %xmm0
; SSE41-NEXT: mulsd %xmm6, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
; SSE41-NEXT: mulsd %xmm6, %xmm0
; SSE41-NEXT: mulsd %xmm7, %xmm0
; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
; SSE41-NEXT: mulsd %xmm7, %xmm0
; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0
; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0
; SSE41-NEXT: retq
; ;
; AVX-LABEL: test_v16f64: ; AVX-LABEL: test_v16f64:
; AVX: # %bb.0: ; AVX: # %bb.0:

View File

@ -3008,11 +3008,10 @@ define void @PR43024() {
; AVX: # %bb.0: ; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; AVX-NEXT: vmovaps %xmm0, (%rax) ; AVX-NEXT: vmovaps %xmm0, (%rax)
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovss %xmm0, (%rax) ; AVX-NEXT: vmovss %xmm0, (%rax)
; AVX-NEXT: retq ; AVX-NEXT: retq
store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16 store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16