forked from OSchip/llvm-project
[x86] form broadcast of scalar memop even with >1 use
The unseen logic diff occurs because MayFoldLoad() is defined like this: static bool MayFoldLoad(SDValue Op) { return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); } The test diffs here all seem ok to me on screen/paper, but it's hard to know if that will lead to universally better perf for all targets. For example, if a target implements broadcast from mem as multiple uops, we would have to weigh the potential reduction of instructions and register pressure vs. possible increase in number of uops. I don't know if we can make a truly informed decision on this at compile-time. The motivating case that I'm looking at in PR42024: https://bugs.llvm.org/show_bug.cgi?id=42024 ...resembles the diff in extract-concat.ll, but we're not going to change the larger example there without at least 1 other fix. Differential Revision: https://reviews.llvm.org/D74088
This commit is contained in:
parent
893c630fbe
commit
e48b536be6
|
@ -12927,7 +12927,12 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
|
|||
// If we can't broadcast from a register, check that the input is a load.
|
||||
if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
|
||||
return SDValue();
|
||||
} else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) {
|
||||
} else if (ISD::isNormalLoad(V.getNode()) &&
|
||||
cast<LoadSDNode>(V)->isSimple()) {
|
||||
// We do not check for one-use of the vector load because a broadcast load
|
||||
// is expected to be a win for code size, register pressure, and possibly
|
||||
// uops even if the original vector load is not eliminated.
|
||||
|
||||
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
|
||||
if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
|
||||
BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
|
||||
|
@ -12936,8 +12941,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
|
|||
: Opcode;
|
||||
}
|
||||
|
||||
// If we are broadcasting a load that is only used by the shuffle
|
||||
// then we can reduce the vector load to the broadcasted scalar load.
|
||||
// Reduce the vector load and shuffle to a broadcasted scalar load.
|
||||
LoadSDNode *Ld = cast<LoadSDNode>(V);
|
||||
SDValue BaseAddr = Ld->getOperand(1);
|
||||
EVT SVT = BroadcastVT.getScalarType();
|
||||
|
|
|
@ -377,66 +377,60 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
|
|||
;
|
||||
; AVX2-LABEL: avg_v48i8:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm0
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
|
||||
; AVX2-NEXT: vmovdqa (%rsi), %xmm6
|
||||
; AVX2-NEXT: vmovdqa 16(%rsi), %xmm7
|
||||
; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpaddd %ymm5, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpaddd %ymm5, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpaddd %ymm5, %ymm4, %ymm4
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpaddd %ymm5, %ymm9, %ymm5
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm2
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm4
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm6
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpaddd %ymm6, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpaddd %ymm6, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm6
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpaddd %ymm6, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vpbroadcastq 40(%rsi), %xmm6
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpaddd %ymm6, %ymm4, %ymm4
|
||||
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
||||
; AVX2-NEXT: vpaddd %ymm6, %ymm5, %ymm5
|
||||
; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
|
||||
; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
|
||||
; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5
|
||||
; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
|
||||
; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5
|
||||
; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
|
||||
; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
|
||||
; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vpackusdw %ymm6, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
|
||||
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm4[2,3]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpackusdw %ymm6, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
|
||||
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm2[2,3]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
|
||||
; AVX2-NEXT: vpackusdw %ymm6, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX2-NEXT: vpackuswb %ymm0, %ymm4, %ymm0
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm5[2,3]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
|
||||
; AVX2-NEXT: vpackuswb %ymm0, %ymm3, %ymm0
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],ymm4[2,3]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3
|
||||
; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2
|
||||
; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
||||
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vmovdqu %xmm1, (%rax)
|
||||
|
|
|
@ -1730,15 +1730,15 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp,
|
|||
define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
|
||||
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; CHECK-NEXT: vmovd %xmm0, %eax
|
||||
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
|
||||
; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vpextrd $3, %xmm1, %eax
|
||||
; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm1
|
||||
; CHECK-NEXT: vpextrd $2, %xmm0, %eax
|
||||
; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
|
||||
; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm0
|
||||
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
|
||||
; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
|
||||
; CHECK-NEXT: vmovd %xmm1, %eax
|
||||
; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vextractps $3, %xmm2, %eax
|
||||
; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vpextrd $2, %xmm1, %eax
|
||||
; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <16 x i32>, <16 x i32>* %vp
|
||||
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
|
||||
|
@ -1747,15 +1747,15 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
|
|||
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
|
||||
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
|
||||
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
|
||||
; CHECK-NEXT: vmovd %xmm2, %eax
|
||||
; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
|
||||
; CHECK-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
|
||||
; CHECK-NEXT: vpextrd $3, %xmm3, %eax
|
||||
; CHECK-NEXT: vpinsrd $2, %eax, %xmm4, %xmm3
|
||||
; CHECK-NEXT: vpextrd $2, %xmm2, %eax
|
||||
; CHECK-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
|
||||
; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm2
|
||||
; CHECK-NEXT: vmovdqa (%rdi), %xmm3
|
||||
; CHECK-NEXT: vmovaps 16(%rdi), %xmm4
|
||||
; CHECK-NEXT: vmovd %xmm3, %eax
|
||||
; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vextractps $3, %xmm4, %eax
|
||||
; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vpextrd $2, %xmm3, %eax
|
||||
; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
|
||||
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
|
||||
; CHECK-NEXT: retq
|
||||
|
@ -1769,15 +1769,15 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4
|
|||
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) {
|
||||
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovdqa (%rdi), %xmm1
|
||||
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
|
||||
; CHECK-NEXT: vmovd %xmm1, %eax
|
||||
; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
|
||||
; CHECK-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
|
||||
; CHECK-NEXT: vpextrd $3, %xmm2, %eax
|
||||
; CHECK-NEXT: vpinsrd $2, %eax, %xmm3, %xmm2
|
||||
; CHECK-NEXT: vpextrd $2, %xmm1, %eax
|
||||
; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
|
||||
; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm1
|
||||
; CHECK-NEXT: vmovdqa (%rdi), %xmm2
|
||||
; CHECK-NEXT: vmovaps 16(%rdi), %xmm3
|
||||
; CHECK-NEXT: vmovd %xmm2, %eax
|
||||
; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vextractps $3, %xmm3, %eax
|
||||
; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vpextrd $2, %xmm2, %eax
|
||||
; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
|
||||
; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
|
||||
; CHECK-NEXT: retq
|
||||
|
@ -4205,12 +4205,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x doub
|
|||
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
|
||||
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
|
||||
; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [6,1,1,1]
|
||||
; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3
|
||||
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
|
||||
; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
|
||||
; CHECK-NEXT: vmovapd (%rdi), %ymm2
|
||||
; CHECK-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[0,1]
|
||||
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
|
||||
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
|
||||
; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <8 x double>, <8 x double>* %vp
|
||||
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
|
||||
|
@ -4222,12 +4221,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double
|
|||
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) {
|
||||
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
|
||||
; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [6,1,1,1]
|
||||
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
|
||||
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
|
||||
; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
|
||||
; CHECK-NEXT: vmovapd %ymm1, %ymm0
|
||||
; CHECK-NEXT: vmovapd (%rdi), %ymm1
|
||||
; CHECK-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[0,1]
|
||||
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
|
||||
; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <8 x double>, <8 x double>* %vp
|
||||
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
|
||||
|
|
|
@ -126,11 +126,10 @@ define <16 x i64> @load_catcat(<4 x i64>* %p) {
|
|||
;
|
||||
; AVX2-LABEL: load_catcat:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vmovaps (%rdi), %ymm3
|
||||
; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
|
||||
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[1,1,1,1]
|
||||
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2]
|
||||
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3]
|
||||
; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm1
|
||||
; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm2
|
||||
; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512F-LABEL: load_catcat:
|
||||
|
|
|
@ -334,12 +334,12 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
|
|||
; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-SSE4A-NEXT: movups (%ecx), %xmm0
|
||||
; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1
|
||||
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
|
||||
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
|
||||
; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax)
|
||||
; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
|
||||
; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; X86-SSE4A-NEXT: movntsd %xmm0, 8(%eax)
|
||||
; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax)
|
||||
; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax)
|
||||
; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
|
||||
; X86-SSE4A-NEXT: movntsd %xmm1, 24(%eax)
|
||||
; X86-SSE4A-NEXT: retl
|
||||
;
|
||||
; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
|
||||
|
@ -362,12 +362,12 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
|
|||
; X64-SSE4A: # %bb.0:
|
||||
; X64-SSE4A-NEXT: movups (%rdi), %xmm0
|
||||
; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1
|
||||
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
|
||||
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
|
||||
; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi)
|
||||
; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
|
||||
; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; X64-SSE4A-NEXT: movntsd %xmm0, 8(%rsi)
|
||||
; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi)
|
||||
; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi)
|
||||
; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
|
||||
; X64-SSE4A-NEXT: movntsd %xmm1, 24(%rsi)
|
||||
; X64-SSE4A-NEXT: retq
|
||||
;
|
||||
; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore:
|
||||
|
@ -447,12 +447,12 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
|
|||
; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-SSE4A-NEXT: movups (%ecx), %xmm0
|
||||
; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1
|
||||
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
|
||||
; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
|
||||
; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax)
|
||||
; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
|
||||
; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; X86-SSE4A-NEXT: movntsd %xmm0, 8(%eax)
|
||||
; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax)
|
||||
; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax)
|
||||
; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
|
||||
; X86-SSE4A-NEXT: movntsd %xmm1, 24(%eax)
|
||||
; X86-SSE4A-NEXT: retl
|
||||
;
|
||||
; X64-SSE2-LABEL: merge_2_v4f32_align1:
|
||||
|
@ -475,12 +475,12 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
|
|||
; X64-SSE4A: # %bb.0:
|
||||
; X64-SSE4A-NEXT: movups (%rdi), %xmm0
|
||||
; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1
|
||||
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
|
||||
; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
|
||||
; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi)
|
||||
; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
|
||||
; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; X64-SSE4A-NEXT: movntsd %xmm0, 8(%rsi)
|
||||
; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi)
|
||||
; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi)
|
||||
; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
|
||||
; X64-SSE4A-NEXT: movntsd %xmm1, 24(%rsi)
|
||||
; X64-SSE4A-NEXT: retq
|
||||
;
|
||||
; X64-SSE41-LABEL: merge_2_v4f32_align1:
|
||||
|
|
|
@ -1510,35 +1510,32 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
|
|||
;
|
||||
; AVX1-LABEL: interleave_24i32_in:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovupd (%rsi), %ymm0
|
||||
; AVX1-NEXT: vmovups (%rdx), %xmm1
|
||||
; AVX1-NEXT: vmovups 16(%rdx), %xmm2
|
||||
; AVX1-NEXT: vmovups (%rsi), %xmm3
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm1[2,0]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[0,1,0,1]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
|
||||
; AVX1-NEXT: vmovups 16(%rcx), %xmm3
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,0],xmm3[3,0]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,1],xmm4[0,2]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,0]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,2]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,1,3,3]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
|
||||
; AVX1-NEXT: vmovups (%rdx), %xmm0
|
||||
; AVX1-NEXT: vmovups 16(%rdx), %xmm1
|
||||
; AVX1-NEXT: vmovups (%rsi), %xmm2
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm0[2,0]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1],xmm3[0,2]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vbroadcastsd (%rcx), %ymm2
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
|
||||
; AVX1-NEXT: vmovups 16(%rcx), %xmm2
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,0],xmm2[3,0]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,1],xmm3[0,2]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,0]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,2]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vbroadcastsd 24(%rsi), %ymm2
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = mem[1,0,2,2]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7]
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7]
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
|
||||
; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
|
||||
; AVX1-NEXT: vmovups %ymm2, 64(%rdi)
|
||||
; AVX1-NEXT: vmovups %ymm1, (%rdi)
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
|
||||
; AVX1-NEXT: vmovups %ymm2, 32(%rdi)
|
||||
; AVX1-NEXT: vmovups %ymm1, 64(%rdi)
|
||||
; AVX1-NEXT: vmovups %ymm0, (%rdi)
|
||||
; AVX1-NEXT: vzeroupper
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
|
@ -1557,7 +1554,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
|
|||
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
|
||||
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
|
||||
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
|
||||
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3]
|
||||
; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm5
|
||||
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
|
||||
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
|
||||
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
|
||||
|
@ -1586,7 +1583,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
|
|||
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4
|
||||
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3]
|
||||
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
|
||||
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3]
|
||||
; AVX2-FAST-NEXT: vbroadcastsd 24(%rsi), %ymm5
|
||||
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
|
||||
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
|
||||
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
|
||||
|
@ -1601,34 +1598,32 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
|
|||
;
|
||||
; XOP-LABEL: interleave_24i32_in:
|
||||
; XOP: # %bb.0:
|
||||
; XOP-NEXT: vmovupd (%rsi), %ymm0
|
||||
; XOP-NEXT: vmovups (%rsi), %ymm0
|
||||
; XOP-NEXT: vmovups (%rcx), %ymm1
|
||||
; XOP-NEXT: vmovups (%rdx), %xmm2
|
||||
; XOP-NEXT: vmovups 16(%rdx), %xmm3
|
||||
; XOP-NEXT: vmovups (%rsi), %xmm4
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0]
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2]
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0]
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1]
|
||||
; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
|
||||
; XOP-NEXT: vpermilps {{.*#+}} xmm4 = mem[0,1,0,1]
|
||||
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
|
||||
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
|
||||
; XOP-NEXT: vmovups 16(%rcx), %xmm4
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0]
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2]
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0]
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2]
|
||||
; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
|
||||
; XOP-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
|
||||
; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
|
||||
; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
|
||||
; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5]
|
||||
; XOP-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7]
|
||||
; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
|
||||
; XOP-NEXT: vmovups (%rdx), %xmm1
|
||||
; XOP-NEXT: vmovups 16(%rdx), %xmm2
|
||||
; XOP-NEXT: vmovups (%rsi), %xmm3
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm1[2,0]
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0]
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
|
||||
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
|
||||
; XOP-NEXT: vbroadcastsd (%rcx), %ymm3
|
||||
; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
|
||||
; XOP-NEXT: vmovups 16(%rcx), %xmm3
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,0],xmm3[3,0]
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,1],xmm4[0,2]
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,0]
|
||||
; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,2]
|
||||
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
|
||||
; XOP-NEXT: vbroadcastsd 24(%rsi), %ymm3
|
||||
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
|
||||
; XOP-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7]
|
||||
; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
|
||||
; XOP-NEXT: vmovups %ymm0, 32(%rdi)
|
||||
; XOP-NEXT: vmovups %ymm3, 64(%rdi)
|
||||
; XOP-NEXT: vmovups %ymm2, (%rdi)
|
||||
; XOP-NEXT: vmovups %ymm2, 64(%rdi)
|
||||
; XOP-NEXT: vmovups %ymm1, (%rdi)
|
||||
; XOP-NEXT: vzeroupper
|
||||
; XOP-NEXT: retq
|
||||
%s1 = load <8 x i32>, <8 x i32>* %q1, align 4
|
||||
|
|
|
@ -16,25 +16,23 @@ define void @pr34653() {
|
|||
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
||||
; CHECK-NEXT: callq test
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
|
||||
; CHECK-NEXT: vmovaps %xmm0, %xmm1
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm2
|
||||
; CHECK-NEXT: vmovaps %xmm2, %xmm3
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm4
|
||||
; CHECK-NEXT: vmovaps %xmm4, %xmm5
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm6
|
||||
; CHECK-NEXT: vmovaps %xmm6, %xmm7
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm8
|
||||
; CHECK-NEXT: vmovaps %xmm8, %xmm9
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm10
|
||||
; CHECK-NEXT: vmovaps %xmm10, %xmm11
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm12
|
||||
; CHECK-NEXT: vmovaps %xmm12, %xmm13
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm14
|
||||
; CHECK-NEXT: vmovaps %xmm14, %xmm15
|
||||
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
|
||||
; CHECK-NEXT: vmovaps %zmm0, %zmm16
|
||||
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm1
|
||||
; CHECK-NEXT: vmovaps %xmm1, %xmm2
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm3
|
||||
; CHECK-NEXT: vmovaps %xmm3, %xmm4
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm5
|
||||
; CHECK-NEXT: vmovaps %xmm5, %xmm6
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm7
|
||||
; CHECK-NEXT: vmovaps %xmm7, %xmm8
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm9
|
||||
; CHECK-NEXT: vmovaps %xmm9, %xmm10
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm11
|
||||
; CHECK-NEXT: vmovaps %xmm11, %xmm12
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm13
|
||||
; CHECK-NEXT: vmovaps %xmm13, %xmm14
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm15
|
||||
; CHECK-NEXT: vmovaps %zmm15, %zmm16
|
||||
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
|
||||
; CHECK-NEXT: vmovaps %zmm0, %zmm17
|
||||
; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
|
@ -67,16 +65,14 @@ define void @pr34653() {
|
|||
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm12 = xmm12[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm14 = xmm14[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0]
|
||||
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
|
@ -107,11 +103,7 @@ define void @pr34653() {
|
|||
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
||||
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
|
||||
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 8-byte Reload
|
||||
; CHECK-NEXT: # xmm24 = mem[0],zero
|
||||
; CHECK-NEXT: vmovsd {{.*#+}} xmm24 = mem[0],zero
|
||||
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 8-byte Reload
|
||||
; CHECK-NEXT: # xmm25 = mem[0],zero
|
||||
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 8-byte Reload
|
||||
|
|
|
@ -60,15 +60,13 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
|
|||
; AVX-32-NEXT: movl %esp, %ebp
|
||||
; AVX-32-NEXT: andl $-16, %esp
|
||||
; AVX-32-NEXT: subl $16, %esp
|
||||
; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3
|
||||
; AVX-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
|
||||
; AVX-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
|
||||
; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
|
||||
; AVX-32-NEXT: xorl %eax, %eax
|
||||
; AVX-32-NEXT: vcomiss %xmm4, %xmm5
|
||||
; AVX-32-NEXT: vcomiss 12(%ebp), %xmm3
|
||||
; AVX-32-NEXT: movl $-1, %ecx
|
||||
; AVX-32-NEXT: movl $0, %edx
|
||||
; AVX-32-NEXT: cmoval %ecx, %edx
|
||||
; AVX-32-NEXT: vcomiss %xmm3, %xmm2
|
||||
; AVX-32-NEXT: vcomiss 8(%ebp), %xmm2
|
||||
; AVX-32-NEXT: cmoval %ecx, %eax
|
||||
; AVX-32-NEXT: vmovd %eax, %xmm2
|
||||
; AVX-32-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2
|
||||
|
@ -99,17 +97,15 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
|
|||
; AVX512-32-NEXT: movl %esp, %ebp
|
||||
; AVX512-32-NEXT: andl $-16, %esp
|
||||
; AVX512-32-NEXT: subl $16, %esp
|
||||
; AVX512-32-NEXT: vmovaps 8(%ebp), %xmm3
|
||||
; AVX512-32-NEXT: movw $-3, %ax
|
||||
; AVX512-32-NEXT: kmovw %eax, %k0
|
||||
; AVX512-32-NEXT: vcomiss %xmm3, %xmm2
|
||||
; AVX512-32-NEXT: vcomiss 8(%ebp), %xmm2
|
||||
; AVX512-32-NEXT: seta %al
|
||||
; AVX512-32-NEXT: andl $1, %eax
|
||||
; AVX512-32-NEXT: kmovw %eax, %k1
|
||||
; AVX512-32-NEXT: kandw %k0, %k1, %k0
|
||||
; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
|
||||
; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; AVX512-32-NEXT: vcomiss %xmm3, %xmm2
|
||||
; AVX512-32-NEXT: vcomiss 12(%ebp), %xmm2
|
||||
; AVX512-32-NEXT: seta %al
|
||||
; AVX512-32-NEXT: kmovw %eax, %k1
|
||||
; AVX512-32-NEXT: kshiftlw $15, %k1, %k1
|
||||
|
@ -148,17 +144,15 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
|
|||
; AVX512F-32-NEXT: subl $16, %esp
|
||||
; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
|
||||
; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
|
||||
; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3
|
||||
; AVX512F-32-NEXT: movw $-3, %ax
|
||||
; AVX512F-32-NEXT: kmovw %eax, %k0
|
||||
; AVX512F-32-NEXT: vcomiss %xmm3, %xmm2
|
||||
; AVX512F-32-NEXT: vcomiss 8(%ebp), %xmm2
|
||||
; AVX512F-32-NEXT: seta %al
|
||||
; AVX512F-32-NEXT: andl $1, %eax
|
||||
; AVX512F-32-NEXT: kmovw %eax, %k1
|
||||
; AVX512F-32-NEXT: kandw %k0, %k1, %k0
|
||||
; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
|
||||
; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; AVX512F-32-NEXT: vcomiss %xmm3, %xmm2
|
||||
; AVX512F-32-NEXT: vcomiss 12(%ebp), %xmm2
|
||||
; AVX512F-32-NEXT: seta %al
|
||||
; AVX512F-32-NEXT: kmovw %eax, %k1
|
||||
; AVX512F-32-NEXT: kshiftlw $15, %k1, %k1
|
||||
|
@ -257,16 +251,14 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
|
|||
; AVX-32-NEXT: movl %esp, %ebp
|
||||
; AVX-32-NEXT: andl $-16, %esp
|
||||
; AVX-32-NEXT: subl $16, %esp
|
||||
; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3
|
||||
; AVX-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
|
||||
; AVX-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
|
||||
; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
|
||||
; AVX-32-NEXT: xorl %eax, %eax
|
||||
; AVX-32-NEXT: vucomiss %xmm4, %xmm5
|
||||
; AVX-32-NEXT: vucomiss 12(%ebp), %xmm3
|
||||
; AVX-32-NEXT: movl $-1, %ecx
|
||||
; AVX-32-NEXT: movl $-1, %edx
|
||||
; AVX-32-NEXT: cmovnel %eax, %edx
|
||||
; AVX-32-NEXT: cmovpl %eax, %edx
|
||||
; AVX-32-NEXT: vucomiss %xmm3, %xmm2
|
||||
; AVX-32-NEXT: vucomiss 8(%ebp), %xmm2
|
||||
; AVX-32-NEXT: cmovnel %eax, %ecx
|
||||
; AVX-32-NEXT: cmovpl %eax, %ecx
|
||||
; AVX-32-NEXT: vmovd %ecx, %xmm2
|
||||
|
@ -300,28 +292,26 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
|
|||
; AVX512-32-NEXT: movl %esp, %ebp
|
||||
; AVX512-32-NEXT: andl $-16, %esp
|
||||
; AVX512-32-NEXT: subl $16, %esp
|
||||
; AVX512-32-NEXT: vmovaps 8(%ebp), %xmm3
|
||||
; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
|
||||
; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
|
||||
; AVX512-32-NEXT: vucomiss %xmm4, %xmm5
|
||||
; AVX512-32-NEXT: setnp %al
|
||||
; AVX512-32-NEXT: sete %cl
|
||||
; AVX512-32-NEXT: testb %al, %cl
|
||||
; AVX512-32-NEXT: setne %al
|
||||
; AVX512-32-NEXT: movw $-3, %ax
|
||||
; AVX512-32-NEXT: kmovw %eax, %k0
|
||||
; AVX512-32-NEXT: kshiftlw $15, %k0, %k0
|
||||
; AVX512-32-NEXT: kshiftrw $14, %k0, %k0
|
||||
; AVX512-32-NEXT: vucomiss %xmm3, %xmm2
|
||||
; AVX512-32-NEXT: vucomiss 8(%ebp), %xmm2
|
||||
; AVX512-32-NEXT: setnp %al
|
||||
; AVX512-32-NEXT: sete %cl
|
||||
; AVX512-32-NEXT: testb %al, %cl
|
||||
; AVX512-32-NEXT: setne %al
|
||||
; AVX512-32-NEXT: andl $1, %eax
|
||||
; AVX512-32-NEXT: kmovw %eax, %k1
|
||||
; AVX512-32-NEXT: movw $-3, %ax
|
||||
; AVX512-32-NEXT: kmovw %eax, %k2
|
||||
; AVX512-32-NEXT: kandw %k2, %k1, %k1
|
||||
; AVX512-32-NEXT: korw %k0, %k1, %k1
|
||||
; AVX512-32-NEXT: kandw %k0, %k1, %k0
|
||||
; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; AVX512-32-NEXT: vucomiss 12(%ebp), %xmm2
|
||||
; AVX512-32-NEXT: setnp %al
|
||||
; AVX512-32-NEXT: sete %cl
|
||||
; AVX512-32-NEXT: testb %al, %cl
|
||||
; AVX512-32-NEXT: setne %al
|
||||
; AVX512-32-NEXT: kmovw %eax, %k1
|
||||
; AVX512-32-NEXT: kshiftlw $15, %k1, %k1
|
||||
; AVX512-32-NEXT: kshiftrw $14, %k1, %k1
|
||||
; AVX512-32-NEXT: korw %k1, %k0, %k1
|
||||
; AVX512-32-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
|
||||
; AVX512-32-NEXT: movl %ebp, %esp
|
||||
; AVX512-32-NEXT: popl %ebp
|
||||
|
@ -361,28 +351,26 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
|
|||
; AVX512F-32-NEXT: subl $16, %esp
|
||||
; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
|
||||
; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
|
||||
; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3
|
||||
; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
|
||||
; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
|
||||
; AVX512F-32-NEXT: vucomiss %xmm4, %xmm5
|
||||
; AVX512F-32-NEXT: setnp %al
|
||||
; AVX512F-32-NEXT: sete %cl
|
||||
; AVX512F-32-NEXT: testb %al, %cl
|
||||
; AVX512F-32-NEXT: setne %al
|
||||
; AVX512F-32-NEXT: movw $-3, %ax
|
||||
; AVX512F-32-NEXT: kmovw %eax, %k0
|
||||
; AVX512F-32-NEXT: kshiftlw $15, %k0, %k0
|
||||
; AVX512F-32-NEXT: kshiftrw $14, %k0, %k0
|
||||
; AVX512F-32-NEXT: vucomiss %xmm3, %xmm2
|
||||
; AVX512F-32-NEXT: vucomiss 8(%ebp), %xmm2
|
||||
; AVX512F-32-NEXT: setnp %al
|
||||
; AVX512F-32-NEXT: sete %cl
|
||||
; AVX512F-32-NEXT: testb %al, %cl
|
||||
; AVX512F-32-NEXT: setne %al
|
||||
; AVX512F-32-NEXT: andl $1, %eax
|
||||
; AVX512F-32-NEXT: kmovw %eax, %k1
|
||||
; AVX512F-32-NEXT: movw $-3, %ax
|
||||
; AVX512F-32-NEXT: kmovw %eax, %k2
|
||||
; AVX512F-32-NEXT: kandw %k2, %k1, %k1
|
||||
; AVX512F-32-NEXT: korw %k0, %k1, %k1
|
||||
; AVX512F-32-NEXT: kandw %k0, %k1, %k0
|
||||
; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; AVX512F-32-NEXT: vucomiss 12(%ebp), %xmm2
|
||||
; AVX512F-32-NEXT: setnp %al
|
||||
; AVX512F-32-NEXT: sete %cl
|
||||
; AVX512F-32-NEXT: testb %al, %cl
|
||||
; AVX512F-32-NEXT: setne %al
|
||||
; AVX512F-32-NEXT: kmovw %eax, %k1
|
||||
; AVX512F-32-NEXT: kshiftlw $15, %k1, %k1
|
||||
; AVX512F-32-NEXT: kshiftrw $14, %k1, %k1
|
||||
; AVX512F-32-NEXT: korw %k1, %k0, %k1
|
||||
; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
|
||||
; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
|
||||
; AVX512F-32-NEXT: movl %ebp, %esp
|
||||
|
|
|
@ -1106,34 +1106,61 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
|
|||
}
|
||||
|
||||
define double @test_v16f64(double %a0, <16 x double> %a1) {
|
||||
; SSE-LABEL: test_v16f64:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE-NEXT: addsd %xmm1, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
|
||||
; SSE-NEXT: addsd %xmm1, %xmm0
|
||||
; SSE-NEXT: addsd %xmm2, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
|
||||
; SSE-NEXT: addsd %xmm2, %xmm0
|
||||
; SSE-NEXT: addsd %xmm3, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
|
||||
; SSE-NEXT: addsd %xmm3, %xmm0
|
||||
; SSE-NEXT: addsd %xmm4, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
|
||||
; SSE-NEXT: addsd %xmm4, %xmm0
|
||||
; SSE-NEXT: addsd %xmm5, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
|
||||
; SSE-NEXT: addsd %xmm5, %xmm0
|
||||
; SSE-NEXT: addsd %xmm6, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
|
||||
; SSE-NEXT: addsd %xmm6, %xmm0
|
||||
; SSE-NEXT: addsd %xmm7, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
|
||||
; SSE-NEXT: addsd %xmm7, %xmm0
|
||||
; SSE-NEXT: addsd %xmm8, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
|
||||
; SSE-NEXT: addsd %xmm8, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: test_v16f64:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE2-NEXT: addsd %xmm1, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
|
||||
; SSE2-NEXT: addsd %xmm1, %xmm0
|
||||
; SSE2-NEXT: addsd %xmm2, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
|
||||
; SSE2-NEXT: addsd %xmm2, %xmm0
|
||||
; SSE2-NEXT: addsd %xmm3, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
|
||||
; SSE2-NEXT: addsd %xmm3, %xmm0
|
||||
; SSE2-NEXT: addsd %xmm4, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
|
||||
; SSE2-NEXT: addsd %xmm4, %xmm0
|
||||
; SSE2-NEXT: addsd %xmm5, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
|
||||
; SSE2-NEXT: addsd %xmm5, %xmm0
|
||||
; SSE2-NEXT: addsd %xmm6, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
|
||||
; SSE2-NEXT: addsd %xmm6, %xmm0
|
||||
; SSE2-NEXT: addsd %xmm7, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
|
||||
; SSE2-NEXT: addsd %xmm7, %xmm0
|
||||
; SSE2-NEXT: addsd %xmm8, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
|
||||
; SSE2-NEXT: addsd %xmm8, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: test_v16f64:
|
||||
; SSE41: # %bb.0:
|
||||
; SSE41-NEXT: addsd %xmm1, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
|
||||
; SSE41-NEXT: addsd %xmm1, %xmm0
|
||||
; SSE41-NEXT: addsd %xmm2, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
|
||||
; SSE41-NEXT: addsd %xmm2, %xmm0
|
||||
; SSE41-NEXT: addsd %xmm3, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
|
||||
; SSE41-NEXT: addsd %xmm3, %xmm0
|
||||
; SSE41-NEXT: addsd %xmm4, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
|
||||
; SSE41-NEXT: addsd %xmm4, %xmm0
|
||||
; SSE41-NEXT: addsd %xmm5, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
|
||||
; SSE41-NEXT: addsd %xmm5, %xmm0
|
||||
; SSE41-NEXT: addsd %xmm6, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
|
||||
; SSE41-NEXT: addsd %xmm6, %xmm0
|
||||
; SSE41-NEXT: addsd %xmm7, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
|
||||
; SSE41-NEXT: addsd %xmm7, %xmm0
|
||||
; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0
|
||||
; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: test_v16f64:
|
||||
; AVX: # %bb.0:
|
||||
|
|
|
@ -1075,34 +1075,61 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
|
|||
}
|
||||
|
||||
define double @test_v16f64(double %a0, <16 x double> %a1) {
|
||||
; SSE-LABEL: test_v16f64:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE-NEXT: mulsd %xmm1, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
|
||||
; SSE-NEXT: mulsd %xmm1, %xmm0
|
||||
; SSE-NEXT: mulsd %xmm2, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
|
||||
; SSE-NEXT: mulsd %xmm2, %xmm0
|
||||
; SSE-NEXT: mulsd %xmm3, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
|
||||
; SSE-NEXT: mulsd %xmm3, %xmm0
|
||||
; SSE-NEXT: mulsd %xmm4, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
|
||||
; SSE-NEXT: mulsd %xmm4, %xmm0
|
||||
; SSE-NEXT: mulsd %xmm5, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
|
||||
; SSE-NEXT: mulsd %xmm5, %xmm0
|
||||
; SSE-NEXT: mulsd %xmm6, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
|
||||
; SSE-NEXT: mulsd %xmm6, %xmm0
|
||||
; SSE-NEXT: mulsd %xmm7, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
|
||||
; SSE-NEXT: mulsd %xmm7, %xmm0
|
||||
; SSE-NEXT: mulsd %xmm8, %xmm0
|
||||
; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
|
||||
; SSE-NEXT: mulsd %xmm8, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: test_v16f64:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE2-NEXT: mulsd %xmm1, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
|
||||
; SSE2-NEXT: mulsd %xmm1, %xmm0
|
||||
; SSE2-NEXT: mulsd %xmm2, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
|
||||
; SSE2-NEXT: mulsd %xmm2, %xmm0
|
||||
; SSE2-NEXT: mulsd %xmm3, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
|
||||
; SSE2-NEXT: mulsd %xmm3, %xmm0
|
||||
; SSE2-NEXT: mulsd %xmm4, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
|
||||
; SSE2-NEXT: mulsd %xmm4, %xmm0
|
||||
; SSE2-NEXT: mulsd %xmm5, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
|
||||
; SSE2-NEXT: mulsd %xmm5, %xmm0
|
||||
; SSE2-NEXT: mulsd %xmm6, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
|
||||
; SSE2-NEXT: mulsd %xmm6, %xmm0
|
||||
; SSE2-NEXT: mulsd %xmm7, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
|
||||
; SSE2-NEXT: mulsd %xmm7, %xmm0
|
||||
; SSE2-NEXT: mulsd %xmm8, %xmm0
|
||||
; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
|
||||
; SSE2-NEXT: mulsd %xmm8, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: test_v16f64:
|
||||
; SSE41: # %bb.0:
|
||||
; SSE41-NEXT: mulsd %xmm1, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
|
||||
; SSE41-NEXT: mulsd %xmm1, %xmm0
|
||||
; SSE41-NEXT: mulsd %xmm2, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
|
||||
; SSE41-NEXT: mulsd %xmm2, %xmm0
|
||||
; SSE41-NEXT: mulsd %xmm3, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
|
||||
; SSE41-NEXT: mulsd %xmm3, %xmm0
|
||||
; SSE41-NEXT: mulsd %xmm4, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
|
||||
; SSE41-NEXT: mulsd %xmm4, %xmm0
|
||||
; SSE41-NEXT: mulsd %xmm5, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1]
|
||||
; SSE41-NEXT: mulsd %xmm5, %xmm0
|
||||
; SSE41-NEXT: mulsd %xmm6, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
|
||||
; SSE41-NEXT: mulsd %xmm6, %xmm0
|
||||
; SSE41-NEXT: mulsd %xmm7, %xmm0
|
||||
; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
|
||||
; SSE41-NEXT: mulsd %xmm7, %xmm0
|
||||
; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0
|
||||
; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: test_v16f64:
|
||||
; AVX: # %bb.0:
|
||||
|
|
|
@ -3008,11 +3008,10 @@ define void @PR43024() {
|
|||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
|
||||
; AVX-NEXT: vmovaps %xmm0, (%rax)
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovss %xmm0, (%rax)
|
||||
; AVX-NEXT: retq
|
||||
store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16
|
||||
|
|
Loading…
Reference in New Issue