[x86] split more 256/512-bit shuffles in lowering

This is intentionally a small step because it's hard to know exactly 
where we might introduce a conflicting transform with the code that 
tries to form wider shuffles. But I think this is safe - if we have 
a wide shuffle with 2 operands, then we should do better with an 
extract + narrow shuffle.

Differential Revision: https://reviews.llvm.org/D57867

llvm-svn: 353427
This commit is contained in:
Sanjay Patel 2019-02-07 17:10:49 +00:00
parent 4b12236f7d
commit a5c4a5e958
6 changed files with 51 additions and 104 deletions

View File

@ -14600,7 +14600,11 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
if (EltWidth == 32 && NumLowerHalves &&
HalfVT.is128BitVector() && !is128BitUnpackShuffleMask(HalfMask))
return SDValue();
if (EltWidth == 64)
// If this is a unary shuffle (assume that the 2nd operand is
// canonicalized to undef), then we can use vpermpd. Otherwise, we
// are better off extracting the upper half of 1 operand and using a
// narrow shuffle.
if (EltWidth == 64 && V2.isUndef())
return SDValue();
}
// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

View File

@ -3981,10 +3981,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double>
define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,6,2,6]
; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
@ -3994,11 +3993,11 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %v
; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,6,2,6]
; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmpeqpd %xmm0, %xmm2, %k1
; CHECK-NEXT: vblendmpd %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3
; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
@ -4011,11 +4010,10 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double>
; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,6,2,6]
; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm3
; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmpeqpd %xmm0, %xmm1, %k1
; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
@ -4367,11 +4365,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x doub
define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) {
; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %ymm1
; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,3,6]
; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: vmovapd (%rdi), %xmm0
; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],mem[0]
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
@ -4380,13 +4375,10 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp)
define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %ymm2
; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,3,6]
; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: vmovapd (%rdi), %xmm2
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],mem[0]
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
@ -4398,13 +4390,10 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %ymm1
; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,6,3,6]
; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm2
; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: vmovapd (%rdi), %xmm1
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[0]
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>

View File

@ -10,7 +10,7 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-32, %rsp
; CHECK-NEXT: subq $352, %rsp # imm = 0x160
; CHECK-NEXT: subq $320, %rsp # imm = 0x140
; CHECK-NEXT: vmovaps 240(%rbp), %ymm8
; CHECK-NEXT: vmovaps 208(%rbp), %ymm9
; CHECK-NEXT: vmovaps 176(%rbp), %ymm10
@ -24,8 +24,6 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
; CHECK-NEXT: vmovdqa %xmm6, %xmm9
; CHECK-NEXT: # kill: def $ymm9 killed $xmm9
; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: # implicit-def: $ymm0
; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
@ -34,9 +32,10 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
; CHECK-NEXT: vmovaps %xmm2, %xmm6
; CHECK-NEXT: # implicit-def: $ymm2
; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; CHECK-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm11[2,3],ymm7[4,5],ymm11[6,7]
; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm6
; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm6[0],zero
; CHECK-NEXT: # implicit-def: $ymm11
; CHECK-NEXT: vmovaps %xmm6, %xmm11
; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
; CHECK-NEXT: vmovaps %xmm7, %xmm6
; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7]

View File

@ -108,11 +108,9 @@ define <2 x i32> @test5(<8 x i32> %v) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@ -228,11 +226,9 @@ define <2 x i32> @test10(<8 x i32> %v) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;

View File

@ -1904,62 +1904,22 @@ define <8 x i32> @shuffle_v8i32_0zzzzzzz_optsize(<8 x i32> %a) optsize {
ret <8 x i32> %b
}
; FIXME: AVX1 lowering is better than AVX2 (and AVX512?)
define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) {
; AVX1-LABEL: unpckh_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: unpckh_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: unpckh_v4i64:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: unpckh_v4i64:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,7,3,7]
; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
; ALL-LABEL: unpckh_v4i64:
; ALL: # %bb.0:
; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; ALL-NEXT: retq
%unpckh = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 1, i32 7, i32 undef, i32 undef>
ret <4 x i64> %unpckh
}
; FIXME: AVX1 lowering is better than AVX2 (and AVX512?)
define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) {
; AVX1-LABEL: unpckh_v4f64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: unpckh_v4f64:
; AVX2: # %bb.0:
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: unpckh_v4f64:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: unpckh_v4f64:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,7,3,7]
; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-NEXT: retq
; ALL-LABEL: unpckh_v4f64:
; ALL: # %bb.0:
; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; ALL-NEXT: retq
%unpckh = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 7, i32 undef, i32 undef>
ret <4 x double> %unpckh
}

View File

@ -2290,10 +2290,9 @@ define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) {
define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) {
; ALL-LABEL: test_v8i64_2_5:
; ALL: # %bb.0:
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3]
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm1
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; ALL-NEXT: vzeroupper
; ALL-NEXT: ret{{[l|q]}}
%res = shufflevector <8 x i64> %v, <8 x i64> undef, <2 x i32> <i32 2, i32 5>