forked from OSchip/llvm-project
[x86] split more 256/512-bit shuffles in lowering
This is intentionally a small step because it's hard to know exactly where we might introduce a conflicting transform with the code that tries to form wider shuffles. But I think this is safe - if we have a wide shuffle with 2 operands, then we should do better with an extract + narrow shuffle. Differential Revision: https://reviews.llvm.org/D57867 llvm-svn: 353427
This commit is contained in:
parent
4b12236f7d
commit
a5c4a5e958
|
@ -14600,7 +14600,11 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
|
|||
if (EltWidth == 32 && NumLowerHalves &&
|
||||
HalfVT.is128BitVector() && !is128BitUnpackShuffleMask(HalfMask))
|
||||
return SDValue();
|
||||
if (EltWidth == 64)
|
||||
// If this is a unary shuffle (assume that the 2nd operand is
|
||||
// canonicalized to undef), then we can use vpermpd. Otherwise, we
|
||||
// are better off extracting the upper half of 1 operand and using a
|
||||
// narrow shuffle.
|
||||
if (EltWidth == 64 && V2.isUndef())
|
||||
return SDValue();
|
||||
}
|
||||
// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
|
||||
|
|
|
@ -3981,10 +3981,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double>
|
|||
define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
|
||||
; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
|
||||
; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,6,2,6]
|
||||
; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
|
||||
; CHECK-NEXT: vmovapd %xmm1, %xmm0
|
||||
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
|
||||
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
|
||||
|
@ -3994,11 +3993,11 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %v
|
|||
; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
|
||||
; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,6,2,6]
|
||||
; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
|
||||
; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vcmpeqpd %xmm0, %xmm2, %k1
|
||||
; CHECK-NEXT: vblendmpd %xmm4, %xmm1, %xmm0 {%k1}
|
||||
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3
|
||||
; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
|
||||
; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
|
||||
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
|
||||
; CHECK-NEXT: vmovapd %xmm1, %xmm0
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
|
||||
|
@ -4011,11 +4010,10 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double>
|
|||
; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
|
||||
; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,6,2,6]
|
||||
; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm3
|
||||
; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vcmpeqpd %xmm0, %xmm1, %k1
|
||||
; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1} {z}
|
||||
; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2
|
||||
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
|
||||
; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
|
||||
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
|
||||
|
@ -4367,11 +4365,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x doub
|
|||
define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) {
|
||||
; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovapd (%rdi), %ymm1
|
||||
; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,3,6]
|
||||
; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
|
||||
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: vmovapd (%rdi), %xmm0
|
||||
; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],mem[0]
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <8 x double>, <8 x double>* %vp
|
||||
%res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
|
||||
|
@ -4380,13 +4375,10 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp)
|
|||
define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
|
||||
; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovapd (%rdi), %ymm2
|
||||
; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,3,6]
|
||||
; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
|
||||
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
|
||||
; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1}
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: vmovapd (%rdi), %xmm2
|
||||
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
|
||||
; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
|
||||
; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],mem[0]
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <8 x double>, <8 x double>* %vp
|
||||
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
|
||||
|
@ -4398,13 +4390,10 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double
|
|||
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) {
|
||||
; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovapd (%rdi), %ymm1
|
||||
; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,6,3,6]
|
||||
; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm2
|
||||
; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
|
||||
; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1} {z}
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: vmovapd (%rdi), %xmm1
|
||||
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
|
||||
; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[0]
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <8 x double>, <8 x double>* %vp
|
||||
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
|
||||
|
|
|
@ -10,7 +10,7 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
|
|||
; CHECK-NEXT: movq %rsp, %rbp
|
||||
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
||||
; CHECK-NEXT: andq $-32, %rsp
|
||||
; CHECK-NEXT: subq $352, %rsp # imm = 0x160
|
||||
; CHECK-NEXT: subq $320, %rsp # imm = 0x140
|
||||
; CHECK-NEXT: vmovaps 240(%rbp), %ymm8
|
||||
; CHECK-NEXT: vmovaps 208(%rbp), %ymm9
|
||||
; CHECK-NEXT: vmovaps 176(%rbp), %ymm10
|
||||
|
@ -24,8 +24,6 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
|
|||
; CHECK-NEXT: vmovdqa %xmm6, %xmm9
|
||||
; CHECK-NEXT: # kill: def $ymm9 killed $xmm9
|
||||
; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
||||
; CHECK-NEXT: # implicit-def: $ymm0
|
||||
; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
|
||||
|
@ -34,9 +32,10 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
|
|||
; CHECK-NEXT: vmovaps %xmm2, %xmm6
|
||||
; CHECK-NEXT: # implicit-def: $ymm2
|
||||
; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
|
||||
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
|
||||
; CHECK-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm11[2,3],ymm7[4,5],ymm11[6,7]
|
||||
; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
|
||||
; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm6
|
||||
; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm6[0],zero
|
||||
; CHECK-NEXT: # implicit-def: $ymm11
|
||||
; CHECK-NEXT: vmovaps %xmm6, %xmm11
|
||||
; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
|
||||
; CHECK-NEXT: vmovaps %xmm7, %xmm6
|
||||
; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7]
|
||||
|
|
|
@ -108,11 +108,9 @@ define <2 x i32> @test5(<8 x i32> %v) {
|
|||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
|
||||
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
|
||||
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
|
||||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
|
||||
; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
|
@ -228,11 +226,9 @@ define <2 x i32> @test10(<8 x i32> %v) {
|
|||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
||||
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
|
||||
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
||||
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
|
||||
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
|
||||
; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
|
|
|
@ -1904,62 +1904,22 @@ define <8 x i32> @shuffle_v8i32_0zzzzzzz_optsize(<8 x i32> %a) optsize {
|
|||
ret <8 x i32> %b
|
||||
}
|
||||
|
||||
; FIXME: AVX1 lowering is better than AVX2 (and AVX512?)
|
||||
|
||||
define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) {
|
||||
; AVX1-LABEL: unpckh_v4i64:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: unpckh_v4i64:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
|
||||
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512VL-SLOW-LABEL: unpckh_v4i64:
|
||||
; AVX512VL-SLOW: # %bb.0:
|
||||
; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
|
||||
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
|
||||
; AVX512VL-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX512VL-FAST-LABEL: unpckh_v4i64:
|
||||
; AVX512VL-FAST: # %bb.0:
|
||||
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,7,3,7]
|
||||
; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
|
||||
; AVX512VL-FAST-NEXT: retq
|
||||
; ALL-LABEL: unpckh_v4i64:
|
||||
; ALL: # %bb.0:
|
||||
; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; ALL-NEXT: retq
|
||||
%unpckh = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 1, i32 7, i32 undef, i32 undef>
|
||||
ret <4 x i64> %unpckh
|
||||
}
|
||||
|
||||
; FIXME: AVX1 lowering is better than AVX2 (and AVX512?)
|
||||
|
||||
define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) {
|
||||
; AVX1-LABEL: unpckh_v4f64:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: unpckh_v4f64:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
|
||||
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512VL-SLOW-LABEL: unpckh_v4f64:
|
||||
; AVX512VL-SLOW: # %bb.0:
|
||||
; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
|
||||
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
|
||||
; AVX512VL-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX512VL-FAST-LABEL: unpckh_v4f64:
|
||||
; AVX512VL-FAST: # %bb.0:
|
||||
; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,7,3,7]
|
||||
; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
|
||||
; AVX512VL-FAST-NEXT: retq
|
||||
; ALL-LABEL: unpckh_v4f64:
|
||||
; ALL: # %bb.0:
|
||||
; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; ALL-NEXT: retq
|
||||
%unpckh = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 7, i32 undef, i32 undef>
|
||||
ret <4 x double> %unpckh
|
||||
}
|
||||
|
|
|
@ -2290,10 +2290,9 @@ define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) {
|
|||
define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) {
|
||||
; ALL-LABEL: test_v8i64_2_5:
|
||||
; ALL: # %bb.0:
|
||||
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
|
||||
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
|
||||
; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3]
|
||||
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm1
|
||||
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
|
||||
; ALL-NEXT: vzeroupper
|
||||
; ALL-NEXT: ret{{[l|q]}}
|
||||
%res = shufflevector <8 x i64> %v, <8 x i64> undef, <2 x i32> <i32 2, i32 5>
|
||||
|
|
Loading…
Reference in New Issue