[X86][AVX] Extend combineCommutableSHUFP to handle v8f32 and v16f32 commutable shufps patterns

This commit is contained in:
Simon Pilgrim 2020-01-26 14:59:53 +00:00
parent 5043962dd3
commit fa19d67a2a
3 changed files with 8 additions and 11 deletions

View File

@ -34555,8 +34555,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
SelectionDAG &DAG) {
// TODO: Add general vXf32 + vXf64 support.
if (VT != MVT::v4f32)
// TODO: Add vXf64 support.
if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
return SDValue();
// SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.

View File

@ -1308,16 +1308,14 @@ define <8 x float> @shuffle_v8f32_32107654_v4f32(<4 x float> %a, <4 x float> %b)
define <8 x float> @shuffle_mem_v8f32_8BA0CFE4(<8 x float> %a0, <8 x float>* %a1) {
; AVX1OR2-LABEL: shuffle_mem_v8f32_8BA0CFE4:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vmovaps (%rdi), %ymm1
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[2,0],ymm0[4,7],ymm1[6,4]
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,0],mem[0,0],ymm0[6,4],mem[4,4]
; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_mem_v8f32_8BA0CFE4:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vmovaps (%rdi), %ymm1
; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[2,0],ymm0[4,7],ymm1[6,4]
; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,0],mem[0,0],ymm0[6,4],mem[4,4]
; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_mem_v8f32_8BA0CFE4:

View File

@ -274,9 +274,8 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_
define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a0, <16 x float>* %a1) {
; ALL-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
; ALL: # %bb.0:
; ALL-NEXT: vmovaps (%rdi), %zmm1
; ALL-NEXT: vshufps {{.*#+}} zmm1 = zmm1[0,0],zmm0[2,0],zmm1[4,4],zmm0[6,4],zmm1[8,8],zmm0[10,8],zmm1[12,12],zmm0[14,12]
; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[2,0],zmm0[4,7],zmm1[6,4],zmm0[8,11],zmm1[10,8],zmm0[12,15],zmm1[14,12]
; ALL-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
; ALL-NEXT: retq
%1 = load <16 x float>, <16 x float>* %a1
%2 = shufflevector <16 x float> %1, <16 x float> %a0, <16 x i32> <i32 16, i32 19, i32 18, i32 0, i32 20, i32 23, i32 22, i32 4, i32 24, i32 27, i32 26, i32 8, i32 28, i32 31, i32 30, i32 12>