From b85df2e185f9e4bb69eb419bc2c6f0158475d17f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 16 Feb 2020 13:54:17 +0000 Subject: [PATCH] [X86] combineX86ShuffleChain - add support for combining 512-bit shuffles to PALIGNR --- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +- llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll | 4 +- .../CodeGen/X86/x86-interleaved-access.ll | 38 +++++++++---------- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 729ab7af4af6..e398a4f2fad9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33646,7 +33646,8 @@ static bool matchBinaryPermuteShuffle( // Attempt to match against PALIGNR byte rotate. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask); if (0 < ByteRotation) { Shuffle = X86ISD::PALIGNR; diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll index c750dcb77cab..44898fc951e0 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -653,14 +653,14 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 +; AVX512BW-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,32,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,48] ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[1,u,3,u,5,u,7,u,9,u,11,u,13,u,15,u,17,u,19,u,21,u,23,u,25,u,27,u,29,u,31,u,33,u,35,u,37,u,39,u,41,u,43,u,45,u,47,u,49,u,51,u,53,u,55,u,57,u,59,u,61,u,63,u] +; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,32,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,48] ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 7ea220c59a85..4c81fae4ca92 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1280,35 +1280,35 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX512-LABEL: interleaved_store_vf64_i8_stride3: ; AVX512: # %bb.0: ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53] -; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58] -; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] -; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm1[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm1[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm1[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52] -; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52] -; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52] -; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52] -; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm1[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm1[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm1[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3 +; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58] +; AVX512-NEXT: vpalignr {{.*#+}} zmm4 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] +; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52] +; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52] +; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm4[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm4[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm4[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm4[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52] +; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] +; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm4[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm4[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm4[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm4[48,49,50,51,52] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm2[2,3] ; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm7 -; AVX512-NEXT: vpshufb %ymm4, %ymm7, %ymm7 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm7 +; AVX512-NEXT: vpshufb %ymm4, %ymm7, %ymm7 +; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rdi) ; AVX512-NEXT: vmovdqu64 %zmm3, 64(%rdi) -; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) +; AVX512-NEXT: vmovdqu64 %zmm2, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32>