[X86][SSE] Add shuffle combine tests for OR(PSHUFB,PSHUFB) style patterns.

We generate these shuffle patterns but we fail to combine them.

llvm-svn: 339684
This commit is contained in:
Simon Pilgrim 2018-08-14 15:21:26 +00:00
parent c8e3943e89
commit 52c88a7c0e
2 changed files with 166 additions and 0 deletions

View File

@ -888,6 +888,73 @@ define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) {
ret <8 x i32> %3
}
define <32 x i8> @combine_pshufb_pshufb_or_as_blend(<32 x i8> %a0, <32 x i8> %a1) {
; X32-LABEL: combine_pshufb_pshufb_or_as_blend:
; X32: # %bb.0:
; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; X32-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_pshufb_or_as_blend:
; X64: # %bb.0:
; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; X64-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
%3 = or <32 x i8> %1, %2
ret <32 x i8> %3
}
define <32 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<32 x i8> %a0, <32 x i8> %a1) {
; X32-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; X32: # %bb.0:
; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
; X32-NEXT: vpor %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; X64: # %bb.0:
; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1>)
%2 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7>)
%3 = or <32 x i8> %1, %2
ret <32 x i8> %3
}
define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_pshufb_or_pshufb:
; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero
; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19]
; X32-NEXT: vpor %ymm0, %ymm1, %ymm0
; X32-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_pshufb_or_pshufb:
; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero
; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19]
; X64-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
; X64-NEXT: retq
%1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
%3 = or <32 x i8> %1, %2
%4 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
ret <32 x i8> %4
}
define <8 x i32> @constant_fold_permd() {
; X32-LABEL: constant_fold_permd:
; X32: # %bb.0:

View File

@ -666,6 +666,105 @@ define <16 x i8> @shuffle_combine_packuswb_pshufb(<8 x i16> %a0, <8 x i16> %a1)
}
declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @combine_pshufb_pshufb_or_as_blend(<16 x i8> %a0, <16 x i8> %a1) {
; SSSE3-LABEL: combine_pshufb_pshufb_or_as_blend:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[8,9,10,11,12,13,14,15]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_pshufb_pshufb_or_as_blend:
; SSE41: # %bb.0:
; SSE41-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_pshufb_pshufb_or_as_blend:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_pshufb_pshufb_or_as_blend:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_pshufb_pshufb_or_as_blend:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
%3 = or <16 x i8> %1, %2
ret <16 x i8> %3
}
define <16 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
; SSSE3-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1>)
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a1, <16 x i8> <i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7>)
%3 = or <16 x i8> %1, %2
ret <16 x i8> %3
}
define <16 x i8> @combine_pshufb_pshufb_or_pshufb(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_pshufb_or_pshufb:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero
; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_pshufb_or_pshufb:
; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3]
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
; AVX-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
%3 = or <16 x i8> %1, %2
%4 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
ret <16 x i8> %4
}
define <16 x i8> @constant_fold_pshufb() {
; SSE-LABEL: constant_fold_pshufb:
; SSE: # %bb.0: