2020-05-07 02:05:42 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2020-11-16 09:44:06 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE4
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
|
2020-05-07 02:05:42 +08:00
|
|
|
|
|
|
|
define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) {
|
|
|
|
; SSE2-LABEL: PR45808:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
|
[DAG] Fold shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) (REAPPLIED)
Fold shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) -> bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))
Attempt to fold from a shuffle of a pair of binops to a binop of shuffles, as long as one/both of the binop sources are also shuffles that can be merged with the outer shuffle. This should guarantee that we remove one binop without introducing any additional shuffles.
Technically there's potential for a merged shuffle's lowering to be poorer than the original shuffle, but it could also be better, and I'm not seeing any regressions as long as we keep the 'don't merge splats' rule already present in MergeInnerShuffle.
This expands and generalizes an existing X86 combine and attempts to merge either of each binop's sources (with an on-the-fly commutation of the shuffle mask) - we couldn't do that in the x86 version as it had to stay in a form that DAGCombine's MergeInnerShuffle would still recognise.
Fixes issue raised by @saugustine in rG5aa8f4c0843a where we were failing to replace null shuffle operands from MergeInnerShuffle to UNDEFs.
Differential Revision: https://reviews.llvm.org/D96345
2021-02-17 19:41:41 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm3, %xmm9
|
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm9
|
2020-05-07 02:05:42 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm6
|
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm6
|
[DAG] Fold shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) (REAPPLIED)
Fold shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) -> bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))
Attempt to fold from a shuffle of a pair of binops to a binop of shuffles, as long as one/both of the binop sources are also shuffles that can be merged with the outer shuffle. This should guarantee that we remove one binop without introducing any additional shuffles.
Technically there's potential for a merged shuffle's lowering to be poorer than the original shuffle, but it could also be better, and I'm not seeing any regressions as long as we keep the 'don't merge splats' rule already present in MergeInnerShuffle.
This expands and generalizes an existing X86 combine and attempts to merge either of each binop's sources (with an on-the-fly commutation of the shuffle mask) - we couldn't do that in the x86 version as it had to stay in a form that DAGCombine's MergeInnerShuffle would still recognise.
Fixes issue raised by @saugustine in rG5aa8f4c0843a where we were failing to replace null shuffle operands from MergeInnerShuffle to UNDEFs.
Differential Revision: https://reviews.llvm.org/D96345
2021-02-17 19:41:41 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm6, %xmm8
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm9, %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm7
|
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm7
|
2020-05-07 02:05:42 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm0, %xmm4
|
[DAG] Fold shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) (REAPPLIED)
Fold shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) -> bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))
Attempt to fold from a shuffle of a pair of binops to a binop of shuffles, as long as one/both of the binop sources are also shuffles that can be merged with the outer shuffle. This should guarantee that we remove one binop without introducing any additional shuffles.
Technically there's potential for a merged shuffle's lowering to be poorer than the original shuffle, but it could also be better, and I'm not seeing any regressions as long as we keep the 'don't merge splats' rule already present in MergeInnerShuffle.
This expands and generalizes an existing X86 combine and attempts to merge either of each binop's sources (with an on-the-fly commutation of the shuffle mask) - we couldn't do that in the x86 version as it had to stay in a form that DAGCombine's MergeInnerShuffle would still recognise.
Fixes issue raised by @saugustine in rG5aa8f4c0843a where we were failing to replace null shuffle operands from MergeInnerShuffle to UNDEFs.
Differential Revision: https://reviews.llvm.org/D96345
2021-02-17 19:41:41 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm4, %xmm5
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
|
|
|
|
; SSE2-NEXT: movdqa %xmm5, %xmm10
|
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2]
|
|
|
|
; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
|
|
|
|
; SSE2-NEXT: pcmpeqd %xmm7, %xmm4
|
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm6[1,3]
|
|
|
|
; SSE2-NEXT: andps %xmm10, %xmm4
|
|
|
|
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm8[1,3]
|
|
|
|
; SSE2-NEXT: orps %xmm4, %xmm5
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,3,3]
|
|
|
|
; SSE2-NEXT: psllq $63, %xmm4
|
|
|
|
; SSE2-NEXT: psrad $31, %xmm4
|
2020-05-07 02:05:42 +08:00
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
|
[DAG] Fold shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) (REAPPLIED)
Fold shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) -> bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))
Attempt to fold from a shuffle of a pair of binops to a binop of shuffles, as long as one/both of the binop sources are also shuffles that can be merged with the outer shuffle. This should guarantee that we remove one binop without introducing any additional shuffles.
Technically there's potential for a merged shuffle's lowering to be poorer than the original shuffle, but it could also be better, and I'm not seeing any regressions as long as we keep the 'don't merge splats' rule already present in MergeInnerShuffle.
This expands and generalizes an existing X86 combine and attempts to merge either of each binop's sources (with an on-the-fly commutation of the shuffle mask) - we couldn't do that in the x86 version as it had to stay in a form that DAGCombine's MergeInnerShuffle would still recognise.
Fixes issue raised by @saugustine in rG5aa8f4c0843a where we were failing to replace null shuffle operands from MergeInnerShuffle to UNDEFs.
Differential Revision: https://reviews.llvm.org/D96345
2021-02-17 19:41:41 +08:00
|
|
|
; SSE2-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE2-NEXT: pandn %xmm3, %xmm4
|
|
|
|
; SSE2-NEXT: por %xmm4, %xmm1
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,1,3]
|
2021-06-12 04:26:17 +08:00
|
|
|
; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
|
2020-08-03 19:18:21 +08:00
|
|
|
; SSE2-NEXT: psllq $63, %xmm3
|
|
|
|
; SSE2-NEXT: psrad $31, %xmm3
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
|
2020-05-07 02:05:42 +08:00
|
|
|
; SSE2-NEXT: pand %xmm3, %xmm0
|
|
|
|
; SSE2-NEXT: pandn %xmm2, %xmm3
|
|
|
|
; SSE2-NEXT: por %xmm3, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE4-LABEL: PR45808:
|
|
|
|
; SSE4: # %bb.0:
|
|
|
|
; SSE4-NEXT: movdqa %xmm0, %xmm4
|
|
|
|
; SSE4-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
|
Revert "[X86] Fold shuffle(not(x),undef) -> not(shuffle(x,undef))"
This reverts commit 925093d88ae74560a8e94cf66f95d60ea3ffa2d3.
Causes an infinite loop when compiling some shuffles:
$ cat bugpoint-reduced-simplified.ll
target triple = "x86_64-unknown-linux-gnu"
define void @foo() {
entry:
%0 = load i8, i8* undef, align 1
%broadcast.splatinsert = insertelement <16 x i8> poison, i8 %0, i32 0
%1 = icmp ne <16 x i8> %broadcast.splatinsert, zeroinitializer
%2 = shufflevector <16 x i1> %1, <16 x i1> undef, <16 x i32> zeroinitializer
%wide.load = load <16 x i8>, <16 x i8>* undef, align 1
%3 = icmp ne <16 x i8> %wide.load, zeroinitializer
%4 = and <16 x i1> %3, %2
%5 = zext <16 x i1> %4 to <16 x i8>
store <16 x i8> %5, <16 x i8>* undef, align 1
ret void
}
$ llc < bugpoint-reduced-simplified.ll
<timeout>
2021-03-02 18:21:54 +08:00
|
|
|
; SSE4-NEXT: movdqa %xmm4, %xmm5
|
|
|
|
; SSE4-NEXT: pcmpgtq %xmm2, %xmm5
|
|
|
|
; SSE4-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
|
2020-06-29 19:18:27 +08:00
|
|
|
; SSE4-NEXT: pcmpeqd %xmm6, %xmm6
|
Revert "[X86] Fold shuffle(not(x),undef) -> not(shuffle(x,undef))"
This reverts commit 925093d88ae74560a8e94cf66f95d60ea3ffa2d3.
Causes an infinite loop when compiling some shuffles:
$ cat bugpoint-reduced-simplified.ll
target triple = "x86_64-unknown-linux-gnu"
define void @foo() {
entry:
%0 = load i8, i8* undef, align 1
%broadcast.splatinsert = insertelement <16 x i8> poison, i8 %0, i32 0
%1 = icmp ne <16 x i8> %broadcast.splatinsert, zeroinitializer
%2 = shufflevector <16 x i1> %1, <16 x i1> undef, <16 x i32> zeroinitializer
%wide.load = load <16 x i8>, <16 x i8>* undef, align 1
%3 = icmp ne <16 x i8> %wide.load, zeroinitializer
%4 = and <16 x i1> %3, %2
%5 = zext <16 x i1> %4 to <16 x i8>
store <16 x i8> %5, <16 x i8>* undef, align 1
ret void
}
$ llc < bugpoint-reduced-simplified.ll
<timeout>
2021-03-02 18:21:54 +08:00
|
|
|
; SSE4-NEXT: pxor %xmm5, %xmm6
|
2020-05-07 02:05:42 +08:00
|
|
|
; SSE4-NEXT: psllq $63, %xmm0
|
|
|
|
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
|
Revert "[X86] Fold shuffle(not(x),undef) -> not(shuffle(x,undef))"
This reverts commit 925093d88ae74560a8e94cf66f95d60ea3ffa2d3.
Causes an infinite loop when compiling some shuffles:
$ cat bugpoint-reduced-simplified.ll
target triple = "x86_64-unknown-linux-gnu"
define void @foo() {
entry:
%0 = load i8, i8* undef, align 1
%broadcast.splatinsert = insertelement <16 x i8> poison, i8 %0, i32 0
%1 = icmp ne <16 x i8> %broadcast.splatinsert, zeroinitializer
%2 = shufflevector <16 x i1> %1, <16 x i1> undef, <16 x i32> zeroinitializer
%wide.load = load <16 x i8>, <16 x i8>* undef, align 1
%3 = icmp ne <16 x i8> %wide.load, zeroinitializer
%4 = and <16 x i1> %3, %2
%5 = zext <16 x i1> %4 to <16 x i8>
store <16 x i8> %5, <16 x i8>* undef, align 1
ret void
}
$ llc < bugpoint-reduced-simplified.ll
<timeout>
2021-03-02 18:21:54 +08:00
|
|
|
; SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero
|
|
|
|
; SSE4-NEXT: psllq $63, %xmm0
|
2020-05-07 02:05:42 +08:00
|
|
|
; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2
|
|
|
|
; SSE4-NEXT: movapd %xmm2, %xmm0
|
|
|
|
; SSE4-NEXT: movapd %xmm3, %xmm1
|
|
|
|
; SSE4-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: PR45808:
|
|
|
|
; AVX1: # %bb.0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
|
2021-06-12 04:26:17 +08:00
|
|
|
; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
|
2020-05-07 02:05:42 +08:00
|
|
|
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: PR45808:
|
|
|
|
; AVX2: # %bb.0:
|
|
|
|
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
|
2021-06-12 04:26:17 +08:00
|
|
|
; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
|
2020-05-07 02:05:42 +08:00
|
|
|
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
%3 = icmp sgt <4 x i64> %0, %1
|
|
|
|
%4 = xor <4 x i1> %3, <i1 true, i1 true, i1 false, i1 false>
|
|
|
|
%5 = select <4 x i1> %4, <4 x i64> %0, <4 x i64> %1
|
|
|
|
ret <4 x i64> %5
|
|
|
|
}
|