llvm-project/llvm/test/CodeGen/X86/vector-trunc-packus.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

3125 lines
124 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
;
; PACKUS saturation truncation to vXi32
;
define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
; SSE2-LABEL: trunc_packus_v4i64_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
; SSE2-NEXT: movdqa %xmm5, %xmm6
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn %xmm8, %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm8, %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v4i64_v4i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
; SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm0
; SSSE3-NEXT: pandn %xmm8, %xmm3
; SSSE3-NEXT: por %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm5, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pandn %xmm8, %xmm4
; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: movdqa %xmm4, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm5, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: pand %xmm3, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v4i64_v4i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295]
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
; SSE41-NEXT: pxor %xmm3, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647]
; SSE41-NEXT: movdqa %xmm6, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
; SSE41-NEXT: movdqa %xmm6, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: movapd %xmm4, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm0
; SSE41-NEXT: movdqa %xmm6, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
; SSE41-NEXT: xorpd %xmm1, %xmm1
; SSE41-NEXT: movapd %xmm4, %xmm2
; SSE41-NEXT: xorpd %xmm3, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm6
; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
; SSE41-NEXT: pcmpgtd %xmm3, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
; SSE41-NEXT: movapd %xmm5, %xmm4
; SSE41-NEXT: xorpd %xmm3, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm6
; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v4i64_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967295,4294967295]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5
; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm2
; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_packus_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_packus_v4i64_v4i32:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v4i64_v4i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_packus_v4i64_v4i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_packus_v4i64_v4i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpmovusqd %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp slt <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%3 = icmp sgt <4 x i64> %2, zeroinitializer
%4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer
%5 = trunc <4 x i64> %4 to <4 x i32>
ret <4 x i32> %5
}
define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
; SSE2-LABEL: trunc_packus_v8i64_v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pxor %xmm10, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
; SSE2-NEXT: movdqa %xmm9, %xmm6
; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pandn %xmm8, %xmm5
; SSE2-NEXT: por %xmm0, %xmm5
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm10, %xmm0
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pandn %xmm8, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm10, %xmm1
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: pandn %xmm8, %xmm6
; SSE2-NEXT: por %xmm2, %xmm6
; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: pxor %xmm10, %xmm1
; SSE2-NEXT: movdqa %xmm9, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm8, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm10, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm6, %xmm1
; SSE2-NEXT: pxor %xmm10, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm1
; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pxor %xmm10, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm10, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v8i64_v8i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm5
; SSSE3-NEXT: pxor %xmm10, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
; SSSE3-NEXT: movdqa %xmm9, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm0
; SSSE3-NEXT: pandn %xmm8, %xmm5
; SSSE3-NEXT: por %xmm0, %xmm5
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm10, %xmm0
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm1
; SSSE3-NEXT: pandn %xmm8, %xmm0
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm10, %xmm1
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm6
; SSSE3-NEXT: pand %xmm6, %xmm2
; SSSE3-NEXT: pandn %xmm8, %xmm6
; SSSE3-NEXT: por %xmm2, %xmm6
; SSSE3-NEXT: movdqa %xmm3, %xmm1
; SSSE3-NEXT: pxor %xmm10, %xmm1
; SSSE3-NEXT: movdqa %xmm9, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: pandn %xmm8, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm10, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: movdqa %xmm6, %xmm1
; SSSE3-NEXT: pxor %xmm10, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm1
; SSSE3-NEXT: pand %xmm6, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: pxor %xmm10, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm2, %xmm3
; SSSE3-NEXT: pand %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm5, %xmm0
; SSSE3-NEXT: pxor %xmm10, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSSE3-NEXT: por %xmm6, %xmm0
; SSSE3-NEXT: pand %xmm5, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i64_v8i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm9
; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
; SSE41-NEXT: movdqa %xmm5, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: movdqa %xmm5, %xmm6
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm8
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm5, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: movdqa %xmm5, %xmm6
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm9
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm5, %xmm1
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm5, %xmm4
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm5, %xmm1
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
; SSE41-NEXT: xorpd %xmm2, %xmm2
; SSE41-NEXT: movapd %xmm7, %xmm1
; SSE41-NEXT: xorpd %xmm10, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm3
; SSE41-NEXT: pcmpeqd %xmm10, %xmm3
; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm3
; SSE41-NEXT: movapd %xmm4, %xmm1
; SSE41-NEXT: xorpd %xmm10, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm5
; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
; SSE41-NEXT: movapd %xmm9, %xmm3
; SSE41-NEXT: xorpd %xmm10, %xmm3
; SSE41-NEXT: movapd %xmm3, %xmm4
; SSE41-NEXT: pcmpeqd %xmm10, %xmm4
; SSE41-NEXT: pcmpgtd %xmm10, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3
; SSE41-NEXT: movapd %xmm8, %xmm4
; SSE41-NEXT: xorpd %xmm10, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v8i64_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4294967295,4294967295]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9
; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2
; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0
; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
; AVX2-FAST-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
; AVX2-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_packus_v8i64_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovusqd %zmm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp slt <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%3 = icmp sgt <8 x i64> %2, zeroinitializer
%4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
%5 = trunc <8 x i64> %4 to <8 x i32>
ret <8 x i32> %5
}
;
; PACKUS saturation truncation to vXi16
;
define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) {
; SSE2-LABEL: trunc_packus_v8i64_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm10, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183]
; SSE2-NEXT: movdqa %xmm9, %xmm6
; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pandn %xmm8, %xmm5
; SSE2-NEXT: por %xmm1, %xmm5
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm10, %xmm1
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pandn %xmm8, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm10, %xmm0
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: pandn %xmm8, %xmm6
; SSE2-NEXT: por %xmm3, %xmm6
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm10, %xmm0
; SSE2-NEXT: movdqa %xmm9, %xmm3
; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: pandn %xmm8, %xmm3
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm10, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm6, %xmm2
; SSE2-NEXT: pxor %xmm10, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm2
; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm10, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm3, %xmm4
; SSE2-NEXT: pand %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: pxor %xmm10, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v8i64_v8i16:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm1, %xmm5
; SSSE3-NEXT: pxor %xmm10, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183]
; SSSE3-NEXT: movdqa %xmm9, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm1
; SSSE3-NEXT: pandn %xmm8, %xmm5
; SSSE3-NEXT: por %xmm1, %xmm5
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm10, %xmm1
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pandn %xmm8, %xmm1
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pxor %xmm10, %xmm0
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm6
; SSSE3-NEXT: pand %xmm6, %xmm3
; SSSE3-NEXT: pandn %xmm8, %xmm6
; SSSE3-NEXT: por %xmm3, %xmm6
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: pxor %xmm10, %xmm0
; SSSE3-NEXT: movdqa %xmm9, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm2
; SSSE3-NEXT: pandn %xmm8, %xmm3
; SSSE3-NEXT: por %xmm2, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pxor %xmm10, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm0
; SSSE3-NEXT: pand %xmm3, %xmm0
; SSSE3-NEXT: movdqa %xmm6, %xmm2
; SSSE3-NEXT: pxor %xmm10, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm2
; SSSE3-NEXT: pand %xmm6, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pxor %xmm10, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm3, %xmm4
; SSSE3-NEXT: pand %xmm1, %xmm4
; SSSE3-NEXT: movdqa %xmm5, %xmm1
; SSSE3-NEXT: pxor %xmm10, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: pand %xmm5, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i64_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm9
; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
; SSE41-NEXT: movdqa %xmm9, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: movapd %xmm7, %xmm1
; SSE41-NEXT: xorpd %xmm10, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm4
; SSE41-NEXT: pcmpeqd %xmm10, %xmm4
; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4
; SSE41-NEXT: movapd %xmm6, %xmm1
; SSE41-NEXT: xorpd %xmm10, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm5
; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
; SSE41-NEXT: packusdw %xmm4, %xmm1
; SSE41-NEXT: movapd %xmm2, %xmm4
; SSE41-NEXT: xorpd %xmm10, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
; SSE41-NEXT: movapd %xmm8, %xmm2
; SSE41-NEXT: xorpd %xmm10, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm5
; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
; SSE41-NEXT: pcmpgtd %xmm10, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
; SSE41-NEXT: packusdw %xmm4, %xmm3
; SSE41-NEXT: packusdw %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v8i64_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9
; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2
; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v8i64_v8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_packus_v8i64_v8i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovusqw %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp slt <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
%2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
%3 = icmp sgt <8 x i64> %2, zeroinitializer
%4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
%5 = trunc <8 x i64> %4 to <8 x i16>
ret <8 x i16> %5
}
define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) {
; SSE2-LABEL: trunc_packus_v8i32_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: pandn %xmm2, %xmm3
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v8i32_v8i16:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm1
; SSSE3-NEXT: pandn %xmm2, %xmm3
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pandn %xmm2, %xmm1
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: movdqa %xmm3, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
; SSSE3-NEXT: pand %xmm3, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: pshufb %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm2, %xmm0
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i32_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v8i32_v8i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v8i32_v8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v8i32_v8i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_packus_v8i32_v8i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_packus_v8i32_v8i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp slt <8 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
%2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
%3 = icmp sgt <8 x i32> %2, zeroinitializer
%4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
%5 = trunc <8 x i32> %4 to <8 x i16>
ret <8 x i16> %5
}
define <16 x i16> @trunc_packus_v16i32_v16i16(<16 x i32> %a0) {
; SSE2-LABEL: trunc_packus_v16i32_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
; SSE2-NEXT: movdqa %xmm6, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm6, %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pandn %xmm6, %xmm5
; SSE2-NEXT: por %xmm0, %xmm5
; SSE2-NEXT: movdqa %xmm6, %xmm0
; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: pandn %xmm6, %xmm0
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm6, %xmm3
; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: pandn %xmm6, %xmm3
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm5, %xmm0
; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
; SSE2-NEXT: pand %xmm4, %xmm5
; SSE2-NEXT: pslld $16, %xmm5
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: packssdw %xmm5, %xmm0
; SSE2-NEXT: pslld $16, %xmm3
; SSE2-NEXT: psrad $16, %xmm3
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: packssdw %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v16i32_v16i16:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
; SSSE3-NEXT: movdqa %xmm6, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pandn %xmm6, %xmm4
; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: movdqa %xmm6, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm0
; SSSE3-NEXT: pandn %xmm6, %xmm5
; SSSE3-NEXT: por %xmm0, %xmm5
; SSSE3-NEXT: movdqa %xmm6, %xmm0
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm3
; SSSE3-NEXT: pandn %xmm6, %xmm0
; SSSE3-NEXT: por %xmm3, %xmm0
; SSSE3-NEXT: movdqa %xmm6, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm2
; SSSE3-NEXT: pandn %xmm6, %xmm3
; SSSE3-NEXT: por %xmm2, %xmm3
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm3, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
; SSSE3-NEXT: pand %xmm3, %xmm1
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
; SSSE3-NEXT: pand %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm5, %xmm0
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
; SSSE3-NEXT: pand %xmm5, %xmm0
; SSSE3-NEXT: movdqa %xmm4, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
; SSSE3-NEXT: pand %xmm4, %xmm5
; SSSE3-NEXT: pslld $16, %xmm5
; SSSE3-NEXT: psrad $16, %xmm5
; SSSE3-NEXT: pslld $16, %xmm0
; SSSE3-NEXT: psrad $16, %xmm0
; SSSE3-NEXT: packssdw %xmm5, %xmm0
; SSSE3-NEXT: pslld $16, %xmm3
; SSSE3-NEXT: psrad $16, %xmm3
; SSSE3-NEXT: pslld $16, %xmm1
; SSSE3-NEXT: psrad $16, %xmm1
; SSSE3-NEXT: packssdw %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v16i32_v16i16:
; SSE41: # %bb.0:
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: packusdw %xmm3, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v16i32_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v16i32_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_packus_v16i32_v16i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovusdw %zmm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp slt <16 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
%2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
%3 = icmp sgt <16 x i32> %2, zeroinitializer
%4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
%5 = trunc <16 x i32> %4 to <16 x i16>
ret <16 x i16> %5
}
;
; PACKUS saturation truncation to v16i8
;
define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) {
; SSE2-LABEL: trunc_packus_v8i64_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: pxor %xmm10, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
; SSE2-NEXT: movdqa %xmm9, %xmm7
; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pandn %xmm8, %xmm5
; SSE2-NEXT: por %xmm2, %xmm5
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm10, %xmm2
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm8, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm10, %xmm3
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn %xmm8, %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm10, %xmm0
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm8, %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: pxor %xmm10, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm10, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm10, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: pxor %xmm10, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: packuswb %xmm3, %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v8i64_v8i8:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm2, %xmm5
; SSSE3-NEXT: pxor %xmm10, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
; SSSE3-NEXT: movdqa %xmm9, %xmm7
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: pandn %xmm8, %xmm5
; SSSE3-NEXT: por %xmm2, %xmm5
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: pxor %xmm10, %xmm2
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: pandn %xmm8, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm10, %xmm3
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm0
; SSSE3-NEXT: pandn %xmm8, %xmm3
; SSSE3-NEXT: por %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm10, %xmm0
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pandn %xmm8, %xmm4
; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: movdqa %xmm4, %xmm0
; SSSE3-NEXT: pxor %xmm10, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pxor %xmm10, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm0
; SSSE3-NEXT: pand %xmm3, %xmm0
; SSSE3-NEXT: packuswb %xmm1, %xmm0
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm10, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: movdqa %xmm5, %xmm1
; SSSE3-NEXT: pxor %xmm10, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm2
; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: packuswb %xmm3, %xmm2
; SSSE3-NEXT: packuswb %xmm2, %xmm0
; SSSE3-NEXT: packuswb %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i64_v8i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm9
; SSE41-NEXT: movapd {{.*#+}} xmm7 = [255,255]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903]
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
; SSE41-NEXT: movdqa %xmm9, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: movapd %xmm7, %xmm1
; SSE41-NEXT: xorpd %xmm10, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm4
; SSE41-NEXT: pcmpeqd %xmm10, %xmm4
; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4
; SSE41-NEXT: movapd %xmm6, %xmm1
; SSE41-NEXT: xorpd %xmm10, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm5
; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
; SSE41-NEXT: packusdw %xmm4, %xmm1
; SSE41-NEXT: movapd %xmm2, %xmm4
; SSE41-NEXT: xorpd %xmm10, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
; SSE41-NEXT: movapd %xmm8, %xmm2
; SSE41-NEXT: xorpd %xmm10, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm5
; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
; SSE41-NEXT: pcmpgtd %xmm10, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
; SSE41-NEXT: packusdw %xmm4, %xmm3
; SSE41-NEXT: packusdw %xmm3, %xmm1
; SSE41-NEXT: packuswb %xmm1, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v8i64_v8i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9
; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2
; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v8i64_v8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_packus_v8i64_v8i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovusqb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%3 = icmp sgt <8 x i64> %2, zeroinitializer
%4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
%5 = trunc <8 x i64> %4 to <8 x i8>
ret <8 x i8> %5
}
define void @trunc_packus_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
; SSE2-LABEL: trunc_packus_v8i64_v8i8_store:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: pxor %xmm10, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
; SSE2-NEXT: movdqa %xmm9, %xmm7
; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pandn %xmm8, %xmm5
; SSE2-NEXT: por %xmm2, %xmm5
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm10, %xmm2
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm8, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm10, %xmm3
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn %xmm8, %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm10, %xmm0
; SSE2-NEXT: movdqa %xmm9, %xmm4
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm8, %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: pxor %xmm10, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm10, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm10, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: pxor %xmm10, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: packuswb %xmm3, %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movq %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v8i64_v8i8_store:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm2, %xmm5
; SSSE3-NEXT: pxor %xmm10, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
; SSSE3-NEXT: movdqa %xmm9, %xmm7
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: pandn %xmm8, %xmm5
; SSSE3-NEXT: por %xmm2, %xmm5
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: pxor %xmm10, %xmm2
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: pandn %xmm8, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm10, %xmm3
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm0
; SSSE3-NEXT: pandn %xmm8, %xmm3
; SSSE3-NEXT: por %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm10, %xmm0
; SSSE3-NEXT: movdqa %xmm9, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pandn %xmm8, %xmm4
; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: movdqa %xmm4, %xmm0
; SSSE3-NEXT: pxor %xmm10, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pxor %xmm10, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm0
; SSSE3-NEXT: pand %xmm3, %xmm0
; SSSE3-NEXT: packuswb %xmm1, %xmm0
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm10, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: movdqa %xmm5, %xmm1
; SSSE3-NEXT: pxor %xmm10, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm2
; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: packuswb %xmm3, %xmm2
; SSSE3-NEXT: packuswb %xmm2, %xmm0
; SSSE3-NEXT: packuswb %xmm0, %xmm0
; SSSE3-NEXT: movq %xmm0, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i64_v8i8_store:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm9
; SSE41-NEXT: movapd {{.*#+}} xmm7 = [255,255]
; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903]
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
; SSE41-NEXT: movdqa %xmm4, %xmm6
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm8
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
; SSE41-NEXT: movdqa %xmm9, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm3
; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
; SSE41-NEXT: pand %xmm3, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm7, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
; SSE41-NEXT: xorpd %xmm1, %xmm1
; SSE41-NEXT: movapd %xmm7, %xmm4
; SSE41-NEXT: xorpd %xmm10, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm5
; SSE41-NEXT: movapd %xmm3, %xmm4
; SSE41-NEXT: xorpd %xmm10, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm6
; SSE41-NEXT: pcmpeqd %xmm10, %xmm6
; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
; SSE41-NEXT: packusdw %xmm5, %xmm4
; SSE41-NEXT: movapd %xmm2, %xmm3
; SSE41-NEXT: xorpd %xmm10, %xmm3
; SSE41-NEXT: movapd %xmm3, %xmm5
; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
; SSE41-NEXT: pcmpgtd %xmm10, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
; SSE41-NEXT: movapd %xmm8, %xmm2
; SSE41-NEXT: xorpd %xmm10, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm5
; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
; SSE41-NEXT: pcmpgtd %xmm10, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
; SSE41-NEXT: packusdw %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm1, %xmm4
; SSE41-NEXT: packuswb %xmm0, %xmm4
; SSE41-NEXT: movq %xmm4, (%rdi)
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v8i64_v8i8_store:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255]
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9
; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2
; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v8i64_v8i8_store:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_packus_v8i64_v8i8_store:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovusqb %zmm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%3 = icmp sgt <8 x i64> %2, zeroinitializer
%4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
%5 = trunc <8 x i64> %4 to <8 x i8>
store <8 x i8> %5, <8 x i8> *%p1
ret void
}
define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
; SSE2-LABEL: trunc_packus_v16i64_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm6, %xmm9
; SSE2-NEXT: pxor %xmm8, %xmm9
; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483903,2147483903]
; SSE2-NEXT: movdqa %xmm11, %xmm12
; SSE2-NEXT: pcmpgtd %xmm9, %xmm12
; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm11, %xmm9
; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3]
; SSE2-NEXT: pand %xmm13, %xmm14
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
; SSE2-NEXT: por %xmm14, %xmm9
; SSE2-NEXT: pand %xmm9, %xmm6
; SSE2-NEXT: pandn %xmm10, %xmm9
; SSE2-NEXT: por %xmm6, %xmm9
; SSE2-NEXT: movdqa %xmm7, %xmm6
; SSE2-NEXT: pxor %xmm8, %xmm6
; SSE2-NEXT: movdqa %xmm11, %xmm12
; SSE2-NEXT: pcmpgtd %xmm6, %xmm12
; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm11, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE2-NEXT: pand %xmm13, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm12
; SSE2-NEXT: pand %xmm12, %xmm7
; SSE2-NEXT: pandn %xmm10, %xmm12
; SSE2-NEXT: por %xmm7, %xmm12
; SSE2-NEXT: movdqa %xmm4, %xmm6
; SSE2-NEXT: pxor %xmm8, %xmm6
; SSE2-NEXT: movdqa %xmm11, %xmm7
; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm11, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSE2-NEXT: pand %xmm13, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm13
; SSE2-NEXT: pand %xmm13, %xmm4
; SSE2-NEXT: pandn %xmm10, %xmm13
; SSE2-NEXT: por %xmm4, %xmm13
; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm11, %xmm6
; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm11, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm14
; SSE2-NEXT: pand %xmm14, %xmm5
; SSE2-NEXT: pandn %xmm10, %xmm14
; SSE2-NEXT: por %xmm5, %xmm14
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
; SSE2-NEXT: movdqa %xmm11, %xmm5
; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm11, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pandn %xmm10, %xmm5
; SSE2-NEXT: por %xmm2, %xmm5
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm8, %xmm2
; SSE2-NEXT: movdqa %xmm11, %xmm4
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm11, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm10, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm8, %xmm3
; SSE2-NEXT: movdqa %xmm11, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm11, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn %xmm10, %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm11, %xmm4
; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn %xmm10, %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: pxor %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pcmpgtd %xmm8, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pxor %xmm8, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm8, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: packuswb %xmm3, %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm14, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm8, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: pand %xmm14, %xmm2
; SSE2-NEXT: movdqa %xmm13, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: pand %xmm13, %xmm1
; SSE2-NEXT: packuswb %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm12, %xmm2
; SSE2-NEXT: pxor %xmm8, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: pand %xmm12, %xmm3
; SSE2-NEXT: movdqa %xmm9, %xmm2
; SSE2-NEXT: pxor %xmm8, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm4
; SSE2-NEXT: pand %xmm9, %xmm4
; SSE2-NEXT: packuswb %xmm3, %xmm4
; SSE2-NEXT: packuswb %xmm4, %xmm1
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v16i64_v16i8:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [255,255]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm6, %xmm9
; SSSE3-NEXT: pxor %xmm8, %xmm9
; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483903,2147483903]
; SSSE3-NEXT: movdqa %xmm11, %xmm12
; SSSE3-NEXT: pcmpgtd %xmm9, %xmm12
; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm11, %xmm9
; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3]
; SSSE3-NEXT: pand %xmm13, %xmm14
; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
; SSSE3-NEXT: por %xmm14, %xmm9
; SSSE3-NEXT: pand %xmm9, %xmm6
; SSSE3-NEXT: pandn %xmm10, %xmm9
; SSSE3-NEXT: por %xmm6, %xmm9
; SSSE3-NEXT: movdqa %xmm7, %xmm6
; SSSE3-NEXT: pxor %xmm8, %xmm6
; SSSE3-NEXT: movdqa %xmm11, %xmm12
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm12
; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSSE3-NEXT: pand %xmm13, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
; SSSE3-NEXT: por %xmm6, %xmm12
; SSSE3-NEXT: pand %xmm12, %xmm7
; SSSE3-NEXT: pandn %xmm10, %xmm12
; SSSE3-NEXT: por %xmm7, %xmm12
; SSSE3-NEXT: movdqa %xmm4, %xmm6
; SSSE3-NEXT: pxor %xmm8, %xmm6
; SSSE3-NEXT: movdqa %xmm11, %xmm7
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
; SSSE3-NEXT: pand %xmm13, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
; SSSE3-NEXT: por %xmm6, %xmm13
; SSSE3-NEXT: pand %xmm13, %xmm4
; SSSE3-NEXT: pandn %xmm10, %xmm13
; SSSE3-NEXT: por %xmm4, %xmm13
; SSSE3-NEXT: movdqa %xmm5, %xmm4
; SSSE3-NEXT: pxor %xmm8, %xmm4
; SSSE3-NEXT: movdqa %xmm11, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm14
; SSSE3-NEXT: pand %xmm14, %xmm5
; SSSE3-NEXT: pandn %xmm10, %xmm14
; SSSE3-NEXT: por %xmm5, %xmm14
; SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSSE3-NEXT: pxor %xmm8, %xmm4
; SSSE3-NEXT: movdqa %xmm11, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSSE3-NEXT: por %xmm4, %xmm5
; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: pandn %xmm10, %xmm5
; SSSE3-NEXT: por %xmm2, %xmm5
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: pxor %xmm8, %xmm2
; SSSE3-NEXT: movdqa %xmm11, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm6, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: pandn %xmm10, %xmm2
; SSSE3-NEXT: por %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm8, %xmm3
; SSSE3-NEXT: movdqa %xmm11, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm11, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm0
; SSSE3-NEXT: pandn %xmm10, %xmm3
; SSSE3-NEXT: por %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm8, %xmm0
; SSSE3-NEXT: movdqa %xmm11, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pandn %xmm10, %xmm4
; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: movdqa %xmm4, %xmm0
; SSSE3-NEXT: pxor %xmm8, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pxor %xmm8, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
; SSSE3-NEXT: pand %xmm6, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm0
; SSSE3-NEXT: pand %xmm3, %xmm0
; SSSE3-NEXT: packuswb %xmm1, %xmm0
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm8, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: movdqa %xmm5, %xmm1
; SSSE3-NEXT: pxor %xmm8, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm2
; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: packuswb %xmm3, %xmm2
; SSSE3-NEXT: packuswb %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm14, %xmm1
; SSSE3-NEXT: pxor %xmm8, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm3, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm2
; SSSE3-NEXT: pand %xmm14, %xmm2
; SSSE3-NEXT: movdqa %xmm13, %xmm1
; SSSE3-NEXT: pxor %xmm8, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm5, %xmm1
; SSSE3-NEXT: pand %xmm13, %xmm1
; SSSE3-NEXT: packuswb %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm12, %xmm2
; SSSE3-NEXT: pxor %xmm8, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSSE3-NEXT: por %xmm2, %xmm3
; SSSE3-NEXT: pand %xmm12, %xmm3
; SSSE3-NEXT: movdqa %xmm9, %xmm2
; SSSE3-NEXT: pxor %xmm8, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm5, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm2, %xmm4
; SSSE3-NEXT: pand %xmm9, %xmm4
; SSSE3-NEXT: packuswb %xmm3, %xmm4
; SSSE3-NEXT: packuswb %xmm4, %xmm1
; SSSE3-NEXT: packuswb %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v16i64_v16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movapd {{.*#+}} xmm11 = [255,255]
[X86] When lowering unsigned v2i64 setcc without SSE42, flip the sign bits in the v2i64 type then bitcast to v4i32. This may give slightly better opportunities for DAG combine to simplify with the operations before the setcc. It also matches the type the xors will eventually be promoted to anyway so it saves a legalization step. Almost all of the test changes are because our constant pool entry is now v2i64 instead of v4i32 on 64-bit targets. On 32-bit targets getConstant should be emitting a v4i32 build_vector and a v4i32->v2i64 bitcast. There are a couple test cases where it appears we now combine a bitwise not with one of these xors which caused a new constant vector to be generated. This prevented a constant pool entry from being shared. But if that's an issue we're concerned about, it seems we need to address it another way that just relying a bitcast to hide it. This came about from experiments I've been trying with pushing the promotion of and/or/xor to vXi64 later than LegalizeVectorOps where it is today. We run LegalizeVectorOps in a bottom up order. So the and/or/xor are promoted before their users are legalized. The bitcasts added for the promotion act as a barrier to computeKnownBits if we try to use it during vector legalization of a later operation. So by moving the promotion out we can hopefully get better results from computeKnownBits/computeNumSignBits like in LowerTruncate on AVX512. I've also looked at running LegalizeVectorOps in a top down order like LegalizeDAG, but thats showing some other issues. llvm-svn: 344071
2018-10-10 03:05:50 +08:00
; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm12 = [2147483903,2147483903]
; SSE41-NEXT: movdqa %xmm12, %xmm10
; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
; SSE41-NEXT: movdqa %xmm12, %xmm13
; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
; SSE41-NEXT: pand %xmm10, %xmm0
; SSE41-NEXT: por %xmm13, %xmm0
; SSE41-NEXT: movapd %xmm11, %xmm10
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm10
; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: movdqa %xmm12, %xmm13
; SSE41-NEXT: pcmpeqd %xmm0, %xmm13
; SSE41-NEXT: movdqa %xmm12, %xmm6
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm13, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: movapd %xmm11, %xmm13
; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm13
; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: movdqa %xmm12, %xmm6
; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
; SSE41-NEXT: movdqa %xmm12, %xmm7
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: movapd %xmm11, %xmm14
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm14
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: movdqa %xmm12, %xmm4
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
; SSE41-NEXT: movdqa %xmm12, %xmm6
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: movapd %xmm11, %xmm15
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm15
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: movdqa %xmm12, %xmm5
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
; SSE41-NEXT: movdqa %xmm12, %xmm6
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: movapd %xmm11, %xmm5
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: movdqa %xmm12, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm12, %xmm6
; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: movapd %xmm11, %xmm6
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
; SSE41-NEXT: movdqa %xmm8, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: movdqa %xmm12, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm12, %xmm3
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: movapd %xmm11, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm9, %xmm0
; SSE41-NEXT: movdqa %xmm12, %xmm2
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
; SSE41-NEXT: pcmpgtd %xmm0, %xmm12
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
; SSE41-NEXT: pand %xmm2, %xmm0
; SSE41-NEXT: por %xmm12, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: movapd %xmm11, %xmm1
; SSE41-NEXT: xorpd %xmm9, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm7
; SSE41-NEXT: pcmpeqd %xmm9, %xmm7
; SSE41-NEXT: pcmpgtd %xmm9, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; SSE41-NEXT: pand %xmm7, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm7, %xmm7
; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm7
; SSE41-NEXT: movapd %xmm3, %xmm1
; SSE41-NEXT: xorpd %xmm9, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm4
; SSE41-NEXT: pcmpeqd %xmm9, %xmm4
; SSE41-NEXT: pcmpgtd %xmm9, %xmm1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: packusdw %xmm7, %xmm1
; SSE41-NEXT: movapd %xmm6, %xmm3
; SSE41-NEXT: xorpd %xmm9, %xmm3
; SSE41-NEXT: movapd %xmm3, %xmm4
; SSE41-NEXT: pcmpeqd %xmm9, %xmm4
; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3
; SSE41-NEXT: movapd %xmm5, %xmm4
; SSE41-NEXT: xorpd %xmm9, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm6
; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
; SSE41-NEXT: pcmpgtd %xmm9, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
; SSE41-NEXT: packusdw %xmm3, %xmm4
; SSE41-NEXT: packusdw %xmm4, %xmm1
; SSE41-NEXT: movapd %xmm15, %xmm3
; SSE41-NEXT: xorpd %xmm9, %xmm3
; SSE41-NEXT: movapd %xmm3, %xmm4
; SSE41-NEXT: pcmpeqd %xmm9, %xmm4
; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm4
; SSE41-NEXT: movapd %xmm14, %xmm3
; SSE41-NEXT: xorpd %xmm9, %xmm3
; SSE41-NEXT: movapd %xmm3, %xmm5
; SSE41-NEXT: pcmpeqd %xmm9, %xmm5
; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm3
; SSE41-NEXT: packusdw %xmm4, %xmm3
; SSE41-NEXT: movapd %xmm13, %xmm4
; SSE41-NEXT: xorpd %xmm9, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm5
; SSE41-NEXT: pcmpeqd %xmm9, %xmm5
; SSE41-NEXT: pcmpgtd %xmm9, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
; SSE41-NEXT: pand %xmm5, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm4
; SSE41-NEXT: movapd %xmm10, %xmm5
; SSE41-NEXT: xorpd %xmm9, %xmm5
; SSE41-NEXT: movapd %xmm5, %xmm6
; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
; SSE41-NEXT: pcmpgtd %xmm9, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
; SSE41-NEXT: pand %xmm6, %xmm0
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2
; SSE41-NEXT: packusdw %xmm4, %xmm2
; SSE41-NEXT: packusdw %xmm2, %xmm3
; SSE41-NEXT: packuswb %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v16i64_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255]
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm5, %xmm10
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm11
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm7, %xmm5, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm7
; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm5, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm9, %xmm5, %xmm7
; AVX1-NEXT: vblendvpd %xmm7, %xmm9, %xmm5, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm0
; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm5, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm8, %xmm5, %xmm3
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm4
; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm4
; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm3
; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm4
; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm2
; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm3
; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm11, %xmm2
; AVX1-NEXT: vpand %xmm11, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm10, %xmm3
; AVX1-NEXT: vpand %xmm10, %xmm3, %xmm3
; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v16i64_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5
; AVX2-NEXT: vpand %ymm1, %ymm5, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5
; AVX2-NEXT: vpand %ymm0, %ymm5, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3
; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v16i64_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpminsq %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_packus_v16i64_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1
; AVX512VL-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_packus_v16i64_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpminsq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpminsq %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_packus_v16i64_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1
; AVX512BWVL-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp slt <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%3 = icmp sgt <16 x i64> %2, zeroinitializer
%4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> zeroinitializer
%5 = trunc <16 x i64> %4 to <16 x i8>
ret <16 x i8> %5
}
define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) {
; SSE-LABEL: trunc_packus_v8i32_v8i8:
; SSE: # %bb.0:
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v8i32_v8i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v8i32_v8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v8i32_v8i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_packus_v8i32_v8i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovusdb %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_packus_v8i32_v8i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpmovusdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%3 = icmp sgt <8 x i32> %2, zeroinitializer
%4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
%5 = trunc <8 x i32> %4 to <8 x i8>
ret <8 x i8> %5
}
define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
; SSE-LABEL: trunc_packus_v8i32_v8i8_store:
; SSE: # %bb.0:
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: movq %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v8i32_v8i8_store:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v8i32_v8i8_store:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v8i32_v8i8_store:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_packus_v8i32_v8i8_store:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_packus_v8i32_v8i8_store:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8_store:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%3 = icmp sgt <8 x i32> %2, zeroinitializer
%4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
%5 = trunc <8 x i32> %4 to <8 x i8>
store <8 x i8> %5, <8 x i8> *%p1
ret void
}
define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32> %a0) {
; SSE-LABEL: trunc_packus_v16i32_v16i8:
; SSE: # %bb.0:
; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v16i32_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v16i32_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_packus_v16i32_v16i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovusdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp slt <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%3 = icmp sgt <16 x i32> %2, zeroinitializer
%4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
%5 = trunc <16 x i32> %4 to <16 x i8>
ret <16 x i8> %5
}
define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) {
; SSE-LABEL: trunc_packus_v16i16_v16i8:
; SSE: # %bb.0:
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v16i16_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v16i16_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v16i16_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_packus_v16i16_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_packus_v16i16_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%1 = icmp slt <16 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%3 = icmp sgt <16 x i16> %2, zeroinitializer
%4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
%5 = trunc <16 x i16> %4 to <16 x i8>
ret <16 x i8> %5
}
define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16> %a0) {
; SSE-LABEL: trunc_packus_v32i16_v32i8:
; SSE: # %bb.0:
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm3, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v32i16_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v32i16_v32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v32i16_v32i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_packus_v32i16_v32i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_packus_v32i16_v32i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
%1 = icmp slt <32 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%3 = icmp sgt <32 x i16> %2, zeroinitializer
%4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
%5 = trunc <32 x i16> %4 to <32 x i8>
ret <32 x i8> %5
}
define <32 x i8> @trunc_packus_v32i32_v32i8(<32 x i32> %a0) {
; SSE-LABEL: trunc_packus_v32i32_v32i8:
; SSE: # %bb.0:
; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: packssdw %xmm7, %xmm6
; SSE-NEXT: packssdw %xmm5, %xmm4
; SSE-NEXT: packuswb %xmm6, %xmm4
; SSE-NEXT: movdqa %xmm4, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v32i32_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_packus_v32i32_v32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_packus_v32i32_v32i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpmovusdb %zmm0, %xmm0
; AVX512-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1
; AVX512-NEXT: vpmovusdb %zmm1, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp slt <32 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%2 = select <32 x i1> %1, <32 x i32> %a0, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%3 = icmp sgt <32 x i32> %2, zeroinitializer
%4 = select <32 x i1> %3, <32 x i32> %2, <32 x i32> zeroinitializer
%5 = trunc <32 x i32> %4 to <32 x i8>
ret <32 x i8> %5
}