2016-11-08 23:42:49 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2017-09-07 03:05:20 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
|
2018-01-10 00:26:06 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
|
2012-12-16 00:47:44 +08:00
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <8 x i16> @test1(<8 x i16> %x) nounwind {
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-LABEL: test1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: test1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2012-12-16 00:47:44 +08:00
|
|
|
vector.ph:
|
2017-05-17 21:39:16 +08:00
|
|
|
%0 = icmp slt <8 x i16> %x, zeroinitializer
|
|
|
|
%1 = xor <8 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
|
|
|
|
%res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
|
|
|
|
ret <8 x i16> %res
|
2015-01-23 06:19:58 +08:00
|
|
|
}
|
2012-12-16 00:47:44 +08:00
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <8 x i16> @test2(<8 x i16> %x) nounwind {
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-LABEL: test2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: test2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2015-01-23 06:19:58 +08:00
|
|
|
vector.ph:
|
2017-05-17 21:39:16 +08:00
|
|
|
%0 = icmp ugt <8 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
|
|
|
|
%1 = add <8 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
|
|
|
|
%res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
|
|
|
|
ret <8 x i16> %res
|
2015-01-23 06:19:58 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
|
2016-01-22 06:07:41 +08:00
|
|
|
; SSE-LABEL: test3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE-NEXT: movd %edi, %xmm1
|
2017-12-29 22:41:50 +08:00
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE-NEXT: psubusw %xmm1, %xmm0
|
2016-01-22 06:07:41 +08:00
|
|
|
; SSE-NEXT: retq
|
2015-11-21 21:25:50 +08:00
|
|
|
;
|
|
|
|
; AVX1-LABEL: test3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vmovd %edi, %xmm1
|
2017-12-29 22:41:50 +08:00
|
|
|
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vmovd %edi, %xmm1
|
|
|
|
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: test3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vpbroadcastw %edi, %xmm1
|
|
|
|
; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: retq
|
2012-12-16 00:47:44 +08:00
|
|
|
vector.ph:
|
|
|
|
%0 = insertelement <8 x i16> undef, i16 %w, i32 0
|
|
|
|
%broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
|
2017-05-17 21:39:16 +08:00
|
|
|
%1 = icmp ult <8 x i16> %x, %broadcast15
|
|
|
|
%2 = sub <8 x i16> %x, %broadcast15
|
|
|
|
%res = select <8 x i1> %1, <8 x i16> zeroinitializer, <8 x i16> %2
|
|
|
|
ret <8 x i16> %res
|
2012-12-16 00:47:44 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <16 x i8> @test4(<16 x i8> %x) nounwind {
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-LABEL: test4:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: test4:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2012-12-16 00:47:44 +08:00
|
|
|
vector.ph:
|
2017-05-17 21:39:16 +08:00
|
|
|
%0 = icmp slt <16 x i8> %x, zeroinitializer
|
|
|
|
%1 = xor <16 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
|
|
|
|
%res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
|
|
|
|
ret <16 x i8> %res
|
2015-01-23 06:19:58 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <16 x i8> @test5(<16 x i8> %x) nounwind {
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-LABEL: test5:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: test5:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2015-01-23 06:19:58 +08:00
|
|
|
vector.ph:
|
2017-05-17 21:39:16 +08:00
|
|
|
%0 = icmp ugt <16 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
|
|
|
|
%1 = add <16 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
|
|
|
|
%res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
|
|
|
|
ret <16 x i8> %res
|
2015-01-23 06:19:58 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
|
2015-11-21 21:57:22 +08:00
|
|
|
; SSE2-LABEL: test6:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: movd %edi, %xmm1
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
2017-12-29 22:41:50 +08:00
|
|
|
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: psubusb %xmm1, %xmm0
|
2015-11-21 21:57:22 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: test6:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSSE3: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: movd %edi, %xmm1
|
|
|
|
; SSSE3-NEXT: pxor %xmm2, %xmm2
|
|
|
|
; SSSE3-NEXT: pshufb %xmm2, %xmm1
|
|
|
|
; SSSE3-NEXT: psubusb %xmm1, %xmm0
|
2015-11-21 21:57:22 +08:00
|
|
|
; SSSE3-NEXT: retq
|
2015-11-21 21:25:50 +08:00
|
|
|
;
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-LABEL: test6:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE41: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: movd %edi, %xmm1
|
|
|
|
; SSE41-NEXT: pxor %xmm2, %xmm2
|
|
|
|
; SSE41-NEXT: pshufb %xmm2, %xmm1
|
|
|
|
; SSE41-NEXT: psubusb %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-LABEL: test6:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vmovd %edi, %xmm1
|
|
|
|
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test6:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vmovd %edi, %xmm1
|
|
|
|
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: test6:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vpbroadcastb %edi, %xmm1
|
|
|
|
; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: retq
|
2012-12-16 00:47:44 +08:00
|
|
|
vector.ph:
|
|
|
|
%0 = insertelement <16 x i8> undef, i8 %w, i32 0
|
|
|
|
%broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
|
2017-05-17 21:39:16 +08:00
|
|
|
%1 = icmp ult <16 x i8> %x, %broadcast15
|
|
|
|
%2 = sub <16 x i8> %x, %broadcast15
|
|
|
|
%res = select <16 x i1> %1, <16 x i8> zeroinitializer, <16 x i8> %2
|
|
|
|
ret <16 x i8> %res
|
2012-12-16 00:47:44 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <16 x i16> @test7(<16 x i16> %x) nounwind {
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-LABEL: test7:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
|
|
|
|
; SSE-NEXT: psubusw %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: psubusw %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: test7:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2018-02-12 01:29:42 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
|
|
|
|
; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test7:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: test7:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: retq
|
2012-12-16 00:47:44 +08:00
|
|
|
vector.ph:
|
2017-05-17 21:39:16 +08:00
|
|
|
%0 = icmp slt <16 x i16> %x, zeroinitializer
|
|
|
|
%1 = xor <16 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
|
|
|
|
%res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
|
|
|
|
ret <16 x i16> %res
|
2012-12-16 00:47:44 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <16 x i16> @test8(<16 x i16> %x) nounwind {
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-LABEL: test8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
|
|
|
|
; SSE-NEXT: psubusw %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: psubusw %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: test8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2018-02-12 01:29:42 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
|
|
|
|
; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: test8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: retq
|
2012-12-16 00:47:44 +08:00
|
|
|
vector.ph:
|
2017-05-17 21:39:16 +08:00
|
|
|
%0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
|
|
|
|
%1 = add <16 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
|
|
|
|
%res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
|
|
|
|
ret <16 x i16> %res
|
2012-12-16 00:47:44 +08:00
|
|
|
}
|
|
|
|
|
2018-02-24 02:57:26 +08:00
|
|
|
define <16 x i16> @test8a(<16 x i16> %x) nounwind {
|
2018-02-24 03:58:44 +08:00
|
|
|
; SSE-LABEL: test8a:
|
|
|
|
; SSE: # %bb.0: # %vector.ph
|
|
|
|
; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: psubusw {{.*}}(%rip), %xmm1
|
|
|
|
; SSE-NEXT: retq
|
2018-02-24 02:57:26 +08:00
|
|
|
;
|
|
|
|
; AVX1-LABEL: test8a:
|
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2018-02-24 03:58:44 +08:00
|
|
|
; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
2018-02-24 02:57:26 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test8a:
|
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2018-02-24 03:58:44 +08:00
|
|
|
; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
|
2018-02-24 02:57:26 +08:00
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: test8a:
|
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2018-02-24 03:58:44 +08:00
|
|
|
; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
|
2018-02-24 02:57:26 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32765, i16 32764, i16 32763, i16 32762, i16 32761, i16 32760, i16 32759, i16 32758, i16 32757, i16 32756, i16 32755, i16 32754, i16 32753, i16 32752, i16 32751>
|
|
|
|
%1 = add <16 x i16> %x, <i16 -32767, i16 -32766, i16 -32765, i16 -32764, i16 -32763, i16 -32762, i16 -32761, i16 -32760, i16 -32759, i16 -32758, i16 -32757, i16 -32756, i16 -32755, i16 -32754, i16 -32753, i16 -32752>
|
|
|
|
%res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
|
|
|
|
ret <16 x i16> %res
|
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
|
2016-01-22 06:07:41 +08:00
|
|
|
; SSE-LABEL: test9:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE-NEXT: movd %edi, %xmm2
|
2017-12-29 22:41:50 +08:00
|
|
|
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE-NEXT: psubusw %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: psubusw %xmm2, %xmm1
|
2016-01-22 06:07:41 +08:00
|
|
|
; SSE-NEXT: retq
|
2015-11-21 21:25:50 +08:00
|
|
|
;
|
|
|
|
; AVX1-LABEL: test9:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2016-01-22 06:07:41 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vmovd %edi, %xmm2
|
2017-12-29 22:41:50 +08:00
|
|
|
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
|
2018-02-12 01:29:42 +08:00
|
|
|
; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
|
2016-01-22 06:07:41 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test9:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vmovd %edi, %xmm1
|
|
|
|
; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: test9:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vpbroadcastw %edi, %ymm1
|
|
|
|
; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: retq
|
2012-12-16 00:47:44 +08:00
|
|
|
vector.ph:
|
|
|
|
%0 = insertelement <16 x i16> undef, i16 %w, i32 0
|
|
|
|
%broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
|
2017-05-17 21:39:16 +08:00
|
|
|
%1 = icmp ult <16 x i16> %x, %broadcast15
|
|
|
|
%2 = sub <16 x i16> %x, %broadcast15
|
|
|
|
%res = select <16 x i1> %1, <16 x i16> zeroinitializer, <16 x i16> %2
|
|
|
|
ret <16 x i16> %res
|
2012-12-16 00:47:44 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <32 x i8> @test10(<32 x i8> %x) nounwind {
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-LABEL: test10:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
|
|
|
|
; SSE-NEXT: psubusb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: psubusb %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: test10:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2018-02-12 01:29:42 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
|
|
|
|
; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test10:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: test10:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: retq
|
2012-12-16 00:47:44 +08:00
|
|
|
vector.ph:
|
2017-05-17 21:39:16 +08:00
|
|
|
%0 = icmp slt <32 x i8> %x, zeroinitializer
|
|
|
|
%1 = xor <32 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
|
|
|
|
%res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
|
|
|
|
ret <32 x i8> %res
|
2012-12-16 00:47:44 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <32 x i8> @test11(<32 x i8> %x) nounwind {
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-LABEL: test11:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
|
|
|
|
; SSE-NEXT: psubusb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: psubusb %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: test11:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
2018-02-12 01:29:42 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
|
|
|
|
; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test11:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: test11:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: retq
|
2012-12-16 00:47:44 +08:00
|
|
|
vector.ph:
|
2017-05-17 21:39:16 +08:00
|
|
|
%0 = icmp ugt <32 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
|
|
|
|
%1 = add <32 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
|
|
|
|
%res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
|
|
|
|
ret <32 x i8> %res
|
2012-12-16 00:47:44 +08:00
|
|
|
}
|
|
|
|
|
2018-02-24 02:57:26 +08:00
|
|
|
define <32 x i8> @test11a(<32 x i8> %x) nounwind {
|
|
|
|
; SSE-LABEL: test11a:
|
|
|
|
; SSE: # %bb.0: # %vector.ph
|
2018-02-24 03:58:44 +08:00
|
|
|
; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: psubusb {{.*}}(%rip), %xmm1
|
2018-02-24 02:57:26 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: test11a:
|
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2018-02-24 03:58:44 +08:00
|
|
|
; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
2018-02-24 02:57:26 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test11a:
|
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2018-02-24 03:58:44 +08:00
|
|
|
; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
|
2018-02-24 02:57:26 +08:00
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: test11a:
|
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2018-02-24 03:58:44 +08:00
|
|
|
; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
|
2018-02-24 02:57:26 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%0 = icmp ugt <32 x i8> %x, <i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 117, i8 116, i8 115, i8 114, i8 113, i8 112, i8 111, i8 110, i8 109, i8 108, i8 107, i8 106, i8 105, i8 104, i8 103, i8 102, i8 101, i8 100, i8 99, i8 98, i8 97, i8 96, i8 95>
|
|
|
|
%1 = add <32 x i8> %x, <i8 -127, i8 -126, i8 -125, i8 -124, i8 -123, i8 -122, i8 -121, i8 -120, i8 -119, i8 -118, i8 -117, i8 -116, i8 -115, i8 -114, i8 -113, i8 -112, i8 -111, i8 -110, i8 -109, i8 -108, i8 -107, i8 -106, i8 -105, i8 -104, i8 -103, i8 -102, i8 -101, i8 -100, i8 -99, i8 -98, i8 -97, i8 -96>
|
|
|
|
%res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
|
|
|
|
ret <32 x i8> %res
|
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
|
2015-11-21 21:57:22 +08:00
|
|
|
; SSE2-LABEL: test12:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: movd %edi, %xmm2
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
2017-12-29 22:41:50 +08:00
|
|
|
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: psubusb %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: psubusb %xmm2, %xmm1
|
2015-11-21 21:57:22 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: test12:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSSE3: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: movd %edi, %xmm2
|
|
|
|
; SSSE3-NEXT: pxor %xmm3, %xmm3
|
|
|
|
; SSSE3-NEXT: pshufb %xmm3, %xmm2
|
|
|
|
; SSSE3-NEXT: psubusb %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: psubusb %xmm2, %xmm1
|
2015-11-21 21:57:22 +08:00
|
|
|
; SSSE3-NEXT: retq
|
2015-11-21 21:25:50 +08:00
|
|
|
;
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-LABEL: test12:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE41: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: movd %edi, %xmm2
|
|
|
|
; SSE41-NEXT: pxor %xmm3, %xmm3
|
|
|
|
; SSE41-NEXT: pshufb %xmm3, %xmm2
|
|
|
|
; SSE41-NEXT: psubusb %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: psubusb %xmm2, %xmm1
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-LABEL: test12:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vmovd %edi, %xmm1
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
2018-02-12 01:29:42 +08:00
|
|
|
; AVX1-NEXT: vpsubusb %xmm1, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test12:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vmovd %edi, %xmm1
|
|
|
|
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
|
2015-11-21 21:25:50 +08:00
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: test12:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vpbroadcastb %edi, %ymm1
|
|
|
|
; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: retq
|
2012-12-16 00:47:44 +08:00
|
|
|
vector.ph:
|
|
|
|
%0 = insertelement <32 x i8> undef, i8 %w, i32 0
|
|
|
|
%broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
|
2017-05-17 21:39:16 +08:00
|
|
|
%1 = icmp ult <32 x i8> %x, %broadcast15
|
|
|
|
%2 = sub <32 x i8> %x, %broadcast15
|
|
|
|
%res = select <32 x i1> %1, <32 x i8> zeroinitializer, <32 x i8> %2
|
|
|
|
ret <32 x i8> %res
|
2012-12-16 00:47:44 +08:00
|
|
|
}
|
2016-11-16 21:59:03 +08:00
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-LABEL: test13:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0: # %vector.ph
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm4
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm2, %xmm0
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm6
|
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm6
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSE2-NEXT: por %xmm4, %xmm5
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm2
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSE2-NEXT: por %xmm3, %xmm4
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSE2-NEXT: packssdw %xmm6, %xmm2
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm1, %xmm3
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm0
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pslld $16, %xmm3
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm3
|
|
|
|
; SSE2-NEXT: packssdw %xmm0, %xmm3
|
|
|
|
; SSE2-NEXT: pandn %xmm3, %xmm2
|
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: test13:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSSE3: # %bb.0: # %vector.ph
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSSE3-NEXT: pxor %xmm3, %xmm3
|
|
|
|
; SSSE3-NEXT: movdqa %xmm0, %xmm4
|
|
|
|
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
|
|
|
|
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm0, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: psubd %xmm2, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm2, %xmm6
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSSE3-NEXT: pxor %xmm3, %xmm6
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSSE3-NEXT: por %xmm3, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
|
|
|
|
; SSSE3-NEXT: movdqa %xmm1, %xmm2
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSSE3-NEXT: pxor %xmm3, %xmm2
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSSE3-NEXT: por %xmm4, %xmm3
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
|
|
|
|
; SSSE3-NEXT: packssdw %xmm6, %xmm2
|
|
|
|
; SSSE3-NEXT: psubd %xmm1, %xmm4
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; SSSE3-NEXT: pshufb %xmm1, %xmm0
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSSE3-NEXT: pshufb %xmm1, %xmm4
|
|
|
|
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
|
|
|
|
; SSSE3-NEXT: pandn %xmm4, %xmm2
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm2, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-LABEL: test13:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE41: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
|
|
|
|
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: movdqa %xmm4, %xmm0
|
|
|
|
; SSE41-NEXT: pmaxud %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: pxor %xmm5, %xmm0
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; SSE41-NEXT: pshufb %xmm6, %xmm0
|
|
|
|
; SSE41-NEXT: movdqa %xmm3, %xmm7
|
|
|
|
; SSE41-NEXT: pmaxud %xmm2, %xmm7
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
|
|
|
|
; SSE41-NEXT: pxor %xmm5, %xmm7
|
|
|
|
; SSE41-NEXT: pshufb %xmm6, %xmm7
|
|
|
|
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSE41-NEXT: psubd %xmm2, %xmm3
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: psubd %xmm1, %xmm4
|
|
|
|
; SSE41-NEXT: pshufb %xmm6, %xmm4
|
|
|
|
; SSE41-NEXT: pshufb %xmm6, %xmm3
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
|
|
|
|
; SSE41-NEXT: pandn %xmm4, %xmm0
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX1-LABEL: test13:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX1-NEXT: vpmaxud %xmm5, %xmm2, %xmm6
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm2, %xmm6
|
|
|
|
; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4
|
|
|
|
; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-11-22 19:29:19 +08:00
|
|
|
; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test13:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
|
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
|
2017-10-24 23:38:16 +08:00
|
|
|
; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2016-11-22 19:29:19 +08:00
|
|
|
; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: test13:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX512-NEXT: vpcmpnltud %ymm1, %ymm0, %k1
|
|
|
|
; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-11-16 21:59:03 +08:00
|
|
|
vector.ph:
|
2017-05-17 21:39:16 +08:00
|
|
|
%lhs = zext <8 x i16> %x to <8 x i32>
|
|
|
|
%cond = icmp ult <8 x i32> %lhs, %y
|
|
|
|
%sub = sub <8 x i32> %lhs, %y
|
|
|
|
%trunc = trunc <8 x i32> %sub to <8 x i16>
|
|
|
|
%res = select <8 x i1> %cond, <8 x i16> zeroinitializer, <8 x i16> %trunc
|
|
|
|
ret <8 x i16> %res
|
2016-11-16 21:59:03 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-LABEL: test14:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0: # %vector.ph
|
2017-03-16 15:17:12 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm0, %xmm0
|
|
|
|
; SSE2-NEXT: movdqa %xmm5, %xmm6
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm6, %xmm8
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
|
|
|
|
; SSE2-NEXT: movdqa %xmm5, %xmm10
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSE2-NEXT: movdqa %xmm4, %xmm9
|
|
|
|
; SSE2-NEXT: pxor %xmm0, %xmm9
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm4
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSE2-NEXT: por %xmm0, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pcmpgtd %xmm9, %xmm5
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255]
|
|
|
|
; SSE2-NEXT: pand %xmm9, %xmm5
|
|
|
|
; SSE2-NEXT: movdqa %xmm3, %xmm7
|
|
|
|
; SSE2-NEXT: pxor %xmm0, %xmm7
|
|
|
|
; SSE2-NEXT: psubd %xmm10, %xmm3
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSE2-NEXT: por %xmm0, %xmm10
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pcmpgtd %xmm7, %xmm10
|
|
|
|
; SSE2-NEXT: pand %xmm9, %xmm10
|
|
|
|
; SSE2-NEXT: packuswb %xmm5, %xmm10
|
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm5
|
|
|
|
; SSE2-NEXT: pxor %xmm0, %xmm5
|
|
|
|
; SSE2-NEXT: psubd %xmm6, %xmm2
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSE2-NEXT: por %xmm0, %xmm6
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
|
|
|
|
; SSE2-NEXT: pand %xmm9, %xmm6
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm5
|
|
|
|
; SSE2-NEXT: pxor %xmm0, %xmm5
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSE2-NEXT: por %xmm8, %xmm0
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
|
|
|
|
; SSE2-NEXT: pand %xmm9, %xmm0
|
|
|
|
; SSE2-NEXT: packuswb %xmm6, %xmm0
|
|
|
|
; SSE2-NEXT: packuswb %xmm10, %xmm0
|
|
|
|
; SSE2-NEXT: psubd %xmm8, %xmm1
|
|
|
|
; SSE2-NEXT: pand %xmm9, %xmm4
|
|
|
|
; SSE2-NEXT: pand %xmm9, %xmm3
|
|
|
|
; SSE2-NEXT: packuswb %xmm4, %xmm3
|
|
|
|
; SSE2-NEXT: pand %xmm9, %xmm2
|
|
|
|
; SSE2-NEXT: pand %xmm9, %xmm1
|
|
|
|
; SSE2-NEXT: packuswb %xmm2, %xmm1
|
|
|
|
; SSE2-NEXT: packuswb %xmm3, %xmm1
|
|
|
|
; SSE2-NEXT: pandn %xmm1, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: test14:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSSE3: # %bb.0: # %vector.ph
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm0, %xmm5
|
|
|
|
; SSSE3-NEXT: pxor %xmm0, %xmm0
|
|
|
|
; SSSE3-NEXT: movdqa %xmm5, %xmm7
|
|
|
|
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm7, %xmm8
|
|
|
|
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
|
|
|
|
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
|
|
|
|
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm5, %xmm10
|
|
|
|
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
|
|
|
|
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm2, %xmm9
|
|
|
|
; SSSE3-NEXT: pxor %xmm0, %xmm9
|
|
|
|
; SSSE3-NEXT: psubd %xmm5, %xmm2
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSSE3-NEXT: por %xmm0, %xmm5
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
|
|
|
; SSSE3-NEXT: pshufb %xmm9, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm1, %xmm6
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSSE3-NEXT: pxor %xmm0, %xmm6
|
|
|
|
; SSSE3-NEXT: psubd %xmm10, %xmm1
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSSE3-NEXT: por %xmm0, %xmm10
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm10
|
|
|
|
; SSSE3-NEXT: pshufb %xmm9, %xmm10
|
|
|
|
; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm4, %xmm5
|
|
|
|
; SSSE3-NEXT: pxor %xmm0, %xmm5
|
|
|
|
; SSSE3-NEXT: psubd %xmm7, %xmm4
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSSE3-NEXT: por %xmm0, %xmm7
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
2017-03-16 15:17:12 +08:00
|
|
|
; SSSE3-NEXT: pshufb %xmm5, %xmm7
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm3, %xmm6
|
|
|
|
; SSSE3-NEXT: pxor %xmm0, %xmm6
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSSE3-NEXT: por %xmm8, %xmm0
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
|
|
|
|
; SSSE3-NEXT: pshufb %xmm5, %xmm0
|
|
|
|
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
|
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
|
|
|
|
; SSSE3-NEXT: psubd %xmm8, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSSE3-NEXT: pand %xmm5, %xmm4
|
|
|
|
; SSSE3-NEXT: pand %xmm5, %xmm3
|
|
|
|
; SSSE3-NEXT: packuswb %xmm4, %xmm3
|
|
|
|
; SSSE3-NEXT: pand %xmm5, %xmm2
|
|
|
|
; SSSE3-NEXT: pand %xmm5, %xmm1
|
|
|
|
; SSSE3-NEXT: packuswb %xmm2, %xmm1
|
|
|
|
; SSSE3-NEXT: packuswb %xmm3, %xmm1
|
|
|
|
; SSSE3-NEXT: andnpd %xmm1, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-LABEL: test14:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE41: # %bb.0: # %vector.ph
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
|
|
|
|
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; SSE41-NEXT: movdqa %xmm4, %xmm0
|
|
|
|
; SSE41-NEXT: pmaxud %xmm10, %xmm0
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
|
|
|
|
; SSE41-NEXT: pxor %xmm6, %xmm0
|
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm7 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
|
|
|
; SSE41-NEXT: pshufb %xmm7, %xmm0
|
|
|
|
; SSE41-NEXT: movdqa %xmm3, %xmm5
|
|
|
|
; SSE41-NEXT: pmaxud %xmm9, %xmm5
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
|
|
|
|
; SSE41-NEXT: pxor %xmm6, %xmm5
|
|
|
|
; SSE41-NEXT: pshufb %xmm7, %xmm5
|
|
|
|
; SSE41-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
|
|
|
|
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: pmaxud %xmm8, %xmm0
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: pxor %xmm6, %xmm0
|
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm12 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
|
|
|
; SSE41-NEXT: pshufb %xmm12, %xmm0
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: movdqa %xmm2, %xmm7
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: pmaxud %xmm11, %xmm7
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
|
|
|
|
; SSE41-NEXT: pxor %xmm6, %xmm7
|
|
|
|
; SSE41-NEXT: pshufb %xmm12, %xmm7
|
|
|
|
; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
|
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
|
|
|
|
; SSE41-NEXT: psubd %xmm11, %xmm2
|
|
|
|
; SSE41-NEXT: psubd %xmm8, %xmm1
|
|
|
|
; SSE41-NEXT: psubd %xmm9, %xmm3
|
|
|
|
; SSE41-NEXT: psubd %xmm10, %xmm4
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; SSE41-NEXT: pand %xmm5, %xmm4
|
|
|
|
; SSE41-NEXT: pand %xmm5, %xmm3
|
2018-06-08 21:59:11 +08:00
|
|
|
; SSE41-NEXT: packusdw %xmm4, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: pand %xmm5, %xmm1
|
|
|
|
; SSE41-NEXT: pand %xmm5, %xmm2
|
2018-06-08 21:59:11 +08:00
|
|
|
; SSE41-NEXT: packusdw %xmm2, %xmm1
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: packuswb %xmm3, %xmm1
|
2017-09-18 22:23:23 +08:00
|
|
|
; SSE41-NEXT: pandn %xmm1, %xmm0
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX1-LABEL: test14:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
|
|
|
|
; AVX1-NEXT: vpmaxud %xmm0, %xmm6, %xmm7
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm7
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7
|
|
|
|
; AVX1-NEXT: vpmaxud %xmm11, %xmm2, %xmm4
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4
|
|
|
|
; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm10
|
|
|
|
; AVX1-NEXT: vpmaxud %xmm9, %xmm1, %xmm7
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm7, %xmm1, %xmm7
|
|
|
|
; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
|
|
|
|
; AVX1-NEXT: vpmaxud %xmm8, %xmm4, %xmm5
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5
|
|
|
|
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm3
|
|
|
|
; AVX1-NEXT: vpackssdw %xmm3, %xmm7, %xmm3
|
|
|
|
; AVX1-NEXT: vpacksswb %xmm10, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpsubd %xmm8, %xmm4, %xmm4
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vpsubd %xmm9, %xmm1, %xmm1
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX1-NEXT: vpsubd %xmm11, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
|
2018-06-08 21:59:11 +08:00
|
|
|
; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm2
|
2018-06-08 21:59:11 +08:00
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
|
2017-09-18 22:23:23 +08:00
|
|
|
; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test14:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm4
|
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4
|
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
|
|
|
|
; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4
|
2017-03-16 15:17:12 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
|
2017-10-24 23:38:16 +08:00
|
|
|
; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm6
|
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm6, %ymm2, %ymm6
|
|
|
|
; AVX2-NEXT: vpxor %ymm5, %ymm6, %ymm5
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
|
|
|
|
; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
|
|
|
|
; AVX2-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
|
|
|
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
2017-03-16 15:17:12 +08:00
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2017-09-18 22:23:23 +08:00
|
|
|
; AVX2-NEXT: vpandn %xmm0, %xmm4, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: test14:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
|
|
|
|
; AVX512-NEXT: vpcmpnltud %zmm0, %zmm1, %k1
|
|
|
|
; AVX512-NEXT: vpsubd %zmm0, %zmm1, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-11-16 21:59:03 +08:00
|
|
|
vector.ph:
|
2017-05-17 21:39:16 +08:00
|
|
|
%rhs = zext <16 x i8> %x to <16 x i32>
|
|
|
|
%cond = icmp ult <16 x i32> %y, %rhs
|
|
|
|
%sub = sub <16 x i32> %y, %rhs
|
|
|
|
%truncsub = trunc <16 x i32> %sub to <16 x i8>
|
|
|
|
%res = select <16 x i1> %cond, <16 x i8> zeroinitializer, <16 x i8> %truncsub
|
|
|
|
ret <16 x i8> %res
|
2016-11-16 21:59:03 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-LABEL: test15:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm4
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm3, %xmm5
|
|
|
|
; SSE2-NEXT: psubd %xmm2, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm2
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSE2-NEXT: por %xmm4, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm2
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSE2-NEXT: por %xmm0, %xmm4
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSE2-NEXT: packssdw %xmm5, %xmm4
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm1, %xmm0
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pslld $16, %xmm3
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm3
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE2-NEXT: packssdw %xmm3, %xmm0
|
|
|
|
; SSE2-NEXT: pand %xmm4, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: test15:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSSE3: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: pxor %xmm4, %xmm4
|
|
|
|
; SSSE3-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
|
|
|
|
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm0, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: psubd %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: pxor %xmm4, %xmm2
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSSE3-NEXT: por %xmm4, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm1, %xmm2
|
|
|
|
; SSSE3-NEXT: pxor %xmm4, %xmm2
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSSE3-NEXT: por %xmm3, %xmm4
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSSE3-NEXT: packssdw %xmm5, %xmm4
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: psubd %xmm1, %xmm3
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; SSSE3-NEXT: pshufb %xmm1, %xmm0
|
|
|
|
; SSSE3-NEXT: pshufb %xmm1, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
|
|
|
|
; SSSE3-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSSE3-NEXT: movdqa %xmm3, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-LABEL: test15:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE41: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
|
|
|
|
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
|
|
|
|
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: movdqa %xmm0, %xmm4
|
|
|
|
; SSE41-NEXT: pminud %xmm1, %xmm4
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
|
|
|
|
; SSE41-NEXT: pxor %xmm5, %xmm4
|
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; SSE41-NEXT: pshufb %xmm6, %xmm4
|
|
|
|
; SSE41-NEXT: movdqa %xmm3, %xmm7
|
|
|
|
; SSE41-NEXT: pminud %xmm2, %xmm7
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
|
|
|
|
; SSE41-NEXT: pxor %xmm5, %xmm7
|
|
|
|
; SSE41-NEXT: pshufb %xmm6, %xmm7
|
|
|
|
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: psubd %xmm2, %xmm3
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: psubd %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: pshufb %xmm6, %xmm0
|
|
|
|
; SSE41-NEXT: pshufb %xmm6, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: pand %xmm4, %xmm0
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX1-LABEL: test15:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX1-NEXT: vpminud %xmm5, %xmm2, %xmm6
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm2, %xmm6
|
|
|
|
; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4
|
|
|
|
; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test15:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm2
|
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
|
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
|
2017-10-24 23:38:16 +08:00
|
|
|
; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: test15:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX512-NEXT: vpcmpnleud %ymm1, %ymm0, %k1
|
|
|
|
; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-11-16 21:59:03 +08:00
|
|
|
vector.ph:
|
2017-05-17 21:39:16 +08:00
|
|
|
%lhs = zext <8 x i16> %x to <8 x i32>
|
|
|
|
%cond = icmp ugt <8 x i32> %lhs, %y
|
|
|
|
%sub = sub <8 x i32> %lhs, %y
|
|
|
|
%truncsub = trunc <8 x i32> %sub to <8 x i16>
|
|
|
|
%res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
|
|
|
|
ret <8 x i16> %res
|
2016-11-16 21:59:03 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 21:39:16 +08:00
|
|
|
define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-LABEL: test16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm4
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm3, %xmm5
|
|
|
|
; SSE2-NEXT: psubd %xmm2, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm2
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSE2-NEXT: por %xmm4, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm2
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSE2-NEXT: por %xmm0, %xmm4
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSE2-NEXT: packssdw %xmm5, %xmm4
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm1, %xmm0
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE2-NEXT: pslld $16, %xmm3
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm3
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSE2-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm0
|
|
|
|
; SSE2-NEXT: packssdw %xmm3, %xmm0
|
|
|
|
; SSE2-NEXT: pand %xmm4, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: test16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSSE3: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: pxor %xmm4, %xmm4
|
|
|
|
; SSSE3-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
|
|
|
|
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm0, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: psubd %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: pxor %xmm4, %xmm2
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSSE3-NEXT: por %xmm4, %xmm5
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm1, %xmm2
|
|
|
|
; SSSE3-NEXT: pxor %xmm4, %xmm2
|
2018-01-08 03:09:40 +08:00
|
|
|
; SSSE3-NEXT: por %xmm3, %xmm4
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
|
2017-10-05 01:31:28 +08:00
|
|
|
; SSSE3-NEXT: packssdw %xmm5, %xmm4
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: psubd %xmm1, %xmm3
|
2017-10-03 20:01:31 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; SSSE3-NEXT: pshufb %xmm1, %xmm0
|
|
|
|
; SSSE3-NEXT: pshufb %xmm1, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
|
|
|
|
; SSSE3-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSSE3-NEXT: movdqa %xmm3, %xmm0
|
2016-11-16 21:59:03 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-LABEL: test16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE41: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
|
|
|
|
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
|
|
|
|
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: movdqa %xmm1, %xmm4
|
|
|
|
; SSE41-NEXT: pmaxud %xmm0, %xmm4
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm1, %xmm4
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
|
|
|
|
; SSE41-NEXT: pxor %xmm5, %xmm4
|
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; SSE41-NEXT: pshufb %xmm6, %xmm4
|
|
|
|
; SSE41-NEXT: movdqa %xmm2, %xmm7
|
|
|
|
; SSE41-NEXT: pmaxud %xmm3, %xmm7
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
|
|
|
|
; SSE41-NEXT: pxor %xmm5, %xmm7
|
|
|
|
; SSE41-NEXT: pshufb %xmm6, %xmm7
|
|
|
|
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: psubd %xmm2, %xmm3
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: psubd %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: pshufb %xmm6, %xmm0
|
|
|
|
; SSE41-NEXT: pshufb %xmm6, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
|
2018-02-12 01:11:40 +08:00
|
|
|
; SSE41-NEXT: pand %xmm4, %xmm0
|
2017-05-17 21:39:16 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX1-LABEL: test16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm3
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX1-NEXT: vpmaxud %xmm2, %xmm5, %xmm6
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6
|
|
|
|
; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4
|
|
|
|
; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
2018-02-12 01:11:40 +08:00
|
|
|
; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm2
|
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2
|
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
|
2017-10-24 23:38:16 +08:00
|
|
|
; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
|
2017-05-17 21:39:16 +08:00
|
|
|
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
|
2017-02-06 02:33:14 +08:00
|
|
|
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
2016-11-16 21:59:03 +08:00
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: test16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX512-NEXT: vpcmpltud %ymm0, %ymm1, %k1
|
|
|
|
; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
2016-11-16 21:59:03 +08:00
|
|
|
vector.ph:
|
2017-05-17 21:39:16 +08:00
|
|
|
%lhs = zext <8 x i16> %x to <8 x i32>
|
|
|
|
%cond = icmp ult <8 x i32> %y, %lhs
|
|
|
|
%sub = sub <8 x i32> %lhs, %y
|
|
|
|
%truncsub = trunc <8 x i32> %sub to <8 x i16>
|
|
|
|
%res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
|
|
|
|
ret <8 x i16> %res
|
2016-11-16 21:59:03 +08:00
|
|
|
}
|
2017-09-27 22:38:05 +08:00
|
|
|
|
|
|
|
define <8 x i16> @psubus_8i16_max(<8 x i16> %x, <8 x i16> %y) nounwind {
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE-LABEL: psubus_8i16_max:
|
|
|
|
; SSE: # %bb.0: # %vector.ph
|
|
|
|
; SSE-NEXT: psubusw %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX-LABEL: psubus_8i16_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%cmp = icmp ult <8 x i16> %x, %y
|
|
|
|
%max = select <8 x i1> %cmp, <8 x i16> %y, <8 x i16> %x
|
|
|
|
%res = sub <8 x i16> %max, %y
|
|
|
|
ret <8 x i16> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @psubus_16i8_max(<16 x i8> %x, <16 x i8> %y) nounwind {
|
|
|
|
; SSE-LABEL: psubus_16i8_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; SSE-NEXT: psubusb %xmm1, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: psubus_16i8_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%cmp = icmp ult <16 x i8> %x, %y
|
|
|
|
%max = select <16 x i1> %cmp, <16 x i8> %y, <16 x i8> %x
|
|
|
|
%res = sub <16 x i8> %max, %y
|
|
|
|
ret <16 x i8> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i16> @psubus_16i16_max(<16 x i16> %x, <16 x i16> %y) nounwind {
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE-LABEL: psubus_16i16_max:
|
|
|
|
; SSE: # %bb.0: # %vector.ph
|
|
|
|
; SSE-NEXT: psubusw %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: psubusw %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX1-LABEL: psubus_16i16_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: psubus_16i16_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: psubus_16i16_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%cmp = icmp ult <16 x i16> %x, %y
|
|
|
|
%max = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> %x
|
|
|
|
%res = sub <16 x i16> %max, %y
|
|
|
|
ret <16 x i16> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <32 x i16> @psubus_32i16_max(<32 x i16> %x, <32 x i16> %y) nounwind {
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE-LABEL: psubus_32i16_max:
|
|
|
|
; SSE: # %bb.0: # %vector.ph
|
|
|
|
; SSE-NEXT: psubusw %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: psubusw %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: psubusw %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: psubusw %xmm7, %xmm3
|
|
|
|
; SSE-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX1-LABEL: psubus_32i16_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2018-02-24 20:44:12 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX1-NEXT: vpsubusw %xmm4, %xmm5, %xmm4
|
|
|
|
; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
|
2018-02-24 20:44:12 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
|
|
|
|
; AVX1-NEXT: vpsubusw %xmm2, %xmm4, %xmm2
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1
|
2018-02-24 20:44:12 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: psubus_32i16_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpsubusw %ymm3, %ymm1, %ymm1
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: psubus_32i16_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%cmp = icmp ult <32 x i16> %x, %y
|
|
|
|
%max = select <32 x i1> %cmp, <32 x i16> %y, <32 x i16> %x
|
|
|
|
%res = sub <32 x i16> %max, %y
|
|
|
|
ret <32 x i16> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <64 x i8> @psubus_64i8_max(<64 x i8> %x, <64 x i8> %y) nounwind {
|
|
|
|
; SSE-LABEL: psubus_64i8_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; SSE-NEXT: psubusb %xmm4, %xmm0
|
|
|
|
; SSE-NEXT: psubusb %xmm5, %xmm1
|
|
|
|
; SSE-NEXT: psubusb %xmm6, %xmm2
|
|
|
|
; SSE-NEXT: psubusb %xmm7, %xmm3
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: psubus_64i8_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2018-02-24 20:44:12 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX1-NEXT: vpsubusb %xmm4, %xmm5, %xmm4
|
|
|
|
; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
|
2018-02-24 20:44:12 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
|
|
|
|
; AVX1-NEXT: vpsubusb %xmm2, %xmm4, %xmm2
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1
|
2018-02-24 20:44:12 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: psubus_64i8_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpsubusb %ymm3, %ymm1, %ymm1
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: psubus_64i8_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%cmp = icmp ult <64 x i8> %x, %y
|
|
|
|
%max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
|
|
|
|
%res = sub <64 x i8> %max, %y
|
|
|
|
ret <64 x i8> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <32 x i8> @psubus_32i8_max(<32 x i8> %x, <32 x i8> %y) nounwind {
|
|
|
|
; SSE-LABEL: psubus_32i8_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; SSE-NEXT: psubusb %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: psubusb %xmm3, %xmm1
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: psubus_32i8_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: psubus_32i8_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: psubus_32i8_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%cmp = icmp ult <32 x i8> %x, %y
|
|
|
|
%max = select <32 x i1> %cmp, <32 x i8> %y, <32 x i8> %x
|
|
|
|
%res = sub <32 x i8> %max, %y
|
|
|
|
ret <32 x i8> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
|
|
|
|
; SSE2-LABEL: psubus_8i32_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm4
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm6
|
|
|
|
; SSE2-NEXT: pxor %xmm5, %xmm6
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm3, %xmm4
|
|
|
|
; SSE2-NEXT: por %xmm5, %xmm4
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
|
|
|
|
; SSE2-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSE2-NEXT: pandn %xmm2, %xmm4
|
|
|
|
; SSE2-NEXT: por %xmm3, %xmm4
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm3
|
|
|
|
; SSE2-NEXT: pxor %xmm5, %xmm3
|
|
|
|
; SSE2-NEXT: por %xmm0, %xmm5
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
|
|
|
|
; SSE2-NEXT: pand %xmm5, %xmm0
|
|
|
|
; SSE2-NEXT: pandn %xmm1, %xmm5
|
|
|
|
; SSE2-NEXT: por %xmm5, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm1, %xmm0
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm2, %xmm4
|
|
|
|
; SSE2-NEXT: pslld $16, %xmm4
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm4
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm0
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: packssdw %xmm4, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: psubus_8i32_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSSE3: # %bb.0: # %vector.ph
|
2018-02-24 21:39:13 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm2, %xmm5
|
|
|
|
; SSSE3-NEXT: pxor %xmm4, %xmm5
|
2018-02-24 21:39:13 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm6, %xmm7
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
|
|
|
|
; SSSE3-NEXT: pand %xmm7, %xmm2
|
|
|
|
; SSSE3-NEXT: pandn %xmm5, %xmm7
|
|
|
|
; SSSE3-NEXT: por %xmm2, %xmm7
|
|
|
|
; SSSE3-NEXT: pshufb %xmm3, %xmm7
|
|
|
|
; SSSE3-NEXT: pxor %xmm1, %xmm4
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
|
|
|
|
; SSSE3-NEXT: pand %xmm6, %xmm1
|
|
|
|
; SSSE3-NEXT: pandn %xmm5, %xmm6
|
|
|
|
; SSSE3-NEXT: por %xmm1, %xmm6
|
|
|
|
; SSSE3-NEXT: pshufb %xmm3, %xmm6
|
|
|
|
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
|
|
|
|
; SSSE3-NEXT: psubusw %xmm6, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: psubus_8i32_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE41: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
|
|
|
|
; SSE41-NEXT: pminud %xmm3, %xmm2
|
|
|
|
; SSE41-NEXT: pminud %xmm3, %xmm1
|
|
|
|
; SSE41-NEXT: packusdw %xmm2, %xmm1
|
|
|
|
; SSE41-NEXT: psubusw %xmm1, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: psubus_8i32_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2017-11-02 05:52:29 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
|
|
|
|
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: psubus_8i32_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
|
|
|
|
; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
|
2017-11-02 05:52:29 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: psubus_8i32_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
|
|
|
|
; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%lhs = zext <8 x i16> %x to <8 x i32>
|
|
|
|
%cond = icmp ult <8 x i32> %lhs, %y
|
|
|
|
%max = select <8 x i1> %cond, <8 x i32> %y, <8 x i32> %lhs
|
|
|
|
%sub = sub <8 x i32> %max, %y
|
|
|
|
%res = trunc <8 x i32> %sub to <8 x i16>
|
|
|
|
ret <8 x i16> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
|
|
|
|
; SSE2-LABEL: psubus_8i64_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm5, %xmm5
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm10
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm10, %xmm8
|
|
|
|
; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
|
|
|
|
; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm9
|
|
|
|
; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
|
|
|
|
; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm6
|
|
|
|
; SSE2-NEXT: pxor %xmm11, %xmm6
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm7
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: por %xmm11, %xmm7
|
|
|
|
; SSE2-NEXT: movdqa %xmm7, %xmm5
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
|
|
|
|
; SSE2-NEXT: pcmpeqd %xmm6, %xmm7
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
|
|
|
|
; SSE2-NEXT: pand %xmm12, %xmm7
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3]
|
|
|
|
; SSE2-NEXT: por %xmm7, %xmm13
|
|
|
|
; SSE2-NEXT: pand %xmm13, %xmm0
|
|
|
|
; SSE2-NEXT: pandn %xmm2, %xmm13
|
|
|
|
; SSE2-NEXT: por %xmm0, %xmm13
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: pxor %xmm11, %xmm0
|
|
|
|
; SSE2-NEXT: movdqa %xmm9, %xmm5
|
|
|
|
; SSE2-NEXT: por %xmm11, %xmm5
|
|
|
|
; SSE2-NEXT: movdqa %xmm5, %xmm7
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,2,2]
|
|
|
|
; SSE2-NEXT: pcmpeqd %xmm0, %xmm5
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: pand %xmm12, %xmm5
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
|
|
|
|
; SSE2-NEXT: por %xmm5, %xmm0
|
|
|
|
; SSE2-NEXT: pand %xmm0, %xmm9
|
|
|
|
; SSE2-NEXT: pandn %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm9, %xmm0
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm4, %xmm5
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm11, %xmm5
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm10, %xmm7
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: por %xmm11, %xmm7
|
|
|
|
; SSE2-NEXT: movdqa %xmm7, %xmm6
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
|
|
|
|
; SSE2-NEXT: pand %xmm9, %xmm5
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: por %xmm5, %xmm7
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm10
|
|
|
|
; SSE2-NEXT: pandn %xmm4, %xmm7
|
|
|
|
; SSE2-NEXT: por %xmm10, %xmm7
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm3, %xmm5
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm11, %xmm5
|
|
|
|
; SSE2-NEXT: por %xmm8, %xmm11
|
|
|
|
; SSE2-NEXT: movdqa %xmm11, %xmm6
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
|
|
|
|
; SSE2-NEXT: pcmpeqd %xmm5, %xmm11
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,3,3]
|
|
|
|
; SSE2-NEXT: pand %xmm9, %xmm5
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
|
|
|
|
; SSE2-NEXT: por %xmm5, %xmm6
|
|
|
|
; SSE2-NEXT: pand %xmm6, %xmm8
|
|
|
|
; SSE2-NEXT: pandn %xmm3, %xmm6
|
|
|
|
; SSE2-NEXT: por %xmm8, %xmm6
|
|
|
|
; SSE2-NEXT: psubq %xmm3, %xmm6
|
|
|
|
; SSE2-NEXT: psubq %xmm4, %xmm7
|
|
|
|
; SSE2-NEXT: psubq %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: psubq %xmm2, %xmm13
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,2,3]
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
|
|
|
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
|
|
|
|
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3]
|
|
|
|
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: psubus_8i64_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSSE3: # %bb.0: # %vector.ph
|
2018-02-24 22:06:39 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm2, %xmm7
|
|
|
|
; SSSE3-NEXT: pxor %xmm5, %xmm7
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm8, %xmm6
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
|
2018-02-24 22:06:39 +08:00
|
|
|
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
|
|
|
|
; SSSE3-NEXT: pand %xmm9, %xmm7
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
|
|
|
|
; SSSE3-NEXT: por %xmm7, %xmm6
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535]
|
|
|
|
; SSSE3-NEXT: pand %xmm6, %xmm2
|
|
|
|
; SSSE3-NEXT: pandn %xmm9, %xmm6
|
|
|
|
; SSSE3-NEXT: por %xmm2, %xmm6
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
|
|
|
|
; SSSE3-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm1, %xmm6
|
|
|
|
; SSSE3-NEXT: pxor %xmm5, %xmm6
|
|
|
|
; SSSE3-NEXT: movdqa %xmm8, %xmm7
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,2,2]
|
|
|
|
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm6
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
|
2018-02-24 22:06:39 +08:00
|
|
|
; SSSE3-NEXT: pand %xmm2, %xmm6
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
|
|
|
|
; SSSE3-NEXT: por %xmm6, %xmm2
|
|
|
|
; SSSE3-NEXT: pand %xmm2, %xmm1
|
|
|
|
; SSSE3-NEXT: pandn %xmm9, %xmm2
|
|
|
|
; SSSE3-NEXT: por %xmm1, %xmm2
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
2018-02-24 22:06:39 +08:00
|
|
|
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm4, %xmm2
|
|
|
|
; SSSE3-NEXT: pxor %xmm5, %xmm2
|
|
|
|
; SSSE3-NEXT: movdqa %xmm8, %xmm6
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
|
|
|
|
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
|
|
|
; SSSE3-NEXT: pand %xmm7, %xmm2
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
|
|
|
|
; SSSE3-NEXT: por %xmm2, %xmm6
|
|
|
|
; SSSE3-NEXT: pand %xmm6, %xmm4
|
|
|
|
; SSSE3-NEXT: pandn %xmm9, %xmm6
|
|
|
|
; SSSE3-NEXT: por %xmm4, %xmm6
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
|
|
|
|
; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
|
|
|
|
; SSSE3-NEXT: pxor %xmm3, %xmm5
|
|
|
|
; SSSE3-NEXT: movdqa %xmm8, %xmm4
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
|
|
|
|
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
|
|
|
|
; SSSE3-NEXT: pand %xmm6, %xmm5
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
|
|
|
|
; SSSE3-NEXT: por %xmm5, %xmm4
|
|
|
|
; SSSE3-NEXT: pand %xmm4, %xmm3
|
|
|
|
; SSSE3-NEXT: pandn %xmm9, %xmm4
|
|
|
|
; SSSE3-NEXT: por %xmm3, %xmm4
|
|
|
|
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
|
|
|
|
; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
|
|
|
|
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
|
|
|
|
; SSSE3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
|
|
|
|
; SSSE3-NEXT: psubusw %xmm3, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: psubus_8i64_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE41: # %bb.0: # %vector.ph
|
2018-02-24 22:06:39 +08:00
|
|
|
; SSE41-NEXT: movdqa %xmm0, %xmm10
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSE41-NEXT: movdqa %xmm4, %xmm0
|
|
|
|
; SSE41-NEXT: pxor %xmm6, %xmm0
|
2018-02-24 22:06:39 +08:00
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991]
|
|
|
|
; SSE41-NEXT: movdqa %xmm8, %xmm7
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
|
2018-02-24 22:06:39 +08:00
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
|
|
|
|
; SSE41-NEXT: pand %xmm9, %xmm5
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
|
|
|
|
; SSE41-NEXT: por %xmm5, %xmm0
|
2018-02-24 22:06:39 +08:00
|
|
|
; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535]
|
|
|
|
; SSE41-NEXT: movapd %xmm7, %xmm11
|
|
|
|
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm11
|
|
|
|
; SSE41-NEXT: movdqa %xmm3, %xmm0
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE41-NEXT: pxor %xmm6, %xmm0
|
2018-02-24 22:06:39 +08:00
|
|
|
; SSE41-NEXT: movdqa %xmm8, %xmm4
|
|
|
|
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2]
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
|
|
|
|
; SSE41-NEXT: pand %xmm9, %xmm5
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE41-NEXT: por %xmm5, %xmm0
|
2018-02-24 22:06:39 +08:00
|
|
|
; SSE41-NEXT: movapd %xmm7, %xmm4
|
|
|
|
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
|
|
|
|
; SSE41-NEXT: packusdw %xmm11, %xmm4
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE41-NEXT: movdqa %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: pxor %xmm6, %xmm0
|
2018-02-24 22:06:39 +08:00
|
|
|
; SSE41-NEXT: movdqa %xmm8, %xmm3
|
|
|
|
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,2,2]
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm8, %xmm0
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
|
|
|
|
; SSE41-NEXT: pand %xmm9, %xmm5
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
|
|
|
|
; SSE41-NEXT: por %xmm5, %xmm0
|
|
|
|
; SSE41-NEXT: movapd %xmm7, %xmm3
|
|
|
|
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
|
|
|
|
; SSE41-NEXT: pxor %xmm1, %xmm6
|
|
|
|
; SSE41-NEXT: movdqa %xmm8, %xmm0
|
|
|
|
; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm8, %xmm6
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
|
|
|
|
; SSE41-NEXT: pand %xmm2, %xmm5
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
|
|
|
; SSE41-NEXT: por %xmm5, %xmm0
|
|
|
|
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
|
|
|
|
; SSE41-NEXT: packusdw %xmm3, %xmm7
|
|
|
|
; SSE41-NEXT: packusdw %xmm4, %xmm7
|
|
|
|
; SSE41-NEXT: psubusw %xmm7, %xmm10
|
2018-06-05 18:52:29 +08:00
|
|
|
; SSE41-NEXT: pxor %xmm1, %xmm1
|
|
|
|
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
|
|
|
|
; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
|
2018-02-24 22:06:39 +08:00
|
|
|
; SSE41-NEXT: packusdw %xmm10, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: psubus_8i64_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2018-02-24 22:06:39 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
|
|
|
|
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
|
|
|
|
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
|
|
|
|
; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6
|
|
|
|
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm6
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
|
|
|
|
; AVX1-NEXT: vmovapd {{.*#+}} ymm6 = [65535,65535,65535,65535]
|
|
|
|
; AVX1-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
|
|
|
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
|
|
|
|
; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4
|
|
|
|
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
|
|
|
|
; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2018-06-05 18:52:29 +08:00
|
|
|
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2018-02-24 22:06:39 +08:00
|
|
|
; AVX2-LABEL: psubus_8i64_max:
|
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
|
|
|
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
|
|
|
|
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4
|
|
|
|
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343]
|
|
|
|
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
|
|
|
|
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,65535,65535,65535]
|
|
|
|
; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
|
|
|
|
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
|
|
|
|
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
|
|
|
|
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
|
|
|
|
; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
2017-09-27 22:38:05 +08:00
|
|
|
;
|
|
|
|
; AVX512-LABEL: psubus_8i64_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX512-NEXT: vpmovusqw %zmm1, %xmm1
|
|
|
|
; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%lhs = zext <8 x i16> %x to <8 x i64>
|
|
|
|
%cond = icmp ult <8 x i64> %lhs, %y
|
|
|
|
%max = select <8 x i1> %cond, <8 x i64> %y, <8 x i64> %lhs
|
|
|
|
%sub = sub <8 x i64> %max, %y
|
|
|
|
%res = trunc <8 x i64> %sub to <8 x i16>
|
|
|
|
ret <8 x i16> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
|
|
|
|
; SSE2-LABEL: psubus_16i32_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm8
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm7, %xmm7
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm10
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm3, %xmm6
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm7, %xmm6
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm9
|
|
|
|
; SSE2-NEXT: por %xmm7, %xmm9
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
|
|
|
|
; SSE2-NEXT: pand %xmm9, %xmm0
|
|
|
|
; SSE2-NEXT: pandn %xmm3, %xmm9
|
|
|
|
; SSE2-NEXT: por %xmm0, %xmm9
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm6
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm7, %xmm6
|
|
|
|
; SSE2-NEXT: movdqa %xmm10, %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm7, %xmm0
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
|
|
|
|
; SSE2-NEXT: pand %xmm0, %xmm10
|
|
|
|
; SSE2-NEXT: pandn %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm10, %xmm0
|
|
|
|
; SSE2-NEXT: movdqa %xmm5, %xmm10
|
|
|
|
; SSE2-NEXT: pxor %xmm7, %xmm10
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, %xmm6
|
|
|
|
; SSE2-NEXT: por %xmm7, %xmm6
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm10, %xmm6
|
|
|
|
; SSE2-NEXT: pand %xmm6, %xmm8
|
|
|
|
; SSE2-NEXT: pandn %xmm5, %xmm6
|
|
|
|
; SSE2-NEXT: por %xmm8, %xmm6
|
|
|
|
; SSE2-NEXT: movdqa %xmm4, %xmm8
|
|
|
|
; SSE2-NEXT: pxor %xmm7, %xmm8
|
|
|
|
; SSE2-NEXT: por %xmm1, %xmm7
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm1
|
|
|
|
; SSE2-NEXT: pandn %xmm4, %xmm7
|
|
|
|
; SSE2-NEXT: por %xmm7, %xmm1
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm4, %xmm1
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm6
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm2, %xmm0
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm3, %xmm9
|
|
|
|
; SSE2-NEXT: pslld $16, %xmm9
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm9
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm0
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: packssdw %xmm9, %xmm0
|
|
|
|
; SSE2-NEXT: pslld $16, %xmm6
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm6
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: pslld $16, %xmm1
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm1
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: packssdw %xmm6, %xmm1
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: psubus_16i32_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSSE3: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm1, %xmm8
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSSE3-NEXT: pxor %xmm7, %xmm7
|
|
|
|
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
|
|
|
|
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm0, %xmm10
|
|
|
|
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
|
|
|
|
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm3, %xmm6
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSSE3-NEXT: pxor %xmm7, %xmm6
|
|
|
|
; SSSE3-NEXT: movdqa %xmm0, %xmm9
|
|
|
|
; SSSE3-NEXT: por %xmm7, %xmm9
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm9
|
|
|
|
; SSSE3-NEXT: pand %xmm9, %xmm0
|
|
|
|
; SSSE3-NEXT: pandn %xmm3, %xmm9
|
|
|
|
; SSSE3-NEXT: por %xmm0, %xmm9
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm2, %xmm6
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSSE3-NEXT: pxor %xmm7, %xmm6
|
|
|
|
; SSSE3-NEXT: movdqa %xmm10, %xmm0
|
|
|
|
; SSSE3-NEXT: por %xmm7, %xmm0
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
|
|
|
|
; SSSE3-NEXT: pand %xmm0, %xmm10
|
|
|
|
; SSSE3-NEXT: pandn %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: por %xmm10, %xmm0
|
|
|
|
; SSSE3-NEXT: movdqa %xmm5, %xmm10
|
|
|
|
; SSSE3-NEXT: pxor %xmm7, %xmm10
|
|
|
|
; SSSE3-NEXT: movdqa %xmm8, %xmm6
|
|
|
|
; SSSE3-NEXT: por %xmm7, %xmm6
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm6
|
|
|
|
; SSSE3-NEXT: pand %xmm6, %xmm8
|
|
|
|
; SSSE3-NEXT: pandn %xmm5, %xmm6
|
|
|
|
; SSSE3-NEXT: por %xmm8, %xmm6
|
|
|
|
; SSSE3-NEXT: movdqa %xmm4, %xmm8
|
|
|
|
; SSSE3-NEXT: pxor %xmm7, %xmm8
|
|
|
|
; SSSE3-NEXT: por %xmm1, %xmm7
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm7
|
|
|
|
; SSSE3-NEXT: pand %xmm7, %xmm1
|
|
|
|
; SSSE3-NEXT: pandn %xmm4, %xmm7
|
|
|
|
; SSSE3-NEXT: por %xmm7, %xmm1
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: psubd %xmm4, %xmm1
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSSE3-NEXT: psubd %xmm5, %xmm6
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: psubd %xmm2, %xmm0
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSSE3-NEXT: psubd %xmm3, %xmm9
|
|
|
|
; SSSE3-NEXT: pslld $16, %xmm9
|
|
|
|
; SSSE3-NEXT: psrad $16, %xmm9
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: pslld $16, %xmm0
|
|
|
|
; SSSE3-NEXT: psrad $16, %xmm0
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSSE3-NEXT: packssdw %xmm9, %xmm0
|
|
|
|
; SSSE3-NEXT: pslld $16, %xmm6
|
|
|
|
; SSSE3-NEXT: psrad $16, %xmm6
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: pslld $16, %xmm1
|
|
|
|
; SSSE3-NEXT: psrad $16, %xmm1
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSSE3-NEXT: packssdw %xmm6, %xmm1
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: psubus_16i32_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE41: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
|
|
|
|
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
|
|
|
|
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
|
|
|
|
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
|
|
|
|
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; SSE41-NEXT: pmaxud %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: pmaxud %xmm3, %xmm7
|
|
|
|
; SSE41-NEXT: pmaxud %xmm4, %xmm1
|
|
|
|
; SSE41-NEXT: pmaxud %xmm5, %xmm6
|
|
|
|
; SSE41-NEXT: psubd %xmm5, %xmm6
|
|
|
|
; SSE41-NEXT: psubd %xmm4, %xmm1
|
|
|
|
; SSE41-NEXT: psubd %xmm3, %xmm7
|
|
|
|
; SSE41-NEXT: psubd %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: pxor %xmm2, %xmm2
|
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
|
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2],xmm2[3],xmm7[4],xmm2[5],xmm7[6],xmm2[7]
|
|
|
|
; SSE41-NEXT: packusdw %xmm7, %xmm0
|
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
|
|
|
|
; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2],xmm2[3],xmm6[4],xmm2[5],xmm6[6],xmm2[7]
|
|
|
|
; SSE41-NEXT: packusdw %xmm6, %xmm1
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: psubus_16i32_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2018-01-12 08:17:38 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
2018-01-22 20:05:17 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
|
2018-01-12 08:17:38 +08:00
|
|
|
; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
|
2018-01-22 20:05:17 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
|
|
|
|
; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: psubus_16i32_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
2017-11-02 05:52:29 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535]
|
|
|
|
; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
2017-11-02 05:52:29 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
|
|
|
|
; AVX2-NEXT: vpsubusw %xmm1, %xmm3, %xmm1
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
2017-11-02 05:52:29 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: psubus_16i32_max:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX512-NEXT: vpmovusdw %zmm1, %ymm1
|
|
|
|
; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%lhs = zext <16 x i16> %x to <16 x i32>
|
|
|
|
%cond = icmp ult <16 x i32> %lhs, %y
|
|
|
|
%max = select <16 x i1> %cond, <16 x i32> %y, <16 x i32> %lhs
|
|
|
|
%sub = sub <16 x i32> %max, %y
|
|
|
|
%res = trunc <16 x i32> %sub to <16 x i16>
|
|
|
|
ret <16 x i16> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwind {
|
|
|
|
; SSE2-LABEL: psubus_i16_i32_max_swapped:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0: # %vector.ph
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm3, %xmm3
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm4
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm3
|
|
|
|
; SSE2-NEXT: pxor %xmm5, %xmm3
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm6
|
|
|
|
; SSE2-NEXT: por %xmm5, %xmm6
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm6, %xmm3
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm6
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pand %xmm3, %xmm6
|
|
|
|
; SSE2-NEXT: pandn %xmm0, %xmm3
|
|
|
|
; SSE2-NEXT: por %xmm6, %xmm3
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: pxor %xmm5, %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm4, %xmm5
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm5
|
|
|
|
; SSE2-NEXT: pand %xmm0, %xmm5
|
|
|
|
; SSE2-NEXT: pandn %xmm4, %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm5, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm1, %xmm0
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm2, %xmm3
|
|
|
|
; SSE2-NEXT: pslld $16, %xmm3
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm3
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm0
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: packssdw %xmm3, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: psubus_i16_i32_max_swapped:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSSE3: # %bb.0: # %vector.ph
|
2018-02-24 21:39:13 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm2, %xmm5
|
|
|
|
; SSSE3-NEXT: pxor %xmm4, %xmm5
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm6, %xmm7
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
|
|
|
|
; SSSE3-NEXT: pand %xmm7, %xmm2
|
|
|
|
; SSSE3-NEXT: pandn %xmm5, %xmm7
|
|
|
|
; SSSE3-NEXT: por %xmm2, %xmm7
|
|
|
|
; SSSE3-NEXT: pshufb %xmm3, %xmm7
|
|
|
|
; SSSE3-NEXT: pxor %xmm1, %xmm4
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
|
|
|
|
; SSSE3-NEXT: pand %xmm6, %xmm1
|
|
|
|
; SSSE3-NEXT: pandn %xmm5, %xmm6
|
|
|
|
; SSSE3-NEXT: por %xmm1, %xmm6
|
|
|
|
; SSSE3-NEXT: pshufb %xmm3, %xmm6
|
|
|
|
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
|
|
|
|
; SSSE3-NEXT: psubusw %xmm6, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: psubus_i16_i32_max_swapped:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE41: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
|
|
|
|
; SSE41-NEXT: pminud %xmm3, %xmm2
|
|
|
|
; SSE41-NEXT: pminud %xmm3, %xmm1
|
|
|
|
; SSE41-NEXT: packusdw %xmm2, %xmm1
|
|
|
|
; SSE41-NEXT: psubusw %xmm1, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: psubus_i16_i32_max_swapped:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2017-11-02 05:52:29 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
|
|
|
|
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: psubus_i16_i32_max_swapped:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
|
|
|
|
; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
|
2017-11-02 05:52:29 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: psubus_i16_i32_max_swapped:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
|
|
|
|
; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%lhs = zext <8 x i16> %x to <8 x i32>
|
|
|
|
%cond = icmp ult <8 x i32> %y, %lhs
|
|
|
|
%max = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
|
|
|
|
%sub = sub <8 x i32> %max, %y
|
|
|
|
%res = trunc <8 x i32> %sub to <8 x i16>
|
|
|
|
ret <8 x i16> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
|
|
|
|
; SSE2-LABEL: psubus_i16_i32_min:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0: # %vector.ph
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm4
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm5
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm5
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm6
|
|
|
|
; SSE2-NEXT: por %xmm4, %xmm6
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm6
|
|
|
|
; SSE2-NEXT: pand %xmm5, %xmm6
|
|
|
|
; SSE2-NEXT: pandn %xmm2, %xmm5
|
|
|
|
; SSE2-NEXT: por %xmm6, %xmm5
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pxor %xmm4, %xmm2
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: por %xmm3, %xmm4
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm3, %xmm4
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pand %xmm2, %xmm4
|
|
|
|
; SSE2-NEXT: pandn %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: por %xmm4, %xmm2
|
|
|
|
; SSE2-NEXT: psubd %xmm2, %xmm3
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm0
|
|
|
|
; SSE2-NEXT: pslld $16, %xmm0
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm0
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSE2-NEXT: pslld $16, %xmm3
|
|
|
|
; SSE2-NEXT: psrad $16, %xmm3
|
|
|
|
; SSE2-NEXT: packssdw %xmm0, %xmm3
|
|
|
|
; SSE2-NEXT: movdqa %xmm3, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: psubus_i16_i32_min:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSSE3: # %bb.0: # %vector.ph
|
2018-02-24 21:39:13 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
|
2018-02-11 18:52:37 +08:00
|
|
|
; SSSE3-NEXT: movdqa %xmm2, %xmm5
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: pxor %xmm4, %xmm5
|
2018-02-24 21:39:13 +08:00
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
|
|
|
|
; SSSE3-NEXT: movdqa %xmm6, %xmm7
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
|
|
|
|
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
|
|
|
|
; SSSE3-NEXT: pand %xmm7, %xmm2
|
|
|
|
; SSSE3-NEXT: pandn %xmm5, %xmm7
|
|
|
|
; SSSE3-NEXT: por %xmm2, %xmm7
|
|
|
|
; SSSE3-NEXT: pshufb %xmm3, %xmm7
|
|
|
|
; SSSE3-NEXT: pxor %xmm1, %xmm4
|
|
|
|
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
|
|
|
|
; SSSE3-NEXT: pand %xmm6, %xmm1
|
|
|
|
; SSSE3-NEXT: pandn %xmm5, %xmm6
|
|
|
|
; SSSE3-NEXT: por %xmm1, %xmm6
|
|
|
|
; SSSE3-NEXT: pshufb %xmm3, %xmm6
|
|
|
|
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
|
|
|
|
; SSSE3-NEXT: psubusw %xmm6, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: psubus_i16_i32_min:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE41: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE41-NEXT: pminud %xmm3, %xmm2
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; SSE41-NEXT: pminud %xmm3, %xmm1
|
|
|
|
; SSE41-NEXT: packusdw %xmm2, %xmm1
|
|
|
|
; SSE41-NEXT: psubusw %xmm1, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; SSE41-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: psubus_i16_i32_min:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0: # %vector.ph
|
2017-11-02 05:52:29 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
|
|
|
|
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: psubus_i16_i32_min:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
|
|
|
|
; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
|
2017-11-02 05:52:29 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: psubus_i16_i32_min:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0: # %vector.ph
|
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
2017-10-10 04:01:10 +08:00
|
|
|
; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
|
|
|
|
; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
|
2017-09-27 22:38:05 +08:00
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
|
|
|
vector.ph:
|
|
|
|
%lhs = zext <8 x i16> %x to <8 x i32>
|
|
|
|
%cond = icmp ult <8 x i32> %lhs, %y
|
|
|
|
%min = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
|
|
|
|
%sub = sub <8 x i32> %lhs, %min
|
|
|
|
%res = trunc <8 x i32> %sub to <8 x i16>
|
|
|
|
ret <8 x i16> %res
|
|
|
|
}
|