2016-08-10 23:13:49 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2018-07-04 22:20:58 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
|
[X86] Improve the lowering of packed shifts by constant build_vector.
This patch teaches the backend how to efficiently lower logical and
arithmetic packed shifts on both SSE and AVX/AVX2 machines.
When possible, instead of scalarizing a vector shift, the backend should try
to expand the shift into a sequence of two packed shifts by immedate count
followed by a MOVSS/MOVSD.
Example
(v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
Can be rewritten as:
(v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
[with X and Y ConstantInt]
The advantage is that the two new shifts from the example would be lowered into
X86ISD::VSRLI nodes. This is always cheaper than scalarizing the vector into
four scalar shifts plus four pairs of vector insert/extract.
llvm-svn: 206316
2014-04-16 03:30:48 +08:00
|
|
|
|
|
|
|
; Verify that the following shifts are lowered into a sequence of two shifts plus
|
|
|
|
; a blend. On pre-avx2 targets, instead of scalarizing logical and arithmetic
|
|
|
|
; packed shift right by a constant build_vector the backend should always try to
|
|
|
|
; emit a simpler sequence of two shifts + blend when possible.
|
|
|
|
|
|
|
|
define <8 x i16> @test1(<8 x i16> %a) {
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-LABEL: test1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSE-NEXT: psrlw $3, %xmm1
|
|
|
|
; SSE-NEXT: psrlw $2, %xmm0
|
|
|
|
; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX1-LABEL: test1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test1:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm1
|
|
|
|
; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; AVX2-NEXT: retq
|
[X86] Improve the lowering of packed shifts by constant build_vector.
This patch teaches the backend how to efficiently lower logical and
arithmetic packed shifts on both SSE and AVX/AVX2 machines.
When possible, instead of scalarizing a vector shift, the backend should try
to expand the shift into a sequence of two packed shifts by immedate count
followed by a MOVSS/MOVSD.
Example
(v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
Can be rewritten as:
(v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
[with X and Y ConstantInt]
The advantage is that the two new shifts from the example would be lowered into
X86ISD::VSRLI nodes. This is always cheaper than scalarizing the vector into
four scalar shifts plus four pairs of vector insert/extract.
llvm-svn: 206316
2014-04-16 03:30:48 +08:00
|
|
|
%lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
|
|
|
|
ret <8 x i16> %lshr
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @test2(<8 x i16> %a) {
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-LABEL: test2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSE-NEXT: psrlw $3, %xmm1
|
|
|
|
; SSE-NEXT: psrlw $2, %xmm0
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX1-LABEL: test2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1
|
|
|
|
; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
|
|
|
|
; AVX2-NEXT: retq
|
[X86] Improve the lowering of packed shifts by constant build_vector.
This patch teaches the backend how to efficiently lower logical and
arithmetic packed shifts on both SSE and AVX/AVX2 machines.
When possible, instead of scalarizing a vector shift, the backend should try
to expand the shift into a sequence of two packed shifts by immedate count
followed by a MOVSS/MOVSD.
Example
(v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
Can be rewritten as:
(v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
[with X and Y ConstantInt]
The advantage is that the two new shifts from the example would be lowered into
X86ISD::VSRLI nodes. This is always cheaper than scalarizing the vector into
four scalar shifts plus four pairs of vector insert/extract.
llvm-svn: 206316
2014-04-16 03:30:48 +08:00
|
|
|
%lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
|
|
|
|
ret <8 x i16> %lshr
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @test3(<4 x i32> %a) {
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-LABEL: test3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSE-NEXT: psrld $3, %xmm1
|
|
|
|
; SSE-NEXT: psrld $2, %xmm0
|
|
|
|
; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: test3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
|
2016-08-10 23:13:49 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: retq
|
[X86] Improve the lowering of packed shifts by constant build_vector.
This patch teaches the backend how to efficiently lower logical and
arithmetic packed shifts on both SSE and AVX/AVX2 machines.
When possible, instead of scalarizing a vector shift, the backend should try
to expand the shift into a sequence of two packed shifts by immedate count
followed by a MOVSS/MOVSD.
Example
(v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
Can be rewritten as:
(v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
[with X and Y ConstantInt]
The advantage is that the two new shifts from the example would be lowered into
X86ISD::VSRLI nodes. This is always cheaper than scalarizing the vector into
four scalar shifts plus four pairs of vector insert/extract.
llvm-svn: 206316
2014-04-16 03:30:48 +08:00
|
|
|
%lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
|
|
|
|
ret <4 x i32> %lshr
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @test4(<4 x i32> %a) {
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-LABEL: test4:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSE-NEXT: psrld $3, %xmm1
|
|
|
|
; SSE-NEXT: psrld $2, %xmm0
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: test4:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpsrld $3, %xmm0, %xmm0
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
|
2016-08-10 23:13:49 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test4:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: retq
|
[X86] Improve the lowering of packed shifts by constant build_vector.
This patch teaches the backend how to efficiently lower logical and
arithmetic packed shifts on both SSE and AVX/AVX2 machines.
When possible, instead of scalarizing a vector shift, the backend should try
to expand the shift into a sequence of two packed shifts by immedate count
followed by a MOVSS/MOVSD.
Example
(v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
Can be rewritten as:
(v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
[with X and Y ConstantInt]
The advantage is that the two new shifts from the example would be lowered into
X86ISD::VSRLI nodes. This is always cheaper than scalarizing the vector into
four scalar shifts plus four pairs of vector insert/extract.
llvm-svn: 206316
2014-04-16 03:30:48 +08:00
|
|
|
%lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
|
|
|
|
ret <4 x i32> %lshr
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @test5(<8 x i16> %a) {
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-LABEL: test5:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSE-NEXT: psraw $3, %xmm1
|
|
|
|
; SSE-NEXT: psraw $2, %xmm0
|
|
|
|
; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX1-LABEL: test5:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test5:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX2-NEXT: vpsraw $3, %xmm0, %xmm1
|
|
|
|
; AVX2-NEXT: vpsraw $2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
|
|
|
; AVX2-NEXT: retq
|
[X86] Improve the lowering of packed shifts by constant build_vector.
This patch teaches the backend how to efficiently lower logical and
arithmetic packed shifts on both SSE and AVX/AVX2 machines.
When possible, instead of scalarizing a vector shift, the backend should try
to expand the shift into a sequence of two packed shifts by immedate count
followed by a MOVSS/MOVSD.
Example
(v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
Can be rewritten as:
(v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
[with X and Y ConstantInt]
The advantage is that the two new shifts from the example would be lowered into
X86ISD::VSRLI nodes. This is always cheaper than scalarizing the vector into
four scalar shifts plus four pairs of vector insert/extract.
llvm-svn: 206316
2014-04-16 03:30:48 +08:00
|
|
|
%lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
|
|
|
|
ret <8 x i16> %lshr
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @test6(<8 x i16> %a) {
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-LABEL: test6:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSE-NEXT: psraw $3, %xmm1
|
|
|
|
; SSE-NEXT: psraw $2, %xmm0
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX1-LABEL: test6:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test6:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX2-NEXT: vpsraw $2, %xmm0, %xmm1
|
|
|
|
; AVX2-NEXT: vpsraw $3, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
|
|
|
|
; AVX2-NEXT: retq
|
[X86] Improve the lowering of packed shifts by constant build_vector.
This patch teaches the backend how to efficiently lower logical and
arithmetic packed shifts on both SSE and AVX/AVX2 machines.
When possible, instead of scalarizing a vector shift, the backend should try
to expand the shift into a sequence of two packed shifts by immedate count
followed by a MOVSS/MOVSD.
Example
(v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
Can be rewritten as:
(v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
[with X and Y ConstantInt]
The advantage is that the two new shifts from the example would be lowered into
X86ISD::VSRLI nodes. This is always cheaper than scalarizing the vector into
four scalar shifts plus four pairs of vector insert/extract.
llvm-svn: 206316
2014-04-16 03:30:48 +08:00
|
|
|
%lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
|
|
|
|
ret <8 x i16> %lshr
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @test7(<4 x i32> %a) {
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-LABEL: test7:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSE-NEXT: psrad $3, %xmm1
|
|
|
|
; SSE-NEXT: psrad $2, %xmm0
|
|
|
|
; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: test7:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
|
2016-08-10 23:13:49 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test7:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: retq
|
[X86] Improve the lowering of packed shifts by constant build_vector.
This patch teaches the backend how to efficiently lower logical and
arithmetic packed shifts on both SSE and AVX/AVX2 machines.
When possible, instead of scalarizing a vector shift, the backend should try
to expand the shift into a sequence of two packed shifts by immedate count
followed by a MOVSS/MOVSD.
Example
(v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
Can be rewritten as:
(v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
[with X and Y ConstantInt]
The advantage is that the two new shifts from the example would be lowered into
X86ISD::VSRLI nodes. This is always cheaper than scalarizing the vector into
four scalar shifts plus four pairs of vector insert/extract.
llvm-svn: 206316
2014-04-16 03:30:48 +08:00
|
|
|
%lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
|
|
|
|
ret <4 x i32> %lshr
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @test8(<4 x i32> %a) {
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-LABEL: test8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
2017-10-05 01:20:12 +08:00
|
|
|
; SSE-NEXT: psrad $3, %xmm1
|
|
|
|
; SSE-NEXT: psrad $2, %xmm0
|
|
|
|
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
|
2016-08-10 23:13:49 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: test8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; AVX1-NEXT: vpsrad $2, %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpsrad $3, %xmm0, %xmm0
|
2016-09-14 22:08:18 +08:00
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
|
2016-08-10 23:13:49 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2016-08-10 23:13:49 +08:00
|
|
|
; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: retq
|
[X86] Improve the lowering of packed shifts by constant build_vector.
This patch teaches the backend how to efficiently lower logical and
arithmetic packed shifts on both SSE and AVX/AVX2 machines.
When possible, instead of scalarizing a vector shift, the backend should try
to expand the shift into a sequence of two packed shifts by immedate count
followed by a MOVSS/MOVSD.
Example
(v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
Can be rewritten as:
(v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
[with X and Y ConstantInt]
The advantage is that the two new shifts from the example would be lowered into
X86ISD::VSRLI nodes. This is always cheaper than scalarizing the vector into
four scalar shifts plus four pairs of vector insert/extract.
llvm-svn: 206316
2014-04-16 03:30:48 +08:00
|
|
|
%lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
|
|
|
|
ret <4 x i32> %lshr
|
|
|
|
}
|
2018-07-02 22:53:41 +08:00
|
|
|
|
|
|
|
define <8 x i16> @test9(<8 x i16> %a) {
|
|
|
|
; SSE-LABEL: test9:
|
|
|
|
; SSE: # %bb.0:
|
2018-07-03 19:15:17 +08:00
|
|
|
; SSE-NEXT: movdqa %xmm0, %xmm1
|
2018-07-04 17:12:48 +08:00
|
|
|
; SSE-NEXT: psraw $3, %xmm1
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,0,0]
|
|
|
|
; SSE-NEXT: psraw $1, %xmm0
|
|
|
|
; SSE-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: pandn %xmm1, %xmm2
|
|
|
|
; SSE-NEXT: por %xmm2, %xmm0
|
2018-07-02 22:53:41 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
2018-07-04 17:12:48 +08:00
|
|
|
; AVX-LABEL: test9:
|
|
|
|
; AVX: # %bb.0:
|
|
|
|
; AVX-NEXT: vpsraw $3, %xmm0, %xmm1
|
|
|
|
; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7]
|
|
|
|
; AVX-NEXT: retq
|
2018-07-02 22:53:41 +08:00
|
|
|
%lshr = ashr <8 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
|
|
|
|
ret <8 x i16> %lshr
|
|
|
|
}
|
2018-07-04 16:55:23 +08:00
|
|
|
|
|
|
|
define <8 x i32> @test10(<8 x i32>* %a) {
|
|
|
|
; SSE-LABEL: test10:
|
|
|
|
; SSE: # %bb.0:
|
|
|
|
; SSE-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE-NEXT: movdqa 16(%rdi), %xmm1
|
|
|
|
; SSE-NEXT: psrad %xmm0, %xmm1
|
|
|
|
; SSE-NEXT: psrad $1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: test10:
|
|
|
|
; AVX1: # %bb.0:
|
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX1-NEXT: vpsrad $1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test10:
|
|
|
|
; AVX2: # %bb.0:
|
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vpsrad $1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
%ld = load <8 x i32>, <8 x i32>* %a, align 32
|
|
|
|
%ashr = ashr <8 x i32> %ld, <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
|
|
|
ret <8 x i32> %ashr
|
|
|
|
}
|
2018-07-04 22:20:58 +08:00
|
|
|
|
2018-07-05 17:54:53 +08:00
|
|
|
; test11 vs test12 - show difference between v16i16 that is repeated/non-repeated at v8i16 level (for PBLENDW masks).
|
|
|
|
|
2018-07-04 22:20:58 +08:00
|
|
|
define <16 x i16> @test11(<16 x i16> %a) {
|
|
|
|
; SSE-LABEL: test11:
|
|
|
|
; SSE: # %bb.0:
|
|
|
|
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: test11:
|
|
|
|
; AVX1: # %bb.0:
|
[X86][SSE] Prefer BLEND(SHL(v,c1),SHL(v,c2)) over MUL(v, c3)
Now that rL336250 has landed, we should prefer 2 immediate shifts + a shuffle blend over performing a multiply. Despite the increase in instructions, this is quicker (especially for slow v4i32 multiplies), avoid loads and constant pool usage. It does mean however that we increase register pressure. The code size will go up a little but by less than what we save on the constant pool data.
This patch also adds support for v16i16 to the BLEND(SHIFT(v,c1),SHIFT(v,c2)) combine, and also prevents blending on pre-SSE41 shifts if it would introduce extra blend masks/constant pool usage.
Differential Revision: https://reviews.llvm.org/D48936
llvm-svn: 336642
2018-07-10 15:58:33 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7]
|
|
|
|
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2018-07-04 22:20:58 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test11:
|
|
|
|
; AVX2: # %bb.0:
|
|
|
|
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
%lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 1, i16 1, i16 1, i16 3, i16 1>
|
|
|
|
ret <16 x i16> %lshr
|
|
|
|
}
|
2018-07-05 17:54:53 +08:00
|
|
|
|
|
|
|
define <16 x i16> @test12(<16 x i16> %a) {
|
|
|
|
; SSE-LABEL: test12:
|
|
|
|
; SSE: # %bb.0:
|
|
|
|
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8]
|
|
|
|
; SSE-NEXT: pmullw %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: pmullw %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: test12:
|
|
|
|
; AVX1: # %bb.0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
[X86][SSE] Prefer BLEND(SHL(v,c1),SHL(v,c2)) over MUL(v, c3)
Now that rL336250 has landed, we should prefer 2 immediate shifts + a shuffle blend over performing a multiply. Despite the increase in instructions, this is quicker (especially for slow v4i32 multiplies), avoid loads and constant pool usage. It does mean however that we increase register pressure. The code size will go up a little but by less than what we save on the constant pool data.
This patch also adds support for v16i16 to the BLEND(SHIFT(v,c1),SHIFT(v,c2)) combine, and also prevents blending on pre-SSE41 shifts if it would introduce extra blend masks/constant pool usage.
Differential Revision: https://reviews.llvm.org/D48936
llvm-svn: 336642
2018-07-10 15:58:33 +08:00
|
|
|
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7]
|
|
|
|
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
|
2018-07-05 17:54:53 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: test12:
|
|
|
|
; AVX2: # %bb.0:
|
[X86][SSE] Prefer BLEND(SHL(v,c1),SHL(v,c2)) over MUL(v, c3)
Now that rL336250 has landed, we should prefer 2 immediate shifts + a shuffle blend over performing a multiply. Despite the increase in instructions, this is quicker (especially for slow v4i32 multiplies), avoid loads and constant pool usage. It does mean however that we increase register pressure. The code size will go up a little but by less than what we save on the constant pool data.
This patch also adds support for v16i16 to the BLEND(SHIFT(v,c1),SHIFT(v,c2)) combine, and also prevents blending on pre-SSE41 shifts if it would introduce extra blend masks/constant pool usage.
Differential Revision: https://reviews.llvm.org/D48936
llvm-svn: 336642
2018-07-10 15:58:33 +08:00
|
|
|
; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1
|
|
|
|
; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15]
|
2018-07-05 17:54:53 +08:00
|
|
|
; AVX2-NEXT: retq
|
|
|
|
%lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
|
|
|
|
ret <16 x i16> %lshr
|
|
|
|
}
|