2016-08-26 01:17:46 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2017-06-23 22:16:50 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
|
2017-06-23 22:38:00 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v4i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movd %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v4i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX-NEXT: vmovd %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <4 x i8>, <4 x i8>* %a
|
|
|
|
%2 = load <4 x i8>, <4 x i8>* %b
|
|
|
|
%3 = zext <4 x i8> %1 to <4 x i32>
|
|
|
|
%4 = zext <4 x i8> %2 to <4 x i32>
|
|
|
|
%5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <4 x i32> %5, %4
|
|
|
|
%7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <4 x i32> %7 to <4 x i8>
|
|
|
|
store <4 x i8> %8, <4 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v8i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movq %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v8i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX-NEXT: vmovq %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <8 x i8>, <8 x i8>* %a
|
|
|
|
%2 = load <8 x i8>, <8 x i8>* %b
|
|
|
|
%3 = zext <8 x i8> %1 to <8 x i32>
|
|
|
|
%4 = zext <8 x i8> %2 to <8 x i32>
|
|
|
|
%5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <8 x i32> %5, %4
|
|
|
|
%7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <8 x i32> %7 to <8 x i8>
|
|
|
|
store <8 x i8> %8, <8 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v16i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgb (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v16i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovdqa (%rsi), %xmm0
|
|
|
|
; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <16 x i8>, <16 x i8>* %a
|
|
|
|
%2 = load <16 x i8>, <16 x i8>* %b
|
|
|
|
%3 = zext <16 x i8> %1 to <16 x i32>
|
|
|
|
%4 = zext <16 x i8> %2 to <16 x i32>
|
|
|
|
%5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <16 x i32> %5, %4
|
|
|
|
%7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <16 x i32> %7 to <16 x i8>
|
|
|
|
store <16 x i8> %8, <16 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgb (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgb 16(%rdi), %xmm1
|
2018-03-18 03:24:54 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-LABEL: avg_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-LABEL: avg_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <32 x i8>, <32 x i8>* %a
|
|
|
|
%2 = load <32 x i8>, <32 x i8>* %b
|
|
|
|
%3 = zext <32 x i8> %1 to <32 x i32>
|
|
|
|
%4 = zext <32 x i8> %2 to <32 x i32>
|
|
|
|
%5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <32 x i32> %5, %4
|
|
|
|
%7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <32 x i32> %7 to <32 x i8>
|
|
|
|
store <32 x i8> %8, <32 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-12-21 21:18:19 +08:00
|
|
|
define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
|
|
|
|
; SSE2-LABEL: avg_v48i8:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: movdqa 16(%rdi), %xmm6
|
|
|
|
; SSE2-NEXT: movdqa 32(%rdi), %xmm11
|
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm12
|
|
|
|
; SSE2-NEXT: movdqa 16(%rsi), %xmm13
|
|
|
|
; SSE2-NEXT: movdqa 32(%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: pxor %xmm7, %xmm7
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm4
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
|
|
|
|
; SSE2-NEXT: movdqa %xmm4, %xmm2
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm10
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: movdqa %xmm6, %xmm5
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
|
|
|
|
; SSE2-NEXT: movdqa %xmm5, %xmm15
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm6, %xmm14
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: movdqa %xmm12, %xmm3
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]
|
|
|
|
; SSE2-NEXT: movdqa %xmm3, %xmm8
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: paddd %xmm2, %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm11, %xmm2
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: paddd %xmm4, %xmm3
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm12, %xmm9
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: paddd %xmm10, %xmm9
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: paddd %xmm1, %xmm12
|
|
|
|
; SSE2-NEXT: movdqa %xmm13, %xmm4
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
|
|
|
|
; SSE2-NEXT: movdqa %xmm4, %xmm10
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: paddd %xmm15, %xmm10
|
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm15
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: paddd %xmm5, %xmm4
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm13, %xmm1
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: paddd %xmm14, %xmm1
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: paddd %xmm6, %xmm13
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm6
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
|
|
|
|
; SSE2-NEXT: movdqa %xmm6, %xmm14
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: paddd %xmm15, %xmm14
|
|
|
|
; SSE2-NEXT: movdqa %xmm11, %xmm5
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: paddd %xmm2, %xmm6
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: paddd %xmm5, %xmm2
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: paddd %xmm11, %xmm0
|
|
|
|
; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm8
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm3
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm9
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm12
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm10
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm4
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm1
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm13
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm14
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm6
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm2
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm0
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm3
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm8
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255]
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm8
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm3
|
|
|
|
; SSE2-NEXT: packuswb %xmm8, %xmm3
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm12
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm9
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm9
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm12
|
|
|
|
; SSE2-NEXT: packuswb %xmm9, %xmm12
|
|
|
|
; SSE2-NEXT: packuswb %xmm3, %xmm12
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm4
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm10
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm10
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm4
|
|
|
|
; SSE2-NEXT: packuswb %xmm10, %xmm4
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm13
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm1
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm1
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm13
|
|
|
|
; SSE2-NEXT: packuswb %xmm1, %xmm13
|
|
|
|
; SSE2-NEXT: packuswb %xmm4, %xmm13
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm6
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm14
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm14
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm6
|
|
|
|
; SSE2-NEXT: packuswb %xmm14, %xmm6
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm0
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm2
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm2
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm0
|
|
|
|
; SSE2-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: packuswb %xmm6, %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm13, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm12, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: avg_v48i8:
|
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm4
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,0,1]
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm13
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm11
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9
|
2018-03-13 09:17:40 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm8
|
2018-03-13 09:17:40 +08:00
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
|
2017-12-21 21:18:19 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vpaddd %xmm4, %xmm15, %xmm15
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,0,1]
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vpaddd %xmm7, %xmm10, %xmm7
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm0, %xmm12, %xmm12
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,3,0,1]
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpsubd %xmm4, %xmm13, %xmm10
|
|
|
|
; AVX1-NEXT: vpsubd %xmm4, %xmm11, %xmm11
|
|
|
|
; AVX1-NEXT: vpsubd %xmm4, %xmm9, %xmm9
|
|
|
|
; AVX1-NEXT: vpsubd %xmm4, %xmm8, %xmm8
|
|
|
|
; AVX1-NEXT: vpsubd %xmm4, %xmm15, %xmm13
|
|
|
|
; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm7
|
|
|
|
; AVX1-NEXT: vpsubd %xmm4, %xmm14, %xmm0
|
|
|
|
; AVX1-NEXT: vpsubd %xmm4, %xmm12, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm5
|
|
|
|
; AVX1-NEXT: vpsubd %xmm4, %xmm6, %xmm6
|
|
|
|
; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
|
2018-03-13 09:17:40 +08:00
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm5, %xmm4
|
2018-06-08 18:29:00 +08:00
|
|
|
; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm7, %xmm2
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm13, %xmm4
|
|
|
|
; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm8, %xmm4
|
2018-03-13 09:17:40 +08:00
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm9, %xmm5
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm11, %xmm5
|
2017-12-21 21:18:19 +08:00
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6
|
2018-06-08 18:29:00 +08:00
|
|
|
; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
|
2018-11-19 01:59:28 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqu %xmm1, (%rax)
|
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
2017-12-21 21:18:19 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: avg_v48i8:
|
|
|
|
; AVX2: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpbroadcastq 24(%rdi), %ymm1
|
2017-12-21 21:18:19 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %xmm2
|
|
|
|
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
|
2017-12-21 21:18:19 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpaddd %ymm6, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpbroadcastq 24(%rsi), %ymm6
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpaddd %ymm6, %ymm1, %ymm1
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rsi), %xmm6
|
|
|
|
; AVX2-NEXT: vmovdqa 32(%rsi), %xmm7
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpaddd %ymm6, %ymm5, %ymm5
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[2,3,0,1]
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpaddd %ymm6, %ymm3, %ymm3
|
2017-12-21 21:18:19 +08:00
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
|
2018-10-09 02:40:50 +08:00
|
|
|
; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm7
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5
|
|
|
|
; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm0
|
2017-12-21 21:18:19 +08:00
|
|
|
; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX2-NEXT: vpsrld $1, %ymm5, %ymm3
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
|
2018-10-09 02:40:50 +08:00
|
|
|
; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX2-NEXT: vpsrld $1, %ymm7, %ymm5
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm6, %xmm5, %xmm5
|
2018-11-19 01:59:28 +08:00
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
|
|
|
|
; AVX2-NEXT: vpand %xmm6, %xmm5, %xmm5
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm7, %xmm1, %xmm1
|
2018-11-19 01:59:28 +08:00
|
|
|
; AVX2-NEXT: vpand %xmm6, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpackuswb %xmm1, %xmm5, %xmm1
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX2-NEXT: vpand %xmm6, %xmm4, %xmm4
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
|
2018-11-19 01:59:28 +08:00
|
|
|
; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX2-NEXT: vpackuswb %xmm2, %xmm4, %xmm2
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
|
2019-01-25 23:37:42 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2
|
|
|
|
; AVX2-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
|
2018-11-19 01:59:28 +08:00
|
|
|
; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2
|
2018-03-13 09:17:40 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
|
2018-06-08 18:29:00 +08:00
|
|
|
; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
|
2018-11-19 01:59:28 +08:00
|
|
|
; AVX2-NEXT: vpand %xmm6, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
|
2017-12-21 21:18:19 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v48i8:
|
|
|
|
; AVX512F: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
|
|
|
|
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
|
|
|
|
; AVX512F-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vpavgb (%rsi), %xmm0, %xmm0
|
|
|
|
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpavgb 32(%rsi), %xmm2, %xmm1
|
2017-12-21 21:18:19 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %xmm1, (%rax)
|
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512F-NEXT: vzeroupper
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: avg_v48i8:
|
|
|
|
; AVX512BW: # %bb.0:
|
2018-11-23 06:56:52 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
|
|
|
|
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
|
|
|
|
; AVX512BW-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
|
|
|
|
; AVX512BW-NEXT: vpavgb (%rsi), %xmm0, %xmm0
|
|
|
|
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX512BW-NEXT: vpavgb 32(%rsi), %xmm2, %xmm1
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
|
|
|
|
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, (%rax)
|
2017-12-21 21:18:19 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
|
|
|
; AVX512BW-NEXT: retq
|
|
|
|
%1 = load <48 x i8>, <48 x i8>* %a
|
|
|
|
%2 = load <48 x i8>, <48 x i8>* %b
|
|
|
|
%3 = zext <48 x i8> %1 to <48 x i32>
|
|
|
|
%4 = zext <48 x i8> %2 to <48 x i32>
|
|
|
|
%5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <48 x i32> %5, %4
|
|
|
|
%7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <48 x i32> %7 to <48 x i8>
|
|
|
|
store <48 x i8> %8, <48 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
|
|
|
|
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: pavgb (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgb 16(%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgb 32(%rdi), %xmm2
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgb 48(%rdi), %xmm3
|
|
|
|
; SSE2-NEXT: movdqu %xmm3, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm2, (%rax)
|
2017-10-04 00:59:13 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
|
|
|
|
; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vpavgb 48(%rdi), %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-LABEL: avg_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2018-03-20 04:19:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
|
|
|
|
; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
|
2018-03-18 03:24:54 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
2018-03-20 04:19:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2018-03-20 04:19:46 +08:00
|
|
|
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
|
2018-03-18 03:24:54 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
|
2018-03-20 04:19:46 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-LABEL: avg_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-08-01 01:35:44 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <64 x i8>, <64 x i8>* %a
|
|
|
|
%2 = load <64 x i8>, <64 x i8>* %b
|
|
|
|
%3 = zext <64 x i8> %1 to <64 x i32>
|
|
|
|
%4 = zext <64 x i8> %2 to <64 x i32>
|
|
|
|
%5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <64 x i32> %5, %4
|
|
|
|
%7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <64 x i32> %7 to <64 x i8>
|
|
|
|
store <64 x i8> %8, <64 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v4i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; SSE2-NEXT: pavgw %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movq %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v4i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX-NEXT: vmovq %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <4 x i16>, <4 x i16>* %a
|
|
|
|
%2 = load <4 x i16>, <4 x i16>* %b
|
|
|
|
%3 = zext <4 x i16> %1 to <4 x i32>
|
|
|
|
%4 = zext <4 x i16> %2 to <4 x i32>
|
|
|
|
%5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <4 x i32> %5, %4
|
|
|
|
%7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <4 x i32> %7 to <4 x i16>
|
|
|
|
store <4 x i16> %8, <4 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v8i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgw (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v8i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovdqa (%rsi), %xmm0
|
|
|
|
; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <8 x i16>, <8 x i16>* %a
|
|
|
|
%2 = load <8 x i16>, <8 x i16>* %b
|
|
|
|
%3 = zext <8 x i16> %1 to <8 x i32>
|
|
|
|
%4 = zext <8 x i16> %2 to <8 x i32>
|
|
|
|
%5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <8 x i32> %5, %4
|
|
|
|
%7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <8 x i32> %7 to <8 x i16>
|
|
|
|
store <8 x i16> %8, <8 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgw (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgw 16(%rdi), %xmm1
|
2018-03-18 03:24:54 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
|
|
|
|
; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-LABEL: avg_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-LABEL: avg_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX512-NEXT: vpavgw (%rdi), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <16 x i16>, <16 x i16>* %a
|
|
|
|
%2 = load <16 x i16>, <16 x i16>* %b
|
|
|
|
%3 = zext <16 x i16> %1 to <16 x i32>
|
|
|
|
%4 = zext <16 x i16> %2 to <16 x i32>
|
|
|
|
%5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <16 x i32> %5, %4
|
|
|
|
%7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <16 x i32> %7 to <16 x i16>
|
|
|
|
store <16 x i16> %8, <16 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
|
|
|
|
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: pavgw (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgw 16(%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgw 32(%rdi), %xmm2
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgw 48(%rdi), %xmm3
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm3, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm2, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
|
|
|
|
; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vpavgw 48(%rdi), %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgw 32(%rdi), %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-LABEL: avg_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2018-03-20 04:19:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
|
|
|
|
; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
|
2018-03-18 03:24:54 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
2018-03-20 04:19:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2018-03-20 04:19:46 +08:00
|
|
|
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
|
2018-03-18 03:24:54 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
|
2018-03-20 04:19:46 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-LABEL: avg_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-08-01 01:35:44 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <32 x i16>, <32 x i16>* %a
|
|
|
|
%2 = load <32 x i16>, <32 x i16>* %b
|
|
|
|
%3 = zext <32 x i16> %1 to <32 x i32>
|
|
|
|
%4 = zext <32 x i16> %2 to <32 x i32>
|
|
|
|
%5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <32 x i32> %5, %4
|
|
|
|
%7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <32 x i32> %7 to <32 x i16>
|
|
|
|
store <32 x i16> %8, <32 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v4i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movd %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v4i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovd %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <4 x i8>, <4 x i8>* %a
|
|
|
|
%2 = load <4 x i8>, <4 x i8>* %b
|
|
|
|
%3 = zext <4 x i8> %1 to <4 x i32>
|
|
|
|
%4 = zext <4 x i8> %2 to <4 x i32>
|
|
|
|
%5 = add nuw nsw <4 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <4 x i32> %7 to <4 x i8>
|
|
|
|
store <4 x i8> %8, <4 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v8i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movq %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v8i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovq %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <8 x i8>, <8 x i8>* %a
|
|
|
|
%2 = load <8 x i8>, <8 x i8>* %b
|
|
|
|
%3 = zext <8 x i8> %1 to <8 x i32>
|
|
|
|
%4 = zext <8 x i8> %2 to <8 x i32>
|
|
|
|
%5 = add nuw nsw <8 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <8 x i32> %7 to <8 x i8>
|
|
|
|
store <8 x i8> %8, <8 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v16i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgb (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v16i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <16 x i8>, <16 x i8>* %a
|
|
|
|
%2 = load <16 x i8>, <16 x i8>* %b
|
|
|
|
%3 = zext <16 x i8> %1 to <16 x i32>
|
|
|
|
%4 = zext <16 x i8> %2 to <16 x i32>
|
|
|
|
%5 = add nuw nsw <16 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <16 x i32> %7 to <16 x i8>
|
|
|
|
store <16 x i8> %8, <16 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v32i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: movdqa 16(%rdi), %xmm1
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgb (%rsi), %xmm0
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: pavgb 16(%rsi), %xmm1
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v32i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-LABEL: avg_v32i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-LABEL: avg_v32i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <32 x i8>, <32 x i8>* %a
|
|
|
|
%2 = load <32 x i8>, <32 x i8>* %b
|
|
|
|
%3 = zext <32 x i8> %1 to <32 x i32>
|
|
|
|
%4 = zext <32 x i8> %2 to <32 x i32>
|
|
|
|
%5 = add nuw nsw <32 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <32 x i32> %7 to <32 x i8>
|
|
|
|
store <32 x i8> %8, <32 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v64i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm0
|
|
|
|
; SSE2-NEXT: pavgb %xmm1, %xmm1
|
|
|
|
; SSE2-NEXT: pavgb %xmm2, %xmm2
|
|
|
|
; SSE2-NEXT: pavgb %xmm3, %xmm3
|
|
|
|
; SSE2-NEXT: movdqu %xmm3, (%rax)
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm2, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v64i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm1
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm3, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-LABEL: avg_v64i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
|
|
|
|
; AVX2-NEXT: vpavgb %ymm0, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb %ymm1, %ymm1, %ymm1
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v64i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgb %ymm0, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpavgb %ymm1, %ymm1, %ymm1
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
|
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-LABEL: avg_v64i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-08-01 01:35:44 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <64 x i8>, <64 x i8>* %a
|
|
|
|
%2 = load <64 x i8>, <64 x i8>* %b
|
|
|
|
%3 = zext <64 x i8> %1 to <64 x i32>
|
|
|
|
%4 = zext <64 x i8> %2 to <64 x i32>
|
|
|
|
%5 = add nuw nsw <64 x i32> %4, %4
|
|
|
|
%6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <64 x i32> %7 to <64 x i8>
|
|
|
|
store <64 x i8> %8, <64 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v4i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; SSE2-NEXT: pavgw %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movq %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v4i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovq %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <4 x i16>, <4 x i16>* %a
|
|
|
|
%2 = load <4 x i16>, <4 x i16>* %b
|
|
|
|
%3 = zext <4 x i16> %1 to <4 x i32>
|
|
|
|
%4 = zext <4 x i16> %2 to <4 x i32>
|
|
|
|
%5 = add nuw nsw <4 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <4 x i32> %7 to <4 x i16>
|
|
|
|
store <4 x i16> %8, <4 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v8i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgw (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v8i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <8 x i16>, <8 x i16>* %a
|
|
|
|
%2 = load <8 x i16>, <8 x i16>* %b
|
|
|
|
%3 = zext <8 x i16> %1 to <8 x i32>
|
|
|
|
%4 = zext <8 x i16> %2 to <8 x i32>
|
|
|
|
%5 = add nuw nsw <8 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <8 x i32> %7 to <8 x i16>
|
|
|
|
store <8 x i16> %8, <8 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v16i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: movdqa 16(%rdi), %xmm1
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgw (%rsi), %xmm0
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: pavgw 16(%rsi), %xmm1
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v16i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
|
|
|
|
; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-LABEL: avg_v16i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-LABEL: avg_v16i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <16 x i16>, <16 x i16>* %a
|
|
|
|
%2 = load <16 x i16>, <16 x i16>* %b
|
|
|
|
%3 = zext <16 x i16> %1 to <16 x i32>
|
|
|
|
%4 = zext <16 x i16> %2 to <16 x i32>
|
|
|
|
%5 = add nuw nsw <16 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <16 x i32> %7 to <16 x i16>
|
|
|
|
store <16 x i16> %8, <16 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v32i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa 16(%rdi), %xmm1
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: movdqa 32(%rdi), %xmm2
|
|
|
|
; SSE2-NEXT: movdqa 48(%rdi), %xmm3
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgw (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgw 16(%rsi), %xmm1
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: pavgw 32(%rsi), %xmm2
|
|
|
|
; SSE2-NEXT: pavgw 48(%rsi), %xmm3
|
2018-03-18 03:24:54 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm3, (%rax)
|
2018-03-20 04:19:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm2, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v32i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
|
|
|
|
; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-LABEL: avg_v32i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
2018-03-20 04:19:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
|
2018-03-20 04:19:46 +08:00
|
|
|
; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v32i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
2018-03-20 04:19:46 +08:00
|
|
|
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0
|
2018-03-20 04:19:46 +08:00
|
|
|
; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
|
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-LABEL: avg_v32i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-08-01 01:35:44 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <32 x i16>, <32 x i16>* %a
|
|
|
|
%2 = load <32 x i16>, <32 x i16>* %b
|
|
|
|
%3 = zext <32 x i16> %1 to <32 x i32>
|
|
|
|
%4 = zext <32 x i16> %2 to <32 x i32>
|
|
|
|
%5 = add nuw nsw <32 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <32 x i32> %7 to <32 x i16>
|
|
|
|
store <32 x i16> %8, <32 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v4i8_const(<4 x i8>* %a) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v4i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: movd %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v4i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovd %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <4 x i8>, <4 x i8>* %a
|
|
|
|
%2 = zext <4 x i8> %1 to <4 x i32>
|
|
|
|
%3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
|
|
|
|
%4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%5 = trunc <4 x i32> %4 to <4 x i8>
|
|
|
|
store <4 x i8> %5, <4 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v8i8_const(<8 x i8>* %a) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v8i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: movq %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v8i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovq %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <8 x i8>, <8 x i8>* %a
|
|
|
|
%2 = zext <8 x i8> %1 to <8 x i32>
|
|
|
|
%3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
|
|
|
%4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%5 = trunc <8 x i32> %4 to <8 x i8>
|
|
|
|
store <8 x i8> %5, <8 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v16i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v16i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <16 x i8>, <16 x i8>* %a
|
|
|
|
%2 = zext <16 x i8> %1 to <16 x i32>
|
|
|
|
%3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
|
|
|
%4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%5 = trunc <16 x i32> %4 to <16 x i8>
|
|
|
|
store <16 x i8> %5, <16 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v32i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
|
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: pavgb 16(%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2017-11-03 19:33:48 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v32i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275]
|
|
|
|
; AVX1-NEXT: # xmm0 = mem[0,0]
|
|
|
|
; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0
|
2017-11-03 19:33:48 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-LABEL: avg_v32i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-LABEL: avg_v32i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <32 x i8>, <32 x i8>* %a
|
|
|
|
%2 = zext <32 x i8> %1 to <32 x i32>
|
|
|
|
%3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
2015-12-01 05:46:08 +08:00
|
|
|
%4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%5 = trunc <32 x i32> %4 to <32 x i8>
|
|
|
|
store <32 x i8> %5, <32 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v64i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
|
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movdqa 16(%rdi), %xmm2
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: movdqa 32(%rdi), %xmm3
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm3
|
|
|
|
; SSE2-NEXT: pavgb 48(%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm3, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm2, (%rax)
|
2017-02-15 19:46:15 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v64i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275]
|
|
|
|
; AVX1-NEXT: # xmm0 = mem[0,0]
|
|
|
|
; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
|
|
|
|
; AVX1-NEXT: vpavgb 48(%rdi), %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb 32(%rdi), %xmm0, %xmm0
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-LABEL: avg_v64i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
|
|
|
|
; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1
|
|
|
|
; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v64i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
|
|
|
|
; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-LABEL: avg_v64i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-08-01 01:35:44 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <64 x i8>, <64 x i8>* %a
|
|
|
|
%2 = zext <64 x i8> %1 to <64 x i32>
|
|
|
|
%3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
2015-12-01 05:46:08 +08:00
|
|
|
%4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%5 = trunc <64 x i32> %4 to <64 x i8>
|
|
|
|
store <64 x i8> %5, <64 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v4i16_const(<4 x i16>* %a) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v4i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: movq %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v4i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovq %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <4 x i16>, <4 x i16>* %a
|
|
|
|
%2 = zext <4 x i16> %1 to <4 x i32>
|
|
|
|
%3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
|
|
|
|
%4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%5 = trunc <4 x i32> %4 to <4 x i16>
|
|
|
|
store <4 x i16> %5, <4 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v8i16_const(<8 x i16>* %a) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v8i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v8i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <8 x i16>, <8 x i16>* %a
|
|
|
|
%2 = zext <8 x i16> %1 to <8 x i32>
|
|
|
|
%3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
|
|
|
%4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%5 = trunc <8 x i32> %4 to <8 x i16>
|
|
|
|
store <8 x i16> %5, <8 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v16i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
|
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgw %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: pavgw 16(%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v16i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
|
|
|
|
; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0
|
2017-10-29 04:51:27 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-LABEL: avg_v16i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-LABEL: avg_v16i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <16 x i16>, <16 x i16>* %a
|
|
|
|
%2 = zext <16 x i16> %1 to <16 x i32>
|
|
|
|
%3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
|
|
|
%4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%5 = trunc <16 x i32> %4 to <16 x i16>
|
|
|
|
store <16 x i16> %5, <16 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v32i16_const(<32 x i16>* %a) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v32i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
|
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgw %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movdqa 16(%rdi), %xmm2
|
|
|
|
; SSE2-NEXT: pavgw %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: movdqa 32(%rdi), %xmm3
|
|
|
|
; SSE2-NEXT: pavgw %xmm0, %xmm3
|
|
|
|
; SSE2-NEXT: pavgw 48(%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm3, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm2, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v32i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
|
|
|
|
; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
|
|
|
|
; AVX1-NEXT: vpavgw 48(%rdi), %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm0
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-LABEL: avg_v32i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
|
|
|
|
; AVX2-NEXT: # ymm0 = mem[0,1,0,1]
|
|
|
|
; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1
|
|
|
|
; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v32i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
|
|
|
|
; AVX512F-NEXT: # ymm0 = mem[0,1,0,1]
|
|
|
|
; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-LABEL: avg_v32i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-08-01 01:35:44 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <32 x i16>, <32 x i16>* %a
|
|
|
|
%2 = zext <32 x i16> %1 to <32 x i32>
|
|
|
|
%3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
2015-12-01 05:46:08 +08:00
|
|
|
%4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%5 = trunc <32 x i32> %4 to <32 x i16>
|
|
|
|
store <32 x i16> %5, <32 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
2017-09-12 15:50:35 +08:00
|
|
|
|
|
|
|
define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind {
|
|
|
|
; SSE2-LABEL: avg_v16i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-09-12 15:50:35 +08:00
|
|
|
; SSE2-NEXT: pavgb %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: avg_v16i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%za = zext <16 x i8> %a to <16 x i16>
|
|
|
|
%zb = zext <16 x i8> %b to <16 x i16>
|
|
|
|
%add = add nuw nsw <16 x i16> %za, %zb
|
|
|
|
%add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%res = trunc <16 x i16> %lshr to <16 x i8>
|
|
|
|
ret <16 x i8> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
|
|
|
|
; SSE2-LABEL: avg_v32i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgb %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: pavgb %xmm3, %xmm1
|
2017-09-12 15:50:35 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: avg_v32i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: avg_v32i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: avg_v32i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%za = zext <32 x i8> %a to <32 x i16>
|
|
|
|
%zb = zext <32 x i8> %b to <32 x i16>
|
|
|
|
%add = add nuw nsw <32 x i16> %za, %zb
|
|
|
|
%add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%res = trunc <32 x i16> %lshr to <32 x i8>
|
|
|
|
ret <32 x i8> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind {
|
|
|
|
; SSE2-LABEL: avg_v64i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgb %xmm4, %xmm0
|
|
|
|
; SSE2-NEXT: pavgb %xmm5, %xmm1
|
|
|
|
; SSE2-NEXT: pavgb %xmm6, %xmm2
|
|
|
|
; SSE2-NEXT: pavgb %xmm7, %xmm3
|
2017-09-12 15:50:35 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: avg_v64i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
|
|
|
|
; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: avg_v64i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb %ymm3, %ymm1, %ymm1
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v64i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: avg_v64i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0
|
|
|
|
; AVX512BW-NEXT: retq
|
|
|
|
%za = zext <64 x i8> %a to <64 x i16>
|
|
|
|
%zb = zext <64 x i8> %b to <64 x i16>
|
|
|
|
%add = add nuw nsw <64 x i16> %za, %zb
|
|
|
|
%add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%res = trunc <64 x i16> %lshr to <64 x i8>
|
|
|
|
ret <64 x i8> %res
|
|
|
|
}
|
2018-02-26 10:16:31 +08:00
|
|
|
|
|
|
|
define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind {
|
|
|
|
; SSE2-LABEL: avg_v512i8_3:
|
|
|
|
; SSE2: # %bb.0:
|
2018-09-20 02:59:08 +08:00
|
|
|
; SSE2-NEXT: movq %rdi, %rax
|
2018-02-26 10:16:31 +08:00
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 496(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 480(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 464(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 448(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 432(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 416(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 400(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 384(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 368(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 352(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 336(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 320(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 304(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 288(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 272(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 256(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 240(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 224(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 208(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 192(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 176(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 160(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 144(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 128(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm7
|
|
|
|
; SSE2-NEXT: movdqa %xmm7, 112(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm6
|
|
|
|
; SSE2-NEXT: movdqa %xmm6, 96(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm5
|
|
|
|
; SSE2-NEXT: movdqa %xmm5, 80(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm4
|
|
|
|
; SSE2-NEXT: movdqa %xmm4, 64(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm3
|
|
|
|
; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm2
|
|
|
|
; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm1
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, (%rdi)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: avg_v512i8_3:
|
|
|
|
; AVX1: # %bb.0:
|
|
|
|
; AVX1-NEXT: pushq %rbp
|
|
|
|
; AVX1-NEXT: movq %rsp, %rbp
|
|
|
|
; AVX1-NEXT: andq $-32, %rsp
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: subq $96, %rsp
|
2018-09-20 02:59:08 +08:00
|
|
|
; AVX1-NEXT: movq %rdi, %rax
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vpavgb 272(%rbp), %xmm0, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpavgb 288(%rbp), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0
|
2018-04-25 03:20:18 +08:00
|
|
|
; AVX1-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vpavgb 304(%rbp), %xmm1, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb 320(%rbp), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm0
|
2018-02-26 10:16:31 +08:00
|
|
|
; AVX1-NEXT: vmovaps %ymm0, (%rsp) # 32-byte Spill
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vpavgb 336(%rbp), %xmm2, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb 352(%rbp), %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm13
|
|
|
|
; AVX1-NEXT: vpavgb 368(%rbp), %xmm3, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpavgb 384(%rbp), %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm14
|
|
|
|
; AVX1-NEXT: vpavgb 400(%rbp), %xmm4, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpavgb 416(%rbp), %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm15
|
|
|
|
; AVX1-NEXT: vpavgb 432(%rbp), %xmm5, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
|
|
|
|
; AVX1-NEXT: vpavgb 448(%rbp), %xmm5, %xmm5
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm12
|
|
|
|
; AVX1-NEXT: vpavgb 464(%rbp), %xmm6, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
|
|
|
|
; AVX1-NEXT: vpavgb 480(%rbp), %xmm6, %xmm6
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6
|
|
|
|
; AVX1-NEXT: vpavgb 496(%rbp), %xmm7, %xmm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
|
|
|
|
; AVX1-NEXT: vpavgb 512(%rbp), %xmm7, %xmm7
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7
|
|
|
|
; AVX1-NEXT: vmovdqa 16(%rbp), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rbp), %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb 528(%rbp), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpavgb 544(%rbp), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8
|
|
|
|
; AVX1-NEXT: vmovdqa 48(%rbp), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 64(%rbp), %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb 560(%rbp), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpavgb 576(%rbp), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9
|
|
|
|
; AVX1-NEXT: vmovdqa 80(%rbp), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 96(%rbp), %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb 592(%rbp), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpavgb 608(%rbp), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10
|
|
|
|
; AVX1-NEXT: vmovdqa 112(%rbp), %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 128(%rbp), %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb 624(%rbp), %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpavgb 640(%rbp), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa 144(%rbp), %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa 160(%rbp), %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb 656(%rbp), %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb 672(%rbp), %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vmovdqa 176(%rbp), %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa 192(%rbp), %xmm3
|
|
|
|
; AVX1-NEXT: vpavgb 688(%rbp), %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb 704(%rbp), %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
|
|
|
|
; AVX1-NEXT: vmovdqa 208(%rbp), %xmm3
|
|
|
|
; AVX1-NEXT: vmovdqa 224(%rbp), %xmm4
|
|
|
|
; AVX1-NEXT: vpavgb 720(%rbp), %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpavgb 736(%rbp), %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
|
|
|
|
; AVX1-NEXT: vmovdqa 240(%rbp), %xmm4
|
|
|
|
; AVX1-NEXT: vpavgb 752(%rbp), %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vmovdqa 256(%rbp), %xmm11
|
|
|
|
; AVX1-NEXT: vpavgb 768(%rbp), %xmm11, %xmm5
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
|
|
|
|
; AVX1-NEXT: vmovaps %ymm4, 480(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm3, 448(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm2, 416(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm1, 384(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm0, 352(%rdi)
|
2018-02-26 10:16:31 +08:00
|
|
|
; AVX1-NEXT: vmovaps %ymm10, 320(%rdi)
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovaps %ymm9, 288(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm8, 256(%rdi)
|
2018-02-26 10:16:31 +08:00
|
|
|
; AVX1-NEXT: vmovaps %ymm7, 224(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm6, 192(%rdi)
|
[x86] allow vector load narrowing with multi-use values
This is a long-awaited follow-up suggested in D33578. Since then, we've picked up even more
opportunities for vector narrowing from changes like D53784, so there are a lot of test diffs.
Apart from 2-3 strange cases, these are all wins.
I've structured this to be no-functional-change-intended for any target except for x86
because I couldn't tell if AArch64, ARM, and AMDGPU would improve or not. All of those
targets have existing regression tests (4, 4, 10 files respectively) that would be
affected. Also, Hexagon overrides the shouldReduceLoadWidth() hook, but doesn't show
any regression test diffs. The trade-off is deciding if an extra vector load is better
than a single wide load + extract_subvector.
For x86, this is almost always better (on paper at least) because we often can fold
loads into subsequent ops and not increase the official instruction count. There's also
some unknown -- but potentially large -- benefit from using narrower vector ops if wide
ops are implemented with multiple uops and/or frequency throttling is avoided.
Differential Revision: https://reviews.llvm.org/D54073
llvm-svn: 346595
2018-11-11 04:05:31 +08:00
|
|
|
; AVX1-NEXT: vmovaps %ymm12, 160(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm15, 128(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm14, 96(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm13, 64(%rdi)
|
2018-02-26 10:16:31 +08:00
|
|
|
; AVX1-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload
|
|
|
|
; AVX1-NEXT: vmovaps %ymm0, 32(%rdi)
|
2018-04-25 03:20:18 +08:00
|
|
|
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
|
2018-02-26 10:16:31 +08:00
|
|
|
; AVX1-NEXT: vmovaps %ymm0, (%rdi)
|
|
|
|
; AVX1-NEXT: movq %rbp, %rsp
|
|
|
|
; AVX1-NEXT: popq %rbp
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: avg_v512i8_3:
|
|
|
|
; AVX2: # %bb.0:
|
|
|
|
; AVX2-NEXT: pushq %rbp
|
|
|
|
; AVX2-NEXT: movq %rsp, %rbp
|
|
|
|
; AVX2-NEXT: andq $-32, %rsp
|
|
|
|
; AVX2-NEXT: subq $32, %rsp
|
2018-09-20 02:59:08 +08:00
|
|
|
; AVX2-NEXT: movq %rdi, %rax
|
2018-02-26 10:16:31 +08:00
|
|
|
; AVX2-NEXT: vmovdqa 240(%rbp), %ymm8
|
|
|
|
; AVX2-NEXT: vmovdqa 208(%rbp), %ymm9
|
|
|
|
; AVX2-NEXT: vmovdqa 176(%rbp), %ymm10
|
|
|
|
; AVX2-NEXT: vmovdqa 144(%rbp), %ymm11
|
|
|
|
; AVX2-NEXT: vmovdqa 112(%rbp), %ymm12
|
|
|
|
; AVX2-NEXT: vmovdqa 80(%rbp), %ymm13
|
|
|
|
; AVX2-NEXT: vmovdqa 48(%rbp), %ymm14
|
|
|
|
; AVX2-NEXT: vmovdqa 16(%rbp), %ymm15
|
|
|
|
; AVX2-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
|
|
|
|
; AVX2-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
|
|
|
|
; AVX2-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
|
|
|
|
; AVX2-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
|
|
|
|
; AVX2-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
|
|
|
|
; AVX2-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
|
|
|
|
; AVX2-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
|
|
|
|
; AVX2-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
|
|
|
|
; AVX2-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
|
|
|
|
; AVX2-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
|
|
|
|
; AVX2-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
|
|
|
|
; AVX2-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm8, 480(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm9, 448(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm10, 416(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm11, 384(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm12, 352(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm13, 320(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm14, 288(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm15, 256(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm7, 224(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm6, 192(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm5, 160(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm4, 128(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm3, 96(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm0, (%rdi)
|
|
|
|
; AVX2-NEXT: movq %rbp, %rsp
|
|
|
|
; AVX2-NEXT: popq %rbp
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v512i8_3:
|
|
|
|
; AVX512F: # %bb.0:
|
|
|
|
; AVX512F-NEXT: pushq %rbp
|
|
|
|
; AVX512F-NEXT: movq %rsp, %rbp
|
|
|
|
; AVX512F-NEXT: andq $-32, %rsp
|
|
|
|
; AVX512F-NEXT: subq $32, %rsp
|
2018-09-20 02:59:08 +08:00
|
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
2018-02-26 10:16:31 +08:00
|
|
|
; AVX512F-NEXT: vmovdqa 240(%rbp), %ymm8
|
|
|
|
; AVX512F-NEXT: vmovdqa 208(%rbp), %ymm9
|
|
|
|
; AVX512F-NEXT: vmovdqa 176(%rbp), %ymm10
|
|
|
|
; AVX512F-NEXT: vmovdqa 144(%rbp), %ymm11
|
|
|
|
; AVX512F-NEXT: vmovdqa 112(%rbp), %ymm12
|
|
|
|
; AVX512F-NEXT: vmovdqa 80(%rbp), %ymm13
|
|
|
|
; AVX512F-NEXT: vmovdqa 48(%rbp), %ymm14
|
|
|
|
; AVX512F-NEXT: vmovdqa 16(%rbp), %ymm15
|
|
|
|
; AVX512F-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
|
|
|
|
; AVX512F-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
|
|
|
|
; AVX512F-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
|
|
|
|
; AVX512F-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
|
|
|
|
; AVX512F-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
|
|
|
|
; AVX512F-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
|
|
|
|
; AVX512F-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
|
|
|
|
; AVX512F-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
|
|
|
|
; AVX512F-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
|
|
|
|
; AVX512F-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
|
|
|
|
; AVX512F-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
|
|
|
|
; AVX512F-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
|
|
|
|
; AVX512F-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
|
|
|
|
; AVX512F-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm8, 480(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm9, 448(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm10, 416(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm11, 384(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm12, 352(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm13, 320(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm14, 288(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm15, 256(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm7, 224(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm6, 192(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm5, 160(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm4, 128(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm3, 96(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm2, 64(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm0, (%rdi)
|
|
|
|
; AVX512F-NEXT: movq %rbp, %rsp
|
|
|
|
; AVX512F-NEXT: popq %rbp
|
|
|
|
; AVX512F-NEXT: vzeroupper
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: avg_v512i8_3:
|
|
|
|
; AVX512BW: # %bb.0:
|
|
|
|
; AVX512BW-NEXT: pushq %rbp
|
|
|
|
; AVX512BW-NEXT: movq %rsp, %rbp
|
|
|
|
; AVX512BW-NEXT: andq $-64, %rsp
|
|
|
|
; AVX512BW-NEXT: subq $64, %rsp
|
2018-09-20 02:59:08 +08:00
|
|
|
; AVX512BW-NEXT: movq %rdi, %rax
|
2018-02-26 10:16:31 +08:00
|
|
|
; AVX512BW-NEXT: vpavgb 16(%rbp), %zmm0, %zmm0
|
|
|
|
; AVX512BW-NEXT: vpavgb 80(%rbp), %zmm1, %zmm1
|
|
|
|
; AVX512BW-NEXT: vpavgb 144(%rbp), %zmm2, %zmm2
|
|
|
|
; AVX512BW-NEXT: vpavgb 208(%rbp), %zmm3, %zmm3
|
|
|
|
; AVX512BW-NEXT: vpavgb 272(%rbp), %zmm4, %zmm4
|
|
|
|
; AVX512BW-NEXT: vpavgb 336(%rbp), %zmm5, %zmm5
|
|
|
|
; AVX512BW-NEXT: vpavgb 400(%rbp), %zmm6, %zmm6
|
|
|
|
; AVX512BW-NEXT: vpavgb 464(%rbp), %zmm7, %zmm7
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdi)
|
|
|
|
; AVX512BW-NEXT: movq %rbp, %rsp
|
|
|
|
; AVX512BW-NEXT: popq %rbp
|
|
|
|
; AVX512BW-NEXT: vzeroupper
|
|
|
|
; AVX512BW-NEXT: retq
|
|
|
|
%za = zext <512 x i8> %a to <512 x i16>
|
|
|
|
%zb = zext <512 x i8> %b to <512 x i16>
|
|
|
|
%add = add nuw nsw <512 x i16> %za, %zb
|
|
|
|
%add1 = add nuw nsw <512 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%lshr = lshr <512 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%res = trunc <512 x i16> %lshr to <512 x i8>
|
|
|
|
ret <512 x i8> %res
|
|
|
|
}
|
2018-02-26 12:43:24 +08:00
|
|
|
|
|
|
|
; This is not an avg, but its structurally similar and previously caused a crash
|
|
|
|
; because the constants can't be read with APInt::getZExtValue.
|
|
|
|
define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind {
|
|
|
|
; SSE2-LABEL: not_avg_v16i8_wide_constants:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: pushq %rbp
|
|
|
|
; SSE2-NEXT: pushq %r15
|
|
|
|
; SSE2-NEXT: pushq %r14
|
|
|
|
; SSE2-NEXT: pushq %r13
|
|
|
|
; SSE2-NEXT: pushq %r12
|
|
|
|
; SSE2-NEXT: pushq %rbx
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movaps (%rsi), %xmm1
|
|
|
|
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
2018-02-26 12:43:24 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
2018-04-25 03:20:18 +08:00
|
|
|
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
|
2018-02-26 12:43:24 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
2018-04-25 03:20:18 +08:00
|
|
|
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
2018-02-26 12:43:24 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
2018-04-25 03:20:18 +08:00
|
|
|
; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
|
|
|
|
; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: leal -1(%rdx,%rsi), %edx
|
|
|
|
; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: leal -1(%rbx,%rdx), %edx
|
|
|
|
; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
|
|
|
|
; SSE2-NEXT: leal -1(%rbp,%rdx), %edx
|
|
|
|
; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
|
|
|
|
; SSE2-NEXT: leal -1(%rdi,%rdx), %r8d
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
|
|
|
|
; SSE2-NEXT: leal -1(%rax,%rdx), %edi
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: leal -1(%rcx,%rax), %edx
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: leal -1(%r9,%rax), %ecx
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
|
|
|
|
; SSE2-NEXT: leal -1(%r10,%rsi), %eax
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
|
|
|
|
; SSE2-NEXT: leaq -1(%r11,%rsi), %rsi
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
|
|
|
|
; SSE2-NEXT: leaq -1(%r12,%rbx), %r12
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
|
|
|
|
; SSE2-NEXT: leaq -1(%r15,%rbx), %r15
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
|
|
|
|
; SSE2-NEXT: leaq -1(%r14,%rbx), %r14
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
|
|
|
|
; SSE2-NEXT: leaq -1(%rbp,%rbx), %r11
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
|
|
|
|
; SSE2-NEXT: leaq -1(%rbp,%rbx), %r10
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: leaq -1(%r13,%rbx), %r9
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
|
|
|
|
; SSE2-NEXT: leaq -1(%r13,%rbx), %rbx
|
|
|
|
; SSE2-NEXT: shrl %eax
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: movd %eax, %xmm8
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: shrl %ecx
|
|
|
|
; SSE2-NEXT: movd %ecx, %xmm15
|
|
|
|
; SSE2-NEXT: shrl %edx
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: movd %edx, %xmm9
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: shrl %edi
|
|
|
|
; SSE2-NEXT: movd %edi, %xmm2
|
|
|
|
; SSE2-NEXT: shrl %r8d
|
|
|
|
; SSE2-NEXT: movd %r8d, %xmm10
|
|
|
|
; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
|
|
|
|
; SSE2-NEXT: shrl %eax
|
|
|
|
; SSE2-NEXT: movd %eax, %xmm6
|
|
|
|
; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
|
|
|
|
; SSE2-NEXT: shrl %eax
|
|
|
|
; SSE2-NEXT: movd %eax, %xmm11
|
|
|
|
; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
|
|
|
|
; SSE2-NEXT: shrl %eax
|
|
|
|
; SSE2-NEXT: movd %eax, %xmm4
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: shrq %rsi
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: movd %esi, %xmm12
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: shrq %r12
|
|
|
|
; SSE2-NEXT: movd %r12d, %xmm3
|
|
|
|
; SSE2-NEXT: shrq %r15
|
|
|
|
; SSE2-NEXT: movd %r15d, %xmm13
|
|
|
|
; SSE2-NEXT: shrq %r14
|
|
|
|
; SSE2-NEXT: movd %r14d, %xmm7
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: shrq %r11
|
|
|
|
; SSE2-NEXT: movd %r11d, %xmm14
|
|
|
|
; SSE2-NEXT: shrq %r10
|
|
|
|
; SSE2-NEXT: movd %r10d, %xmm5
|
|
|
|
; SSE2-NEXT: shrq %r9
|
|
|
|
; SSE2-NEXT: movd %r9d, %xmm0
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: shrq %rbx
|
|
|
|
; SSE2-NEXT: movd %ebx, %xmm1
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
|
|
|
|
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
|
2018-03-13 09:17:40 +08:00
|
|
|
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
|
2018-11-23 10:32:13 +08:00
|
|
|
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
|
|
|
|
; SSE2-NEXT: movdqu %xmm4, (%rax)
|
2018-02-26 12:43:24 +08:00
|
|
|
; SSE2-NEXT: popq %rbx
|
|
|
|
; SSE2-NEXT: popq %r12
|
|
|
|
; SSE2-NEXT: popq %r13
|
|
|
|
; SSE2-NEXT: popq %r14
|
|
|
|
; SSE2-NEXT: popq %r15
|
|
|
|
; SSE2-NEXT: popq %rbp
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: not_avg_v16i8_wide_constants:
|
|
|
|
; AVX1: # %bb.0:
|
|
|
|
; AVX1-NEXT: pushq %rbp
|
|
|
|
; AVX1-NEXT: pushq %r15
|
|
|
|
; AVX1-NEXT: pushq %r14
|
|
|
|
; AVX1-NEXT: pushq %r13
|
|
|
|
; AVX1-NEXT: pushq %r12
|
|
|
|
; AVX1-NEXT: pushq %rbx
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
|
2018-11-03 05:09:49 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
|
2019-04-04 13:00:18 +08:00
|
|
|
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
|
|
|
|
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
|
|
|
|
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
|
|
|
|
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
|
|
|
|
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm7, %r15
|
|
|
|
; AVX1-NEXT: vmovq %xmm7, %r14
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm4, %r11
|
|
|
|
; AVX1-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
|
|
|
|
; AVX1-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
|
2019-03-25 00:30:35 +08:00
|
|
|
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
|
2019-04-04 13:00:18 +08:00
|
|
|
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
|
|
|
|
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
|
|
|
|
; AVX1-NEXT: vmovd %xmm6, %ecx
|
|
|
|
; AVX1-NEXT: vpextrd $1, %xmm6, %edx
|
|
|
|
; AVX1-NEXT: vpextrd $2, %xmm6, %r13d
|
|
|
|
; AVX1-NEXT: vpextrd $3, %xmm6, %r12d
|
|
|
|
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
|
|
|
|
; AVX1-NEXT: vmovd %xmm1, %ebx
|
|
|
|
; AVX1-NEXT: vpextrd $1, %xmm1, %ebp
|
|
|
|
; AVX1-NEXT: vpextrd $2, %xmm1, %esi
|
|
|
|
; AVX1-NEXT: vpextrd $3, %xmm1, %edi
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vmovd %xmm7, %r8d
|
|
|
|
; AVX1-NEXT: leal -1(%r12,%rdi), %eax
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
|
2019-04-04 13:00:18 +08:00
|
|
|
; AVX1-NEXT: vpextrd $2, %xmm7, %eax
|
|
|
|
; AVX1-NEXT: leal -1(%r13,%rsi), %esi
|
|
|
|
; AVX1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
|
2019-03-25 00:30:35 +08:00
|
|
|
; AVX1-NEXT: vpextrd $2, %xmm4, %edi
|
2019-04-04 13:00:18 +08:00
|
|
|
; AVX1-NEXT: leal -1(%rdx,%rbp), %edx
|
|
|
|
; AVX1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
|
|
|
|
; AVX1-NEXT: vpextrd $3, %xmm4, %edx
|
|
|
|
; AVX1-NEXT: leal -1(%rcx,%rbx), %r10d
|
2019-03-25 00:30:35 +08:00
|
|
|
; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
|
2019-04-04 13:00:18 +08:00
|
|
|
; AVX1-NEXT: leal -1(%rdx,%rcx), %r9d
|
2019-03-25 00:30:35 +08:00
|
|
|
; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
|
2019-04-04 13:00:18 +08:00
|
|
|
; AVX1-NEXT: leal -1(%rdi,%rcx), %edi
|
|
|
|
; AVX1-NEXT: vpextrd $2, %xmm5, %ecx
|
|
|
|
; AVX1-NEXT: leal -1(%rax,%rcx), %eax
|
|
|
|
; AVX1-NEXT: vmovd %xmm5, %ecx
|
|
|
|
; AVX1-NEXT: leal -1(%r8,%rcx), %r8d
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm6, %rdx
|
|
|
|
; AVX1-NEXT: leal -1(%r15,%rdx), %r15d
|
|
|
|
; AVX1-NEXT: vmovq %xmm6, %rdx
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
|
|
|
|
; AVX1-NEXT: leal -1(%r14,%rdx), %r14d
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm1, %rdx
|
|
|
|
; AVX1-NEXT: leal -1(%r11,%rdx), %edx
|
|
|
|
; AVX1-NEXT: vmovq %xmm1, %rcx
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
|
|
|
|
; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
|
|
|
|
; AVX1-NEXT: leal -1(%rsi,%rcx), %ecx
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
|
|
|
|
; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
|
|
|
|
; AVX1-NEXT: leal -1(%rbp,%rsi), %esi
|
|
|
|
; AVX1-NEXT: vmovq %xmm1, %rbx
|
|
|
|
; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
|
|
|
|
; AVX1-NEXT: leal -1(%rbp,%rbx), %ebx
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm8, %r11
|
2019-03-25 00:30:35 +08:00
|
|
|
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
|
2019-04-04 13:00:18 +08:00
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %r12
|
|
|
|
; AVX1-NEXT: leal -1(%r11,%r12), %r11d
|
|
|
|
; AVX1-NEXT: vmovq %xmm8, %r12
|
|
|
|
; AVX1-NEXT: vmovq %xmm0, %r13
|
|
|
|
; AVX1-NEXT: leal -1(%r12,%r13), %ebp
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX1-NEXT: shrl %ebp
|
|
|
|
; AVX1-NEXT: vmovd %ebp, %xmm0
|
|
|
|
; AVX1-NEXT: shrl %r11d
|
2019-04-04 13:00:18 +08:00
|
|
|
; AVX1-NEXT: vpinsrb $1, %r11d, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: shrl %ebx
|
|
|
|
; AVX1-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: shrl %esi
|
|
|
|
; AVX1-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: shrl %ecx
|
|
|
|
; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX1-NEXT: shrl %edx
|
2019-04-04 13:00:18 +08:00
|
|
|
; AVX1-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: shrl %r14d
|
|
|
|
; AVX1-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX1-NEXT: shrl %r15d
|
|
|
|
; AVX1-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: shrl %r8d
|
2019-04-04 13:00:18 +08:00
|
|
|
; AVX1-NEXT: vpinsrb $8, %r8d, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: shrl %eax
|
|
|
|
; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: shrl %edi
|
|
|
|
; AVX1-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: shrl %r9d
|
|
|
|
; AVX1-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: shrl %r10d
|
|
|
|
; AVX1-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
|
|
|
|
; AVX1-NEXT: shrl %eax
|
|
|
|
; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
|
|
|
|
; AVX1-NEXT: shrl %eax
|
|
|
|
; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
|
|
|
|
; AVX1-NEXT: shrl %eax
|
|
|
|
; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
|
2018-02-26 12:43:24 +08:00
|
|
|
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX1-NEXT: popq %rbx
|
|
|
|
; AVX1-NEXT: popq %r12
|
|
|
|
; AVX1-NEXT: popq %r13
|
|
|
|
; AVX1-NEXT: popq %r14
|
|
|
|
; AVX1-NEXT: popq %r15
|
|
|
|
; AVX1-NEXT: popq %rbp
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: not_avg_v16i8_wide_constants:
|
|
|
|
; AVX2: # %bb.0:
|
|
|
|
; AVX2-NEXT: pushq %rbp
|
|
|
|
; AVX2-NEXT: pushq %r15
|
|
|
|
; AVX2-NEXT: pushq %r14
|
|
|
|
; AVX2-NEXT: pushq %r13
|
|
|
|
; AVX2-NEXT: pushq %r12
|
|
|
|
; AVX2-NEXT: pushq %rbx
|
2018-05-02 03:26:15 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
2018-02-26 12:43:24 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm4
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm7
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm2, %r15
|
|
|
|
; AVX2-NEXT: vmovq %xmm2, %r14
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
|
|
|
|
; AVX2-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm1
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, %r13
|
|
|
|
; AVX2-NEXT: vmovq %xmm1, %r11
|
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm11 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm1
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX2-NEXT: vmovd %xmm9, %r12d
|
|
|
|
; AVX2-NEXT: vpextrd $2, %xmm9, %r9d
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: vmovd %xmm7, %ecx
|
|
|
|
; AVX2-NEXT: vpextrd $2, %xmm7, %edi
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: vmovd %xmm5, %ebx
|
|
|
|
; AVX2-NEXT: vpextrd $2, %xmm5, %esi
|
|
|
|
; AVX2-NEXT: vmovd %xmm4, %edx
|
|
|
|
; AVX2-NEXT: vpextrd $2, %xmm4, %ebp
|
|
|
|
; AVX2-NEXT: vpextrd $2, %xmm1, %eax
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: leal -1(%rbp,%rax), %eax
|
|
|
|
; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: vmovd %xmm1, %eax
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: leal -1(%rdx,%rax), %eax
|
|
|
|
; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: vpextrd $2, %xmm8, %eax
|
|
|
|
; AVX2-NEXT: leal -1(%rsi,%rax), %eax
|
|
|
|
; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
|
|
|
|
; AVX2-NEXT: vmovd %xmm8, %eax
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: leal -1(%rbx,%rax), %r10d
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: vpextrd $2, %xmm6, %eax
|
|
|
|
; AVX2-NEXT: leal -1(%rdi,%rax), %r8d
|
|
|
|
; AVX2-NEXT: vmovd %xmm6, %eax
|
|
|
|
; AVX2-NEXT: leal -1(%rcx,%rax), %edi
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: vpextrd $2, %xmm3, %eax
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: leal -1(%r9,%rax), %r9d
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: vmovd %xmm3, %ecx
|
|
|
|
; AVX2-NEXT: leal -1(%r12,%rcx), %r12d
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
|
|
|
|
; AVX2-NEXT: leal -1(%r15,%rcx), %r15d
|
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rcx
|
|
|
|
; AVX2-NEXT: leal -1(%r14,%rcx), %r14d
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm2, %rdx
|
2018-04-25 03:20:18 +08:00
|
|
|
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: leal -1(%rax,%rdx), %edx
|
|
|
|
; AVX2-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm0
|
|
|
|
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
|
|
|
|
; AVX2-NEXT: leal -1(%rcx,%rax), %eax
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
|
|
|
|
; AVX2-NEXT: leal -1(%r13,%rsi), %esi
|
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rbx
|
|
|
|
; AVX2-NEXT: leal -1(%r11,%rbx), %ebx
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm10, %rcx
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm11, %r13
|
|
|
|
; AVX2-NEXT: leal -1(%rcx,%r13), %ecx
|
|
|
|
; AVX2-NEXT: vmovq %xmm10, %r13
|
|
|
|
; AVX2-NEXT: vmovq %xmm11, %r11
|
|
|
|
; AVX2-NEXT: leaq -1(%r13,%r11), %rbp
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: shrq %rbp
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: vmovd %ebp, %xmm0
|
|
|
|
; AVX2-NEXT: shrl %ecx
|
|
|
|
; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: shrl %ebx
|
|
|
|
; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: shrl %esi
|
|
|
|
; AVX2-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: shrl %eax
|
|
|
|
; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: shrl %edx
|
|
|
|
; AVX2-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: shrl %r14d
|
|
|
|
; AVX2-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: shrl %r15d
|
|
|
|
; AVX2-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: shrl %r12d
|
|
|
|
; AVX2-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: shrl %r9d
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: shrl %edi
|
|
|
|
; AVX2-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: shrl %r8d
|
|
|
|
; AVX2-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: shrl %r10d
|
|
|
|
; AVX2-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
|
|
|
|
; AVX2-NEXT: shrl %eax
|
|
|
|
; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
|
|
|
|
; AVX2-NEXT: shrl %eax
|
|
|
|
; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
|
|
|
|
; AVX2-NEXT: shrl %eax
|
|
|
|
; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
|
2018-02-26 12:43:24 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX2-NEXT: popq %rbx
|
|
|
|
; AVX2-NEXT: popq %r12
|
|
|
|
; AVX2-NEXT: popq %r13
|
|
|
|
; AVX2-NEXT: popq %r14
|
|
|
|
; AVX2-NEXT: popq %r15
|
|
|
|
; AVX2-NEXT: popq %rbp
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: not_avg_v16i8_wide_constants:
|
|
|
|
; AVX512: # %bb.0:
|
|
|
|
; AVX512-NEXT: pushq %rbp
|
|
|
|
; AVX512-NEXT: pushq %r15
|
|
|
|
; AVX512-NEXT: pushq %r14
|
|
|
|
; AVX512-NEXT: pushq %r13
|
|
|
|
; AVX512-NEXT: pushq %r12
|
|
|
|
; AVX512-NEXT: pushq %rbx
|
2018-05-02 03:26:15 +08:00
|
|
|
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
2018-02-26 12:43:24 +08:00
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
|
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm4
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm7
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1
|
2018-02-26 12:43:24 +08:00
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm2, %r15
|
|
|
|
; AVX512-NEXT: vmovq %xmm2, %r14
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
|
|
|
|
; AVX512-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm1
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm1, %r13
|
|
|
|
; AVX512-NEXT: vmovq %xmm1, %r11
|
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm11 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm1
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX512-NEXT: vmovd %xmm9, %r12d
|
|
|
|
; AVX512-NEXT: vpextrd $2, %xmm9, %r9d
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: vmovd %xmm7, %ecx
|
|
|
|
; AVX512-NEXT: vpextrd $2, %xmm7, %edi
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: vmovd %xmm5, %ebx
|
|
|
|
; AVX512-NEXT: vpextrd $2, %xmm5, %esi
|
|
|
|
; AVX512-NEXT: vmovd %xmm4, %edx
|
|
|
|
; AVX512-NEXT: vpextrd $2, %xmm4, %ebp
|
|
|
|
; AVX512-NEXT: vpextrd $2, %xmm1, %eax
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: leal -1(%rbp,%rax), %eax
|
|
|
|
; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: vmovd %xmm1, %eax
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: leal -1(%rdx,%rax), %eax
|
|
|
|
; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: vpextrd $2, %xmm8, %eax
|
|
|
|
; AVX512-NEXT: leal -1(%rsi,%rax), %eax
|
|
|
|
; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
|
|
|
|
; AVX512-NEXT: vmovd %xmm8, %eax
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: leal -1(%rbx,%rax), %r10d
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: vpextrd $2, %xmm6, %eax
|
|
|
|
; AVX512-NEXT: leal -1(%rdi,%rax), %r8d
|
|
|
|
; AVX512-NEXT: vmovd %xmm6, %eax
|
|
|
|
; AVX512-NEXT: leal -1(%rcx,%rax), %edi
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: vpextrd $2, %xmm3, %eax
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: leal -1(%r9,%rax), %r9d
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: vmovd %xmm3, %ecx
|
|
|
|
; AVX512-NEXT: leal -1(%r12,%rcx), %r12d
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
|
|
|
|
; AVX512-NEXT: leal -1(%r15,%rcx), %r15d
|
|
|
|
; AVX512-NEXT: vmovq %xmm0, %rcx
|
|
|
|
; AVX512-NEXT: leal -1(%r14,%rcx), %r14d
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
|
2018-04-25 03:20:18 +08:00
|
|
|
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: leal -1(%rax,%rdx), %edx
|
|
|
|
; AVX512-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm0
|
|
|
|
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
|
|
|
|
; AVX512-NEXT: leal -1(%rcx,%rax), %eax
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
|
|
|
|
; AVX512-NEXT: leal -1(%r13,%rsi), %esi
|
|
|
|
; AVX512-NEXT: vmovq %xmm0, %rbx
|
|
|
|
; AVX512-NEXT: leal -1(%r11,%rbx), %ebx
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm10, %rcx
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm11, %r13
|
|
|
|
; AVX512-NEXT: leal -1(%rcx,%r13), %ecx
|
|
|
|
; AVX512-NEXT: vmovq %xmm10, %r13
|
|
|
|
; AVX512-NEXT: vmovq %xmm11, %r11
|
|
|
|
; AVX512-NEXT: leaq -1(%r13,%r11), %rbp
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: shrq %rbp
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: vmovd %ebp, %xmm0
|
|
|
|
; AVX512-NEXT: shrl %ecx
|
|
|
|
; AVX512-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: shrl %ebx
|
|
|
|
; AVX512-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: shrl %esi
|
|
|
|
; AVX512-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: shrl %eax
|
|
|
|
; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: shrl %edx
|
|
|
|
; AVX512-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: shrl %r14d
|
|
|
|
; AVX512-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: shrl %r15d
|
|
|
|
; AVX512-NEXT: vpinsrb $7, %r15d, %xmm0, %xmm0
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: shrl %r12d
|
|
|
|
; AVX512-NEXT: vpinsrb $8, %r12d, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: shrl %r9d
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: shrl %edi
|
|
|
|
; AVX512-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: shrl %r8d
|
|
|
|
; AVX512-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: shrl %r10d
|
|
|
|
; AVX512-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
|
2019-03-16 00:16:49 +08:00
|
|
|
; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
|
|
|
|
; AVX512-NEXT: shrl %eax
|
|
|
|
; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
|
2018-11-23 10:32:13 +08:00
|
|
|
; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
|
|
|
|
; AVX512-NEXT: shrl %eax
|
|
|
|
; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
|
|
|
|
; AVX512-NEXT: shrl %eax
|
|
|
|
; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
|
|
|
|
; AVX512-NEXT: vmovdqu %xmm0, (%rax)
|
2018-02-26 12:43:24 +08:00
|
|
|
; AVX512-NEXT: popq %rbx
|
|
|
|
; AVX512-NEXT: popq %r12
|
|
|
|
; AVX512-NEXT: popq %r13
|
|
|
|
; AVX512-NEXT: popq %r14
|
|
|
|
; AVX512-NEXT: popq %r15
|
|
|
|
; AVX512-NEXT: popq %rbp
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = load <16 x i8>, <16 x i8>* %a
|
|
|
|
%2 = load <16 x i8>, <16 x i8>* %b
|
|
|
|
%3 = zext <16 x i8> %1 to <16 x i128>
|
|
|
|
%4 = zext <16 x i8> %2 to <16 x i128>
|
|
|
|
%5 = add nuw nsw <16 x i128> %3, <i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1>
|
|
|
|
%6 = add nuw nsw <16 x i128> %5, %4
|
|
|
|
%7 = lshr <16 x i128> %6, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
|
|
|
|
%8 = trunc <16 x i128> %7 to <16 x i8>
|
|
|
|
store <16 x i8> %8, <16 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
2018-09-08 04:56:01 +08:00
|
|
|
|
|
|
|
; Make sure we don't fail on single element vectors.
|
|
|
|
define <1 x i8> @avg_v1i8(<1 x i8> %x, <1 x i8> %y) {
|
|
|
|
; SSE2-LABEL: avg_v1i8:
|
|
|
|
; SSE2: # %bb.0:
|
2018-09-08 04:56:03 +08:00
|
|
|
; SSE2-NEXT: movzbl %dil, %eax
|
|
|
|
; SSE2-NEXT: movzbl %sil, %ecx
|
2018-09-08 04:56:01 +08:00
|
|
|
; SSE2-NEXT: leal 1(%rax,%rcx), %eax
|
|
|
|
; SSE2-NEXT: shrl %eax
|
|
|
|
; SSE2-NEXT: # kill: def $al killed $al killed $eax
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: avg_v1i8:
|
|
|
|
; AVX: # %bb.0:
|
2018-09-08 04:56:03 +08:00
|
|
|
; AVX-NEXT: movzbl %dil, %eax
|
|
|
|
; AVX-NEXT: movzbl %sil, %ecx
|
2018-09-08 04:56:01 +08:00
|
|
|
; AVX-NEXT: leal 1(%rax,%rcx), %eax
|
|
|
|
; AVX-NEXT: shrl %eax
|
|
|
|
; AVX-NEXT: # kill: def $al killed $al killed $eax
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%a = zext <1 x i8> %x to <1 x i16>
|
|
|
|
%b = zext <1 x i8> %y to <1 x i16>
|
|
|
|
%c = add <1 x i16> %a, %b
|
|
|
|
%d = add <1 x i16> %c, <i16 1>
|
|
|
|
%e = lshr <1 x i16> %d, <i16 1>
|
|
|
|
%f = trunc <1 x i16> %e to <1 x i8>
|
|
|
|
ret <1 x i8> %f
|
|
|
|
}
|
|
|
|
|
2019-03-30 21:53:11 +08:00
|
|
|
; _mm_avg_epu16( _mm_slli_epi16(a, 2), _mm_slli_epi16(b, 2))
|
|
|
|
define <2 x i64> @PR41316(<2 x i64>, <2 x i64>) {
|
|
|
|
; SSE2-LABEL: PR41316:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: psllw $2, %xmm0
|
|
|
|
; SSE2-NEXT: psllw $2, %xmm1
|
2019-03-31 01:12:29 +08:00
|
|
|
; SSE2-NEXT: pavgw %xmm1, %xmm0
|
2019-03-30 21:53:11 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2019-03-31 01:12:29 +08:00
|
|
|
; AVX-LABEL: PR41316:
|
|
|
|
; AVX: # %bb.0:
|
|
|
|
; AVX-NEXT: vpsllw $2, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vpsllw $2, %xmm1, %xmm1
|
|
|
|
; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2019-03-30 21:53:11 +08:00
|
|
|
%3 = bitcast <2 x i64> %0 to <8 x i16>
|
|
|
|
%4 = shl <8 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
|
|
|
|
%5 = bitcast <2 x i64> %1 to <8 x i16>
|
|
|
|
%6 = shl <8 x i16> %5, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
|
|
|
|
%7 = zext <8 x i16> %6 to <8 x i32>
|
|
|
|
%8 = or <8 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%9 = zext <8 x i16> %8 to <8 x i32>
|
|
|
|
%10 = add nuw nsw <8 x i32> %9, %7
|
|
|
|
%11 = lshr <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%12 = trunc <8 x i32> %11 to <8 x i16>
|
|
|
|
%13 = bitcast <8 x i16> %12 to <2 x i64>
|
|
|
|
ret <2 x i64> %13
|
|
|
|
}
|