2016-08-26 01:17:46 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2017-06-23 22:16:50 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
|
2017-06-23 22:38:00 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v4i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movd %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v4i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX-NEXT: vmovd %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <4 x i8>, <4 x i8>* %a
|
|
|
|
%2 = load <4 x i8>, <4 x i8>* %b
|
|
|
|
%3 = zext <4 x i8> %1 to <4 x i32>
|
|
|
|
%4 = zext <4 x i8> %2 to <4 x i32>
|
|
|
|
%5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <4 x i32> %5, %4
|
|
|
|
%7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <4 x i32> %7 to <4 x i8>
|
|
|
|
store <4 x i8> %8, <4 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v8i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movq %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v8i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX-NEXT: vmovq %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <8 x i8>, <8 x i8>* %a
|
|
|
|
%2 = load <8 x i8>, <8 x i8>* %b
|
|
|
|
%3 = zext <8 x i8> %1 to <8 x i32>
|
|
|
|
%4 = zext <8 x i8> %2 to <8 x i32>
|
|
|
|
%5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <8 x i32> %5, %4
|
|
|
|
%7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <8 x i32> %7 to <8 x i8>
|
|
|
|
store <8 x i8> %8, <8 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v16i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgb (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v16i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovdqa (%rsi), %xmm0
|
|
|
|
; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <16 x i8>, <16 x i8>* %a
|
|
|
|
%2 = load <16 x i8>, <16 x i8>* %b
|
|
|
|
%3 = zext <16 x i8> %1 to <16 x i32>
|
|
|
|
%4 = zext <16 x i8> %2 to <16 x i32>
|
|
|
|
%5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <16 x i32> %5, %4
|
|
|
|
%7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <16 x i32> %7 to <16 x i8>
|
|
|
|
store <16 x i8> %8, <16 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgb (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgb 16(%rsi), %xmm0
|
2018-02-07 00:14:29 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-LABEL: avg_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-LABEL: avg_v32i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <32 x i8>, <32 x i8>* %a
|
|
|
|
%2 = load <32 x i8>, <32 x i8>* %b
|
|
|
|
%3 = zext <32 x i8> %1 to <32 x i32>
|
|
|
|
%4 = zext <32 x i8> %2 to <32 x i32>
|
|
|
|
%5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <32 x i32> %5, %4
|
|
|
|
%7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <32 x i32> %7 to <32 x i8>
|
|
|
|
store <32 x i8> %8, <32 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-12-21 21:18:19 +08:00
|
|
|
define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
|
|
|
|
; SSE2-LABEL: avg_v48i8:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: movdqa 16(%rdi), %xmm6
|
|
|
|
; SSE2-NEXT: movdqa 32(%rdi), %xmm11
|
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm12
|
|
|
|
; SSE2-NEXT: movdqa 16(%rsi), %xmm13
|
|
|
|
; SSE2-NEXT: movdqa 32(%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: pxor %xmm7, %xmm7
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm4
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
|
|
|
|
; SSE2-NEXT: movdqa %xmm4, %xmm2
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm10
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: movdqa %xmm6, %xmm5
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
|
|
|
|
; SSE2-NEXT: movdqa %xmm5, %xmm15
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm6, %xmm14
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: movdqa %xmm12, %xmm3
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]
|
|
|
|
; SSE2-NEXT: movdqa %xmm3, %xmm8
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: paddd %xmm2, %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm11, %xmm2
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: paddd %xmm4, %xmm3
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm12, %xmm9
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: paddd %xmm10, %xmm9
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: paddd %xmm1, %xmm12
|
|
|
|
; SSE2-NEXT: movdqa %xmm13, %xmm4
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
|
|
|
|
; SSE2-NEXT: movdqa %xmm4, %xmm10
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: paddd %xmm15, %xmm10
|
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm15
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: paddd %xmm5, %xmm4
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm13, %xmm1
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: paddd %xmm14, %xmm1
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: paddd %xmm6, %xmm13
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm6
|
|
|
|
; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
|
|
|
|
; SSE2-NEXT: movdqa %xmm6, %xmm14
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: paddd %xmm15, %xmm14
|
|
|
|
; SSE2-NEXT: movdqa %xmm11, %xmm5
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: paddd %xmm2, %xmm6
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
|
|
|
|
; SSE2-NEXT: paddd %xmm5, %xmm2
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
|
|
|
|
; SSE2-NEXT: paddd %xmm11, %xmm0
|
|
|
|
; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm8
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm3
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm9
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm12
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm10
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm4
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm1
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm13
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm14
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm6
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm2
|
|
|
|
; SSE2-NEXT: psubd %xmm5, %xmm0
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm3
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm8
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255]
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm8
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm3
|
|
|
|
; SSE2-NEXT: packuswb %xmm8, %xmm3
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm12
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm9
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm9
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm12
|
|
|
|
; SSE2-NEXT: packuswb %xmm9, %xmm12
|
|
|
|
; SSE2-NEXT: packuswb %xmm3, %xmm12
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm4
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm10
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm10
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm4
|
|
|
|
; SSE2-NEXT: packuswb %xmm10, %xmm4
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm13
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm1
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm1
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm13
|
|
|
|
; SSE2-NEXT: packuswb %xmm1, %xmm13
|
|
|
|
; SSE2-NEXT: packuswb %xmm4, %xmm13
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm6
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm14
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm14
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm6
|
|
|
|
; SSE2-NEXT: packuswb %xmm14, %xmm6
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm0
|
|
|
|
; SSE2-NEXT: psrld $1, %xmm2
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm2
|
|
|
|
; SSE2-NEXT: pand %xmm7, %xmm0
|
|
|
|
; SSE2-NEXT: packuswb %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: packuswb %xmm6, %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm13, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm12, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: avg_v48i8:
|
|
|
|
; AVX1: # %bb.0:
|
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm2
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rdi), %ymm5
|
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %ymm1
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rsi), %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vmovdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vmovdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5
|
|
|
|
; AVX1-NEXT: vmovdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm9
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm8
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm2, %xmm11, %xmm11
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm7, %xmm12, %xmm12
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm5, %xmm13, %xmm13
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm15
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm4, %xmm14, %xmm14
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm6
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm2, %xmm2 # 16-byte Folded Reload
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
|
|
|
; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
|
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm7, %xmm7, %xmm7
|
|
|
|
; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
|
|
|
|
; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm10
|
|
|
|
; AVX1-NEXT: vpsubd %xmm7, %xmm9, %xmm9
|
|
|
|
; AVX1-NEXT: vpsubd %xmm7, %xmm8, %xmm8
|
|
|
|
; AVX1-NEXT: vpsubd %xmm7, %xmm11, %xmm11
|
|
|
|
; AVX1-NEXT: vpsubd %xmm7, %xmm12, %xmm12
|
|
|
|
; AVX1-NEXT: vpsubd %xmm7, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpsubd %xmm7, %xmm13, %xmm4
|
|
|
|
; AVX1-NEXT: vpsubd %xmm7, %xmm15, %xmm5
|
|
|
|
; AVX1-NEXT: vpsubd %xmm7, %xmm14, %xmm1
|
|
|
|
; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6
|
|
|
|
; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm2, %xmm14
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm6, %xmm15
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm1, %xmm13
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm12, %xmm12
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm11, %xmm11
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm9, %xmm2
|
|
|
|
; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm6
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm7
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm7[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm3
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm4
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm3
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm3
|
|
|
|
; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
|
|
|
|
; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: avg_v48i8:
|
|
|
|
; AVX2: # %bb.0:
|
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm1
|
|
|
|
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
|
|
|
|
; AVX2-NEXT: vmovdqa (%rsi), %ymm3
|
|
|
|
; AVX2-NEXT: vmovdqa 32(%rsi), %ymm0
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
|
|
|
; AVX2-NEXT: vpand %ymm9, %ymm5, %ymm5
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,0,1]
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,1,2,3]
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpand %ymm9, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpaddd %ymm2, %ymm5, %ymm2
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpaddd %ymm4, %ymm7, %ymm4
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpaddd %ymm3, %ymm11, %ymm3
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
|
|
|
; AVX2-NEXT: vpaddd %ymm0, %ymm10, %ymm0
|
|
|
|
; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6
|
|
|
|
; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4
|
|
|
|
; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5
|
|
|
|
; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
|
|
|
|
; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
|
|
|
; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm7, %xmm4, %xmm4
|
|
|
|
; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
|
|
|
|
; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
|
|
|
|
; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
|
|
|
|
; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
|
|
|
|
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v48i8:
|
|
|
|
; AVX512F: # %bb.0:
|
|
|
|
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
|
|
|
|
; AVX512F-NEXT: vmovdqa (%rsi), %ymm2
|
|
|
|
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3
|
|
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
|
|
|
|
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm5
|
|
|
|
; AVX512F-NEXT: vpavgb %xmm5, %xmm4, %xmm4
|
|
|
|
; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX512F-NEXT: vmovdqu %xmm1, (%rax)
|
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512F-NEXT: vzeroupper
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: avg_v48i8:
|
|
|
|
; AVX512BW: # %bb.0:
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1
|
|
|
|
; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
|
|
|
|
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3
|
|
|
|
; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
|
|
|
|
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
|
|
|
; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
|
|
|
|
; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
|
|
|
|
; AVX512BW-NEXT: vpaddd %zmm4, %zmm2, %zmm2
|
|
|
|
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm4
|
|
|
|
; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
|
|
|
|
; AVX512BW-NEXT: vpaddd %zmm4, %zmm3, %zmm3
|
|
|
|
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
|
|
|
|
; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
|
|
|
|
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
|
|
|
|
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
|
|
|
|
; AVX512BW-NEXT: vpsubd %zmm1, %zmm2, %zmm2
|
|
|
|
; AVX512BW-NEXT: vpsubd %zmm1, %zmm3, %zmm3
|
|
|
|
; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0
|
|
|
|
; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm0
|
|
|
|
; AVX512BW-NEXT: vpsrld $1, %zmm3, %zmm1
|
|
|
|
; AVX512BW-NEXT: vpsrld $1, %zmm2, %zmm2
|
|
|
|
; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2
|
|
|
|
; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1
|
|
|
|
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
|
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
|
|
|
|
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
|
|
|
|
; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
|
|
|
|
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
|
|
|
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
|
|
|
|
; AVX512BW-NEXT: vmovdqu %ymm1, (%rax)
|
|
|
|
; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, (%rax)
|
|
|
|
; AVX512BW-NEXT: vzeroupper
|
|
|
|
; AVX512BW-NEXT: retq
|
|
|
|
%1 = load <48 x i8>, <48 x i8>* %a
|
|
|
|
%2 = load <48 x i8>, <48 x i8>* %b
|
|
|
|
%3 = zext <48 x i8> %1 to <48 x i32>
|
|
|
|
%4 = zext <48 x i8> %2 to <48 x i32>
|
|
|
|
%5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <48 x i32> %5, %4
|
|
|
|
%7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <48 x i32> %7 to <48 x i8>
|
|
|
|
store <48 x i8> %8, <48 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: movdqa 32(%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm1
|
|
|
|
; SSE2-NEXT: movdqa 16(%rsi), %xmm2
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: pavgb (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgb 16(%rdi), %xmm2
|
|
|
|
; SSE2-NEXT: pavgb 32(%rsi), %xmm0
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgb 48(%rdi), %xmm3
|
|
|
|
; SSE2-NEXT: movdqu %xmm3, (%rax)
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm2, (%rax)
|
2017-10-04 00:59:13 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
|
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %ymm2
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
|
|
|
|
; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-LABEL: avg_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2018-02-17 10:26:25 +08:00
|
|
|
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqa (%rsi), %ymm1
|
|
|
|
; AVX2-NEXT: vpavgb (%rdi), %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0
|
2018-02-07 00:14:29 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
2018-02-17 10:26:25 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2018-02-17 10:26:25 +08:00
|
|
|
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
|
|
|
|
; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgb (%rdi), %ymm1, %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0
|
2018-02-07 00:14:29 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
2018-02-17 10:26:25 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-LABEL: avg_v64i8:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-08-01 01:35:44 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <64 x i8>, <64 x i8>* %a
|
|
|
|
%2 = load <64 x i8>, <64 x i8>* %b
|
|
|
|
%3 = zext <64 x i8> %1 to <64 x i32>
|
|
|
|
%4 = zext <64 x i8> %2 to <64 x i32>
|
|
|
|
%5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <64 x i32> %5, %4
|
|
|
|
%7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <64 x i32> %7 to <64 x i8>
|
|
|
|
store <64 x i8> %8, <64 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v4i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; SSE2-NEXT: pavgw %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movq %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v4i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX-NEXT: vmovq %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <4 x i16>, <4 x i16>* %a
|
|
|
|
%2 = load <4 x i16>, <4 x i16>* %b
|
|
|
|
%3 = zext <4 x i16> %1 to <4 x i32>
|
|
|
|
%4 = zext <4 x i16> %2 to <4 x i32>
|
|
|
|
%5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <4 x i32> %5, %4
|
|
|
|
%7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <4 x i32> %7 to <4 x i16>
|
|
|
|
store <4 x i16> %8, <4 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v8i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgw (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v8i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovdqa (%rsi), %xmm0
|
|
|
|
; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <8 x i16>, <8 x i16>* %a
|
|
|
|
%2 = load <8 x i16>, <8 x i16>* %b
|
|
|
|
%3 = zext <8 x i16> %1 to <8 x i32>
|
|
|
|
%4 = zext <8 x i16> %2 to <8 x i32>
|
|
|
|
%5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <8 x i32> %5, %4
|
|
|
|
%7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <8 x i32> %7 to <8 x i16>
|
|
|
|
store <8 x i16> %8, <8 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgw (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgw 16(%rsi), %xmm0
|
2018-02-07 00:14:29 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
|
|
|
; AVX1-NEXT: vpavgw %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgw %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-LABEL: avg_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-LABEL: avg_v16i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX512-NEXT: vpavgw (%rdi), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <16 x i16>, <16 x i16>* %a
|
|
|
|
%2 = load <16 x i16>, <16 x i16>* %b
|
|
|
|
%3 = zext <16 x i16> %1 to <16 x i32>
|
|
|
|
%4 = zext <16 x i16> %2 to <16 x i32>
|
|
|
|
%5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <16 x i32> %5, %4
|
|
|
|
%7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <16 x i32> %7 to <16 x i16>
|
|
|
|
store <16 x i16> %8, <16 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: movdqa 32(%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm1
|
|
|
|
; SSE2-NEXT: movdqa 16(%rsi), %xmm2
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: pavgw (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgw 16(%rdi), %xmm2
|
|
|
|
; SSE2-NEXT: pavgw 32(%rsi), %xmm0
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgw 48(%rdi), %xmm3
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm3, (%rax)
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm2, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
|
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %ymm2
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
|
|
|
|
; AVX1-NEXT: vpavgw %xmm4, %xmm5, %xmm4
|
|
|
|
; AVX1-NEXT: vpavgw %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
|
|
|
|
; AVX1-NEXT: vpavgw %xmm2, %xmm4, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgw %xmm1, %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-LABEL: avg_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2018-02-17 10:26:25 +08:00
|
|
|
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqa (%rsi), %ymm1
|
|
|
|
; AVX2-NEXT: vpavgw (%rdi), %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0
|
2018-02-07 00:14:29 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
2018-02-17 10:26:25 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2018-02-17 10:26:25 +08:00
|
|
|
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
|
|
|
|
; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0
|
2018-02-07 00:14:29 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
2018-02-17 10:26:25 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-LABEL: avg_v32i16:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-08-01 01:35:44 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <32 x i16>, <32 x i16>* %a
|
|
|
|
%2 = load <32 x i16>, <32 x i16>* %b
|
|
|
|
%3 = zext <32 x i16> %1 to <32 x i32>
|
|
|
|
%4 = zext <32 x i16> %2 to <32 x i32>
|
|
|
|
%5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%6 = add nuw nsw <32 x i32> %5, %4
|
|
|
|
%7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <32 x i32> %7 to <32 x i16>
|
|
|
|
store <32 x i16> %8, <32 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v4i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movd %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v4i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
|
|
|
; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovd %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <4 x i8>, <4 x i8>* %a
|
|
|
|
%2 = load <4 x i8>, <4 x i8>* %b
|
|
|
|
%3 = zext <4 x i8> %1 to <4 x i32>
|
|
|
|
%4 = zext <4 x i8> %2 to <4 x i32>
|
|
|
|
%5 = add nuw nsw <4 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <4 x i32> %7 to <4 x i8>
|
|
|
|
store <4 x i8> %8, <4 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v8i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movq %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v8i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovq %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <8 x i8>, <8 x i8>* %a
|
|
|
|
%2 = load <8 x i8>, <8 x i8>* %b
|
|
|
|
%3 = zext <8 x i8> %1 to <8 x i32>
|
|
|
|
%4 = zext <8 x i8> %2 to <8 x i32>
|
|
|
|
%5 = add nuw nsw <8 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <8 x i32> %7 to <8 x i8>
|
|
|
|
store <8 x i8> %8, <8 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v16i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgb (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v16i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <16 x i8>, <16 x i8>* %a
|
|
|
|
%2 = load <16 x i8>, <16 x i8>* %b
|
|
|
|
%3 = zext <16 x i8> %1 to <16 x i32>
|
|
|
|
%4 = zext <16 x i8> %2 to <16 x i32>
|
|
|
|
%5 = add nuw nsw <16 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <16 x i32> %7 to <16 x i8>
|
|
|
|
store <16 x i8> %8, <16 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v32i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgb (%rsi), %xmm0
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: pavgb 16(%rdi), %xmm1
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v32i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-LABEL: avg_v32i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-LABEL: avg_v32i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <32 x i8>, <32 x i8>* %a
|
|
|
|
%2 = load <32 x i8>, <32 x i8>* %b
|
|
|
|
%3 = zext <32 x i8> %1 to <32 x i32>
|
|
|
|
%4 = zext <32 x i8> %2 to <32 x i32>
|
|
|
|
%5 = add nuw nsw <32 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <32 x i32> %7 to <32 x i8>
|
|
|
|
store <32 x i8> %8, <32 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v64i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm0
|
|
|
|
; SSE2-NEXT: pavgb %xmm1, %xmm1
|
|
|
|
; SSE2-NEXT: pavgb %xmm2, %xmm2
|
|
|
|
; SSE2-NEXT: pavgb %xmm3, %xmm3
|
|
|
|
; SSE2-NEXT: movdqu %xmm3, (%rax)
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm2, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v64i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rsi), %ymm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm1
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
|
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-LABEL: avg_v64i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
|
|
|
|
; AVX2-NEXT: vpavgb %ymm0, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb %ymm1, %ymm1, %ymm1
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v64i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
|
|
|
|
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgb %ymm0, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpavgb %ymm1, %ymm1, %ymm1
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
|
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-LABEL: avg_v64i8_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-08-01 01:35:44 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <64 x i8>, <64 x i8>* %a
|
|
|
|
%2 = load <64 x i8>, <64 x i8>* %b
|
|
|
|
%3 = zext <64 x i8> %1 to <64 x i32>
|
|
|
|
%4 = zext <64 x i8> %2 to <64 x i32>
|
|
|
|
%5 = add nuw nsw <64 x i32> %4, %4
|
|
|
|
%6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <64 x i32> %7 to <64 x i8>
|
|
|
|
store <64 x i8> %8, <64 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v4i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; SSE2-NEXT: pavgw %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movq %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v4i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovq %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <4 x i16>, <4 x i16>* %a
|
|
|
|
%2 = load <4 x i16>, <4 x i16>* %b
|
|
|
|
%3 = zext <4 x i16> %1 to <4 x i32>
|
|
|
|
%4 = zext <4 x i16> %2 to <4 x i32>
|
|
|
|
%5 = add nuw nsw <4 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <4 x i32> %7 to <4 x i16>
|
|
|
|
store <4 x i16> %8, <4 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v8i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgw (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v8i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <8 x i16>, <8 x i16>* %a
|
|
|
|
%2 = load <8 x i16>, <8 x i16>* %b
|
|
|
|
%3 = zext <8 x i16> %1 to <8 x i32>
|
|
|
|
%4 = zext <8 x i16> %2 to <8 x i32>
|
|
|
|
%5 = add nuw nsw <8 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <8 x i32> %7 to <8 x i16>
|
|
|
|
store <8 x i16> %8, <8 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v16i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgw (%rsi), %xmm0
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: pavgw 16(%rdi), %xmm1
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v16i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpavgw %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-LABEL: avg_v16i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-LABEL: avg_v16i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <16 x i16>, <16 x i16>* %a
|
|
|
|
%2 = load <16 x i16>, <16 x i16>* %b
|
|
|
|
%3 = zext <16 x i16> %1 to <16 x i32>
|
|
|
|
%4 = zext <16 x i16> %2 to <16 x i32>
|
|
|
|
%5 = add nuw nsw <16 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <16 x i32> %7 to <16 x i16>
|
|
|
|
store <16 x i16> %8, <16 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v32i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa 16(%rdi), %xmm1
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: movdqa 48(%rdi), %xmm2
|
|
|
|
; SSE2-NEXT: movdqa 32(%rsi), %xmm3
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgw (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgw 16(%rsi), %xmm1
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: pavgw 32(%rdi), %xmm3
|
|
|
|
; SSE2-NEXT: pavgw 48(%rsi), %xmm2
|
2018-02-07 00:14:29 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm2, (%rax)
|
2018-02-17 10:26:25 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm3, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v32i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
|
|
|
|
; AVX1-NEXT: vmovdqa (%rsi), %ymm2
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
|
|
|
|
; AVX1-NEXT: vpavgw %xmm4, %xmm5, %xmm4
|
|
|
|
; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
|
|
|
|
; AVX1-NEXT: vpavgw %xmm2, %xmm4, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgw %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant:
add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>
The all-ones vector constant can be materialized using a pcmpeq instruction that is
commonly recognized as an idiom (has no register dependency), so that's better than
loading a splat 1 constant.
AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.
The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables,
so in theory, this could be better for perf, but...
2. That seems unlikely to affect any OOO implementation, and I can't measure any real
perf difference from this transform on Haswell or Jaguar, but...
3. It doesn't look like it from the diffs, but this is an overall size win because we
eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting
a scalar load (which might itself be a bug), then we're replacing a scalar constant
load + broadcast with a single cheap op, so that should always be smaller/better too.
4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1
and psub x, -1, so we should use that form for +1 too because we can. If there's some
reason to favor a constant load on some CPU, let's make the reverse transform for all
of these cases (either here in the DAG or in a later machine pass).
This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483
Differential Revision: https://reviews.llvm.org/D34336
llvm-svn: 306289
2017-06-26 22:19:26 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-LABEL: avg_v32i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
2018-02-17 10:26:25 +08:00
|
|
|
; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
|
2018-02-17 10:26:25 +08:00
|
|
|
; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v32i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
2018-02-17 10:26:25 +08:00
|
|
|
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0
|
2018-02-17 10:26:25 +08:00
|
|
|
; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
|
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-LABEL: avg_v32i16_2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-08-01 01:35:44 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <32 x i16>, <32 x i16>* %a
|
|
|
|
%2 = load <32 x i16>, <32 x i16>* %b
|
|
|
|
%3 = zext <32 x i16> %1 to <32 x i32>
|
|
|
|
%4 = zext <32 x i16> %2 to <32 x i32>
|
|
|
|
%5 = add nuw nsw <32 x i32> %3, %4
|
|
|
|
%6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%8 = trunc <32 x i32> %7 to <32 x i16>
|
|
|
|
store <32 x i16> %8, <32 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v4i8_const(<4 x i8>* %a) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v4i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: movd %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v4i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
|
|
|
; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovd %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <4 x i8>, <4 x i8>* %a
|
|
|
|
%2 = zext <4 x i8> %1 to <4 x i32>
|
|
|
|
%3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
|
|
|
|
%4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%5 = trunc <4 x i32> %4 to <4 x i8>
|
|
|
|
store <4 x i8> %5, <4 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v8i8_const(<8 x i8>* %a) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v8i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: movq %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v8i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovq %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <8 x i8>, <8 x i8>* %a
|
|
|
|
%2 = zext <8 x i8> %1 to <8 x i32>
|
|
|
|
%3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
|
|
|
%4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%5 = trunc <8 x i32> %4 to <8 x i8>
|
|
|
|
store <8 x i8> %5, <8 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v16i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v16i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <16 x i8>, <16 x i8>* %a
|
|
|
|
%2 = zext <16 x i8> %1 to <16 x i32>
|
|
|
|
%3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
|
|
|
%4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%5 = trunc <16 x i32> %4 to <16 x i8>
|
|
|
|
store <16 x i8> %5, <16 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v32i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
|
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: pavgb 16(%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2017-11-03 19:33:48 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v32i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0
|
2017-11-03 19:33:48 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-LABEL: avg_v32i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-LABEL: avg_v32i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <32 x i8>, <32 x i8>* %a
|
|
|
|
%2 = zext <32 x i8> %1 to <32 x i32>
|
|
|
|
%3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
2015-12-01 05:46:08 +08:00
|
|
|
%4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%5 = trunc <32 x i32> %4 to <32 x i8>
|
|
|
|
store <32 x i8> %5, <32 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v64i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
|
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movdqa 16(%rdi), %xmm2
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: movdqa 32(%rdi), %xmm3
|
|
|
|
; SSE2-NEXT: pavgb %xmm0, %xmm3
|
|
|
|
; SSE2-NEXT: pavgb 48(%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm3, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm2, (%rax)
|
2017-02-15 19:46:15 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v64i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
|
|
|
|
; AVX1-NEXT: vpavgb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-LABEL: avg_v64i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
|
|
|
|
; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1
|
|
|
|
; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v64i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
|
|
|
|
; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-LABEL: avg_v64i8_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-08-01 01:35:44 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <64 x i8>, <64 x i8>* %a
|
|
|
|
%2 = zext <64 x i8> %1 to <64 x i32>
|
|
|
|
%3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
2015-12-01 05:46:08 +08:00
|
|
|
%4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%5 = trunc <64 x i32> %4 to <64 x i8>
|
|
|
|
store <64 x i8> %5, <64 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v4i16_const(<4 x i16>* %a) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v4i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: movq %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v4i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
|
|
|
; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovq %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <4 x i16>, <4 x i16>* %a
|
|
|
|
%2 = zext <4 x i16> %1 to <4 x i32>
|
|
|
|
%3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
|
|
|
|
%4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%5 = trunc <4 x i32> %4 to <4 x i16>
|
|
|
|
store <4 x i16> %5, <4 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v8i16_const(<8 x i16>* %a) nounwind {
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-LABEL: avg_v8i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-LABEL: avg_v8i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
|
|
|
; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <8 x i16>, <8 x i16>* %a
|
|
|
|
%2 = zext <8 x i16> %1 to <8 x i32>
|
|
|
|
%3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
|
|
|
%4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%5 = trunc <8 x i32> %4 to <8 x i16>
|
|
|
|
store <8 x i16> %5, <8 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v16i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
|
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgw %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: pavgw 16(%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v16i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
|
|
|
|
; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0
|
2017-10-29 04:51:27 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-LABEL: avg_v16i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-LABEL: avg_v16i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX512-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <16 x i16>, <16 x i16>* %a
|
|
|
|
%2 = zext <16 x i16> %1 to <16 x i32>
|
|
|
|
%3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
|
|
|
%4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%5 = trunc <16 x i32> %4 to <16 x i16>
|
|
|
|
store <16 x i16> %5, <16 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-09-12 15:50:35 +08:00
|
|
|
define void @avg_v32i16_const(<32 x i16>* %a) nounwind {
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-LABEL: avg_v32i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7]
|
|
|
|
; SSE2-NEXT: movdqa (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: pavgw %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movdqa 16(%rdi), %xmm2
|
|
|
|
; SSE2-NEXT: pavgw %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: movdqa 32(%rdi), %xmm3
|
|
|
|
; SSE2-NEXT: pavgw %xmm0, %xmm3
|
|
|
|
; SSE2-NEXT: pavgw 48(%rdi), %xmm0
|
|
|
|
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm3, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: movdqu %xmm2, (%rax)
|
|
|
|
; SSE2-NEXT: movdqu %xmm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-LABEL: avg_v32i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
|
|
|
; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
|
|
|
|
; AVX1-NEXT: vpavgw %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgw %xmm3, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgw %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgw %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
|
|
|
; AVX1-NEXT: vmovups %ymm1, (%rax)
|
2017-06-23 22:38:00 +08:00
|
|
|
; AVX1-NEXT: vmovups %ymm0, (%rax)
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-LABEL: avg_v32i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
|
|
|
|
; AVX2-NEXT: # ymm0 = mem[0,1,0,1]
|
|
|
|
; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1
|
|
|
|
; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v32i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
|
|
|
|
; AVX512F-NEXT: # ymm0 = mem[0,1,0,1]
|
|
|
|
; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
|
|
|
|
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512F-NEXT: vzeroupper
|
2016-08-26 01:17:46 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-LABEL: avg_v32i16_const:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-08-01 01:35:44 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0
|
2018-01-18 15:44:09 +08:00
|
|
|
; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
|
2017-03-03 17:03:24 +08:00
|
|
|
; AVX512BW-NEXT: vzeroupper
|
2015-12-01 05:46:08 +08:00
|
|
|
; AVX512BW-NEXT: retq
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%1 = load <32 x i16>, <32 x i16>* %a
|
|
|
|
%2 = zext <32 x i16> %1 to <32 x i32>
|
|
|
|
%3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
|
2015-12-01 05:46:08 +08:00
|
|
|
%4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
[X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW.
This patch detects the AVG pattern in vectorized code, which is simply
c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of
either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to
i32 before any arithmetic operations. The following IR shows such an example:
%1 = zext <N x i8> %a to <N x i32>
%2 = zext <N x i8> %b to <N x i32>
%3 = add nuw nsw <N x i32> %1, <i32 1 x N>
%4 = add nuw nsw <N x i32> %3, %2
%5 = lshr <N x i32> %N, <i32 1 x N>
%6 = trunc <N x i32> %5 to <N x i8>
and with this patch it will be converted to a X86ISD::AVG instruction.
The pattern recognition is done when combining instructions just before type
legalization during instruction selection. We do it here because after type
legalization, it is much more difficult to do pattern recognition based
on many instructions that are doing type conversions. Therefore, for
target-specific instructions (like X86ISD::AVG), we need to take care of type
legalization by ourselves. However, as X86ISD::AVG behaves similarly to
ISD::ADD, I am wondering if there is a way to legalize operands and result
types of X86ISD::AVG together with ISD::ADD. It seems that the current design
doesn't support this idea.
Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of
variant vector sizes.
Differential revision: http://reviews.llvm.org/D14761
llvm-svn: 253952
2015-11-24 13:44:19 +08:00
|
|
|
%5 = trunc <32 x i32> %4 to <32 x i16>
|
|
|
|
store <32 x i16> %5, <32 x i16>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|
2017-09-12 15:50:35 +08:00
|
|
|
|
|
|
|
define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind {
|
|
|
|
; SSE2-LABEL: avg_v16i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-09-12 15:50:35 +08:00
|
|
|
; SSE2-NEXT: pavgb %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: avg_v16i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0:
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
%za = zext <16 x i8> %a to <16 x i16>
|
|
|
|
%zb = zext <16 x i8> %b to <16 x i16>
|
|
|
|
%add = add nuw nsw <16 x i16> %za, %zb
|
|
|
|
%add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%res = trunc <16 x i16> %lshr to <16 x i8>
|
|
|
|
ret <16 x i8> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
|
|
|
|
; SSE2-LABEL: avg_v32i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgb %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: pavgb %xmm3, %xmm1
|
2017-09-12 15:50:35 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: avg_v32i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: avg_v32i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: avg_v32i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512: # %bb.0:
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%za = zext <32 x i8> %a to <32 x i16>
|
|
|
|
%zb = zext <32 x i8> %b to <32 x i16>
|
|
|
|
%add = add nuw nsw <32 x i16> %za, %zb
|
|
|
|
%add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%res = trunc <32 x i16> %lshr to <32 x i8>
|
|
|
|
ret <32 x i8> %res
|
|
|
|
}
|
|
|
|
|
|
|
|
define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind {
|
|
|
|
; SSE2-LABEL: avg_v64i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; SSE2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; SSE2-NEXT: pavgb %xmm4, %xmm0
|
|
|
|
; SSE2-NEXT: pavgb %xmm5, %xmm1
|
|
|
|
; SSE2-NEXT: pavgb %xmm6, %xmm2
|
|
|
|
; SSE2-NEXT: pavgb %xmm7, %xmm3
|
2017-09-12 15:50:35 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: avg_v64i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX1: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
|
|
|
|
; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: avg_v64i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX2: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb %ymm3, %ymm1, %ymm1
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v64i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512F: # %bb.0:
|
2017-12-22 02:12:31 +08:00
|
|
|
; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: avg_v64i8_3:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX512BW: # %bb.0:
|
2017-09-12 15:50:35 +08:00
|
|
|
; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0
|
|
|
|
; AVX512BW-NEXT: retq
|
|
|
|
%za = zext <64 x i8> %a to <64 x i16>
|
|
|
|
%zb = zext <64 x i8> %b to <64 x i16>
|
|
|
|
%add = add nuw nsw <64 x i16> %za, %zb
|
|
|
|
%add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%res = trunc <64 x i16> %lshr to <64 x i8>
|
|
|
|
ret <64 x i8> %res
|
|
|
|
}
|
2018-02-26 10:16:31 +08:00
|
|
|
|
|
|
|
define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind {
|
|
|
|
; SSE2-LABEL: avg_v512i8_3:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 496(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 480(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 464(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 448(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 432(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 416(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 400(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 384(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 368(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 352(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 336(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 320(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 304(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 288(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 272(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 256(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 240(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 224(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 208(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 192(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 176(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 160(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 144(%rdi)
|
|
|
|
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8
|
|
|
|
; SSE2-NEXT: movdqa %xmm8, 128(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm7
|
|
|
|
; SSE2-NEXT: movdqa %xmm7, 112(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm6
|
|
|
|
; SSE2-NEXT: movdqa %xmm6, 96(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm5
|
|
|
|
; SSE2-NEXT: movdqa %xmm5, 80(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm4
|
|
|
|
; SSE2-NEXT: movdqa %xmm4, 64(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm3
|
|
|
|
; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm2
|
|
|
|
; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm1
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, 16(%rdi)
|
|
|
|
; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm0
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, (%rdi)
|
|
|
|
; SSE2-NEXT: movq %rdi, %rax
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: avg_v512i8_3:
|
|
|
|
; AVX1: # %bb.0:
|
|
|
|
; AVX1-NEXT: pushq %rbp
|
|
|
|
; AVX1-NEXT: movq %rsp, %rbp
|
|
|
|
; AVX1-NEXT: andq $-32, %rsp
|
|
|
|
; AVX1-NEXT: subq $128, %rsp
|
|
|
|
; AVX1-NEXT: vmovdqa 144(%rbp), %ymm8
|
|
|
|
; AVX1-NEXT: vmovdqa 112(%rbp), %ymm9
|
|
|
|
; AVX1-NEXT: vmovdqa 80(%rbp), %ymm10
|
|
|
|
; AVX1-NEXT: vmovdqa 48(%rbp), %ymm11
|
|
|
|
; AVX1-NEXT: vmovdqa 16(%rbp), %ymm12
|
|
|
|
; AVX1-NEXT: vmovdqa 272(%rbp), %ymm13
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm14
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm15
|
|
|
|
; AVX1-NEXT: vpavgb %xmm14, %xmm15, %xmm14
|
|
|
|
; AVX1-NEXT: vmovdqa 304(%rbp), %ymm15
|
|
|
|
; AVX1-NEXT: vpavgb %xmm13, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm14
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
|
|
|
|
; AVX1-NEXT: vpavgb %xmm14, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 336(%rbp), %ymm14
|
|
|
|
; AVX1-NEXT: vpavgb %xmm15, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; AVX1-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 368(%rbp), %ymm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm14, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
|
|
|
|
; AVX1-NEXT: vmovaps %ymm0, (%rsp) # 32-byte Spill
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 400(%rbp), %ymm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 432(%rbp), %ymm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm4
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 464(%rbp), %ymm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm5, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm5
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 496(%rbp), %ymm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm6, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm6
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 528(%rbp), %ymm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm7, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 560(%rbp), %ymm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm12, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm12
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 592(%rbp), %ymm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm11, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm11
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 624(%rbp), %ymm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm10, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm10
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 656(%rbp), %ymm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm9, %xmm1
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0
|
|
|
|
; AVX1-NEXT: vmovdqa 176(%rbp), %ymm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm8, %xmm2
|
|
|
|
; AVX1-NEXT: vmovdqa 688(%rbp), %ymm8
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm13
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm13, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm8, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vmovdqa 208(%rbp), %ymm8
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm13
|
|
|
|
; AVX1-NEXT: vmovdqa 720(%rbp), %ymm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm15
|
|
|
|
; AVX1-NEXT: vpavgb %xmm1, %xmm15, %xmm1
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm8, %xmm2
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
|
|
|
|
; AVX1-NEXT: vmovdqa 240(%rbp), %ymm15
|
|
|
|
; AVX1-NEXT: vmovdqa 752(%rbp), %ymm8
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm14
|
|
|
|
; AVX1-NEXT: vpavgb %xmm2, %xmm14, %xmm2
|
|
|
|
; AVX1-NEXT: vpavgb %xmm8, %xmm15, %xmm8
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2
|
|
|
|
; AVX1-NEXT: vmovaps %ymm2, 480(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm1, 448(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm13, 416(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm0, 384(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm9, 352(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm10, 320(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm11, 288(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm12, 256(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm7, 224(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm6, 192(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm5, 160(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm4, 128(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps %ymm3, 96(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload
|
|
|
|
; AVX1-NEXT: vmovaps %ymm0, 64(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
|
|
|
|
; AVX1-NEXT: vmovaps %ymm0, 32(%rdi)
|
|
|
|
; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
|
|
|
|
; AVX1-NEXT: vmovaps %ymm0, (%rdi)
|
|
|
|
; AVX1-NEXT: movq %rdi, %rax
|
|
|
|
; AVX1-NEXT: movq %rbp, %rsp
|
|
|
|
; AVX1-NEXT: popq %rbp
|
|
|
|
; AVX1-NEXT: vzeroupper
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: avg_v512i8_3:
|
|
|
|
; AVX2: # %bb.0:
|
|
|
|
; AVX2-NEXT: pushq %rbp
|
|
|
|
; AVX2-NEXT: movq %rsp, %rbp
|
|
|
|
; AVX2-NEXT: andq $-32, %rsp
|
|
|
|
; AVX2-NEXT: subq $32, %rsp
|
|
|
|
; AVX2-NEXT: vmovdqa 240(%rbp), %ymm8
|
|
|
|
; AVX2-NEXT: vmovdqa 208(%rbp), %ymm9
|
|
|
|
; AVX2-NEXT: vmovdqa 176(%rbp), %ymm10
|
|
|
|
; AVX2-NEXT: vmovdqa 144(%rbp), %ymm11
|
|
|
|
; AVX2-NEXT: vmovdqa 112(%rbp), %ymm12
|
|
|
|
; AVX2-NEXT: vmovdqa 80(%rbp), %ymm13
|
|
|
|
; AVX2-NEXT: vmovdqa 48(%rbp), %ymm14
|
|
|
|
; AVX2-NEXT: vmovdqa 16(%rbp), %ymm15
|
|
|
|
; AVX2-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
|
|
|
|
; AVX2-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
|
|
|
|
; AVX2-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
|
|
|
|
; AVX2-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
|
|
|
|
; AVX2-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
|
|
|
|
; AVX2-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
|
|
|
|
; AVX2-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
|
|
|
|
; AVX2-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
|
|
|
|
; AVX2-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
|
|
|
|
; AVX2-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
|
|
|
|
; AVX2-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
|
|
|
|
; AVX2-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
|
|
|
|
; AVX2-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
|
|
|
|
; AVX2-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
|
|
|
|
; AVX2-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm8, 480(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm9, 448(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm10, 416(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm11, 384(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm12, 352(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm13, 320(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm14, 288(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm15, 256(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm7, 224(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm6, 192(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm5, 160(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm4, 128(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm3, 96(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi)
|
|
|
|
; AVX2-NEXT: vmovdqa %ymm0, (%rdi)
|
|
|
|
; AVX2-NEXT: movq %rdi, %rax
|
|
|
|
; AVX2-NEXT: movq %rbp, %rsp
|
|
|
|
; AVX2-NEXT: popq %rbp
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512F-LABEL: avg_v512i8_3:
|
|
|
|
; AVX512F: # %bb.0:
|
|
|
|
; AVX512F-NEXT: pushq %rbp
|
|
|
|
; AVX512F-NEXT: movq %rsp, %rbp
|
|
|
|
; AVX512F-NEXT: andq $-32, %rsp
|
|
|
|
; AVX512F-NEXT: subq $32, %rsp
|
|
|
|
; AVX512F-NEXT: vmovdqa 240(%rbp), %ymm8
|
|
|
|
; AVX512F-NEXT: vmovdqa 208(%rbp), %ymm9
|
|
|
|
; AVX512F-NEXT: vmovdqa 176(%rbp), %ymm10
|
|
|
|
; AVX512F-NEXT: vmovdqa 144(%rbp), %ymm11
|
|
|
|
; AVX512F-NEXT: vmovdqa 112(%rbp), %ymm12
|
|
|
|
; AVX512F-NEXT: vmovdqa 80(%rbp), %ymm13
|
|
|
|
; AVX512F-NEXT: vmovdqa 48(%rbp), %ymm14
|
|
|
|
; AVX512F-NEXT: vmovdqa 16(%rbp), %ymm15
|
|
|
|
; AVX512F-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0
|
|
|
|
; AVX512F-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1
|
|
|
|
; AVX512F-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2
|
|
|
|
; AVX512F-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3
|
|
|
|
; AVX512F-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4
|
|
|
|
; AVX512F-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5
|
|
|
|
; AVX512F-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6
|
|
|
|
; AVX512F-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7
|
|
|
|
; AVX512F-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15
|
|
|
|
; AVX512F-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14
|
|
|
|
; AVX512F-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13
|
|
|
|
; AVX512F-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12
|
|
|
|
; AVX512F-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11
|
|
|
|
; AVX512F-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10
|
|
|
|
; AVX512F-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9
|
|
|
|
; AVX512F-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm8, 480(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm9, 448(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm10, 416(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm11, 384(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm12, 352(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm13, 320(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm14, 288(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm15, 256(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm7, 224(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm6, 192(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm5, 160(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm4, 128(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm3, 96(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm2, 64(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdi)
|
|
|
|
; AVX512F-NEXT: vmovdqa %ymm0, (%rdi)
|
|
|
|
; AVX512F-NEXT: movq %rdi, %rax
|
|
|
|
; AVX512F-NEXT: movq %rbp, %rsp
|
|
|
|
; AVX512F-NEXT: popq %rbp
|
|
|
|
; AVX512F-NEXT: vzeroupper
|
|
|
|
; AVX512F-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512BW-LABEL: avg_v512i8_3:
|
|
|
|
; AVX512BW: # %bb.0:
|
|
|
|
; AVX512BW-NEXT: pushq %rbp
|
|
|
|
; AVX512BW-NEXT: movq %rsp, %rbp
|
|
|
|
; AVX512BW-NEXT: andq $-64, %rsp
|
|
|
|
; AVX512BW-NEXT: subq $64, %rsp
|
|
|
|
; AVX512BW-NEXT: vpavgb 16(%rbp), %zmm0, %zmm0
|
|
|
|
; AVX512BW-NEXT: vpavgb 80(%rbp), %zmm1, %zmm1
|
|
|
|
; AVX512BW-NEXT: vpavgb 144(%rbp), %zmm2, %zmm2
|
|
|
|
; AVX512BW-NEXT: vpavgb 208(%rbp), %zmm3, %zmm3
|
|
|
|
; AVX512BW-NEXT: vpavgb 272(%rbp), %zmm4, %zmm4
|
|
|
|
; AVX512BW-NEXT: vpavgb 336(%rbp), %zmm5, %zmm5
|
|
|
|
; AVX512BW-NEXT: vpavgb 400(%rbp), %zmm6, %zmm6
|
|
|
|
; AVX512BW-NEXT: vpavgb 464(%rbp), %zmm7, %zmm7
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdi)
|
|
|
|
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdi)
|
|
|
|
; AVX512BW-NEXT: movq %rdi, %rax
|
|
|
|
; AVX512BW-NEXT: movq %rbp, %rsp
|
|
|
|
; AVX512BW-NEXT: popq %rbp
|
|
|
|
; AVX512BW-NEXT: vzeroupper
|
|
|
|
; AVX512BW-NEXT: retq
|
|
|
|
%za = zext <512 x i8> %a to <512 x i16>
|
|
|
|
%zb = zext <512 x i8> %b to <512 x i16>
|
|
|
|
%add = add nuw nsw <512 x i16> %za, %zb
|
|
|
|
%add1 = add nuw nsw <512 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%lshr = lshr <512 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
|
|
|
|
%res = trunc <512 x i16> %lshr to <512 x i8>
|
|
|
|
ret <512 x i8> %res
|
|
|
|
}
|
2018-02-26 12:43:24 +08:00
|
|
|
|
|
|
|
; This is not an avg, but its structurally similar and previously caused a crash
|
|
|
|
; because the constants can't be read with APInt::getZExtValue.
|
|
|
|
define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind {
|
|
|
|
; SSE2-LABEL: not_avg_v16i8_wide_constants:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: pushq %rbp
|
|
|
|
; SSE2-NEXT: pushq %r15
|
|
|
|
; SSE2-NEXT: pushq %r14
|
|
|
|
; SSE2-NEXT: pushq %r13
|
|
|
|
; SSE2-NEXT: pushq %r12
|
|
|
|
; SSE2-NEXT: pushq %rbx
|
|
|
|
; SSE2-NEXT: subq $56, %rsp
|
|
|
|
; SSE2-NEXT: movaps (%rdi), %xmm1
|
|
|
|
; SSE2-NEXT: movaps (%rsi), %xmm0
|
|
|
|
; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
|
|
|
|
; SSE2-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
|
|
|
|
; SSE2-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
|
|
|
|
; SSE2-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
|
|
|
|
; SSE2-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
|
|
|
|
; SSE2-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
|
|
|
|
; SSE2-NEXT: addq %rax, %r11
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: addq %rdi, %rax
|
|
|
|
; SSE2-NEXT: movq %rax, %rdi
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
|
|
|
|
; SSE2-NEXT: addq %r15, %r14
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: addq %rsi, %rax
|
|
|
|
; SSE2-NEXT: movq %rax, %r15
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
|
|
|
|
; SSE2-NEXT: addq %rdx, %rsi
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
|
|
|
|
; SSE2-NEXT: addq %r13, %r8
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: addq %r10, %rax
|
|
|
|
; SSE2-NEXT: movq %rax, %r10
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: addq %rcx, %rax
|
|
|
|
; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
|
|
|
|
; SSE2-NEXT: addq %r9, %r13
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
|
|
|
|
; SSE2-NEXT: addq %rbx, %rcx
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: addq %r12, %rax
|
|
|
|
; SSE2-NEXT: movq %rax, %r9
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
|
|
|
|
; SSE2-NEXT: movq %rax, %rbp
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
|
|
|
|
; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
|
|
|
|
; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
|
|
|
|
; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
|
|
|
|
; SSE2-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
|
|
|
|
; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: xorl %eax, %eax
|
|
|
|
; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: addq $-1, %r11
|
|
|
|
; SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movl $0, %r12d
|
|
|
|
; SSE2-NEXT: adcq $-1, %r12
|
|
|
|
; SSE2-NEXT: addq $-1, %rdi
|
|
|
|
; SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movl $0, %edx
|
|
|
|
; SSE2-NEXT: adcq $-1, %rdx
|
|
|
|
; SSE2-NEXT: addq $-1, %r14
|
|
|
|
; SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movl $0, %edi
|
|
|
|
; SSE2-NEXT: adcq $-1, %rdi
|
|
|
|
; SSE2-NEXT: addq $-1, %r15
|
|
|
|
; SSE2-NEXT: movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movl $0, %eax
|
|
|
|
; SSE2-NEXT: adcq $-1, %rax
|
|
|
|
; SSE2-NEXT: addq $-1, %rsi
|
|
|
|
; SSE2-NEXT: movq %rsi, (%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movl $0, %r15d
|
|
|
|
; SSE2-NEXT: adcq $-1, %r15
|
|
|
|
; SSE2-NEXT: addq $-1, %r8
|
|
|
|
; SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movl $0, %r14d
|
|
|
|
; SSE2-NEXT: adcq $-1, %r14
|
|
|
|
; SSE2-NEXT: addq $-1, %r10
|
|
|
|
; SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movl $0, %esi
|
|
|
|
; SSE2-NEXT: adcq $-1, %rsi
|
|
|
|
; SSE2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %r10 # 8-byte Reload
|
|
|
|
; SSE2-NEXT: addq $-1, %r10
|
|
|
|
; SSE2-NEXT: movl $0, %esi
|
|
|
|
; SSE2-NEXT: adcq $-1, %rsi
|
|
|
|
; SSE2-NEXT: movq %rsi, %r8
|
|
|
|
; SSE2-NEXT: addq $-1, %r13
|
|
|
|
; SSE2-NEXT: movl $0, %esi
|
|
|
|
; SSE2-NEXT: adcq $-1, %rsi
|
|
|
|
; SSE2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: addq $-1, %rcx
|
|
|
|
; SSE2-NEXT: movl $0, %esi
|
|
|
|
; SSE2-NEXT: adcq $-1, %rsi
|
|
|
|
; SSE2-NEXT: addq $-1, %r9
|
|
|
|
; SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movl $0, %r9d
|
|
|
|
; SSE2-NEXT: adcq $-1, %r9
|
|
|
|
; SSE2-NEXT: addq $-1, %rbp
|
|
|
|
; SSE2-NEXT: movq %rbp, {{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: movl $0, %r11d
|
|
|
|
; SSE2-NEXT: adcq $-1, %r11
|
|
|
|
; SSE2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; SSE2-NEXT: movl $0, %ebx
|
|
|
|
; SSE2-NEXT: adcq $-1, %rbx
|
|
|
|
; SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; SSE2-NEXT: movl $0, %ebp
|
|
|
|
; SSE2-NEXT: adcq $-1, %rbp
|
|
|
|
; SSE2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; SSE2-NEXT: movl $0, %ebx
|
|
|
|
; SSE2-NEXT: adcq $-1, %rbx
|
|
|
|
; SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; SSE2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; SSE2-NEXT: adcq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; SSE2-NEXT: shldq $63, %rcx, %rsi
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shldq $63, %r13, %rbx
|
|
|
|
; SSE2-NEXT: shldq $63, %r10, %r8
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %r10 # 8-byte Reload
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shldq $63, %rcx, %r10
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shldq $63, %rcx, %r14
|
|
|
|
; SSE2-NEXT: movq (%rsp), %rcx # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shldq $63, %rcx, %r15
|
|
|
|
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shldq $63, %rcx, %rdi
|
|
|
|
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shldq $63, %rcx, %rdx
|
|
|
|
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shldq $63, %rcx, %r12
|
|
|
|
; SSE2-NEXT: movq %r12, %xmm11
|
|
|
|
; SSE2-NEXT: movq %rdx, %xmm5
|
|
|
|
; SSE2-NEXT: movq %rdi, %xmm13
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shrdq $1, %rax, %rcx
|
|
|
|
; SSE2-NEXT: movq %rcx, %xmm15
|
|
|
|
; SSE2-NEXT: shrq %rax
|
|
|
|
; SSE2-NEXT: movq %rax, %xmm8
|
|
|
|
; SSE2-NEXT: movq %r15, %xmm9
|
|
|
|
; SSE2-NEXT: movq %r14, %xmm6
|
|
|
|
; SSE2-NEXT: movq %r10, %xmm7
|
|
|
|
; SSE2-NEXT: movq %r8, %xmm0
|
|
|
|
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
|
|
|
; SSE2-NEXT: movq %rbx, %xmm10
|
|
|
|
; SSE2-NEXT: movq %rsi, %xmm4
|
|
|
|
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shrdq $1, %r9, %rax
|
|
|
|
; SSE2-NEXT: movq %rax, %xmm1
|
|
|
|
; SSE2-NEXT: shrq %r9
|
|
|
|
; SSE2-NEXT: movq %r9, %xmm12
|
|
|
|
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shrdq $1, %r11, %rax
|
|
|
|
; SSE2-NEXT: movq %rax, %xmm2
|
|
|
|
; SSE2-NEXT: shrq %r11
|
|
|
|
; SSE2-NEXT: movq %r11, %xmm14
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shrdq $1, %rcx, %rax
|
|
|
|
; SSE2-NEXT: movq %rax, %xmm3
|
|
|
|
; SSE2-NEXT: movq %rcx, %rax
|
|
|
|
; SSE2-NEXT: shrq %rax
|
|
|
|
; SSE2-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0]
|
|
|
|
; SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1]
|
|
|
|
; SSE2-NEXT: pand {{.*}}(%rip), %xmm5
|
|
|
|
; SSE2-NEXT: por %xmm11, %xmm5
|
|
|
|
; SSE2-NEXT: movq %rax, %xmm11
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; SSE2-NEXT: movq %rbp, %rcx
|
2018-02-28 00:59:10 +08:00
|
|
|
; SSE2-NEXT: shrdq $1, %rbp, %rax
|
2018-02-26 12:43:24 +08:00
|
|
|
; SSE2-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2]
|
|
|
|
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm8[0]
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
|
|
|
|
; SSE2-NEXT: pand %xmm0, %xmm15
|
|
|
|
; SSE2-NEXT: pandn %xmm13, %xmm0
|
|
|
|
; SSE2-NEXT: movq %rax, %xmm8
|
|
|
|
; SSE2-NEXT: shrq %rcx
|
|
|
|
; SSE2-NEXT: por %xmm15, %xmm0
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0]
|
|
|
|
; SSE2-NEXT: pand %xmm13, %xmm0
|
|
|
|
; SSE2-NEXT: pandn %xmm5, %xmm13
|
|
|
|
; SSE2-NEXT: movq %rcx, %xmm15
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shrdq $1, %rcx, %rax
|
|
|
|
; SSE2-NEXT: por %xmm0, %xmm13
|
|
|
|
; SSE2-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4]
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
|
|
|
|
; SSE2-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5]
|
|
|
|
; SSE2-NEXT: pand %xmm0, %xmm6
|
|
|
|
; SSE2-NEXT: pandn %xmm9, %xmm0
|
|
|
|
; SSE2-NEXT: movq %rax, %xmm9
|
|
|
|
; SSE2-NEXT: shrq %rcx
|
|
|
|
; SSE2-NEXT: por %xmm6, %xmm0
|
|
|
|
; SSE2-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6]
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
|
|
|
|
; SSE2-NEXT: pshufd $68, -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
|
|
|
|
; SSE2-NEXT: # xmm5 = mem[0,1,0,1]
|
|
|
|
; SSE2-NEXT: pand %xmm6, %xmm5
|
|
|
|
; SSE2-NEXT: pandn %xmm7, %xmm6
|
|
|
|
; SSE2-NEXT: movq %rcx, %xmm7
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; SSE2-NEXT: shrdq $1, %rax, %rcx
|
|
|
|
; SSE2-NEXT: por %xmm5, %xmm6
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535]
|
|
|
|
; SSE2-NEXT: pand %xmm5, %xmm6
|
|
|
|
; SSE2-NEXT: pandn %xmm0, %xmm5
|
|
|
|
; SSE2-NEXT: movq %rcx, %xmm0
|
|
|
|
; SSE2-NEXT: shrq %rax
|
|
|
|
; SSE2-NEXT: por %xmm6, %xmm5
|
|
|
|
; SSE2-NEXT: movq %rax, %xmm6
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,2]
|
|
|
|
; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3]
|
|
|
|
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
|
|
|
|
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm12[0]
|
|
|
|
; SSE2-NEXT: pslld $24, %xmm1
|
|
|
|
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm14[0]
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
|
|
|
|
; SSE2-NEXT: pslld $16, %xmm2
|
|
|
|
; SSE2-NEXT: pand %xmm10, %xmm2
|
|
|
|
; SSE2-NEXT: pandn %xmm1, %xmm10
|
|
|
|
; SSE2-NEXT: por %xmm2, %xmm10
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
|
|
|
|
; SSE2-NEXT: pand %xmm1, %xmm4
|
|
|
|
; SSE2-NEXT: pandn %xmm10, %xmm1
|
|
|
|
; SSE2-NEXT: por %xmm4, %xmm1
|
|
|
|
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm11[0]
|
|
|
|
; SSE2-NEXT: psllq $56, %xmm3
|
|
|
|
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm15[0]
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
|
|
|
|
; SSE2-NEXT: psllq $48, %xmm8
|
|
|
|
; SSE2-NEXT: pand %xmm2, %xmm8
|
|
|
|
; SSE2-NEXT: pandn %xmm3, %xmm2
|
|
|
|
; SSE2-NEXT: por %xmm8, %xmm2
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535]
|
|
|
|
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm7[0]
|
|
|
|
; SSE2-NEXT: psllq $40, %xmm9
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
|
|
|
|
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
|
|
|
|
; SSE2-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE2-NEXT: pandn %xmm9, %xmm4
|
|
|
|
; SSE2-NEXT: por %xmm0, %xmm4
|
|
|
|
; SSE2-NEXT: pand %xmm3, %xmm4
|
|
|
|
; SSE2-NEXT: pandn %xmm2, %xmm3
|
|
|
|
; SSE2-NEXT: por %xmm3, %xmm4
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
|
|
|
|
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
|
|
|
; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
|
|
|
|
; SSE2-NEXT: movupd %xmm5, (%rax)
|
|
|
|
; SSE2-NEXT: addq $56, %rsp
|
|
|
|
; SSE2-NEXT: popq %rbx
|
|
|
|
; SSE2-NEXT: popq %r12
|
|
|
|
; SSE2-NEXT: popq %r13
|
|
|
|
; SSE2-NEXT: popq %r14
|
|
|
|
; SSE2-NEXT: popq %r15
|
|
|
|
; SSE2-NEXT: popq %rbp
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: not_avg_v16i8_wide_constants:
|
|
|
|
; AVX1: # %bb.0:
|
|
|
|
; AVX1-NEXT: pushq %rbp
|
|
|
|
; AVX1-NEXT: pushq %r15
|
|
|
|
; AVX1-NEXT: pushq %r14
|
|
|
|
; AVX1-NEXT: pushq %r13
|
|
|
|
; AVX1-NEXT: pushq %r12
|
|
|
|
; AVX1-NEXT: pushq %rbx
|
|
|
|
; AVX1-NEXT: subq $24, %rsp
|
|
|
|
; AVX1-NEXT: movq %rsi, %r8
|
|
|
|
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm2, %rcx
|
|
|
|
; AVX1-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm1, %rbx
|
|
|
|
; AVX1-NEXT: vmovq %xmm1, %rbp
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm1, %r10
|
|
|
|
; AVX1-NEXT: vmovq %xmm1, %r12
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %r15
|
|
|
|
; AVX1-NEXT: vmovq %xmm0, %r14
|
|
|
|
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm2, %rdx
|
|
|
|
; AVX1-NEXT: vmovq %xmm2, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX1-NEXT: vmovq %xmm1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm4, %r9
|
|
|
|
; AVX1-NEXT: addq %rcx, %r9
|
|
|
|
; AVX1-NEXT: vmovq %xmm4, %r13
|
|
|
|
; AVX1-NEXT: addq %rax, %r13
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm3, %rcx
|
|
|
|
; AVX1-NEXT: addq %rbx, %rcx
|
|
|
|
; AVX1-NEXT: vmovq %xmm3, %r11
|
|
|
|
; AVX1-NEXT: addq %rbp, %r11
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm3, %rax
|
|
|
|
; AVX1-NEXT: addq %r10, %rax
|
|
|
|
; AVX1-NEXT: movq %rax, %rsi
|
|
|
|
; AVX1-NEXT: vmovq %xmm3, %rax
|
|
|
|
; AVX1-NEXT: addq %r12, %rax
|
|
|
|
; AVX1-NEXT: movq %rax, %rbx
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
|
|
|
|
; AVX1-NEXT: addq %r15, %rax
|
|
|
|
; AVX1-NEXT: movq %rax, %r15
|
|
|
|
; AVX1-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX1-NEXT: addq %r14, %rax
|
|
|
|
; AVX1-NEXT: movq %rax, %r14
|
|
|
|
; AVX1-NEXT: vmovq %xmm1, %rax
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
|
|
|
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm3, %rbp
|
|
|
|
; AVX1-NEXT: addq %rdx, %rbp
|
|
|
|
; AVX1-NEXT: movq %rbp, %r8
|
|
|
|
; AVX1-NEXT: vmovq %xmm3, %rbp
|
|
|
|
; AVX1-NEXT: addq -{{[0-9]+}}(%rsp), %rbp # 8-byte Folded Reload
|
|
|
|
; AVX1-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm2, %rdx
|
|
|
|
; AVX1-NEXT: addq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
|
|
|
|
; AVX1-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: vmovq %xmm2, %rdx
|
|
|
|
; AVX1-NEXT: addq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
|
|
|
|
; AVX1-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm2, %rdx
|
|
|
|
; AVX1-NEXT: addq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
|
|
|
|
; AVX1-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: vmovq %xmm2, %r12
|
|
|
|
; AVX1-NEXT: addq %rax, %r12
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
|
|
|
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
|
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm1, %r10
|
|
|
|
; AVX1-NEXT: addq %rax, %r10
|
|
|
|
; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX1-NEXT: vmovq %xmm1, %rdi
|
|
|
|
; AVX1-NEXT: addq %rax, %rdi
|
|
|
|
; AVX1-NEXT: addq $-1, %r9
|
|
|
|
; AVX1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: movl $0, %eax
|
|
|
|
; AVX1-NEXT: adcq $-1, %rax
|
|
|
|
; AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: addq $-1, %r13
|
|
|
|
; AVX1-NEXT: movq %r13, {{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: movl $0, %eax
|
|
|
|
; AVX1-NEXT: adcq $-1, %rax
|
|
|
|
; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: addq $-1, %rcx
|
|
|
|
; AVX1-NEXT: movq %rcx, (%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: movl $0, %eax
|
|
|
|
; AVX1-NEXT: adcq $-1, %rax
|
|
|
|
; AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: addq $-1, %r11
|
|
|
|
; AVX1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: movl $0, %eax
|
|
|
|
; AVX1-NEXT: adcq $-1, %rax
|
|
|
|
; AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: addq $-1, %rsi
|
|
|
|
; AVX1-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: movl $0, %eax
|
|
|
|
; AVX1-NEXT: adcq $-1, %rax
|
|
|
|
; AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: addq $-1, %rbx
|
|
|
|
; AVX1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: movl $0, %eax
|
|
|
|
; AVX1-NEXT: adcq $-1, %rax
|
|
|
|
; AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: addq $-1, %r15
|
|
|
|
; AVX1-NEXT: movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: movl $0, %ebp
|
|
|
|
; AVX1-NEXT: adcq $-1, %rbp
|
|
|
|
; AVX1-NEXT: addq $-1, %r14
|
|
|
|
; AVX1-NEXT: movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: movl $0, %r15d
|
|
|
|
; AVX1-NEXT: adcq $-1, %r15
|
|
|
|
; AVX1-NEXT: addq $-1, %r8
|
|
|
|
; AVX1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: movl $0, %eax
|
|
|
|
; AVX1-NEXT: adcq $-1, %rax
|
|
|
|
; AVX1-NEXT: movq %rax, %rsi
|
|
|
|
; AVX1-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX1-NEXT: movl $0, %r13d
|
|
|
|
; AVX1-NEXT: adcq $-1, %r13
|
|
|
|
; AVX1-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX1-NEXT: movl $0, %r14d
|
|
|
|
; AVX1-NEXT: adcq $-1, %r14
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
|
|
|
|
; AVX1-NEXT: addq $-1, %rdx
|
|
|
|
; AVX1-NEXT: movl $0, %r11d
|
|
|
|
; AVX1-NEXT: adcq $-1, %r11
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX1-NEXT: addq $-1, %rax
|
|
|
|
; AVX1-NEXT: movl $0, %ebx
|
|
|
|
; AVX1-NEXT: adcq $-1, %rbx
|
|
|
|
; AVX1-NEXT: addq $-1, %r12
|
|
|
|
; AVX1-NEXT: movl $0, %r9d
|
|
|
|
; AVX1-NEXT: adcq $-1, %r9
|
|
|
|
; AVX1-NEXT: addq $-1, %r10
|
|
|
|
; AVX1-NEXT: movl $0, %r8d
|
|
|
|
; AVX1-NEXT: adcq $-1, %r8
|
|
|
|
; AVX1-NEXT: addq $-1, %rdi
|
|
|
|
; AVX1-NEXT: movl $0, %ecx
|
|
|
|
; AVX1-NEXT: adcq $-1, %rcx
|
|
|
|
; AVX1-NEXT: shldq $63, %rdi, %rcx
|
|
|
|
; AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: shldq $63, %r10, %r8
|
|
|
|
; AVX1-NEXT: shldq $63, %r12, %r9
|
|
|
|
; AVX1-NEXT: shldq $63, %rax, %rbx
|
|
|
|
; AVX1-NEXT: shldq $63, %rdx, %r11
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
|
|
|
|
; AVX1-NEXT: shldq $63, %rdx, %r14
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
|
|
|
|
; AVX1-NEXT: shldq $63, %rdx, %r13
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX1-NEXT: shldq $63, %rax, %rsi
|
|
|
|
; AVX1-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX1-NEXT: shldq $63, %rax, %r15
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX1-NEXT: shldq $63, %rax, %rbp
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX1-NEXT: shldq $63, %rax, %rsi
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX1-NEXT: shldq $63, %rax, %rcx
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX1-NEXT: shldq $63, %rax, %rdi
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %r12 # 8-byte Reload
|
|
|
|
; AVX1-NEXT: movq (%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX1-NEXT: shldq $63, %rax, %r12
|
|
|
|
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 # 8-byte Reload
|
|
|
|
; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX1-NEXT: shldq $63, %rax, %r10
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
|
|
|
|
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX1-NEXT: shldq $63, %rdx, %rax
|
|
|
|
; AVX1-NEXT: vmovq %rax, %xmm8
|
|
|
|
; AVX1-NEXT: vmovq %r10, %xmm0
|
|
|
|
; AVX1-NEXT: vmovq %r12, %xmm1
|
|
|
|
; AVX1-NEXT: vmovq %rdi, %xmm11
|
|
|
|
; AVX1-NEXT: vmovq %rcx, %xmm2
|
|
|
|
; AVX1-NEXT: vmovq %rsi, %xmm13
|
|
|
|
; AVX1-NEXT: vmovq %rbp, %xmm14
|
|
|
|
; AVX1-NEXT: vmovq %r15, %xmm15
|
|
|
|
; AVX1-NEXT: vmovq -{{[0-9]+}}(%rsp), %xmm9 # 8-byte Folded Reload
|
|
|
|
; AVX1-NEXT: # xmm9 = mem[0],zero
|
|
|
|
; AVX1-NEXT: vmovq %r13, %xmm10
|
|
|
|
; AVX1-NEXT: vmovq %r14, %xmm12
|
|
|
|
; AVX1-NEXT: vmovq %r11, %xmm3
|
|
|
|
; AVX1-NEXT: vmovq %rbx, %xmm4
|
|
|
|
; AVX1-NEXT: vmovq %r9, %xmm5
|
|
|
|
; AVX1-NEXT: vmovq %r8, %xmm6
|
|
|
|
; AVX1-NEXT: vmovq -{{[0-9]+}}(%rsp), %xmm7 # 8-byte Folded Reload
|
|
|
|
; AVX1-NEXT: # xmm7 = mem[0],zero
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm0[0],xmm8[0]
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm11[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2]
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm13[0],xmm2[0]
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm15[0],xmm14[0]
|
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,2],xmm1[0,2]
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm8, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm11, %xmm2
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm10[0],xmm9[0]
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm12[0]
|
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0]
|
|
|
|
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0]
|
|
|
|
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
|
|
|
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX1-NEXT: addq $24, %rsp
|
|
|
|
; AVX1-NEXT: popq %rbx
|
|
|
|
; AVX1-NEXT: popq %r12
|
|
|
|
; AVX1-NEXT: popq %r13
|
|
|
|
; AVX1-NEXT: popq %r14
|
|
|
|
; AVX1-NEXT: popq %r15
|
|
|
|
; AVX1-NEXT: popq %rbp
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: not_avg_v16i8_wide_constants:
|
|
|
|
; AVX2: # %bb.0:
|
|
|
|
; AVX2-NEXT: pushq %rbp
|
|
|
|
; AVX2-NEXT: pushq %r15
|
|
|
|
; AVX2-NEXT: pushq %r14
|
|
|
|
; AVX2-NEXT: pushq %r13
|
|
|
|
; AVX2-NEXT: pushq %r12
|
|
|
|
; AVX2-NEXT: pushq %rbx
|
|
|
|
; AVX2-NEXT: subq $16, %rsp
|
|
|
|
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm3, %rcx
|
|
|
|
; AVX2-NEXT: vmovq %xmm3, %rax
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm2, %rbx
|
|
|
|
; AVX2-NEXT: vmovq %xmm2, %rdx
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm2, %rdi
|
|
|
|
; AVX2-NEXT: vmovq %xmm2, %r11
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, %r13
|
|
|
|
; AVX2-NEXT: vmovq %xmm1, %r12
|
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm2, %rbp
|
|
|
|
; AVX2-NEXT: vmovq %xmm2, %r10
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX2-NEXT: vmovq %xmm1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX2-NEXT: vmovq %xmm1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm4, %r15
|
|
|
|
; AVX2-NEXT: addq %rcx, %r15
|
|
|
|
; AVX2-NEXT: vmovq %xmm4, %r9
|
|
|
|
; AVX2-NEXT: addq %rax, %r9
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm3, %rax
|
|
|
|
; AVX2-NEXT: addq %rbx, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, %rbx
|
|
|
|
; AVX2-NEXT: vmovq %xmm3, %rax
|
|
|
|
; AVX2-NEXT: addq %rdx, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, %r8
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm3, %rax
|
|
|
|
; AVX2-NEXT: addq %rdi, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, %rcx
|
|
|
|
; AVX2-NEXT: vmovq %xmm3, %rax
|
|
|
|
; AVX2-NEXT: addq %r11, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, %r11
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm2, %r14
|
|
|
|
; AVX2-NEXT: addq %r13, %r14
|
|
|
|
; AVX2-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX2-NEXT: addq %r12, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm3, %rax
|
|
|
|
; AVX2-NEXT: addq %rbp, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: vmovq %xmm3, %rax
|
|
|
|
; AVX2-NEXT: addq %r10, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
|
|
|
|
; AVX2-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
|
|
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX2-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
|
|
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
|
|
|
|
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm2, %rbp
|
|
|
|
; AVX2-NEXT: addq -{{[0-9]+}}(%rsp), %rbp # 8-byte Folded Reload
|
|
|
|
; AVX2-NEXT: vmovq %xmm2, %r10
|
|
|
|
; AVX2-NEXT: addq -{{[0-9]+}}(%rsp), %r10 # 8-byte Folded Reload
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm1, %rdi
|
|
|
|
; AVX2-NEXT: addq %rax, %rdi
|
|
|
|
; AVX2-NEXT: vmovq %xmm0, %rdx
|
|
|
|
; AVX2-NEXT: vmovq %xmm1, %rsi
|
|
|
|
; AVX2-NEXT: addq %rdx, %rsi
|
|
|
|
; AVX2-NEXT: addq $-1, %r15
|
|
|
|
; AVX2-NEXT: movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: movl $0, %eax
|
|
|
|
; AVX2-NEXT: adcq $-1, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: addq $-1, %r9
|
|
|
|
; AVX2-NEXT: movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: movl $0, %eax
|
|
|
|
; AVX2-NEXT: adcq $-1, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, (%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: addq $-1, %rbx
|
|
|
|
; AVX2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: movl $0, %eax
|
|
|
|
; AVX2-NEXT: adcq $-1, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: addq $-1, %r8
|
|
|
|
; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: movl $0, %r13d
|
|
|
|
; AVX2-NEXT: adcq $-1, %r13
|
|
|
|
; AVX2-NEXT: addq $-1, %rcx
|
|
|
|
; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: movl $0, %eax
|
|
|
|
; AVX2-NEXT: adcq $-1, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: addq $-1, %r11
|
|
|
|
; AVX2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: movl $0, %r15d
|
|
|
|
; AVX2-NEXT: adcq $-1, %r15
|
|
|
|
; AVX2-NEXT: addq $-1, %r14
|
|
|
|
; AVX2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: movl $0, %ebx
|
|
|
|
; AVX2-NEXT: adcq $-1, %rbx
|
|
|
|
; AVX2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX2-NEXT: movl $0, %r8d
|
|
|
|
; AVX2-NEXT: adcq $-1, %r8
|
|
|
|
; AVX2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX2-NEXT: movl $0, %eax
|
|
|
|
; AVX2-NEXT: adcq $-1, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX2-NEXT: movl $0, %eax
|
|
|
|
; AVX2-NEXT: adcq $-1, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX2-NEXT: movl $0, %r12d
|
|
|
|
; AVX2-NEXT: adcq $-1, %r12
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; AVX2-NEXT: addq $-1, %rcx
|
|
|
|
; AVX2-NEXT: movl $0, %r11d
|
|
|
|
; AVX2-NEXT: adcq $-1, %r11
|
|
|
|
; AVX2-NEXT: addq $-1, %rbp
|
|
|
|
; AVX2-NEXT: movl $0, %r14d
|
|
|
|
; AVX2-NEXT: adcq $-1, %r14
|
|
|
|
; AVX2-NEXT: addq $-1, %r10
|
|
|
|
; AVX2-NEXT: movl $0, %r9d
|
|
|
|
; AVX2-NEXT: adcq $-1, %r9
|
|
|
|
; AVX2-NEXT: addq $-1, %rdi
|
|
|
|
; AVX2-NEXT: movl $0, %edx
|
|
|
|
; AVX2-NEXT: adcq $-1, %rdx
|
|
|
|
; AVX2-NEXT: addq $-1, %rsi
|
|
|
|
; AVX2-NEXT: movl $0, %eax
|
|
|
|
; AVX2-NEXT: adcq $-1, %rax
|
|
|
|
; AVX2-NEXT: shldq $63, %rsi, %rax
|
|
|
|
; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX2-NEXT: shldq $63, %rdi, %rdx
|
|
|
|
; AVX2-NEXT: shldq $63, %r10, %r9
|
|
|
|
; AVX2-NEXT: shldq $63, %rbp, %r14
|
|
|
|
; AVX2-NEXT: shldq $63, %rcx, %r11
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; AVX2-NEXT: shldq $63, %rcx, %r12
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; AVX2-NEXT: shldq $63, %rcx, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %r10 # 8-byte Reload
|
|
|
|
; AVX2-NEXT: shldq $63, %rcx, %r10
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; AVX2-NEXT: shldq $63, %rcx, %r8
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX2-NEXT: shldq $63, %rax, %rbx
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX2-NEXT: shldq $63, %rax, %r15
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; AVX2-NEXT: shldq $63, %rcx, %rax
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; AVX2-NEXT: shldq $63, %rcx, %r13
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; AVX2-NEXT: shldq $63, %rcx, %rbp
|
|
|
|
; AVX2-NEXT: movq (%rsp), %rdi # 8-byte Reload
|
|
|
|
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; AVX2-NEXT: shldq $63, %rcx, %rdi
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
|
|
|
|
; AVX2-NEXT: shldq $63, %rcx, %rsi
|
|
|
|
; AVX2-NEXT: vmovq %rsi, %xmm8
|
|
|
|
; AVX2-NEXT: vmovq %rdi, %xmm9
|
|
|
|
; AVX2-NEXT: vmovq %rbp, %xmm10
|
|
|
|
; AVX2-NEXT: vmovq %r13, %xmm11
|
|
|
|
; AVX2-NEXT: vmovq %rax, %xmm12
|
|
|
|
; AVX2-NEXT: vmovq %r15, %xmm13
|
|
|
|
; AVX2-NEXT: vmovq %rbx, %xmm14
|
|
|
|
; AVX2-NEXT: vmovq %r8, %xmm15
|
|
|
|
; AVX2-NEXT: vmovq %r10, %xmm0
|
|
|
|
; AVX2-NEXT: vmovq -{{[0-9]+}}(%rsp), %xmm1 # 8-byte Folded Reload
|
|
|
|
; AVX2-NEXT: # xmm1 = mem[0],zero
|
|
|
|
; AVX2-NEXT: vmovq %r12, %xmm2
|
|
|
|
; AVX2-NEXT: vmovq %r11, %xmm3
|
|
|
|
; AVX2-NEXT: vmovq %r14, %xmm4
|
|
|
|
; AVX2-NEXT: vmovq %r9, %xmm5
|
|
|
|
; AVX2-NEXT: vmovq %rdx, %xmm6
|
|
|
|
; AVX2-NEXT: vmovq -{{[0-9]+}}(%rsp), %xmm7 # 8-byte Folded Reload
|
|
|
|
; AVX2-NEXT: # xmm7 = mem[0],zero
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0]
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm10[0]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm13[0],xmm12[0]
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm15[0],xmm14[0]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
|
|
|
; AVX2-NEXT: vpshufb %ymm1, %ymm8, %ymm2
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0]
|
|
|
|
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
|
|
|
|
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
|
|
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
|
|
|
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
|
|
|
|
; AVX2-NEXT: addq $16, %rsp
|
|
|
|
; AVX2-NEXT: popq %rbx
|
|
|
|
; AVX2-NEXT: popq %r12
|
|
|
|
; AVX2-NEXT: popq %r13
|
|
|
|
; AVX2-NEXT: popq %r14
|
|
|
|
; AVX2-NEXT: popq %r15
|
|
|
|
; AVX2-NEXT: popq %rbp
|
|
|
|
; AVX2-NEXT: vzeroupper
|
|
|
|
; AVX2-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX512-LABEL: not_avg_v16i8_wide_constants:
|
|
|
|
; AVX512: # %bb.0:
|
|
|
|
; AVX512-NEXT: pushq %rbp
|
|
|
|
; AVX512-NEXT: pushq %r15
|
|
|
|
; AVX512-NEXT: pushq %r14
|
|
|
|
; AVX512-NEXT: pushq %r13
|
|
|
|
; AVX512-NEXT: pushq %r12
|
|
|
|
; AVX512-NEXT: pushq %rbx
|
|
|
|
; AVX512-NEXT: subq $24, %rsp
|
|
|
|
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
|
|
|
|
; AVX512-NEXT: vmovq %xmm3, %rax
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm2, %rbx
|
|
|
|
; AVX512-NEXT: vmovq %xmm2, %rbp
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm2, %rdi
|
|
|
|
; AVX512-NEXT: vmovq %xmm2, %r8
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm1, %r13
|
|
|
|
; AVX512-NEXT: vmovq %xmm1, %r12
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm2, %r15
|
|
|
|
; AVX512-NEXT: vmovq %xmm2, %r14
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
|
|
|
|
; AVX512-NEXT: vmovq %xmm1, %r9
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX512-NEXT: vmovq %xmm1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm4, %rsi
|
|
|
|
; AVX512-NEXT: addq %rcx, %rsi
|
|
|
|
; AVX512-NEXT: vmovq %xmm4, %rcx
|
|
|
|
; AVX512-NEXT: addq %rax, %rcx
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm3, %rax
|
|
|
|
; AVX512-NEXT: addq %rbx, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, %rbx
|
|
|
|
; AVX512-NEXT: vmovq %xmm3, %rax
|
|
|
|
; AVX512-NEXT: addq %rbp, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, %r10
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm3, %rax
|
|
|
|
; AVX512-NEXT: addq %rdi, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, %rdi
|
|
|
|
; AVX512-NEXT: vmovq %xmm3, %rax
|
|
|
|
; AVX512-NEXT: addq %r8, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, %r8
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm2, %rbp
|
|
|
|
; AVX512-NEXT: addq %r13, %rbp
|
|
|
|
; AVX512-NEXT: vmovq %xmm2, %r11
|
|
|
|
; AVX512-NEXT: addq %r12, %r11
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
|
|
|
|
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm3, %rax
|
|
|
|
; AVX512-NEXT: addq %r15, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: vmovq %xmm3, %rax
|
|
|
|
; AVX512-NEXT: addq %r14, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm2, %rax
|
|
|
|
; AVX512-NEXT: addq %rdx, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: vmovq %xmm2, %rax
|
|
|
|
; AVX512-NEXT: addq %r9, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
|
|
|
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm2, %rax
|
|
|
|
; AVX512-NEXT: addq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
|
|
|
|
; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: vmovq %xmm2, %r14
|
|
|
|
; AVX512-NEXT: addq -{{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm1, %r9
|
|
|
|
; AVX512-NEXT: addq %rax, %r9
|
|
|
|
; AVX512-NEXT: vmovq %xmm0, %rax
|
|
|
|
; AVX512-NEXT: vmovq %xmm1, %rdx
|
|
|
|
; AVX512-NEXT: addq %rax, %rdx
|
|
|
|
; AVX512-NEXT: addq $-1, %rsi
|
|
|
|
; AVX512-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: movl $0, %eax
|
|
|
|
; AVX512-NEXT: adcq $-1, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: addq $-1, %rcx
|
|
|
|
; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: movl $0, %eax
|
|
|
|
; AVX512-NEXT: adcq $-1, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: addq $-1, %rbx
|
|
|
|
; AVX512-NEXT: movq %rbx, (%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: movl $0, %eax
|
|
|
|
; AVX512-NEXT: adcq $-1, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: addq $-1, %r10
|
|
|
|
; AVX512-NEXT: movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: movl $0, %eax
|
|
|
|
; AVX512-NEXT: adcq $-1, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: addq $-1, %rdi
|
|
|
|
; AVX512-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: movl $0, %eax
|
|
|
|
; AVX512-NEXT: adcq $-1, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: addq $-1, %r8
|
|
|
|
; AVX512-NEXT: movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: movl $0, %eax
|
|
|
|
; AVX512-NEXT: adcq $-1, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: addq $-1, %rbp
|
|
|
|
; AVX512-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: movl $0, %r13d
|
|
|
|
; AVX512-NEXT: adcq $-1, %r13
|
|
|
|
; AVX512-NEXT: addq $-1, %r11
|
|
|
|
; AVX512-NEXT: movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: movl $0, %r15d
|
|
|
|
; AVX512-NEXT: adcq $-1, %r15
|
|
|
|
; AVX512-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX512-NEXT: movl $0, %eax
|
|
|
|
; AVX512-NEXT: adcq $-1, %rax
|
|
|
|
; AVX512-NEXT: movq %rax, %rsi
|
|
|
|
; AVX512-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX512-NEXT: movl $0, %r12d
|
|
|
|
; AVX512-NEXT: adcq $-1, %r12
|
|
|
|
; AVX512-NEXT: addq $-1, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
|
|
|
|
; AVX512-NEXT: movl $0, %ebx
|
|
|
|
; AVX512-NEXT: adcq $-1, %rbx
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
|
|
|
|
; AVX512-NEXT: addq $-1, %rbp
|
|
|
|
; AVX512-NEXT: movl $0, %r11d
|
|
|
|
; AVX512-NEXT: adcq $-1, %r11
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX512-NEXT: addq $-1, %rax
|
|
|
|
; AVX512-NEXT: movl $0, %r10d
|
|
|
|
; AVX512-NEXT: adcq $-1, %r10
|
|
|
|
; AVX512-NEXT: addq $-1, %r14
|
|
|
|
; AVX512-NEXT: movl $0, %r8d
|
|
|
|
; AVX512-NEXT: adcq $-1, %r8
|
|
|
|
; AVX512-NEXT: addq $-1, %r9
|
|
|
|
; AVX512-NEXT: movl $0, %edi
|
|
|
|
; AVX512-NEXT: adcq $-1, %rdi
|
|
|
|
; AVX512-NEXT: addq $-1, %rdx
|
|
|
|
; AVX512-NEXT: movl $0, %ecx
|
|
|
|
; AVX512-NEXT: adcq $-1, %rcx
|
|
|
|
; AVX512-NEXT: shldq $63, %rdx, %rcx
|
|
|
|
; AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: shldq $63, %r9, %rdi
|
|
|
|
; AVX512-NEXT: shldq $63, %r14, %r8
|
|
|
|
; AVX512-NEXT: shldq $63, %rax, %r10
|
|
|
|
; AVX512-NEXT: shldq $63, %rbp, %r11
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
|
|
|
|
; AVX512-NEXT: shldq $63, %rdx, %rbx
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
|
|
|
|
; AVX512-NEXT: shldq $63, %rdx, %r12
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
|
|
|
|
; AVX512-NEXT: shldq $63, %rdx, %rsi
|
|
|
|
; AVX512-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX512-NEXT: shldq $63, %rax, %r15
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX512-NEXT: shldq $63, %rax, %r13
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX512-NEXT: shldq $63, %rax, %rsi
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX512-NEXT: shldq $63, %rax, %rcx
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
|
|
|
|
; AVX512-NEXT: shldq $63, %rdx, %rax
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %r14 # 8-byte Reload
|
|
|
|
; AVX512-NEXT: movq (%rsp), %rdx # 8-byte Reload
|
|
|
|
; AVX512-NEXT: shldq $63, %rdx, %r14
|
|
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9 # 8-byte Reload
|
|
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx # 8-byte Reload
|
|
|
|
; AVX512-NEXT: shldq $63, %rdx, %r9
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
|
|
|
|
; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
|
|
|
|
; AVX512-NEXT: shldq $63, %rdx, %rbp
|
|
|
|
; AVX512-NEXT: vmovq %rbp, %xmm8
|
|
|
|
; AVX512-NEXT: vmovq %r9, %xmm9
|
|
|
|
; AVX512-NEXT: vmovq %r14, %xmm10
|
|
|
|
; AVX512-NEXT: vmovq %rax, %xmm11
|
|
|
|
; AVX512-NEXT: vmovq %rcx, %xmm12
|
|
|
|
; AVX512-NEXT: vmovq %rsi, %xmm13
|
|
|
|
; AVX512-NEXT: vmovq %r13, %xmm14
|
|
|
|
; AVX512-NEXT: vmovq %r15, %xmm15
|
|
|
|
; AVX512-NEXT: vmovq -{{[0-9]+}}(%rsp), %xmm0 # 8-byte Folded Reload
|
|
|
|
; AVX512-NEXT: # xmm0 = mem[0],zero
|
|
|
|
; AVX512-NEXT: vmovq %r12, %xmm1
|
|
|
|
; AVX512-NEXT: vmovq %rbx, %xmm2
|
|
|
|
; AVX512-NEXT: vmovq %r11, %xmm3
|
|
|
|
; AVX512-NEXT: vmovq %r10, %xmm4
|
|
|
|
; AVX512-NEXT: vmovq %r8, %xmm5
|
|
|
|
; AVX512-NEXT: vmovq %rdi, %xmm6
|
|
|
|
; AVX512-NEXT: vmovq -{{[0-9]+}}(%rsp), %xmm7 # 8-byte Folded Reload
|
|
|
|
; AVX512-NEXT: # xmm7 = mem[0],zero
|
|
|
|
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0]
|
|
|
|
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm10[0]
|
|
|
|
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
|
|
|
|
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm13[0],xmm12[0]
|
|
|
|
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm15[0],xmm14[0]
|
|
|
|
; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
|
|
|
|
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
|
|
|
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
|
|
|
|
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
|
|
|
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm8, %ymm2
|
|
|
|
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm7[0],xmm6[0]
|
|
|
|
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
|
|
|
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
|
|
|
|
; AVX512-NEXT: vpmovdb %zmm0, (%rax)
|
|
|
|
; AVX512-NEXT: addq $24, %rsp
|
|
|
|
; AVX512-NEXT: popq %rbx
|
|
|
|
; AVX512-NEXT: popq %r12
|
|
|
|
; AVX512-NEXT: popq %r13
|
|
|
|
; AVX512-NEXT: popq %r14
|
|
|
|
; AVX512-NEXT: popq %r15
|
|
|
|
; AVX512-NEXT: popq %rbp
|
|
|
|
; AVX512-NEXT: vzeroupper
|
|
|
|
; AVX512-NEXT: retq
|
|
|
|
%1 = load <16 x i8>, <16 x i8>* %a
|
|
|
|
%2 = load <16 x i8>, <16 x i8>* %b
|
|
|
|
%3 = zext <16 x i8> %1 to <16 x i128>
|
|
|
|
%4 = zext <16 x i8> %2 to <16 x i128>
|
|
|
|
%5 = add nuw nsw <16 x i128> %3, <i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1, i128 -1>
|
|
|
|
%6 = add nuw nsw <16 x i128> %5, %4
|
|
|
|
%7 = lshr <16 x i128> %6, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
|
|
|
|
%8 = trunc <16 x i128> %7 to <16 x i8>
|
|
|
|
store <16 x i8> %8, <16 x i8>* undef, align 4
|
|
|
|
ret void
|
|
|
|
}
|