forked from OSchip/llvm-project
[X86] Use PSADBW for v8i8 addition reductions.
Improves the 8 byte case from PR42674. Differential Revision: https://reviews.llvm.org/D66069 llvm-svn: 368864
This commit is contained in:
parent
bffa4a2b17
commit
3e44d96170
|
@ -35440,13 +35440,23 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
|
||||||
if (VecVT.getScalarType() != VT)
|
if (VecVT.getScalarType() != VT)
|
||||||
return SDValue();
|
return SDValue();
|
||||||
|
|
||||||
|
SDLoc DL(ExtElt);
|
||||||
|
|
||||||
|
if (VecVT == MVT::v8i8) {
|
||||||
|
// Pad with undef.
|
||||||
|
Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
|
||||||
|
DAG.getUNDEF(VecVT));
|
||||||
|
Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
|
||||||
|
DAG.getConstant(0, DL, MVT::v16i8));
|
||||||
|
Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
|
||||||
|
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
|
||||||
|
}
|
||||||
|
|
||||||
// Must be a >=128-bit vector with pow2 elements.
|
// Must be a >=128-bit vector with pow2 elements.
|
||||||
if ((VecVT.getSizeInBits() % 128) != 0 ||
|
if ((VecVT.getSizeInBits() % 128) != 0 ||
|
||||||
!isPowerOf2_32(VecVT.getVectorNumElements()))
|
!isPowerOf2_32(VecVT.getVectorNumElements()))
|
||||||
return SDValue();
|
return SDValue();
|
||||||
|
|
||||||
SDLoc DL(ExtElt);
|
|
||||||
|
|
||||||
// vXi8 reduction - sum lo/hi halves then use PSADBW.
|
// vXi8 reduction - sum lo/hi halves then use PSADBW.
|
||||||
if (VT == MVT::i8) {
|
if (VT == MVT::i8) {
|
||||||
while (Rdx.getValueSizeInBits() > 128) {
|
while (Rdx.getValueSizeInBits() > 128) {
|
||||||
|
|
|
@ -1160,52 +1160,32 @@ define i8 @test_v4i8_load(<4 x i8>* %p) {
|
||||||
define i8 @test_v8i8(<8 x i8> %a0) {
|
define i8 @test_v8i8(<8 x i8> %a0) {
|
||||||
; SSE2-LABEL: test_v8i8:
|
; SSE2-LABEL: test_v8i8:
|
||||||
; SSE2: # %bb.0:
|
; SSE2: # %bb.0:
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
; SSE2-NEXT: pxor %xmm1, %xmm1
|
||||||
; SSE2-NEXT: paddb %xmm0, %xmm1
|
; SSE2-NEXT: psadbw %xmm0, %xmm1
|
||||||
; SSE2-NEXT: movdqa %xmm1, %xmm0
|
|
||||||
; SSE2-NEXT: psrld $16, %xmm0
|
|
||||||
; SSE2-NEXT: paddb %xmm1, %xmm0
|
|
||||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
|
||||||
; SSE2-NEXT: psrlw $8, %xmm1
|
|
||||||
; SSE2-NEXT: paddb %xmm0, %xmm1
|
|
||||||
; SSE2-NEXT: movd %xmm1, %eax
|
; SSE2-NEXT: movd %xmm1, %eax
|
||||||
; SSE2-NEXT: # kill: def $al killed $al killed $eax
|
; SSE2-NEXT: # kill: def $al killed $al killed $eax
|
||||||
; SSE2-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; SSE41-LABEL: test_v8i8:
|
; SSE41-LABEL: test_v8i8:
|
||||||
; SSE41: # %bb.0:
|
; SSE41: # %bb.0:
|
||||||
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
; SSE41-NEXT: pxor %xmm1, %xmm1
|
||||||
; SSE41-NEXT: paddb %xmm0, %xmm1
|
; SSE41-NEXT: psadbw %xmm0, %xmm1
|
||||||
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
|
||||||
; SSE41-NEXT: psrld $16, %xmm0
|
|
||||||
; SSE41-NEXT: paddb %xmm1, %xmm0
|
|
||||||
; SSE41-NEXT: movdqa %xmm0, %xmm1
|
|
||||||
; SSE41-NEXT: psrlw $8, %xmm1
|
|
||||||
; SSE41-NEXT: paddb %xmm0, %xmm1
|
|
||||||
; SSE41-NEXT: pextrb $0, %xmm1, %eax
|
; SSE41-NEXT: pextrb $0, %xmm1, %eax
|
||||||
; SSE41-NEXT: # kill: def $al killed $al killed $eax
|
; SSE41-NEXT: # kill: def $al killed $al killed $eax
|
||||||
; SSE41-NEXT: retq
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v8i8:
|
; AVX-LABEL: test_v8i8:
|
||||||
; AVX: # %bb.0:
|
; AVX: # %bb.0:
|
||||||
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||||
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||||
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
|
|
||||||
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
|
|
||||||
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX-NEXT: vpextrb $0, %xmm0, %eax
|
; AVX-NEXT: vpextrb $0, %xmm0, %eax
|
||||||
; AVX-NEXT: # kill: def $al killed $al killed $eax
|
; AVX-NEXT: # kill: def $al killed $al killed $eax
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_v8i8:
|
; AVX512-LABEL: test_v8i8:
|
||||||
; AVX512: # %bb.0:
|
; AVX512: # %bb.0:
|
||||||
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||||
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||||
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
|
|
||||||
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
|
|
||||||
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
|
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
|
||||||
; AVX512-NEXT: # kill: def $al killed $al killed $eax
|
; AVX512-NEXT: # kill: def $al killed $al killed $eax
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
|
@ -1217,14 +1197,8 @@ define i8 @test_v8i8_load(<8 x i8>* %p) {
|
||||||
; SSE2-LABEL: test_v8i8_load:
|
; SSE2-LABEL: test_v8i8_load:
|
||||||
; SSE2: # %bb.0:
|
; SSE2: # %bb.0:
|
||||||
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
; SSE2-NEXT: pxor %xmm1, %xmm1
|
||||||
; SSE2-NEXT: paddb %xmm0, %xmm1
|
; SSE2-NEXT: psadbw %xmm0, %xmm1
|
||||||
; SSE2-NEXT: movdqa %xmm1, %xmm0
|
|
||||||
; SSE2-NEXT: psrld $16, %xmm0
|
|
||||||
; SSE2-NEXT: paddb %xmm1, %xmm0
|
|
||||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
|
||||||
; SSE2-NEXT: psrlw $8, %xmm1
|
|
||||||
; SSE2-NEXT: paddb %xmm0, %xmm1
|
|
||||||
; SSE2-NEXT: movd %xmm1, %eax
|
; SSE2-NEXT: movd %xmm1, %eax
|
||||||
; SSE2-NEXT: # kill: def $al killed $al killed $eax
|
; SSE2-NEXT: # kill: def $al killed $al killed $eax
|
||||||
; SSE2-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
|
@ -1232,77 +1206,29 @@ define i8 @test_v8i8_load(<8 x i8>* %p) {
|
||||||
; SSE41-LABEL: test_v8i8_load:
|
; SSE41-LABEL: test_v8i8_load:
|
||||||
; SSE41: # %bb.0:
|
; SSE41: # %bb.0:
|
||||||
; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||||
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
; SSE41-NEXT: pxor %xmm1, %xmm1
|
||||||
; SSE41-NEXT: paddb %xmm0, %xmm1
|
; SSE41-NEXT: psadbw %xmm0, %xmm1
|
||||||
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
|
||||||
; SSE41-NEXT: psrld $16, %xmm0
|
|
||||||
; SSE41-NEXT: paddb %xmm1, %xmm0
|
|
||||||
; SSE41-NEXT: movdqa %xmm0, %xmm1
|
|
||||||
; SSE41-NEXT: psrlw $8, %xmm1
|
|
||||||
; SSE41-NEXT: paddb %xmm0, %xmm1
|
|
||||||
; SSE41-NEXT: pextrb $0, %xmm1, %eax
|
; SSE41-NEXT: pextrb $0, %xmm1, %eax
|
||||||
; SSE41-NEXT: # kill: def $al killed $al killed $eax
|
; SSE41-NEXT: # kill: def $al killed $al killed $eax
|
||||||
; SSE41-NEXT: retq
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX1-LABEL: test_v8i8_load:
|
; AVX-LABEL: test_v8i8_load:
|
||||||
; AVX1: # %bb.0:
|
; AVX: # %bb.0:
|
||||||
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||||
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||||
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
|
; AVX-NEXT: vpextrb $0, %xmm0, %eax
|
||||||
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
; AVX-NEXT: # kill: def $al killed $al killed $eax
|
||||||
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
|
; AVX-NEXT: retq
|
||||||
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX1-NEXT: vpextrb $0, %xmm0, %eax
|
|
||||||
; AVX1-NEXT: # kill: def $al killed $al killed $eax
|
|
||||||
; AVX1-NEXT: retq
|
|
||||||
;
|
;
|
||||||
; AVX2-LABEL: test_v8i8_load:
|
; AVX512-LABEL: test_v8i8_load:
|
||||||
; AVX2: # %bb.0:
|
; AVX512: # %bb.0:
|
||||||
; AVX2-NEXT: movq (%rdi), %rax
|
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||||
; AVX2-NEXT: vmovq %rax, %xmm0
|
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||||
; AVX2-NEXT: shrq $32, %rax
|
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||||
; AVX2-NEXT: vmovd %eax, %xmm1
|
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
|
||||||
; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
|
; AVX512-NEXT: # kill: def $al killed $al killed $eax
|
||||||
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
; AVX512-NEXT: retq
|
||||||
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
|
|
||||||
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
|
|
||||||
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX2-NEXT: vpextrb $0, %xmm0, %eax
|
|
||||||
; AVX2-NEXT: # kill: def $al killed $al killed $eax
|
|
||||||
; AVX2-NEXT: retq
|
|
||||||
;
|
|
||||||
; AVX512BW-LABEL: test_v8i8_load:
|
|
||||||
; AVX512BW: # %bb.0:
|
|
||||||
; AVX512BW-NEXT: movq (%rdi), %rax
|
|
||||||
; AVX512BW-NEXT: vmovq %rax, %xmm0
|
|
||||||
; AVX512BW-NEXT: shrq $32, %rax
|
|
||||||
; AVX512BW-NEXT: vmovd %eax, %xmm1
|
|
||||||
; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1
|
|
||||||
; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
|
|
||||||
; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
|
|
||||||
; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
|
|
||||||
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
|
|
||||||
; AVX512BW-NEXT: retq
|
|
||||||
;
|
|
||||||
; AVX512VL-LABEL: test_v8i8_load:
|
|
||||||
; AVX512VL: # %bb.0:
|
|
||||||
; AVX512VL-NEXT: movq (%rdi), %rax
|
|
||||||
; AVX512VL-NEXT: vmovq %rax, %xmm0
|
|
||||||
; AVX512VL-NEXT: shrq $32, %rax
|
|
||||||
; AVX512VL-NEXT: vpbroadcastd %eax, %xmm1
|
|
||||||
; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
|
|
||||||
; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1
|
|
||||||
; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
|
||||||
; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
|
|
||||||
; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
|
|
||||||
; AVX512VL-NEXT: retq
|
|
||||||
%a0 = load <8 x i8>, <8 x i8>* %p
|
%a0 = load <8 x i8>, <8 x i8>* %p
|
||||||
%1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
|
%1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
|
||||||
ret i8 %1
|
ret i8 %1
|
||||||
|
|
Loading…
Reference in New Issue