[X86] In combineLoopSADPattern, pad result with zeros and use full size add instead of using a smaller add and inserting.

In some cases the result psadbw is smaller than the type of the add that started the match. Currently in these cases we are using a smaller add and inserting the result.

If we instead combine the psadbw with zeros and use the full size add we can take advantage of implicit zeroing we get if we emit a narrower move before the add.

In a future patch, I want to make isel aware that the psadbw itself already zeroed the upper bits and remove the move entirely.

Differential Revision: https://reviews.llvm.org/D37453

llvm-svn: 314331
This commit is contained in:
Craig Topper 2017-09-27 18:36:45 +00:00
parent 102c333d9a
commit 05f71dd036
2 changed files with 17 additions and 20 deletions

View File

@ -35536,16 +35536,13 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
// Update part of elements of the reduction vector. This is done by first
// extracting a sub-vector from it, updating this sub-vector, and inserting
// it back.
SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
DAG.getIntPtrConstant(0, DL));
SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
DAG.getIntPtrConstant(0, DL));
} else
return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
// Fill the upper elements with zero to match the add width.
SDValue Zero = DAG.getConstant(0, DL, VT);
Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
DAG.getIntPtrConstant(0, DL));
}
return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
}
/// Convert vector increment or decrement to sub/add with an all-ones constant:

View File

@ -43,8 +43,8 @@ define i32 @sad_16i8() nounwind {
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vmovdqu a+1024(%rax), %xmm2
; AVX2-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovdqa %xmm2, %xmm2
; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: addq $4, %rax
; AVX2-NEXT: jne .LBB0_1
; AVX2-NEXT: # BB#2: # %middle.block
@ -67,8 +67,8 @@ define i32 @sad_16i8() nounwind {
; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512F-NEXT: vmovdqu a+1024(%rax), %xmm1
; AVX512F-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm1
; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovdqa %xmm1, %xmm1
; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB0_1
; AVX512F-NEXT: # BB#2: # %middle.block
@ -93,8 +93,8 @@ define i32 @sad_16i8() nounwind {
; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512BW-NEXT: vmovdqu a+1024(%rax), %xmm1
; AVX512BW-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
; AVX512BW-NEXT: vpaddd %xmm0, %xmm1, %xmm1
; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm1
; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: addq $4, %rax
; AVX512BW-NEXT: jne .LBB0_1
; AVX512BW-NEXT: # BB#2: # %middle.block
@ -315,8 +315,8 @@ define i32 @sad_32i8() nounwind {
; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2
; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
; AVX512F-NEXT: vpaddd %ymm1, %ymm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa %ymm2, %ymm2
; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB1_1
; AVX512F-NEXT: # BB#2: # %middle.block
@ -343,8 +343,8 @@ define i32 @sad_32i8() nounwind {
; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512BW-NEXT: vmovdqa a+1024(%rax), %ymm2
; AVX512BW-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
; AVX512BW-NEXT: vpaddd %ymm1, %ymm2, %ymm2
; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
; AVX512BW-NEXT: vmovdqa %ymm2, %ymm2
; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: addq $4, %rax
; AVX512BW-NEXT: jne .LBB1_1
; AVX512BW-NEXT: # BB#2: # %middle.block