From 05f71dd0367195ed3cd319104d8efc9a9128e174 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 27 Sep 2017 18:36:45 +0000 Subject: [PATCH] [X86] In combineLoopSADPattern, pad result with zeros and use full size add instead of using a smaller add and inserting. In some cases the result psadbw is smaller than the type of the add that started the match. Currently in these cases we are using a smaller add and inserting the result. If we instead combine the psadbw with zeros and use the full size add we can take advantage of implicit zeroing we get if we emit a narrower move before the add. In a future patch, I want to make isel aware that the psadbw itself already zeroed the upper bits and remove the move entirely. Differential Revision: https://reviews.llvm.org/D37453 llvm-svn: 314331 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 17 +++++++---------- llvm/test/CodeGen/X86/sad.ll | 20 ++++++++++---------- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0852765333d4..1cd3af98dc98 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35536,16 +35536,13 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad); if (VT.getSizeInBits() > ResVT.getSizeInBits()) { - // Update part of elements of the reduction vector. This is done by first - // extracting a sub-vector from it, updating this sub-vector, and inserting - // it back. - SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi, - DAG.getIntPtrConstant(0, DL)); - SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi); - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res, - DAG.getIntPtrConstant(0, DL)); - } else - return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi); + // Fill the upper elements with zero to match the add width. + SDValue Zero = DAG.getConstant(0, DL, VT); + Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad, + DAG.getIntPtrConstant(0, DL)); + } + + return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi); } /// Convert vector increment or decrement to sub/add with an all-ones constant: diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index 78e232917565..e8a55215dc8d 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -43,8 +43,8 @@ define i32 @sad_16i8() nounwind { ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: vmovdqu a+1024(%rax), %xmm2 ; AVX2-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 -; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqa %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: addq $4, %rax ; AVX2-NEXT: jne .LBB0_1 ; AVX2-NEXT: # BB#2: # %middle.block @@ -67,8 +67,8 @@ define i32 @sad_16i8() nounwind { ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX512F-NEXT: vmovdqu a+1024(%rax), %xmm1 ; AVX512F-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa %xmm1, %xmm1 +; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB0_1 ; AVX512F-NEXT: # BB#2: # %middle.block @@ -93,8 +93,8 @@ define i32 @sad_16i8() nounwind { ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX512BW-NEXT: vmovdqu a+1024(%rax), %xmm1 ; AVX512BW-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: addq $4, %rax ; AVX512BW-NEXT: jne .LBB0_1 ; AVX512BW-NEXT: # BB#2: # %middle.block @@ -315,8 +315,8 @@ define i32 @sad_32i8() nounwind { ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2 ; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 -; AVX512F-NEXT: vpaddd %ymm1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa %ymm2, %ymm2 +; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB1_1 ; AVX512F-NEXT: # BB#2: # %middle.block @@ -343,8 +343,8 @@ define i32 @sad_32i8() nounwind { ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX512BW-NEXT: vmovdqa a+1024(%rax), %ymm2 ; AVX512BW-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 -; AVX512BW-NEXT: vpaddd %ymm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa %ymm2, %ymm2 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: addq $4, %rax ; AVX512BW-NEXT: jne .LBB1_1 ; AVX512BW-NEXT: # BB#2: # %middle.block