forked from OSchip/llvm-project
[X86] Improve lowering of v2i32 SAD handling in combineLoopSADPattern.
For v2i32 we only feed 2 i8 elements into the psadbw instructions with 0s in the other 14 bytes. The resulting psadbw instruction will produce zeros in bits [127:16] of the output. We need to take the result and feed it to a v2i32 add where the first element includes bits [15:0] of the sad result. The other element should be zero. Prior to this patch we were using a truncate to take 0 from bits 95:64 of the psadbw. This results in a pshufd to move those bits to 63:32. But since we also have zeroes in bits 63:32 of the psadbw output, we should just take those bits. The previous code probably worked better with promoting legalization, but now we use widening legalization. I've preserved the old behavior if -x86-experimental-vector-widening-legalization=false until we get that option removed. llvm-svn: 369733
This commit is contained in:
parent
8798c8de9a
commit
bdceb9fb14
|
@ -43647,10 +43647,13 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
|
||||||
// The output of PSADBW is a vector of i64.
|
// The output of PSADBW is a vector of i64.
|
||||||
// We need to turn the vector of i64 into a vector of i32.
|
// We need to turn the vector of i64 into a vector of i32.
|
||||||
// If the reduction vector is at least as wide as the psadbw result, just
|
// If the reduction vector is at least as wide as the psadbw result, just
|
||||||
// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
|
// bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
|
||||||
// anyway.
|
// the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64
|
||||||
|
// result to v2i32 which will be removed by type legalization. If we/ widen
|
||||||
|
// narrow vectors then we bitcast to v4i32 and extract v2i32.
|
||||||
MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
|
MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
|
||||||
if (VT.getSizeInBits() >= ResVT.getSizeInBits())
|
if (ExperimentalVectorWideningLegalization ||
|
||||||
|
VT.getSizeInBits() >= ResVT.getSizeInBits())
|
||||||
Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
|
Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
|
||||||
else
|
else
|
||||||
Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
|
Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
|
||||||
|
@ -43660,6 +43663,10 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
|
||||||
SDValue Zero = DAG.getConstant(0, DL, VT);
|
SDValue Zero = DAG.getConstant(0, DL, VT);
|
||||||
Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
|
Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
|
||||||
DAG.getIntPtrConstant(0, DL));
|
DAG.getIntPtrConstant(0, DL));
|
||||||
|
} else if (ExperimentalVectorWideningLegalization &&
|
||||||
|
VT.getSizeInBits() < ResVT.getSizeInBits()) {
|
||||||
|
Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
|
||||||
|
DAG.getIntPtrConstant(0, DL));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Preserve the reduction flag on the ADD. We may need to revisit for the
|
// Preserve the reduction flag on the ADD. We may need to revisit for the
|
||||||
|
|
|
@ -1074,7 +1074,6 @@ define i32 @sad_2i8() nounwind {
|
||||||
; SSE2-NEXT: pand %xmm1, %xmm3
|
; SSE2-NEXT: pand %xmm1, %xmm3
|
||||||
; SSE2-NEXT: pand %xmm1, %xmm2
|
; SSE2-NEXT: pand %xmm1, %xmm2
|
||||||
; SSE2-NEXT: psadbw %xmm3, %xmm2
|
; SSE2-NEXT: psadbw %xmm3, %xmm2
|
||||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
||||||
; SSE2-NEXT: paddd %xmm2, %xmm0
|
; SSE2-NEXT: paddd %xmm2, %xmm0
|
||||||
; SSE2-NEXT: addq $4, %rax
|
; SSE2-NEXT: addq $4, %rax
|
||||||
; SSE2-NEXT: jne .LBB3_1
|
; SSE2-NEXT: jne .LBB3_1
|
||||||
|
@ -1097,7 +1096,6 @@ define i32 @sad_2i8() nounwind {
|
||||||
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
|
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
|
||||||
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
|
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
|
||||||
; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
|
; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
|
||||||
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
|
||||||
; AVX-NEXT: vpaddd %xmm1, %xmm2, %xmm1
|
; AVX-NEXT: vpaddd %xmm1, %xmm2, %xmm1
|
||||||
; AVX-NEXT: addq $4, %rax
|
; AVX-NEXT: addq $4, %rax
|
||||||
; AVX-NEXT: jne .LBB3_1
|
; AVX-NEXT: jne .LBB3_1
|
||||||
|
|
Loading…
Reference in New Issue