forked from OSchip/llvm-project
[InstCombine] reassociate FP diff of sums into sum of diffs
(a[0] + a[1] + a[2] + a[3]) - (b[0] + b[1] + b[2] +b[3]) --> (a[0] - b[0]) + (a[1] - b[1]) + (a[2] - b[2]) + (a[3] - b[3]) This should be the last step in solving PR43953: https://bugs.llvm.org/show_bug.cgi?id=43953 We started emitting reduction intrinsics with: D80867/ rGe50059f6b6b3 So it's a relatively easy pattern match now to re-order those ops. Also, I have not seen any complaints for the switch to intrinsics yet, so I'll propose to remove the "experimental" tag from the intrinsics soon. Differential Revision: https://reviews.llvm.org/D81491
This commit is contained in:
parent
aeb5044801
commit
b5fb26951a
|
@ -2228,6 +2228,23 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
|
|||
return BinaryOperator::CreateFSubFMF(XZ, YW, &I);
|
||||
}
|
||||
|
||||
auto m_FaddRdx = [](Value *&Sum, Value *&Vec) {
|
||||
return m_OneUse(
|
||||
m_Intrinsic<Intrinsic::experimental_vector_reduce_v2_fadd>(
|
||||
m_Value(Sum), m_Value(Vec)));
|
||||
};
|
||||
Value *A0, *A1, *V0, *V1;
|
||||
if (match(Op0, m_FaddRdx(A0, V0)) && match(Op1, m_FaddRdx(A1, V1)) &&
|
||||
V0->getType() == V1->getType()) {
|
||||
// Difference of sums is sum of differences:
|
||||
// add_rdx(A0, V0) - add_rdx(A1, V1) --> add_rdx(A0, V0 - V1) - A1
|
||||
Value *Sub = Builder.CreateFSubFMF(V0, V1, &I);
|
||||
Value *Rdx = Builder.CreateIntrinsic(
|
||||
Intrinsic::experimental_vector_reduce_v2_fadd,
|
||||
{A0->getType(), Sub->getType()}, {A0, Sub}, &I);
|
||||
return BinaryOperator::CreateFSubFMF(Rdx, A1, &I);
|
||||
}
|
||||
|
||||
if (Instruction *F = factorizeFAddFSub(I, Builder))
|
||||
return F;
|
||||
|
||||
|
|
|
@ -7,9 +7,9 @@ declare void @use_f32(float)
|
|||
|
||||
define float @diff_of_sums_v4f32(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
|
||||
; CHECK-LABEL: @diff_of_sums_v4f32(
|
||||
; CHECK-NEXT: [[R0:%.*]] = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
|
||||
; CHECK-NEXT: [[R1:%.*]] = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]])
|
||||
; CHECK-NEXT: [[R:%.*]] = fsub reassoc nsz float [[R0]], [[R1]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = fsub reassoc nsz <4 x float> [[V0:%.*]], [[V1:%.*]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call reassoc nsz float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[TMP1]])
|
||||
; CHECK-NEXT: [[R:%.*]] = fsub reassoc nsz float [[TMP2]], [[A1:%.*]]
|
||||
; CHECK-NEXT: ret float [[R]]
|
||||
;
|
||||
%r0 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0)
|
||||
|
@ -18,6 +18,8 @@ define float @diff_of_sums_v4f32(float %a0, <4 x float> %v0, float %a1, <4 x flo
|
|||
ret float %r
|
||||
}
|
||||
|
||||
; negative test - fsub must allow reassociation
|
||||
|
||||
define float @diff_of_sums_v4f32_fmf(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
|
||||
; CHECK-LABEL: @diff_of_sums_v4f32_fmf(
|
||||
; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
|
||||
|
@ -31,6 +33,8 @@ define float @diff_of_sums_v4f32_fmf(float %a0, <4 x float> %v0, float %a1, <4 x
|
|||
ret float %r
|
||||
}
|
||||
|
||||
; negative test - extra uses could create extra instructions
|
||||
|
||||
define float @diff_of_sums_extra_use1(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
|
||||
; CHECK-LABEL: @diff_of_sums_extra_use1(
|
||||
; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
|
||||
|
@ -46,6 +50,8 @@ define float @diff_of_sums_extra_use1(float %a0, <4 x float> %v0, float %a1, <4
|
|||
ret float %r
|
||||
}
|
||||
|
||||
; negative test - extra uses could create extra instructions
|
||||
|
||||
define float @diff_of_sums_extra_use2(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) {
|
||||
; CHECK-LABEL: @diff_of_sums_extra_use2(
|
||||
; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
|
||||
|
@ -61,6 +67,8 @@ define float @diff_of_sums_extra_use2(float %a0, <4 x float> %v0, float %a1, <4
|
|||
ret float %r
|
||||
}
|
||||
|
||||
; negative test - can't reassociate different vector types
|
||||
|
||||
define float @diff_of_sums_type_mismatch(float %a0, <4 x float> %v0, float %a1, <8 x float> %v1) {
|
||||
; CHECK-LABEL: @diff_of_sums_type_mismatch(
|
||||
; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]])
|
||||
|
|
|
@ -240,10 +240,9 @@ define i32 @TestVectorsEqualFP_alt(float* noalias %Vec0, float* noalias %Vec1, f
|
|||
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[VEC1:%.*]] to <4 x float>*
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP1]])
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
|
||||
; CHECK-NEXT: [[ADD_3:%.*]] = fsub fast float [[TMP4]], [[TMP5]]
|
||||
; CHECK-NEXT: [[CMP3:%.*]] = fcmp fast ole float [[ADD_3]], [[TOLERANCE:%.*]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP4]])
|
||||
; CHECK-NEXT: [[CMP3:%.*]] = fcmp fast ole float [[TMP5]], [[TOLERANCE:%.*]]
|
||||
; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP3]] to i32
|
||||
; CHECK-NEXT: ret i32 [[COND]]
|
||||
;
|
||||
|
|
Loading…
Reference in New Issue