From fe6c11c5710e95eedbb16c2aac58a5d992e55434 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 21 May 2021 18:12:01 +0100 Subject: [PATCH] [CostModel][X86] Improve f64/v2f64/v4f64 FMUL costs on AVX1 targets to account for slower btver2 BTVER2 has a weaker f64 multiplier that other AVX1-era targets, so we need to bump the worst case cost slightly - llvm-mca reports the new vectorization in simplebb is beneficial on btver2, bdver2 and sandybridge AVX1 targets --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 6 ++++++ llvm/test/Analysis/CostModel/X86/arith-fp.ll | 8 ++++---- llvm/test/Analysis/CostModel/X86/reduce-fmul.ll | 8 ++++---- llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll | 12 +++++++----- 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 0cfe8d745763..83ee47691736 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -785,6 +785,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ + { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/ + { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/ { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ @@ -828,6 +830,10 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/ { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/ + { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/ + { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/ + { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/ + { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp.ll b/llvm/test/Analysis/CostModel/X86/arith-fp.ll index 0d2d94c813db..89059c0681da 100644 --- a/llvm/test/Analysis/CostModel/X86/arith-fp.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-fp.ll @@ -461,10 +461,10 @@ define i32 @fmul(i32 %arg) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fmul <4 x float> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fmul <8 x float> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fmul <16 x float> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = fmul double undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fmul <2 x double> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fmul <4 x double> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fmul <8 x double> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fmul double undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fmul <2 x double> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fmul <4 x double> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fmul <8 x double> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'fmul' diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll index e74f7ff02b2b..63424bcb3276 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll @@ -44,10 +44,10 @@ define void @reduce_f64(double %arg) { ; ; AVX1-LABEL: 'reduce_f64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'reduce_f64' diff --git a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll index 80a60dd4a3da..2a18d50b3893 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll @@ -64,15 +64,17 @@ define void @test_volatile_load(double* %a, double* %b, double* %c) { ; CHECK-LABEL: @test_volatile_load( ; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8 ; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8 -; CHECK-NEXT: [[MUL:%.*]] = fmul double [[I0]], [[I1]] ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 ; CHECK-NEXT: [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 ; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8 -; CHECK-NEXT: [[MUL5:%.*]] = fmul double [[I3]], [[I4]] -; CHECK-NEXT: store double [[MUL]], double* [[C:%.*]], align 8 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1 -; CHECK-NEXT: store double [[MUL5]], double* [[ARRAYIDX5]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[I0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[I4]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: ret void ; %i0 = load volatile double, double* %a, align 8