forked from OSchip/llvm-project
[CostModel][X86] Improve f64/v2f64/v4f64 FMUL costs on AVX1 targets to account for slower btver2
BTVER2 has a weaker f64 multiplier that other AVX1-era targets, so we need to bump the worst case cost slightly - llvm-mca reports the new vectorization in simplebb is beneficial on btver2, bdver2 and sandybridge AVX1 targets
This commit is contained in:
parent
355114a753
commit
fe6c11c571
|
@ -785,6 +785,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
|
||||||
{ ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
|
{ ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
|
||||||
{ ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
|
{ ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
|
||||||
{ ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
|
{ ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
|
||||||
|
{ ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
|
||||||
|
{ ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
|
||||||
{ ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
|
{ ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
|
||||||
{ ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
|
{ ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
|
||||||
|
|
||||||
|
@ -828,6 +830,10 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
|
||||||
{ ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
|
{ ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
|
||||||
{ ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
|
{ ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
|
||||||
|
|
||||||
|
{ ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
|
||||||
|
{ ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
|
||||||
|
{ ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
|
||||||
|
|
||||||
{ ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
|
{ ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
|
||||||
{ ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
|
{ ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
|
||||||
{ ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
|
{ ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
|
||||||
|
|
|
@ -461,10 +461,10 @@ define i32 @fmul(i32 %arg) {
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fmul <4 x float> undef, undef
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fmul <4 x float> undef, undef
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fmul <8 x float> undef, undef
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fmul <8 x float> undef, undef
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fmul <16 x float> undef, undef
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fmul <16 x float> undef, undef
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = fmul double undef, undef
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = fmul double undef, undef
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fmul <2 x double> undef, undef
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fmul <2 x double> undef, undef
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fmul <4 x double> undef, undef
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fmul <4 x double> undef, undef
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fmul <8 x double> undef, undef
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fmul <8 x double> undef, undef
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
|
||||||
;
|
;
|
||||||
; AVX2-LABEL: 'fmul'
|
; AVX2-LABEL: 'fmul'
|
||||||
|
|
|
@ -44,10 +44,10 @@ define void @reduce_f64(double %arg) {
|
||||||
;
|
;
|
||||||
; AVX1-LABEL: 'reduce_f64'
|
; AVX1-LABEL: 'reduce_f64'
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
|
||||||
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||||
;
|
;
|
||||||
; AVX2-LABEL: 'reduce_f64'
|
; AVX2-LABEL: 'reduce_f64'
|
||||||
|
|
|
@ -64,15 +64,17 @@ define void @test_volatile_load(double* %a, double* %b, double* %c) {
|
||||||
; CHECK-LABEL: @test_volatile_load(
|
; CHECK-LABEL: @test_volatile_load(
|
||||||
; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8
|
; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8
|
||||||
; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8
|
; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8
|
||||||
; CHECK-NEXT: [[MUL:%.*]] = fmul double [[I0]], [[I1]]
|
|
||||||
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
|
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
|
||||||
; CHECK-NEXT: [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
|
; CHECK-NEXT: [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
|
||||||
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
|
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
|
||||||
; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
|
; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
|
||||||
; CHECK-NEXT: [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[I0]], i32 0
|
||||||
; CHECK-NEXT: store double [[MUL]], double* [[C:%.*]], align 8
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1
|
||||||
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I1]], i32 0
|
||||||
; CHECK-NEXT: store double [[MUL5]], double* [[ARRAYIDX5]], align 8
|
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[I4]], i32 1
|
||||||
|
; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
|
||||||
|
; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
|
||||||
|
; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
|
||||||
; CHECK-NEXT: ret void
|
; CHECK-NEXT: ret void
|
||||||
;
|
;
|
||||||
%i0 = load volatile double, double* %a, align 8
|
%i0 = load volatile double, double* %a, align 8
|
||||||
|
|
Loading…
Reference in New Issue