[CostModel][X86] Adjust pre-SSE41 fp scalar select costs to account for vector ops

Based off the script from D103695, we now mainly use BLENDV or OR(AND,ANDN) to select scalar float/double ops
This commit is contained in:
Simon Pilgrim 2022-05-06 11:41:45 +01:00
parent 7cc8377f2c
commit d21bf51494
3 changed files with 44 additions and 80 deletions

View File

@ -2753,7 +2753,9 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
static const CostTblEntry SSE41CostTbl[] = {
{ ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
{ ISD::SELECT, MVT::f64, 1 }, // blendvpd
{ ISD::SELECT, MVT::v4f32, 1 }, // blendvps
{ ISD::SELECT, MVT::f32 , 1 }, // blendvps
{ ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
{ ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
{ ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
@ -2769,6 +2771,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
{ ISD::SETCC, MVT::v16i8, 1 },
{ ISD::SELECT, MVT::v2f64, 2 }, // andpd + andnpd + orpd
{ ISD::SELECT, MVT::f64, 2 }, // andpd + andnpd + orpd
{ ISD::SELECT, MVT::v2i64, 2 }, // pand + pandn + por
{ ISD::SELECT, MVT::v4i32, 2 }, // pand + pandn + por
{ ISD::SELECT, MVT::v8i16, 2 }, // pand + pandn + por
@ -2780,6 +2783,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
{ ISD::SETCC, MVT::f32, 1 },
{ ISD::SELECT, MVT::v4f32, 2 }, // andps + andnps + orps
{ ISD::SELECT, MVT::f32, 2 }, // andps + andnps + orps
};
if (ST->useSLMArithCosts())

View File

@ -148,11 +148,11 @@ define i32 @test_select() {
define i32 @test_select_fp() {
; SSE2-LABEL: 'test_select_fp'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = select i1 undef, double undef, double undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = select i1 undef, double undef, double undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = select <4 x i1> undef, <4 x double> undef, <4 x double> undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = select <8 x i1> undef, <8 x double> undef, <8 x double> undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = select i1 undef, float undef, float undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = select i1 undef, float undef, float undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = select <8 x i1> undef, <8 x float> undef, <8 x float> undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = select <16 x i1> undef, <16 x float> undef, <16 x float> undef

View File

@ -1,87 +1,47 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -basic-aa -slp-vectorizer -S | FileCheck %s -check-prefix=SSE
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mattr=+avx | FileCheck %s -check-prefix=AVX
; RUN: opt < %s -basic-aa -slp-vectorizer -S | FileCheck %s
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mattr=+avx | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.10.0"
define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) {
; SSE-LABEL: @testfunc(
; SSE-NEXT: entry:
; SSE-NEXT: br label [[FOR_BODY:%.*]]
; SSE: for.body:
; SSE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; SSE-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
; SSE-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ]
; SSE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
; SSE-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
; SSE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; SSE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
; SSE-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
; SSE-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP5]], i32 1
; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP1]], i32 1
; SSE-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[TMP6]], [[TMP8]]
; SSE-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP2]], [[TMP9]]
; SSE-NEXT: [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], <float 1.000000e+00, float 1.000000e+00>
; SSE-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
; SSE-NEXT: [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], <float -1.000000e+00, float -1.000000e+00>
; SSE-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
; SSE-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP14]]
; SSE-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
; SSE-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
; SSE-NEXT: [[ADD13]] = fadd float [[TMP16]], [[TMP17]]
; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i32 0
; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1
; SSE-NEXT: [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], <float 1.000000e+00, float 1.000000e+00>
; SSE-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
; SSE-NEXT: [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], <float -1.000000e+00, float -1.000000e+00>
; SSE-NEXT: [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP21]]
; SSE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
; SSE-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; SSE: for.end:
; SSE-NEXT: ret void
;
; AVX-LABEL: @testfunc(
; AVX-NEXT: entry:
; AVX-NEXT: br label [[FOR_BODY:%.*]]
; AVX: for.body:
; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; AVX-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
; AVX-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]]
; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]]
; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], <float 1.000000e+00, float 1.000000e+00>
; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], <float -1.000000e+00, float -1.000000e+00>
; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer
; AVX-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP10]]
; AVX-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
; AVX-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
; AVX-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]]
; AVX-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0
; AVX-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1
; AVX-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], <float 1.000000e+00, float 1.000000e+00>
; AVX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
; AVX-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], <float -1.000000e+00, float -1.000000e+00>
; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP17]]
; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; AVX: for.end:
; AVX-NEXT: ret void
; CHECK-LABEL: @testfunc(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]]
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]]
; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], <float 1.000000e+00, float 1.000000e+00>
; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
; CHECK-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], <float -1.000000e+00, float -1.000000e+00>
; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
; CHECK-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]]
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1
; CHECK-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], <float 1.000000e+00, float 1.000000e+00>
; CHECK-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
; CHECK-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], <float -1.000000e+00, float -1.000000e+00>
; CHECK-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP17]]
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body