[VectorCombine] fix cost calc for extract-cmp

getOperationCost() is not the cost we wanted; that's not the
throughput value that the rest of the calculation uses.

We may want to switch everything in this code to use the
getInstructionThroughput() wrapper to avoid these kinds of
problems, but I'll look at that as a follow-up because that
can create other logical diffs via using optional parameters
(we'd need to speculatively create the vector instruction to
make a fair(er) comparison).
This commit is contained in:
Sanjay Patel 2020-02-16 10:40:28 -05:00
parent e48b536be6
commit 62dd44d76d
2 changed files with 31 additions and 15 deletions

View File

@ -58,8 +58,9 @@ static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) {
// ((2 * extract) + scalar cmp) < (vector cmp + extract) ? // ((2 * extract) + scalar cmp) < (vector cmp + extract) ?
int ExtractCost = TTI.getVectorInstrCost(Instruction::ExtractElement, int ExtractCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
VecTy, C->getZExtValue()); VecTy, C->getZExtValue());
int ScalarCmpCost = TTI.getOperationCost(CmpOpcode, ScalarTy); int ScalarCmpCost = TTI.getCmpSelInstrCost(CmpOpcode, ScalarTy, I.getType());
int VecCmpCost = TTI.getOperationCost(CmpOpcode, VecTy); int VecCmpCost = TTI.getCmpSelInstrCost(CmpOpcode, VecTy,
CmpInst::makeCmpResultType(VecTy));
int ScalarCost = 2 * ExtractCost + ScalarCmpCost; int ScalarCost = 2 * ExtractCost + ScalarCmpCost;
int VecCost = VecCmpCost + ExtractCost + int VecCost = VecCmpCost + ExtractCost +

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- | FileCheck %s ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
define i1 @cmp_v4i32(<4 x float> %arg, <4 x float> %arg1) { define i1 @cmp_v4i32(<4 x float> %arg, <4 x float> %arg1) {
; CHECK-LABEL: @cmp_v4i32( ; CHECK-LABEL: @cmp_v4i32(
@ -57,18 +58,32 @@ bb18:
} }
define i32 @cmp_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) { define i32 @cmp_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) {
; CHECK-LABEL: @cmp_v2f64( ; SSE-LABEL: @cmp_v2f64(
; CHECK-NEXT: entry: ; SSE-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <2 x double> [[X:%.*]], [[Y:%.*]] ; SSE-NEXT: [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 ; SSE-NEXT: [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
; CHECK-NEXT: br i1 [[TMP1]], label [[T:%.*]], label [[F:%.*]] ; SSE-NEXT: [[CMP1:%.*]] = fcmp oeq double [[X1]], [[Y1]]
; CHECK: t: ; SSE-NEXT: br i1 [[CMP1]], label [[T:%.*]], label [[F:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x double> [[Y]], [[Z:%.*]] ; SSE: t:
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 ; SSE-NEXT: [[Z1:%.*]] = extractelement <2 x double> [[Z:%.*]], i32 1
; CHECK-NEXT: [[E:%.*]] = select i1 [[TMP3]], i32 42, i32 99 ; SSE-NEXT: [[CMP2:%.*]] = fcmp ogt double [[Y1]], [[Z1]]
; CHECK-NEXT: ret i32 [[E]] ; SSE-NEXT: [[E:%.*]] = select i1 [[CMP2]], i32 42, i32 99
; CHECK: f: ; SSE-NEXT: ret i32 [[E]]
; CHECK-NEXT: ret i32 0 ; SSE: f:
; SSE-NEXT: ret i32 0
;
; AVX-LABEL: @cmp_v2f64(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP0:%.*]] = fcmp oeq <2 x double> [[X:%.*]], [[Y:%.*]]
; AVX-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
; AVX-NEXT: br i1 [[TMP1]], label [[T:%.*]], label [[F:%.*]]
; AVX: t:
; AVX-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x double> [[Y]], [[Z:%.*]]
; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
; AVX-NEXT: [[E:%.*]] = select i1 [[TMP3]], i32 42, i32 99
; AVX-NEXT: ret i32 [[E]]
; AVX: f:
; AVX-NEXT: ret i32 0
; ;
entry: entry:
%x1 = extractelement <2 x double> %x, i32 1 %x1 = extractelement <2 x double> %x, i32 1