From 62dd44d76da9aa596fb199bda8b1e8768bb41033 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 16 Feb 2020 10:40:28 -0500 Subject: [PATCH] [VectorCombine] fix cost calc for extract-cmp getOperationCost() is not the cost we wanted; that's not the throughput value that the rest of the calculation uses. We may want to switch everything in this code to use the getInstructionThroughput() wrapper to avoid these kinds of problems, but I'll look at that as a follow-up because that can create other logical diffs via using optional parameters (we'd need to speculatively create the vector instruction to make a fair(er) comparison). --- .../Transforms/Vectorize/VectorCombine.cpp | 5 ++- .../VectorCombine/X86/extract-cmp.ll | 41 +++++++++++++------ 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index f5a26d012de9..7b3697be0ae0 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -58,8 +58,9 @@ static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) { // ((2 * extract) + scalar cmp) < (vector cmp + extract) ? int ExtractCost = TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, C->getZExtValue()); - int ScalarCmpCost = TTI.getOperationCost(CmpOpcode, ScalarTy); - int VecCmpCost = TTI.getOperationCost(CmpOpcode, VecTy); + int ScalarCmpCost = TTI.getCmpSelInstrCost(CmpOpcode, ScalarTy, I.getType()); + int VecCmpCost = TTI.getCmpSelInstrCost(CmpOpcode, VecTy, + CmpInst::makeCmpResultType(VecTy)); int ScalarCost = 2 * ExtractCost + ScalarCmpCost; int VecCost = VecCmpCost + ExtractCost + diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll index d5d11df0ece0..8d04af3c8105 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- | FileCheck %s +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX define i1 @cmp_v4i32(<4 x float> %arg, <4 x float> %arg1) { ; CHECK-LABEL: @cmp_v4i32( @@ -57,18 +58,32 @@ bb18: } define i32 @cmp_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) { -; CHECK-LABEL: @cmp_v2f64( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <2 x double> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 -; CHECK-NEXT: br i1 [[TMP1]], label [[T:%.*]], label [[F:%.*]] -; CHECK: t: -; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x double> [[Y]], [[Z:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: [[E:%.*]] = select i1 [[TMP3]], i32 42, i32 99 -; CHECK-NEXT: ret i32 [[E]] -; CHECK: f: -; CHECK-NEXT: ret i32 0 +; SSE-LABEL: @cmp_v2f64( +; SSE-NEXT: entry: +; SSE-NEXT: [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 +; SSE-NEXT: [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1 +; SSE-NEXT: [[CMP1:%.*]] = fcmp oeq double [[X1]], [[Y1]] +; SSE-NEXT: br i1 [[CMP1]], label [[T:%.*]], label [[F:%.*]] +; SSE: t: +; SSE-NEXT: [[Z1:%.*]] = extractelement <2 x double> [[Z:%.*]], i32 1 +; SSE-NEXT: [[CMP2:%.*]] = fcmp ogt double [[Y1]], [[Z1]] +; SSE-NEXT: [[E:%.*]] = select i1 [[CMP2]], i32 42, i32 99 +; SSE-NEXT: ret i32 [[E]] +; SSE: f: +; SSE-NEXT: ret i32 0 +; +; AVX-LABEL: @cmp_v2f64( +; AVX-NEXT: entry: +; AVX-NEXT: [[TMP0:%.*]] = fcmp oeq <2 x double> [[X:%.*]], [[Y:%.*]] +; AVX-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; AVX-NEXT: br i1 [[TMP1]], label [[T:%.*]], label [[F:%.*]] +; AVX: t: +; AVX-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x double> [[Y]], [[Z:%.*]] +; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; AVX-NEXT: [[E:%.*]] = select i1 [[TMP3]], i32 42, i32 99 +; AVX-NEXT: ret i32 [[E]] +; AVX: f: +; AVX-NEXT: ret i32 0 ; entry: %x1 = extractelement <2 x double> %x, i32 1