forked from OSchip/llvm-project
[VectorCombine] make cost calc consistent for binops and cmps
Code duplication (subsequently removed by refactoring) allowed a logic discrepancy to creep in here. We were being conservative about creating a vector binop -- but not a vector cmp -- in the case where a vector op has the same estimated cost as the scalar op. We want to be more aggressive here because that can allow other combines based on reduced instruction count/uses. We can reverse the transform in DAGCombiner (potentially with a more accurate cost model) if this causes regressions. AFAIK, this does not conflict with InstCombine. We have a scalarize transform there, but it relies on finding a constant operand or a matching insertelement, so that means it eliminates an extractelement from the sequence (so we won't have 2 extracts by the time we get here if InstCombine succeeds). Differential Revision: https://reviews.llvm.org/D75062
This commit is contained in:
parent
b8d638d337
commit
10ea01d80d
|
@ -88,10 +88,10 @@ static bool isExtractExtractCheap(Instruction *Ext0, Instruction *Ext1,
|
|||
NewCost = VectorOpCost + ExtractCost + !Ext0->hasOneUse() * ExtractCost +
|
||||
!Ext1->hasOneUse() * ExtractCost;
|
||||
}
|
||||
// TODO: The cost comparison should not differ based on opcode. Either we
|
||||
// want to be uniformly more or less aggressive in deciding if a vector
|
||||
// operation should replace the scalar operation.
|
||||
return IsBinOp ? OldCost <= NewCost : OldCost < NewCost;
|
||||
// Aggressively form a vector op if the cost is equal because the transform
|
||||
// may enable further optimization.
|
||||
// Codegen can reverse this transform (scalarize) if it was not profitable.
|
||||
return OldCost < NewCost;
|
||||
}
|
||||
|
||||
/// Try to reduce extract element costs by converting scalar compares to vector
|
||||
|
|
|
@ -74,14 +74,15 @@ define i8 @ext0_ext0_sdiv(<16 x i8> %x, <16 x i8> %y) {
|
|||
ret i8 %r
|
||||
}
|
||||
|
||||
; Negative test - extracts are free and vector op has same cost as scalar.
|
||||
; Extracts are free and vector op has same cost as scalar, but we
|
||||
; speculatively transform to vector to create more optimization
|
||||
; opportunities..
|
||||
|
||||
define double @ext0_ext0_fadd(<2 x double> %x, <2 x double> %y) {
|
||||
; CHECK-LABEL: @ext0_ext0_fadd(
|
||||
; CHECK-NEXT: [[E0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
|
||||
; CHECK-NEXT: [[E1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0
|
||||
; CHECK-NEXT: [[R:%.*]] = fadd double [[E0]], [[E1]]
|
||||
; CHECK-NEXT: ret double [[R]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[X:%.*]], [[Y:%.*]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
|
||||
; CHECK-NEXT: ret double [[TMP2]]
|
||||
;
|
||||
%e0 = extractelement <2 x double> %x, i32 0
|
||||
%e1 = extractelement <2 x double> %y, i32 0
|
||||
|
@ -118,14 +119,14 @@ define double @ext1_ext1_fadd_different_types(<2 x double> %x, <4 x double> %y)
|
|||
ret double %r
|
||||
}
|
||||
|
||||
; Negative test - disguised same vector operand; scalar code is cheaper than general case.
|
||||
; Disguised same vector operand; scalar code is not cheaper (with default
|
||||
; x86 target), so aggressively form vector binop.
|
||||
|
||||
define i32 @ext1_ext1_add_same_vec(<4 x i32> %x) {
|
||||
; CHECK-LABEL: @ext1_ext1_add_same_vec(
|
||||
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 1
|
||||
; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[X]], i32 1
|
||||
; CHECK-NEXT: [[R:%.*]] = add i32 [[E0]], [[E1]]
|
||||
; CHECK-NEXT: ret i32 [[R]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[X:%.*]], [[X]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
|
||||
; CHECK-NEXT: ret i32 [[TMP2]]
|
||||
;
|
||||
%e0 = extractelement <4 x i32> %x, i32 1
|
||||
%e1 = extractelement <4 x i32> %x, i32 1
|
||||
|
@ -133,13 +134,13 @@ define i32 @ext1_ext1_add_same_vec(<4 x i32> %x) {
|
|||
ret i32 %r
|
||||
}
|
||||
|
||||
; Negative test - same vector operand; scalar code is cheaper than general case.
|
||||
; Functionally equivalent to above test; should transform as above.
|
||||
|
||||
define i32 @ext1_ext1_add_same_vec_cse(<4 x i32> %x) {
|
||||
; CHECK-LABEL: @ext1_ext1_add_same_vec_cse(
|
||||
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 1
|
||||
; CHECK-NEXT: [[R:%.*]] = add i32 [[E0]], [[E0]]
|
||||
; CHECK-NEXT: ret i32 [[R]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[X:%.*]], [[X]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
|
||||
; CHECK-NEXT: ret i32 [[TMP2]]
|
||||
;
|
||||
%e0 = extractelement <4 x i32> %x, i32 1
|
||||
%r = add i32 %e0, %e0
|
||||
|
@ -200,15 +201,15 @@ define i8 @ext1_ext1_add_same_vec_cse_extra_use(<16 x i8> %x) {
|
|||
ret i8 %r
|
||||
}
|
||||
|
||||
; Negative test - vector code would not be cheaper.
|
||||
; Vector code costs the same as scalar, so aggressively form vector op.
|
||||
|
||||
define i8 @ext1_ext1_add_uses1(<16 x i8> %x, <16 x i8> %y) {
|
||||
; CHECK-LABEL: @ext1_ext1_add_uses1(
|
||||
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
|
||||
; CHECK-NEXT: call void @use_i8(i8 [[E0]])
|
||||
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
|
||||
; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]
|
||||
; CHECK-NEXT: ret i8 [[R]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[X]], [[Y:%.*]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
|
||||
; CHECK-NEXT: ret i8 [[TMP2]]
|
||||
;
|
||||
%e0 = extractelement <16 x i8> %x, i32 0
|
||||
call void @use_i8(i8 %e0)
|
||||
|
@ -217,15 +218,15 @@ define i8 @ext1_ext1_add_uses1(<16 x i8> %x, <16 x i8> %y) {
|
|||
ret i8 %r
|
||||
}
|
||||
|
||||
; Negative test - vector code would not be cheaper.
|
||||
; Vector code costs the same as scalar, so aggressively form vector op.
|
||||
|
||||
define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
|
||||
; CHECK-LABEL: @ext1_ext1_add_uses2(
|
||||
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
|
||||
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
|
||||
; CHECK-NEXT: call void @use_i8(i8 [[E1]])
|
||||
; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]
|
||||
; CHECK-NEXT: ret i8 [[R]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[X:%.*]], [[Y]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
|
||||
; CHECK-NEXT: ret i8 [[TMP2]]
|
||||
;
|
||||
%e0 = extractelement <16 x i8> %x, i32 0
|
||||
%e1 = extractelement <16 x i8> %y, i32 0
|
||||
|
|
Loading…
Reference in New Issue