forked from OSchip/llvm-project
Changed basic cost of store operation on X86
Store operation takes 2 UOps on X86 processors. The exact cost calculation affects several optimization passes including loop unroling. This change compensates performance degradation caused by https://reviews.llvm.org/D34458 and shows improvements on some benchmarks. Differential Revision: https://reviews.llvm.org/D35888 llvm-svn: 311285
This commit is contained in:
parent
d196930799
commit
f58f838495
|
@ -2113,6 +2113,21 @@ int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
|
|||
return X86TTIImpl::getIntImmCost(Imm, Ty);
|
||||
}
|
||||
|
||||
unsigned X86TTIImpl::getUserCost(const User *U,
|
||||
ArrayRef<const Value *> Operands) {
|
||||
if (isa<StoreInst>(U)) {
|
||||
Value *Ptr = U->getOperand(1);
|
||||
// Store instruction with index and scale costs 2 Uops.
|
||||
// Check the preceding GEP to identify non-const indices.
|
||||
if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
|
||||
if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
|
||||
return TTI::TCC_Basic * 2;
|
||||
}
|
||||
return TTI::TCC_Basic;
|
||||
}
|
||||
return BaseT::getUserCost(U, Operands);
|
||||
}
|
||||
|
||||
// Return an average cost of Gather / Scatter instruction, maybe improved later
|
||||
int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
|
||||
unsigned Alignment, unsigned AddressSpace) {
|
||||
|
|
|
@ -102,6 +102,8 @@ public:
|
|||
|
||||
int getIntImmCost(const APInt &Imm, Type *Ty);
|
||||
|
||||
unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
|
||||
|
||||
int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
|
||||
int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
|
||||
Type *Ty);
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
; REQUIRES: asserts
|
||||
; RUN: opt -mcpu=core-avx2 -loop-unroll --debug-only=loop-unroll -S -unroll-allow-partial < %s 2>&1 | FileCheck %s
|
||||
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
; CHECK: Loop Unroll: F[foo] Loop %loop.2.header
|
||||
; CHECK: Loop Size = 27
|
||||
; CHECK-NOT: UNROLLING loop %loop.2.header
|
||||
; CHECK: Loop Unroll: F[foo] Loop %loop.header
|
||||
; CHECK: Loop Size = 25
|
||||
; CHECK: UNROLLING loop %loop.header by 2
|
||||
|
||||
define void @foo(i32 * %out) {
|
||||
entry:
|
||||
%0 = alloca [1024 x i32]
|
||||
%x0 = alloca [1024 x i32]
|
||||
%x01 = alloca [1024 x i32]
|
||||
%x02 = alloca [1024 x i32]
|
||||
%x03 = alloca [1024 x i32]
|
||||
%x04 = alloca [1024 x i32]
|
||||
%x05 = alloca [1024 x i32]
|
||||
%x06 = alloca [1024 x i32]
|
||||
br label %loop.header
|
||||
|
||||
loop.header:
|
||||
%counter = phi i32 [0, %entry], [%inc, %loop.inc]
|
||||
br label %loop.body
|
||||
|
||||
loop.body:
|
||||
%ptr = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 %counter
|
||||
store i32 %counter, i32* %ptr
|
||||
%val = add i32 %counter, 5
|
||||
%xptr = getelementptr [1024 x i32], [1024 x i32]* %x0, i32 0, i32 %counter
|
||||
store i32 %val, i32* %xptr
|
||||
%val1 = add i32 %counter, 6
|
||||
%xptr1 = getelementptr [1024 x i32], [1024 x i32]* %x01, i32 0, i32 %counter
|
||||
store i32 %val1, i32* %xptr1
|
||||
%val2 = add i32 %counter, 7
|
||||
%xptr2 = getelementptr [1024 x i32], [1024 x i32]* %x02, i32 0, i32 %counter
|
||||
store i32 %val2, i32* %xptr2
|
||||
%val3 = add i32 %counter, 8
|
||||
%xptr3 = getelementptr [1024 x i32], [1024 x i32]* %x03, i32 0, i32 %counter
|
||||
store i32 %val3, i32* %xptr3
|
||||
%val4 = add i32 %counter, 9
|
||||
%xptr4 = getelementptr [1024 x i32], [1024 x i32]* %x04, i32 0, i32 %counter
|
||||
store i32 %val4, i32* %xptr4
|
||||
%val5 = add i32 %counter, 10
|
||||
%xptr5 = getelementptr [1024 x i32], [1024 x i32]* %x05, i32 0, i32 %counter
|
||||
store i32 %val5, i32* %xptr5
|
||||
br label %loop.inc
|
||||
|
||||
loop.inc:
|
||||
%inc = add i32 %counter, 2
|
||||
%1 = icmp sge i32 %inc, 1023
|
||||
br i1 %1, label %exit.0, label %loop.header
|
||||
|
||||
exit.0:
|
||||
%2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 5
|
||||
%3 = load i32, i32* %2
|
||||
store i32 %3, i32 * %out
|
||||
br label %loop.2.header
|
||||
|
||||
|
||||
loop.2.header:
|
||||
%counter.2 = phi i32 [0, %exit.0], [%inc.2, %loop.2.inc]
|
||||
br label %loop.2.body
|
||||
|
||||
loop.2.body:
|
||||
%ptr.2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 %counter.2
|
||||
store i32 %counter.2, i32* %ptr.2
|
||||
%val.2 = add i32 %counter.2, 5
|
||||
%xptr.2 = getelementptr [1024 x i32], [1024 x i32]* %x0, i32 0, i32 %counter.2
|
||||
store i32 %val.2, i32* %xptr.2
|
||||
%val1.2 = add i32 %counter.2, 6
|
||||
%xptr1.2 = getelementptr [1024 x i32], [1024 x i32]* %x01, i32 0, i32 %counter.2
|
||||
store i32 %val1, i32* %xptr1.2
|
||||
%val2.2 = add i32 %counter.2, 7
|
||||
%xptr2.2 = getelementptr [1024 x i32], [1024 x i32]* %x02, i32 0, i32 %counter.2
|
||||
store i32 %val2, i32* %xptr2.2
|
||||
%val3.2 = add i32 %counter.2, 8
|
||||
%xptr3.2 = getelementptr [1024 x i32], [1024 x i32]* %x03, i32 0, i32 %counter.2
|
||||
store i32 %val3.2, i32* %xptr3.2
|
||||
%val4.2 = add i32 %counter.2, 9
|
||||
%xptr4.2 = getelementptr [1024 x i32], [1024 x i32]* %x04, i32 0, i32 %counter.2
|
||||
store i32 %val4.2, i32* %xptr4.2
|
||||
%val5.2 = add i32 %counter.2, 10
|
||||
%xptr5.2 = getelementptr [1024 x i32], [1024 x i32]* %x05, i32 0, i32 %counter.2
|
||||
store i32 %val5.2, i32* %xptr5.2
|
||||
%xptr6.2 = getelementptr [1024 x i32], [1024 x i32]* %x06, i32 0, i32 %counter.2
|
||||
store i32 %val5.2, i32* %xptr6.2
|
||||
br label %loop.2.inc
|
||||
|
||||
loop.2.inc:
|
||||
%inc.2 = add i32 %counter.2, 2
|
||||
%4 = icmp sge i32 %inc.2, 1023
|
||||
br i1 %4, label %exit.2, label %loop.2.header
|
||||
|
||||
exit.2:
|
||||
%x2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 6
|
||||
%x3 = load i32, i32* %x2
|
||||
%out2 = getelementptr i32, i32 * %out, i32 1
|
||||
store i32 %3, i32 * %out2
|
||||
ret void
|
||||
}
|
|
@ -172,7 +172,7 @@ for.body: ; preds = %for.body, %entry
|
|||
%arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
|
||||
store i32 %add, i32* %arrayidx2, align 4
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 64
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 48
|
||||
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2
|
||||
|
||||
for.end: ; preds = %for.body
|
||||
|
|
Loading…
Reference in New Issue