forked from OSchip/llvm-project
[LoopStrengthReduce, x86] don't add cost for a cmp that will be macro-fused (PR35681)
In the motivating case from PR35681 and represented by the macro-fuse-cmp test: https://bugs.llvm.org/show_bug.cgi?id=35681 ...there's a 37 -> 31 byte size win for the loop because we eliminate the big base address offsets. SPEC2017 on Ryzen shows no significant perf difference. Differential Revision: https://reviews.llvm.org/D42607 llvm-svn: 324289
This commit is contained in:
parent
40ddcb8133
commit
d7c702b451
|
@ -471,6 +471,11 @@ public:
|
|||
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
|
||||
TargetTransformInfo::LSRCost &C2) const;
|
||||
|
||||
/// Return true if the target can fuse a compare and branch.
|
||||
/// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
|
||||
/// calculation for the instructions in a loop.
|
||||
bool canMacroFuseCmp() const;
|
||||
|
||||
/// \brief Return true if the target supports masked load/store
|
||||
/// AVX2 and AVX-512 targets allow masks for consecutive load and store
|
||||
bool isLegalMaskedStore(Type *DataType) const;
|
||||
|
@ -978,6 +983,7 @@ public:
|
|||
Instruction *I) = 0;
|
||||
virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
|
||||
TargetTransformInfo::LSRCost &C2) = 0;
|
||||
virtual bool canMacroFuseCmp() = 0;
|
||||
virtual bool isLegalMaskedStore(Type *DataType) = 0;
|
||||
virtual bool isLegalMaskedLoad(Type *DataType) = 0;
|
||||
virtual bool isLegalMaskedScatter(Type *DataType) = 0;
|
||||
|
@ -1197,6 +1203,9 @@ public:
|
|||
TargetTransformInfo::LSRCost &C2) override {
|
||||
return Impl.isLSRCostLess(C1, C2);
|
||||
}
|
||||
bool canMacroFuseCmp() override {
|
||||
return Impl.canMacroFuseCmp();
|
||||
}
|
||||
bool isLegalMaskedStore(Type *DataType) override {
|
||||
return Impl.isLegalMaskedStore(DataType);
|
||||
}
|
||||
|
|
|
@ -246,6 +246,8 @@ public:
|
|||
C2.ScaleCost, C2.ImmCost, C2.SetupCost);
|
||||
}
|
||||
|
||||
bool canMacroFuseCmp() { return false; }
|
||||
|
||||
bool isLegalMaskedStore(Type *DataType) { return false; }
|
||||
|
||||
bool isLegalMaskedLoad(Type *DataType) { return false; }
|
||||
|
|
|
@ -155,6 +155,10 @@ bool TargetTransformInfo::isLSRCostLess(LSRCost &C1, LSRCost &C2) const {
|
|||
return TTIImpl->isLSRCostLess(C1, C2);
|
||||
}
|
||||
|
||||
bool TargetTransformInfo::canMacroFuseCmp() const {
|
||||
return TTIImpl->canMacroFuseCmp();
|
||||
}
|
||||
|
||||
bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const {
|
||||
return TTIImpl->isLegalMaskedStore(DataType);
|
||||
}
|
||||
|
|
|
@ -2482,6 +2482,10 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
|
|||
C2.ScaleCost, C2.ImmCost, C2.SetupCost);
|
||||
}
|
||||
|
||||
bool X86TTIImpl::canMacroFuseCmp() {
|
||||
return ST->hasMacroFusion();
|
||||
}
|
||||
|
||||
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
|
||||
// The backend can't handle a single element vector.
|
||||
if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
|
||||
|
|
|
@ -120,6 +120,7 @@ public:
|
|||
Type *Ty);
|
||||
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
|
||||
TargetTransformInfo::LSRCost &C2);
|
||||
bool canMacroFuseCmp();
|
||||
bool isLegalMaskedLoad(Type *DataType);
|
||||
bool isLegalMaskedStore(Type *DataType);
|
||||
bool isLegalMaskedGather(Type *DataType);
|
||||
|
|
|
@ -1343,14 +1343,15 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
|
|||
|
||||
// If ICmpZero formula ends with not 0, it could not be replaced by
|
||||
// just add or sub. We'll need to compare final result of AddRec.
|
||||
// That means we'll need an additional instruction.
|
||||
// That means we'll need an additional instruction. But if the target can
|
||||
// macro-fuse a compare with a branch, don't count this extra instruction.
|
||||
// For -10 + {0, +, 1}:
|
||||
// i = i + 1;
|
||||
// cmp i, 10
|
||||
//
|
||||
// For {-10, +, 1}:
|
||||
// i = i + 1;
|
||||
if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
|
||||
if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp())
|
||||
C.Insns++;
|
||||
// Each new AddRec adds 1 instruction to calculation.
|
||||
C.Insns += (C.AddRecCost - PrevAddRecCost);
|
||||
|
|
|
@ -82,35 +82,41 @@ define i32 @CSE() nounwind {
|
|||
define void @loop(i32* %p, i32 %n) nounwind {
|
||||
; X86-LABEL: loop:
|
||||
; X86: # %bb.0: # %entry
|
||||
; X86-NEXT: pushl %esi
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: testl %eax, %eax
|
||||
; X86-NEXT: je .LBB3_3
|
||||
; X86-NEXT: # %bb.1: # %while.body.preheader
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-NEXT: xorl %edx, %edx
|
||||
; X86-NEXT: .p2align 4, 0x90
|
||||
; X86-NEXT: .LBB3_2: # %while.body
|
||||
; X86-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; X86-NEXT: rdrandl %edx
|
||||
; X86-NEXT: movl %edx, (%ecx)
|
||||
; X86-NEXT: leal 4(%ecx), %ecx
|
||||
; X86-NEXT: addl $-1, %eax
|
||||
; X86-NEXT: rdrandl %esi
|
||||
; X86-NEXT: movl %esi, (%ecx,%edx,4)
|
||||
; X86-NEXT: addl $1, %edx
|
||||
; X86-NEXT: cmpl %edx, %eax
|
||||
; X86-NEXT: jne .LBB3_2
|
||||
; X86-NEXT: .LBB3_3: # %while.end
|
||||
; X86-NEXT: popl %esi
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: loop:
|
||||
; X64: # %bb.0: # %entry
|
||||
; X64-NEXT: testl %esi, %esi
|
||||
; X64-NEXT: je .LBB3_2
|
||||
; X64-NEXT: je .LBB3_3
|
||||
; X64-NEXT: # %bb.1: # %while.body.preheader
|
||||
; X64-NEXT: movl %esi, %eax
|
||||
; X64-NEXT: xorl %ecx, %ecx
|
||||
; X64-NEXT: .p2align 4, 0x90
|
||||
; X64-NEXT: .LBB3_1: # %while.body
|
||||
; X64-NEXT: .LBB3_2: # %while.body
|
||||
; X64-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; X64-NEXT: rdrandl %eax
|
||||
; X64-NEXT: movl %eax, (%rdi)
|
||||
; X64-NEXT: leaq 4(%rdi), %rdi
|
||||
; X64-NEXT: addl $-1, %esi
|
||||
; X64-NEXT: jne .LBB3_1
|
||||
; X64-NEXT: .LBB3_2: # %while.end
|
||||
; X64-NEXT: rdrandl %edx
|
||||
; X64-NEXT: movl %edx, (%rdi,%rcx,4)
|
||||
; X64-NEXT: addq $1, %rcx
|
||||
; X64-NEXT: cmpl %ecx, %eax
|
||||
; X64-NEXT: jne .LBB3_2
|
||||
; X64-NEXT: .LBB3_3: # %while.end
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%tobool1 = icmp eq i32 %n, 0
|
||||
|
|
|
@ -347,30 +347,31 @@ define void @foldedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c) nou
|
|||
; X32-NEXT: pushl %ebx
|
||||
; X32-NEXT: pushl %edi
|
||||
; X32-NEXT: pushl %esi
|
||||
; X32-NEXT: movl $-400, %eax # imm = 0xFE70
|
||||
; X32-NEXT: movl $3, %eax
|
||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; X32-NEXT: .p2align 4, 0x90
|
||||
; X32-NEXT: .LBB3_1: # %for.body
|
||||
; X32-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; X32-NEXT: movzbl 400(%esi,%eax), %edi
|
||||
; X32-NEXT: movzbl 400(%edx,%eax), %ebx
|
||||
; X32-NEXT: movzbl -3(%esi,%eax), %edi
|
||||
; X32-NEXT: movzbl -3(%edx,%eax), %ebx
|
||||
; X32-NEXT: addl %edi, %ebx
|
||||
; X32-NEXT: movb %bl, 400(%ecx,%eax)
|
||||
; X32-NEXT: movzbl 401(%esi,%eax), %edi
|
||||
; X32-NEXT: movzbl 401(%edx,%eax), %ebx
|
||||
; X32-NEXT: movb %bl, -3(%ecx,%eax)
|
||||
; X32-NEXT: movzbl -2(%esi,%eax), %edi
|
||||
; X32-NEXT: movzbl -2(%edx,%eax), %ebx
|
||||
; X32-NEXT: addl %edi, %ebx
|
||||
; X32-NEXT: movb %bl, 401(%ecx,%eax)
|
||||
; X32-NEXT: movzbl 402(%esi,%eax), %edi
|
||||
; X32-NEXT: movzbl 402(%edx,%eax), %ebx
|
||||
; X32-NEXT: movb %bl, -2(%ecx,%eax)
|
||||
; X32-NEXT: movzbl -1(%esi,%eax), %edi
|
||||
; X32-NEXT: movzbl -1(%edx,%eax), %ebx
|
||||
; X32-NEXT: addl %edi, %ebx
|
||||
; X32-NEXT: movb %bl, 402(%ecx,%eax)
|
||||
; X32-NEXT: movzbl 403(%esi,%eax), %edi
|
||||
; X32-NEXT: movzbl 403(%edx,%eax), %ebx
|
||||
; X32-NEXT: movb %bl, -1(%ecx,%eax)
|
||||
; X32-NEXT: movzbl (%esi,%eax), %edi
|
||||
; X32-NEXT: movzbl (%edx,%eax), %ebx
|
||||
; X32-NEXT: addl %edi, %ebx
|
||||
; X32-NEXT: movb %bl, 403(%ecx,%eax)
|
||||
; X32-NEXT: movb %bl, (%ecx,%eax)
|
||||
; X32-NEXT: addl $4, %eax
|
||||
; X32-NEXT: cmpl $403, %eax # imm = 0x193
|
||||
; X32-NEXT: jne .LBB3_1
|
||||
; X32-NEXT: # %bb.2: # %for.end
|
||||
; X32-NEXT: popl %esi
|
||||
|
|
|
@ -43,27 +43,22 @@ define void @maxArray(double* noalias nocapture %x, double* noalias nocapture re
|
|||
;
|
||||
; HSW-LABEL: @maxArray(
|
||||
; HSW-NEXT: entry:
|
||||
; HSW-NEXT: [[Y1:%.*]] = bitcast double* [[Y:%.*]] to i8*
|
||||
; HSW-NEXT: [[X3:%.*]] = bitcast double* [[X:%.*]] to i8*
|
||||
; HSW-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; HSW: vector.body:
|
||||
; HSW-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ -524288, [[ENTRY:%.*]] ]
|
||||
; HSW-NEXT: [[UGLYGEP7:%.*]] = getelementptr i8, i8* [[X3]], i64 [[LSR_IV]]
|
||||
; HSW-NEXT: [[UGLYGEP78:%.*]] = bitcast i8* [[UGLYGEP7]] to <2 x double>*
|
||||
; HSW-NEXT: [[SCEVGEP9:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP78]], i64 32768
|
||||
; HSW-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[Y1]], i64 [[LSR_IV]]
|
||||
; HSW-NEXT: [[UGLYGEP2:%.*]] = bitcast i8* [[UGLYGEP]] to <2 x double>*
|
||||
; HSW-NEXT: [[SCEVGEP:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP2]], i64 32768
|
||||
; HSW-NEXT: [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP9]], align 8
|
||||
; HSW-NEXT: [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP]], align 8
|
||||
; HSW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; HSW-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[INDEX]]
|
||||
; HSW-NEXT: [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to <2 x double>*
|
||||
; HSW-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[INDEX]]
|
||||
; HSW-NEXT: [[SCEVGEP1:%.*]] = bitcast double* [[SCEVGEP]] to <2 x double>*
|
||||
; HSW-NEXT: [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP45]], align 8
|
||||
; HSW-NEXT: [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP1]], align 8
|
||||
; HSW-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]]
|
||||
; HSW-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]]
|
||||
; HSW-NEXT: [[UGLYGEP4:%.*]] = getelementptr i8, i8* [[X3]], i64 [[LSR_IV]]
|
||||
; HSW-NEXT: [[UGLYGEP45:%.*]] = bitcast i8* [[UGLYGEP4]] to <2 x double>*
|
||||
; HSW-NEXT: [[SCEVGEP6:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP45]], i64 32768
|
||||
; HSW-NEXT: store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP6]], align 8
|
||||
; HSW-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 16
|
||||
; HSW-NEXT: [[DONE:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
|
||||
; HSW-NEXT: [[SCEVGEP2:%.*]] = getelementptr double, double* [[X]], i64 [[INDEX]]
|
||||
; HSW-NEXT: [[SCEVGEP23:%.*]] = bitcast double* [[SCEVGEP2]] to <2 x double>*
|
||||
; HSW-NEXT: store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP23]], align 8
|
||||
; HSW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
|
||||
; HSW-NEXT: [[DONE:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
|
||||
; HSW-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
|
||||
; HSW: exit:
|
||||
; HSW-NEXT: ret void
|
||||
|
@ -85,15 +80,16 @@ define void @maxArray(double* noalias nocapture %x, double* noalias nocapture re
|
|||
;
|
||||
; FUSE-LABEL: maxArray:
|
||||
; FUSE: # %bb.0: # %entry
|
||||
; FUSE-NEXT: movq $-524288, %rax # imm = 0xFFF80000
|
||||
; FUSE-NEXT: xorl %eax, %eax
|
||||
; FUSE-NEXT: .p2align 4, 0x90
|
||||
; FUSE-NEXT: .LBB0_1: # %vector.body
|
||||
; FUSE-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; FUSE-NEXT: movupd 524288(%rdi,%rax), %xmm0
|
||||
; FUSE-NEXT: movupd 524288(%rsi,%rax), %xmm1
|
||||
; FUSE-NEXT: movupd (%rdi,%rax,8), %xmm0
|
||||
; FUSE-NEXT: movupd (%rsi,%rax,8), %xmm1
|
||||
; FUSE-NEXT: maxpd %xmm0, %xmm1
|
||||
; FUSE-NEXT: movupd %xmm1, 524288(%rdi,%rax)
|
||||
; FUSE-NEXT: addq $16, %rax
|
||||
; FUSE-NEXT: movupd %xmm1, (%rdi,%rax,8)
|
||||
; FUSE-NEXT: addq $2, %rax
|
||||
; FUSE-NEXT: cmpq $65536, %rax # imm = 0x10000
|
||||
; FUSE-NEXT: jne .LBB0_1
|
||||
; FUSE-NEXT: # %bb.2: # %exit
|
||||
; FUSE-NEXT: retq
|
||||
|
|
Loading…
Reference in New Issue