forked from OSchip/llvm-project
ARM: Enable MachineScheduler and disable PostRAScheduler for swift.
This is mostly done to disable the PostRAScheduler which optimizes for instruction latencies which isn't a good fit for out-of-order architectures. This also allows to leave out the itinerary table in swift in favor of the SchedModel ones. This change leads to performance improvements/regressions by as much as 10% in some benchmarks, in fact we loose 0.4% performance over the llvm-testsuite for reasons that appear to be unknown or out of the compilers control. rdar://20803802 documents the investigation of these effects. While it is probably a good idea to perform the same switch for the other ARM out-of-order CPUs, I limited this change to swift as I cannot perform the benchmark verification on the other CPUs. Differential Revision: http://reviews.llvm.org/D10513 llvm-svn: 242500
This commit is contained in:
parent
fb2398d0c4
commit
2d8315f806
|
@ -206,6 +206,9 @@ struct MCSchedModel {
|
|||
/// scheduling class (itinerary class or SchedRW list).
|
||||
bool isComplete() const { return CompleteModel; }
|
||||
|
||||
/// Return true if machine supports out of order execution.
|
||||
bool isOutOfOrder() const { return MicroOpBufferSize > 1; }
|
||||
|
||||
unsigned getNumProcResourceKinds() const {
|
||||
return NumProcResourceKinds;
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -319,8 +319,19 @@ bool ARMSubtarget::hasSinCos() const {
|
|||
return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0);
|
||||
}
|
||||
|
||||
bool ARMSubtarget::enableMachineScheduler() const {
|
||||
// Enable the MachineScheduler before register allocation for out-of-order
|
||||
// architectures where we do not use the PostRA scheduler anymore (for now
|
||||
// restricted to swift).
|
||||
return getSchedModel().isOutOfOrder() && isSwift();
|
||||
}
|
||||
|
||||
// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
|
||||
bool ARMSubtarget::enablePostRAScheduler() const {
|
||||
// No need for PostRA scheduling on out of order CPUs (for now restricted to
|
||||
// swift).
|
||||
if (getSchedModel().isOutOfOrder() && isSwift())
|
||||
return false;
|
||||
return (!isThumb() || hasThumb2());
|
||||
}
|
||||
|
||||
|
|
|
@ -433,6 +433,9 @@ public:
|
|||
/// compiler runtime or math libraries.
|
||||
bool hasSinCos() const;
|
||||
|
||||
/// Returns true if machine scheduler should be enabled.
|
||||
bool enableMachineScheduler() const override;
|
||||
|
||||
/// True for some subtargets at > -O0.
|
||||
bool enablePostRAScheduler() const override;
|
||||
|
||||
|
|
|
@ -11,25 +11,25 @@
|
|||
; r0 = r0 / r2
|
||||
; r1 = r1 / r3
|
||||
;
|
||||
; NOOPT: vmov [[B:d[0-9]+]], r2, r3
|
||||
; NOOPT-NEXT: vmov [[A:d[0-9]+]], r0, r1
|
||||
; NOOPT: vmov [[A:d[0-9]+]], r0, r1
|
||||
; NOOPT-NEXT: vmov [[B:d[0-9]+]], r2, r3
|
||||
; Move the low part of B into a register.
|
||||
; Unfortunately, we cannot express that the 's' register is the low
|
||||
; part of B, i.e., sIdx == BIdx x 2. E.g., B = d1, B_low = s2.
|
||||
; NOOPT-NEXT: vmov [[B_LOW:r[0-9]+]], s{{[0-9]+}}
|
||||
; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}}
|
||||
; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
|
||||
; NOOPT-NEXT: vmov [[B_HIGH:r[0-9]+]], s{{[0-9]+}}
|
||||
; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}}
|
||||
; NOOPT-NEXT: vmov [[A_HIGH:r[0-9]+]], s{{[0-9]+}}
|
||||
; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
|
||||
; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
|
||||
; NOOPT-NEXT: vmov.32 [[RES:d[0-9]+]][0], [[RES_LOW]]
|
||||
; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
|
||||
; NOOPT-NEXT: vmov.32 [[RES]][1], [[RES_HIGH]]
|
||||
; NOOPT-NEXT: vmov r0, r1, [[RES]]
|
||||
; NOOPT-NEXT: bx lr
|
||||
;
|
||||
; OPT-NOT: vmov
|
||||
; OPT: udiv r0, r0, r2
|
||||
; OPT-NEXT: udiv r1, r1, r3
|
||||
; OPT: udiv r1, r1, r3
|
||||
; OPT-NEXT: udiv r0, r0, r2
|
||||
; OPT-NEXT: bx lr
|
||||
define <2 x i32> @simpleVectorDiv(<2 x i32> %A, <2 x i32> %B) nounwind {
|
||||
entry:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX
|
||||
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
|
||||
; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
|
||||
; dependency) when it isn't dependent on last CPSR defining instruction.
|
||||
; rdar://8928208
|
||||
|
@ -7,8 +7,10 @@
|
|||
define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
|
||||
entry:
|
||||
; CHECK-LABEL: t1:
|
||||
; CHECK: muls [[REG:(r[0-9]+)]], r3, r2
|
||||
; CHECK-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0
|
||||
; CHECK-CORTEX: muls [[REG:(r[0-9]+)]], r3, r2
|
||||
; CHECK-CORTEX-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0
|
||||
; CHECK-SWIFT: muls [[REG2:(r[0-9]+)]], r1, r0
|
||||
; CHECK-SWIFT-NEXT: mul [[REG:(r[0-9]+)]], r2, r3
|
||||
; CHECK-NEXT: muls r0, [[REG]], [[REG2]]
|
||||
%0 = mul nsw i32 %a, %b
|
||||
%1 = mul nsw i32 %c, %d
|
||||
|
@ -21,8 +23,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
|
|||
define void @t2(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind {
|
||||
entry:
|
||||
; CHECK-LABEL: t2:
|
||||
%tobool7 = icmp eq i32* %ptr2, null
|
||||
br i1 %tobool7, label %while.end, label %while.body
|
||||
br label %while.body
|
||||
|
||||
while.body:
|
||||
; CHECK: while.body
|
||||
|
@ -55,8 +56,7 @@ while.end:
|
|||
define void @t3(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind minsize {
|
||||
entry:
|
||||
; CHECK-LABEL: t3:
|
||||
%tobool7 = icmp eq i32* %ptr2, null
|
||||
br i1 %tobool7, label %while.end, label %while.body
|
||||
br label %while.body
|
||||
|
||||
while.body:
|
||||
; CHECK: while.body
|
||||
|
|
|
@ -15,14 +15,14 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
|
|||
; CHECK: bne [[LOOP]]
|
||||
|
||||
; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
|
||||
; CHECK: movs r0, #1
|
||||
; CHECK: dmb ish
|
||||
; CHECK: movs r0, #1
|
||||
; CHECK: bx lr
|
||||
|
||||
; CHECK: [[FAILED]]:
|
||||
; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
|
||||
; CHECK: movs r0, #0
|
||||
; CHECK: dmb ish
|
||||
; CHECK: movs r0, #0
|
||||
; CHECK: bx lr
|
||||
|
||||
%pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
|
||||
|
@ -34,8 +34,8 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
|
|||
define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
|
||||
; CHECK-LABEL: test_return_bool:
|
||||
|
||||
; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
|
||||
; CHECK: dmb ishst
|
||||
; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
|
||||
|
||||
; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
|
||||
; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0]
|
||||
|
|
|
@ -20,8 +20,8 @@ entry:
|
|||
|
||||
for.body: ; preds = %entry, %for.body.3
|
||||
; CHECK: %for.body
|
||||
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
|
||||
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
|
||||
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
|
||||
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
|
||||
%i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ]
|
||||
%arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.09
|
||||
%0 = load i8, i8* %arrayidx, align 1
|
||||
|
@ -42,8 +42,8 @@ for.end: ; preds = %for.body, %for.body
|
|||
|
||||
for.body.1: ; preds = %for.body
|
||||
; CHECK: %for.body.1
|
||||
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
|
||||
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
|
||||
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
|
||||
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
|
||||
%arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %add5
|
||||
%2 = load i8, i8* %arrayidx.1, align 1
|
||||
%conv6.1 = zext i8 %2 to i32
|
||||
|
@ -60,8 +60,8 @@ for.body.1: ; preds = %for.body
|
|||
|
||||
for.body.2: ; preds = %for.body.1
|
||||
; CHECK: %for.body.2
|
||||
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
|
||||
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
|
||||
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
|
||||
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
|
||||
%arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %add5.1
|
||||
%4 = load i8, i8* %arrayidx.2, align 1
|
||||
%conv6.2 = zext i8 %4 to i32
|
||||
|
@ -78,8 +78,8 @@ for.body.2: ; preds = %for.body.1
|
|||
|
||||
for.body.3: ; preds = %for.body.2
|
||||
; CHECK: %for.body.3
|
||||
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
|
||||
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
|
||||
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
|
||||
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
|
||||
%arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %add5.2
|
||||
%6 = load i8, i8* %arrayidx.3, align 1
|
||||
%conv6.3 = zext i8 %6 to i32
|
||||
|
|
|
@ -238,12 +238,12 @@ define <4 x i32> @zextload_v8i8tov8i32(<4 x i8>** %ptr) {
|
|||
|
||||
define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
|
||||
;CHECK-LABEL: zextload_v8i8tov8i32_fake_update:
|
||||
;CHECK: ldr.w r[[PTRREG:[0-9]+]], [r0]
|
||||
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
|
||||
;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r[[PTRREG]]:32]
|
||||
;CHECK: add.w r[[INCREG:[0-9]+]], r[[PTRREG]], #16
|
||||
;CHECK: str.w r[[INCREG]], [r0]
|
||||
;CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
|
||||
;CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
|
||||
;CHECK: str r[[INCREG]], [r0]
|
||||
%A = load <4 x i8>*, <4 x i8>** %ptr
|
||||
%lA = load <4 x i8>, <4 x i8>* %A, align 4
|
||||
%inc = getelementptr <4 x i8>, <4 x i8>* %A, i38 4
|
||||
|
|
|
@ -228,9 +228,9 @@ define void @truncstore_v4i32tov4i8(<4 x i8>** %ptr, <4 x i32> %val) {
|
|||
;CHECK: ldr.w r9, [sp]
|
||||
;CHECK: vmov {{d[0-9]+}}, r3, r9
|
||||
;CHECK: vmov {{d[0-9]+}}, r1, r2
|
||||
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
|
||||
;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}}
|
||||
;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}}
|
||||
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
|
||||
;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32]
|
||||
%A = load <4 x i8>*, <4 x i8>** %ptr
|
||||
%trunc = trunc <4 x i32> %val to <4 x i8>
|
||||
|
@ -243,10 +243,10 @@ define void @truncstore_v4i32tov4i8_fake_update(<4 x i8>** %ptr, <4 x i32> %val)
|
|||
;CHECK: ldr.w r9, [sp]
|
||||
;CHECK: vmov {{d[0-9]+}}, r3, r9
|
||||
;CHECK: vmov {{d[0-9]+}}, r1, r2
|
||||
;CHECK: movs [[IMM16:r[0-9]+]], #16
|
||||
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
|
||||
;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}}
|
||||
;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}}
|
||||
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
|
||||
;CHECK: movs [[IMM16:r[0-9]+]], #16
|
||||
;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32], [[IMM16]]
|
||||
;CHECK: str r[[PTRREG]], [r0]
|
||||
%A = load <4 x i8>*, <4 x i8>** %ptr
|
||||
|
|
Loading…
Reference in New Issue