ARM: Enable MachineScheduler and disable PostRAScheduler for swift.

This is mostly done to disable the PostRAScheduler which optimizes for
instruction latencies which isn't a good fit for out-of-order
architectures. This also allows to leave out the itinerary table in
swift in favor of the SchedModel ones.

This change leads to performance improvements/regressions by as much as
10% in some benchmarks, in fact we loose 0.4% performance over the
llvm-testsuite for reasons that appear to be unknown or out of the
compilers control. rdar://20803802 documents the investigation of
these effects.

While it is probably a good idea to perform the same switch for the
other ARM out-of-order CPUs, I limited this change to swift as I cannot
perform the benchmark verification on the other CPUs.

Differential Revision: http://reviews.llvm.org/D10513

llvm-svn: 242500
This commit is contained in:
Matthias Braun 2015-07-17 01:44:31 +00:00
parent fb2398d0c4
commit 2d8315f806
10 changed files with 48 additions and 1069 deletions

View File

@ -206,6 +206,9 @@ struct MCSchedModel {
/// scheduling class (itinerary class or SchedRW list).
bool isComplete() const { return CompleteModel; }
/// Return true if machine supports out of order execution.
bool isOutOfOrder() const { return MicroOpBufferSize > 1; }
unsigned getNumProcResourceKinds() const {
return NumProcResourceKinds;
}

File diff suppressed because it is too large Load Diff

View File

@ -319,8 +319,19 @@ bool ARMSubtarget::hasSinCos() const {
return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0);
}
bool ARMSubtarget::enableMachineScheduler() const {
// Enable the MachineScheduler before register allocation for out-of-order
// architectures where we do not use the PostRA scheduler anymore (for now
// restricted to swift).
return getSchedModel().isOutOfOrder() && isSwift();
}
// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
bool ARMSubtarget::enablePostRAScheduler() const {
// No need for PostRA scheduling on out of order CPUs (for now restricted to
// swift).
if (getSchedModel().isOutOfOrder() && isSwift())
return false;
return (!isThumb() || hasThumb2());
}

View File

@ -433,6 +433,9 @@ public:
/// compiler runtime or math libraries.
bool hasSinCos() const;
/// Returns true if machine scheduler should be enabled.
bool enableMachineScheduler() const override;
/// True for some subtargets at > -O0.
bool enablePostRAScheduler() const override;

View File

@ -11,25 +11,25 @@
; r0 = r0 / r2
; r1 = r1 / r3
;
; NOOPT: vmov [[B:d[0-9]+]], r2, r3
; NOOPT-NEXT: vmov [[A:d[0-9]+]], r0, r1
; NOOPT: vmov [[A:d[0-9]+]], r0, r1
; NOOPT-NEXT: vmov [[B:d[0-9]+]], r2, r3
; Move the low part of B into a register.
; Unfortunately, we cannot express that the 's' register is the low
; part of B, i.e., sIdx == BIdx x 2. E.g., B = d1, B_low = s2.
; NOOPT-NEXT: vmov [[B_LOW:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
; NOOPT-NEXT: vmov [[B_HIGH:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: vmov [[A_HIGH:r[0-9]+]], s{{[0-9]+}}
; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
; NOOPT-NEXT: vmov.32 [[RES:d[0-9]+]][0], [[RES_LOW]]
; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
; NOOPT-NEXT: vmov.32 [[RES]][1], [[RES_HIGH]]
; NOOPT-NEXT: vmov r0, r1, [[RES]]
; NOOPT-NEXT: bx lr
;
; OPT-NOT: vmov
; OPT: udiv r0, r0, r2
; OPT-NEXT: udiv r1, r1, r3
; OPT: udiv r1, r1, r3
; OPT-NEXT: udiv r0, r0, r2
; OPT-NEXT: bx lr
define <2 x i32> @simpleVectorDiv(<2 x i32> %A, <2 x i32> %B) nounwind {
entry:

View File

@ -1,5 +1,5 @@
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
; dependency) when it isn't dependent on last CPSR defining instruction.
; rdar://8928208
@ -7,8 +7,10 @@
define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
entry:
; CHECK-LABEL: t1:
; CHECK: muls [[REG:(r[0-9]+)]], r3, r2
; CHECK-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0
; CHECK-CORTEX: muls [[REG:(r[0-9]+)]], r3, r2
; CHECK-CORTEX-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0
; CHECK-SWIFT: muls [[REG2:(r[0-9]+)]], r1, r0
; CHECK-SWIFT-NEXT: mul [[REG:(r[0-9]+)]], r2, r3
; CHECK-NEXT: muls r0, [[REG]], [[REG2]]
%0 = mul nsw i32 %a, %b
%1 = mul nsw i32 %c, %d
@ -21,8 +23,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
define void @t2(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind {
entry:
; CHECK-LABEL: t2:
%tobool7 = icmp eq i32* %ptr2, null
br i1 %tobool7, label %while.end, label %while.body
br label %while.body
while.body:
; CHECK: while.body
@ -55,8 +56,7 @@ while.end:
define void @t3(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind minsize {
entry:
; CHECK-LABEL: t3:
%tobool7 = icmp eq i32* %ptr2, null
br i1 %tobool7, label %while.end, label %while.body
br label %while.body
while.body:
; CHECK: while.body

View File

@ -15,14 +15,14 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
; CHECK: bne [[LOOP]]
; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
; CHECK: movs r0, #1
; CHECK: dmb ish
; CHECK: movs r0, #1
; CHECK: bx lr
; CHECK: [[FAILED]]:
; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
; CHECK: movs r0, #0
; CHECK: dmb ish
; CHECK: movs r0, #0
; CHECK: bx lr
%pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
@ -34,8 +34,8 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) {
define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
; CHECK-LABEL: test_return_bool:
; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
; CHECK: dmb ishst
; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0]

View File

@ -20,8 +20,8 @@ entry:
for.body: ; preds = %entry, %for.body.3
; CHECK: %for.body
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
%i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.09
%0 = load i8, i8* %arrayidx, align 1
@ -42,8 +42,8 @@ for.end: ; preds = %for.body, %for.body
for.body.1: ; preds = %for.body
; CHECK: %for.body.1
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
%arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %add5
%2 = load i8, i8* %arrayidx.1, align 1
%conv6.1 = zext i8 %2 to i32
@ -60,8 +60,8 @@ for.body.1: ; preds = %for.body
for.body.2: ; preds = %for.body.1
; CHECK: %for.body.2
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
%arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %add5.1
%4 = load i8, i8* %arrayidx.2, align 1
%conv6.2 = zext i8 %4 to i32
@ -78,8 +78,8 @@ for.body.2: ; preds = %for.body.1
for.body.3: ; preds = %for.body.2
; CHECK: %for.body.3
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
%arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %add5.2
%6 = load i8, i8* %arrayidx.3, align 1
%conv6.3 = zext i8 %6 to i32

View File

@ -238,12 +238,12 @@ define <4 x i32> @zextload_v8i8tov8i32(<4 x i8>** %ptr) {
define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
;CHECK-LABEL: zextload_v8i8tov8i32_fake_update:
;CHECK: ldr.w r[[PTRREG:[0-9]+]], [r0]
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r[[PTRREG]]:32]
;CHECK: add.w r[[INCREG:[0-9]+]], r[[PTRREG]], #16
;CHECK: str.w r[[INCREG]], [r0]
;CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
;CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
;CHECK: str r[[INCREG]], [r0]
%A = load <4 x i8>*, <4 x i8>** %ptr
%lA = load <4 x i8>, <4 x i8>* %A, align 4
%inc = getelementptr <4 x i8>, <4 x i8>* %A, i38 4

View File

@ -228,9 +228,9 @@ define void @truncstore_v4i32tov4i8(<4 x i8>** %ptr, <4 x i32> %val) {
;CHECK: ldr.w r9, [sp]
;CHECK: vmov {{d[0-9]+}}, r3, r9
;CHECK: vmov {{d[0-9]+}}, r1, r2
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}}
;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}}
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32]
%A = load <4 x i8>*, <4 x i8>** %ptr
%trunc = trunc <4 x i32> %val to <4 x i8>
@ -243,10 +243,10 @@ define void @truncstore_v4i32tov4i8_fake_update(<4 x i8>** %ptr, <4 x i32> %val)
;CHECK: ldr.w r9, [sp]
;CHECK: vmov {{d[0-9]+}}, r3, r9
;CHECK: vmov {{d[0-9]+}}, r1, r2
;CHECK: movs [[IMM16:r[0-9]+]], #16
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}}
;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}}
;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
;CHECK: movs [[IMM16:r[0-9]+]], #16
;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32], [[IMM16]]
;CHECK: str r[[PTRREG]], [r0]
%A = load <4 x i8>*, <4 x i8>** %ptr