forked from OSchip/llvm-project
ARM: prefer allocating VFP regs at stride 4 on Darwin.
This is necessary for WatchOS support, where the compact unwind format assumes this kind of layout. For now we only want this on Swift-like CPUs though, where it's been the Xcode behaviour for ages. Also, since it can expand the prologue we don't want it at -Oz. llvm-svn: 243884
This commit is contained in:
parent
4fb46cb818
commit
910dde7ab2
|
@ -266,12 +266,19 @@ def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
|
|||
}
|
||||
|
||||
// Scalar single precision floating point register class..
|
||||
// FIXME: Allocation order changed to s0, s2, s4, ... as a quick hack to
|
||||
// avoid partial-write dependencies on D registers (S registers are
|
||||
// renamed as portions of D registers).
|
||||
def SPR : RegisterClass<"ARM", [f32], 32, (add (decimate
|
||||
(sequence "S%u", 0, 31), 2),
|
||||
(sequence "S%u", 0, 31))>;
|
||||
// FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack
|
||||
// to avoid partial-write dependencies on D or Q (depending on platform)
|
||||
// registers (S registers are renamed as portions of D/Q registers).
|
||||
def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> {
|
||||
let AltOrders = [(add (decimate SPR, 2), SPR),
|
||||
(add (decimate SPR, 4),
|
||||
(decimate SPR, 2),
|
||||
(decimate (rotl SPR, 1), 4),
|
||||
(decimate (rotl SPR, 1), 2))];
|
||||
let AltOrderSelect = [{
|
||||
return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
|
||||
}];
|
||||
}
|
||||
|
||||
// Subset of SPR which can be used as a source of NEON scalars for 16-bit
|
||||
// operations
|
||||
|
@ -283,9 +290,13 @@ def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>;
|
|||
// is double-word alignment though.
|
||||
def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
|
||||
(sequence "D%u", 0, 31)> {
|
||||
// Allocate non-VFP2 registers D16-D31 first.
|
||||
let AltOrders = [(rotl DPR, 16)];
|
||||
let AltOrderSelect = [{ return 1; }];
|
||||
// Allocate non-VFP2 registers D16-D31 first, and prefer even registers on
|
||||
// Darwin platforms.
|
||||
let AltOrders = [(rotl DPR, 16),
|
||||
(add (decimate (rotl DPR, 16), 2), (rotl DPR, 16))];
|
||||
let AltOrderSelect = [{
|
||||
return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
|
||||
}];
|
||||
}
|
||||
|
||||
// Subset of DPR that are accessible with VFP2 (and so that also have
|
||||
|
|
|
@ -167,6 +167,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
|
|||
ArchFS = FS;
|
||||
}
|
||||
ParseSubtargetFeatures(CPUString, ArchFS);
|
||||
printf("A-class: %d\n", static_cast<bool>(getFeatureBits()[ARM::ProcSwift]));
|
||||
|
||||
// FIXME: This used enable V6T2 support implicitly for Thumb2 mode.
|
||||
// Assert this for now to make the change obvious.
|
||||
|
@ -285,6 +286,10 @@ bool ARMSubtarget::enableAtomicExpand() const {
|
|||
return hasAnyDataBarrier() && !isThumb1Only();
|
||||
}
|
||||
|
||||
bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const {
|
||||
return isSwift() && !MF.getFunction()->hasFnAttribute(Attribute::MinSize);
|
||||
}
|
||||
|
||||
bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
|
||||
// NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
|
||||
// immediates as it is inherently position independent, and may be out of
|
||||
|
|
|
@ -413,6 +413,8 @@ public:
|
|||
return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9;
|
||||
}
|
||||
|
||||
bool useStride4VFPs(const MachineFunction &MF) const;
|
||||
|
||||
bool useMovt(const MachineFunction &MF) const;
|
||||
|
||||
bool supportsTailCall() const { return SupportsTailCall; }
|
||||
|
|
|
@ -60,8 +60,6 @@ define void @check_vfp_fold() minsize {
|
|||
; CHECK: vpush {d6, d7, d8, d9}
|
||||
; CHECK-NOT: sub sp,
|
||||
; ...
|
||||
; CHECK: vldmia r[[GLOBREG]], {d8, d9}
|
||||
; ...
|
||||
; CHECK-NOT: add sp,
|
||||
; CHECK: vpop {d6, d7, d8, d9}
|
||||
; CHECKL pop {r[[GLOBREG]], pc}
|
||||
|
@ -82,9 +80,8 @@ define void @check_vfp_fold() minsize {
|
|||
|
||||
%var = alloca i8, i32 16
|
||||
|
||||
%tmp = load %bigVec, %bigVec* @var
|
||||
call void asm "", "r,~{d8},~{d9}"(i8* %var)
|
||||
call void @bar(i8* %var)
|
||||
store %bigVec %tmp, %bigVec* @var
|
||||
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
; RUN: llc -mcpu=swift -mtriple=thumbv7s-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-STRIDE4
|
||||
; RUN: llc -mcpu=cortex-a57 -mtriple=thumbv7-linux-gnueabihf -o - %s | FileCheck %s --check-prefix=CHECK-GENERIC
|
||||
|
||||
define void @test_reg_stride(double %a, double %b) {
|
||||
; CHECK-STRIDE4-LABEL: test_reg_stride:
|
||||
; CHECK-STRIDE4-DAG: vmov d16, r
|
||||
; CHECK-STRIDE4-DAG: vmov d18, r
|
||||
|
||||
; CHECK-GENERIC-LABEL: test_reg_stride:
|
||||
; CHECK-GENERIC-DAG: vmov.f64 d16, {{d[01]}}
|
||||
; CHECK-GENERIC-DAG: vmov.f64 d17, {{d[01]}}
|
||||
|
||||
call void asm "", "~{r0},~{r1},~{d0},~{d1}"()
|
||||
call arm_aapcs_vfpcc void @eat_doubles(double %a, double %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_stride_minsize(float %a, float %b) minsize {
|
||||
; CHECK-STRIDE4-LABEL: test_stride_minsize:
|
||||
; CHECK-STRIDE4: vmov d2, {{r[01]}}
|
||||
; CHECK-STRIDE4: vmov d3, {{r[01]}}
|
||||
|
||||
; CHECK-GENERIC-LABEL: test_stride_minsize:
|
||||
; CHECK-GENERIC-DAG: vmov.f32 s4, {{s[01]}}
|
||||
; CHECK-GENERIC-DAG: vmov.f32 s6, {{s[01]}}
|
||||
call void asm "", "~{r0},~{r1},~{s0},~{s1},~{d0},~{d1}"()
|
||||
call arm_aapcs_vfpcc void @eat_floats(float %a, float %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
declare arm_aapcs_vfpcc void @eat_doubles(double, double)
|
||||
declare arm_aapcs_vfpcc void @eat_floats(float, float)
|
Loading…
Reference in New Issue