forked from OSchip/llvm-project
[ARM] Do not fuse VADD and VMUL on the Cortex-M4 and Cortex-M33
A sequence of VMUL and VADD instructions always give the same or better performance than a fused VMLA instruction on the Cortex-M4 and Cortex-M33. Executing the VMUL and VADD back-to-back requires the same cycles, but having separate instructions allows scheduling to avoid the hazard between these 2 instructions. Differential Revision: https://reviews.llvm.org/D52289 llvm-svn: 342874
This commit is contained in:
parent
5555c00902
commit
d986ede313
|
@ -966,6 +966,7 @@ def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em,
|
|||
FeatureVFPOnlySP,
|
||||
FeatureD16,
|
||||
FeaturePrefLoopAlign32,
|
||||
FeatureHasSlowFPVMLx,
|
||||
FeatureHasNoBranchPredictor]>;
|
||||
|
||||
def : ProcNoItin<"cortex-m7", [ARMv7em,
|
||||
|
@ -981,6 +982,7 @@ def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline,
|
|||
FeatureD16,
|
||||
FeatureVFPOnlySP,
|
||||
FeaturePrefLoopAlign32,
|
||||
FeatureHasSlowFPVMLx,
|
||||
FeatureHasNoBranchPredictor]>;
|
||||
|
||||
def : ProcNoItin<"cortex-a32", [ARMv8a,
|
||||
|
|
|
@ -353,10 +353,10 @@ def UseNegativeImmediates :
|
|||
let RecomputePerFunction = 1 in {
|
||||
def UseMovt : Predicate<"Subtarget->useMovt(*MF)">;
|
||||
def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">;
|
||||
def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
|
||||
def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;
|
||||
def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
|
||||
def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;
|
||||
def UseFPVMLx : Predicate<"Subtarget->useFPVMLx() || MF->getFunction().optForMinSize()">;
|
||||
}
|
||||
def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;
|
||||
def UseMulOps : Predicate<"Subtarget->useMulOps()">;
|
||||
|
||||
// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=A8
|
||||
; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9
|
||||
; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard %s -o - | FileCheck %s -check-prefix=HARD
|
||||
; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard %s -o - | FileCheck %s -check-prefix=VMLA
|
||||
; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-m33 -float-abi=hard %s -o - | FileCheck %s -check-prefix=VMLA
|
||||
|
||||
define float @t1(float %acc, float %a, float %b) {
|
||||
entry:
|
||||
|
@ -15,6 +17,21 @@ entry:
|
|||
; A8-LABEL: t1:
|
||||
; A8: vmul.f32
|
||||
; A8: vadd.f32
|
||||
|
||||
; VMLA-LABEL: t1:
|
||||
; VMLA: vmul.f32
|
||||
; VMLA-NEXT: vadd.f32
|
||||
|
||||
%0 = fmul float %a, %b
|
||||
%1 = fadd float %acc, %0
|
||||
ret float %1
|
||||
}
|
||||
|
||||
define float @vlma_minsize(float %acc, float %a, float %b) #0 {
|
||||
entry:
|
||||
; VMLA-LABEL: vlma_minsize:
|
||||
; VLMA: vmla.f32 s0, s1, s2
|
||||
|
||||
%0 = fmul float %a, %b
|
||||
%1 = fadd float %acc, %0
|
||||
ret float %1
|
||||
|
@ -102,3 +119,5 @@ entry:
|
|||
%3 = fadd float %1, %2
|
||||
ret float %3
|
||||
}
|
||||
|
||||
attributes #0 = { minsize nounwind optsize }
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
; RUN: llc < %s -mtriple=thumbv7-none-eabi -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=NONE
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=VMLA
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=NO-VMLA
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m33 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=NO-VMLA
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP -check-prefix=FP-ARMv8 -check-prefix=VMLA
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 -mattr=+fp-only-sp | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=FP-ARMv8 -check-prefix=VMLA
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=VFP4 -check-prefix=NO-VMLA
|
||||
|
@ -188,8 +189,6 @@ define float @round_f(float %a) {
|
|||
ret float %1
|
||||
}
|
||||
|
||||
; FIXME: why does cortex-m4 use vmla, while cortex-a7 uses vmul+vadd?
|
||||
; (these should be equivalent, even the rounding is the same)
|
||||
declare float @llvm.fmuladd.f32(float %a, float %b, float %c)
|
||||
define float @fmuladd_f(float %a, float %b, float %c) {
|
||||
; CHECK-LABEL: fmuladd_f:
|
||||
|
|
Loading…
Reference in New Issue