forked from OSchip/llvm-project
[AArch64] Add new target feature to fuse arithmetic and logic operations
This feature enables the fusion of some arithmetic and logic instructions together. Differential revision: https://reviews.llvm.org/D56572 llvm-svn: 351139
This commit is contained in:
parent
ed2df18a48
commit
bf59cb02c3
|
@ -188,14 +188,18 @@ def FeatureFuseAES : SubtargetFeature<
|
|||
"fuse-aes", "HasFuseAES", "true",
|
||||
"CPU fuses AES crypto operations">;
|
||||
|
||||
def FeatureFuseCryptoEOR : SubtargetFeature<
|
||||
"fuse-crypto-eor", "HasFuseCryptoEOR", "true",
|
||||
"CPU fuses AES/PMULL and EOR operations">;
|
||||
def FeatureFuseArithmeticLogic : SubtargetFeature<
|
||||
"fuse-arith-logic", "HasFuseArithmeticLogic", "true",
|
||||
"CPU fuses arithmetic and logic operations">;
|
||||
|
||||
def FeatureFuseCCSelect : SubtargetFeature<
|
||||
"fuse-csel", "HasFuseCCSelect", "true",
|
||||
"CPU fuses conditional select operations">;
|
||||
|
||||
def FeatureFuseCryptoEOR : SubtargetFeature<
|
||||
"fuse-crypto-eor", "HasFuseCryptoEOR", "true",
|
||||
"CPU fuses AES/PMULL and EOR operations">;
|
||||
|
||||
def FeatureFuseLiterals : SubtargetFeature<
|
||||
"fuse-literals", "HasFuseLiterals", "true",
|
||||
"CPU fuses literal generation operations">;
|
||||
|
|
|
@ -270,7 +270,107 @@ static bool isCCSelectPair(const MachineInstr *FirstMI,
|
|||
return false;
|
||||
}
|
||||
|
||||
/// Check if the instr pair, FirstMI and SecondMI, should be fused
|
||||
// Arithmetic and logic.
|
||||
static bool isArithmeticLogicPair(const MachineInstr *FirstMI,
|
||||
const MachineInstr &SecondMI) {
|
||||
if (AArch64InstrInfo::hasShiftedReg(SecondMI))
|
||||
return false;
|
||||
|
||||
switch (SecondMI.getOpcode()) {
|
||||
// Arithmetic
|
||||
case AArch64::ADDWrr:
|
||||
case AArch64::ADDXrr:
|
||||
case AArch64::SUBWrr:
|
||||
case AArch64::SUBXrr:
|
||||
case AArch64::ADDWrs:
|
||||
case AArch64::ADDXrs:
|
||||
case AArch64::SUBWrs:
|
||||
case AArch64::SUBXrs:
|
||||
// Logic
|
||||
case AArch64::ANDWrr:
|
||||
case AArch64::ANDXrr:
|
||||
case AArch64::BICWrr:
|
||||
case AArch64::BICXrr:
|
||||
case AArch64::EONWrr:
|
||||
case AArch64::EONXrr:
|
||||
case AArch64::EORWrr:
|
||||
case AArch64::EORXrr:
|
||||
case AArch64::ORNWrr:
|
||||
case AArch64::ORNXrr:
|
||||
case AArch64::ORRWrr:
|
||||
case AArch64::ORRXrr:
|
||||
case AArch64::ANDWrs:
|
||||
case AArch64::ANDXrs:
|
||||
case AArch64::BICWrs:
|
||||
case AArch64::BICXrs:
|
||||
case AArch64::EONWrs:
|
||||
case AArch64::EONXrs:
|
||||
case AArch64::EORWrs:
|
||||
case AArch64::EORXrs:
|
||||
case AArch64::ORNWrs:
|
||||
case AArch64::ORNXrs:
|
||||
case AArch64::ORRWrs:
|
||||
case AArch64::ORRXrs:
|
||||
// Assume the 1st instr to be a wildcard if it is unspecified.
|
||||
if (FirstMI == nullptr)
|
||||
return true;
|
||||
|
||||
// Arithmetic
|
||||
switch (FirstMI->getOpcode()) {
|
||||
case AArch64::ADDWrr:
|
||||
case AArch64::ADDXrr:
|
||||
case AArch64::ADDSWrr:
|
||||
case AArch64::ADDSXrr:
|
||||
case AArch64::SUBWrr:
|
||||
case AArch64::SUBXrr:
|
||||
case AArch64::SUBSWrr:
|
||||
case AArch64::SUBSXrr:
|
||||
return true;
|
||||
case AArch64::ADDWrs:
|
||||
case AArch64::ADDXrs:
|
||||
case AArch64::ADDSWrs:
|
||||
case AArch64::ADDSXrs:
|
||||
case AArch64::SUBWrs:
|
||||
case AArch64::SUBXrs:
|
||||
case AArch64::SUBSWrs:
|
||||
case AArch64::SUBSXrs:
|
||||
return !AArch64InstrInfo::hasShiftedReg(*FirstMI);
|
||||
}
|
||||
break;
|
||||
|
||||
// Arithmetic, setting flags.
|
||||
case AArch64::ADDSWrr:
|
||||
case AArch64::ADDSXrr:
|
||||
case AArch64::SUBSWrr:
|
||||
case AArch64::SUBSXrr:
|
||||
case AArch64::ADDSWrs:
|
||||
case AArch64::ADDSXrs:
|
||||
case AArch64::SUBSWrs:
|
||||
case AArch64::SUBSXrs:
|
||||
// Assume the 1st instr to be a wildcard if it is unspecified.
|
||||
if (FirstMI == nullptr)
|
||||
return true;
|
||||
|
||||
// Arithmetic, not setting flags.
|
||||
switch (FirstMI->getOpcode()) {
|
||||
case AArch64::ADDWrr:
|
||||
case AArch64::ADDXrr:
|
||||
case AArch64::SUBWrr:
|
||||
case AArch64::SUBXrr:
|
||||
return true;
|
||||
case AArch64::ADDWrs:
|
||||
case AArch64::ADDXrs:
|
||||
case AArch64::SUBWrs:
|
||||
case AArch64::SUBXrs:
|
||||
return !AArch64InstrInfo::hasShiftedReg(*FirstMI);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
|
||||
/// together. Given SecondMI, when FirstMI is unspecified, then check if
|
||||
/// SecondMI may be part of a fused pair at all.
|
||||
static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
|
||||
|
@ -295,6 +395,8 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
|
|||
return true;
|
||||
if (ST.hasFuseCCSelect() && isCCSelectPair(FirstMI, SecondMI))
|
||||
return true;
|
||||
if (ST.hasFuseArithmeticLogic() && isArithmeticLogicPair(FirstMI, SecondMI))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -166,8 +166,9 @@ protected:
|
|||
bool HasArithmeticCbzFusion = false;
|
||||
bool HasFuseAddress = false;
|
||||
bool HasFuseAES = false;
|
||||
bool HasFuseCryptoEOR = false;
|
||||
bool HasFuseArithmeticLogic = false;
|
||||
bool HasFuseCCSelect = false;
|
||||
bool HasFuseCryptoEOR = false;
|
||||
bool HasFuseLiterals = false;
|
||||
bool DisableLatencySchedHeuristic = false;
|
||||
bool UseRSqrt = false;
|
||||
|
@ -311,14 +312,16 @@ public:
|
|||
bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
|
||||
bool hasFuseAddress() const { return HasFuseAddress; }
|
||||
bool hasFuseAES() const { return HasFuseAES; }
|
||||
bool hasFuseCryptoEOR() const { return HasFuseCryptoEOR; }
|
||||
bool hasFuseArithmeticLogic() const { return HasFuseArithmeticLogic; }
|
||||
bool hasFuseCCSelect() const { return HasFuseCCSelect; }
|
||||
bool hasFuseCryptoEOR() const { return HasFuseCryptoEOR; }
|
||||
bool hasFuseLiterals() const { return HasFuseLiterals; }
|
||||
|
||||
/// Return true if the CPU supports any kind of instruction fusion.
|
||||
bool hasFusion() const {
|
||||
return hasArithmeticBccFusion() || hasArithmeticCbzFusion() ||
|
||||
hasFuseAES() || hasFuseCCSelect() || hasFuseLiterals();
|
||||
hasFuseAES() || hasFuseArithmeticLogic() ||
|
||||
hasFuseCCSelect() || hasFuseLiterals();
|
||||
}
|
||||
|
||||
bool useRSqrt() const { return UseRSqrt; }
|
||||
|
|
|
@ -0,0 +1,111 @@
|
|||
# RUN: llc -o - %s -mtriple aarch64-unknown -mattr=fuse-arith-logic -run-pass=machine-scheduler -misched-print-dags |& FileCheck %s
|
||||
# REQUIRES: asserts
|
||||
|
||||
---
|
||||
name: arith
|
||||
body: |
|
||||
bb.0.entry:
|
||||
%0:gpr32 = SUBWrr undef $w0, undef $w1
|
||||
%1:gpr32 = ADDWrr undef $w1, undef $w2
|
||||
%2:gpr32 = SUBWrs %0, undef $w2, 0
|
||||
%3:gpr32 = ADDWrs %1, undef $w3, 0
|
||||
|
||||
; CHECK: SU(0): %0:gpr32 = SUBWrr undef $w0, undef $w1
|
||||
; CHECK: Successors:
|
||||
; CHECK: SU(2): Ord Latency=0 Cluster
|
||||
; CHECK: SU(1): %1:gpr32 = ADDWrr undef $w1, undef $w2
|
||||
; CHECK: Successors:
|
||||
; CHECK: SU(3): Ord Latency=0 Cluster
|
||||
; CHECK: SU(2): dead %2:gpr32 = SUBWrs %0:gpr32, undef $w2, 0
|
||||
; CHECK: Predecessors:
|
||||
; CHECK: SU(0): Ord Latency=0 Cluster
|
||||
; CHECK: SU(3): dead %3:gpr32 = ADDWrs %1:gpr32, undef $w3, 0
|
||||
; CHECK: Predecessors:
|
||||
; CHECK: SU(1): Ord Latency=0 Cluster
|
||||
...
|
||||
---
|
||||
name: compare
|
||||
body: |
|
||||
bb.0.entry:
|
||||
%0:gpr64 = ADDXrr undef $x0, undef $x1
|
||||
%1:gpr64 = SUBXrs undef $x1, undef $x2, 0
|
||||
%2:gpr64 = ADDSXrr %0, undef $x3, implicit-def $nzcv
|
||||
%3:gpr64 = SUBSXrs %1, undef $x4, 0, implicit-def $nzcv
|
||||
|
||||
; CHECK: SU(0): %0:gpr64 = ADDXrr undef $x0, undef $x1
|
||||
; CHECK: Successors:
|
||||
; CHECK: SU(2): Ord Latency=0 Cluster
|
||||
; CHECK: SU(1): %1:gpr64 = SUBXrs undef $x1, undef $x2, 0
|
||||
; CHECK: Successors:
|
||||
; CHECK: SU(3): Ord Latency=0 Cluster
|
||||
; CHECK: SU(2): dead %2:gpr64 = ADDSXrr %0:gpr64, undef $x3, implicit-def $nzcv
|
||||
; CHECK: Predecessors:
|
||||
; CHECK: SU(0): Ord Latency=0 Cluster
|
||||
; CHECK: SU(3): dead %3:gpr64 = SUBSXrs %1:gpr64, undef $x4, 0, implicit-def $nzcv
|
||||
; CHECK: Predecessors:
|
||||
; CHECK: SU(1): Ord Latency=0 Cluster
|
||||
...
|
||||
---
|
||||
name: logic
|
||||
body: |
|
||||
bb.0.entry:
|
||||
%0:gpr32 = ADDWrr undef $w0, undef $w1
|
||||
%1:gpr64 = SUBXrs undef $x1, undef $x2, 0
|
||||
%3:gpr32 = ANDWrs %0, undef $w3, 0
|
||||
%4:gpr64 = ORRXrr %1, undef $x4
|
||||
|
||||
; CHECK: SU(0): %0:gpr32 = ADDWrr undef $w0, undef $w1
|
||||
; CHECK: Successors:
|
||||
; CHECK: SU(2): Ord Latency=0 Cluster
|
||||
; CHECK: SU(1): %1:gpr64 = SUBXrs undef $x1, undef $x2, 0
|
||||
; CHECK: Successors:
|
||||
; CHECK: SU(3): Ord Latency=0 Cluster
|
||||
; CHECK: SU(2): dead %2:gpr32 = ANDWrs %0:gpr32, undef $w3, 0
|
||||
; CHECK: Predecessors:
|
||||
; CHECK: SU(0): Ord Latency=0 Cluster
|
||||
; CHECK: SU(3): dead %3:gpr64 = ORRXrr %1:gpr64, undef $x4
|
||||
; CHECK: Predecessors:
|
||||
; CHECK: SU(1): Ord Latency=0 Cluster
|
||||
...
|
||||
---
|
||||
name: nope
|
||||
body: |
|
||||
bb.0.entry:
|
||||
; Shifted register.
|
||||
%0:gpr32 = SUBWrr undef $w0, undef $w1
|
||||
%1:gpr32 = SUBWrs %0, undef $w2, 1
|
||||
; CHECKSU(0)%0:gpr32 = SUBWrr undef $w0, undef $w1
|
||||
; CHECKSuccessors:
|
||||
; CHECK-NOTSU(1)Ord Latency=0 Cluster
|
||||
; CHECKSU(1)dead %1:gpr32 = SUBWrs %0:gpr32, undef $w2, 1
|
||||
|
||||
; Multiple successors.
|
||||
%2:gpr64 = ADDXrr undef $x0, undef $x1
|
||||
%3:gpr32 = EXTRACT_SUBREG %2, %subreg.sub_32
|
||||
%4:gpr32 = ANDWrs %3, undef $w2, 0
|
||||
%5:gpr64 = ADDSXrr %2, undef $x3, implicit-def $nzcv
|
||||
; CHECKSU(2)%2:gpr64 = ADDXrr undef $x0, undef $x1
|
||||
; CHECKSuccessors:
|
||||
; CHECK-NOTSU(3)Ord Latency=0 Cluster
|
||||
; CHECKSU(5)Ord Latency=0 Cluster
|
||||
; CHECKSU(3)%3:gpr32 = EXTRACT_SUBREG %2:gpr64, %subreg.sub_32
|
||||
; CHECKSU(5)dead %5:gpr64 = ADDSXrr %2:gpr64, undef $x3, implicit-def $nzcv
|
||||
|
||||
; Different register sizes.
|
||||
%6:gpr32 = SUBWrr undef $w0, undef $w1
|
||||
%7:gpr64 = ADDXrr undef $x1, undef $x2
|
||||
%8:gpr64 = SUBXrr %7, undef $x3
|
||||
%9:gpr32 = ADDWrr %6, undef $w4
|
||||
; CHECKSU(6)%6:gpr32 = SUBWrr undef $w0, undef $w1
|
||||
; CHECKSuccessors:
|
||||
; CHECK-NOTSU(8)Ord Latency=0 Cluster
|
||||
; CHECKSU(7)%7:gpr64 = ADDXrr undef $x1, undef $x2
|
||||
; CHECKSuccessors:
|
||||
; CHECK-NOTSU(9)Ord Latency=0 Cluster
|
||||
; CHECKSU(8)dead %8:gpr64 = SUBXrr %7:gpr64, undef $x3
|
||||
; CHECKPredecessors:
|
||||
; CHECKSU(7)Ord Latency=0 Cluster
|
||||
; CHECKSU(9)dead %9:gpr32 = ADDWrr %6:gpr32, undef $w4
|
||||
; CHECKPredecessors:
|
||||
; CHECKSU(6)Ord Latency=0 Cluster
|
||||
...
|
Loading…
Reference in New Issue