[PowerPC] Implement more fusion types for Power10

This implements the rest of Power10 instruction fusion pairs, according
to user manual, including 'wide immediate', 'load compare', 'zero move'
and 'SHA3 assist'.

Only 'SHA3 assist' is enabled by default.

Reviewed By: shchenz

Differential Revision: https://reviews.llvm.org/D112912
This commit is contained in:
Qiu Chaofan 2021-11-23 17:21:17 +08:00
parent 8ea3e70fb0
commit 59f4b3d308
6 changed files with 230 additions and 1 deletions

View File

@ -203,6 +203,22 @@ def FeatureLogicalFusion :
SubtargetFeature<"fuse-logical", "HasLogicalFusion", "true",
"Target supports Logical Operations fusion",
[FeatureFusion]>;
def FeatureSha3Fusion :
SubtargetFeature<"fuse-sha3", "HasSha3Fusion", "true",
"Target supports SHA3 assist fusion",
[FeatureFusion]>;
def FeatureCompareFusion:
SubtargetFeature<"fuse-cmp", "HasCompareFusion", "true",
"Target supports Comparison Operations fusion",
[FeatureFusion]>;
def FeatureWideImmFusion:
SubtargetFeature<"fuse-wideimm", "HasWideImmFusion", "true",
"Target supports Wide-Immediate fusion",
[FeatureFusion]>;
def FeatureZeroMoveFusion:
SubtargetFeature<"fuse-zeromove", "HasZeroMoveFusion", "true",
"Target supports move to SPR with branch fusion",
[FeatureFusion]>;
def FeatureUnalignedFloats :
SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess",
"true", "CPU does not trap on unaligned FP access">;
@ -393,7 +409,7 @@ def ProcessorFeatures {
// still exist with the exception of those we know are Power9 specific.
list<SubtargetFeature> FusionFeatures = [
FeatureStoreFusion, FeatureAddLogicalFusion, FeatureLogicalAddFusion,
FeatureLogicalFusion, FeatureArithAddFusion
FeatureLogicalFusion, FeatureArithAddFusion, FeatureSha3Fusion,
];
list<SubtargetFeature> P10AdditionalFeatures =
!listconcat(FusionFeatures, [

View File

@ -149,6 +149,79 @@ static bool checkOpConstraints(FusionFeature::FusionKind Kd,
case FusionFeature::FK_SldiAdd:
return (matchingImmOps(FirstMI, 2, 3) && matchingImmOps(FirstMI, 3, 60)) ||
(matchingImmOps(FirstMI, 2, 6) && matchingImmOps(FirstMI, 3, 57));
// rldicl rx, ra, 1, 0 - xor
case FusionFeature::FK_RotateLeftXor:
return matchingImmOps(FirstMI, 2, 1) && matchingImmOps(FirstMI, 3, 0);
// rldicr rx, ra, 1, 63 - xor
case FusionFeature::FK_RotateRightXor:
return matchingImmOps(FirstMI, 2, 1) && matchingImmOps(FirstMI, 3, 63);
// We actually use CMPW* and CMPD*, 'l' doesn't exist as an operand in instr.
// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpi 0,1,rx,{ 0,1,-1 }
// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpli 0,L,rx,{ 0,1 }
case FusionFeature::FK_LoadCmp1:
// { ld,ldx } - cmpi 0,1,rx,{ 0,1,-1 }
// { ld,ldx } - cmpli 0,1,rx,{ 0,1 }
case FusionFeature::FK_LoadCmp2: {
const MachineOperand &BT = SecondMI.getOperand(0);
if (!BT.isReg() ||
(!Register::isVirtualRegister(BT.getReg()) && BT.getReg() != PPC::CR0))
return false;
if (SecondMI.getOpcode() == PPC::CMPDI &&
matchingImmOps(SecondMI, 2, -1, 16))
return true;
return matchingImmOps(SecondMI, 2, 0) || matchingImmOps(SecondMI, 2, 1);
}
// { lha,lhax,lwa,lwax } - cmpi 0,L,rx,{ 0,1,-1 }
case FusionFeature::FK_LoadCmp3: {
const MachineOperand &BT = SecondMI.getOperand(0);
if (!BT.isReg() ||
(!Register::isVirtualRegister(BT.getReg()) && BT.getReg() != PPC::CR0))
return false;
return matchingImmOps(SecondMI, 2, 0) || matchingImmOps(SecondMI, 2, 1) ||
matchingImmOps(SecondMI, 2, -1, 16);
}
// mtctr - { bcctr,bcctrl }
case FusionFeature::FK_ZeroMoveCTR:
// ( mtctr rx ) is alias of ( mtspr 9, rx )
return (FirstMI.getOpcode() != PPC::MTSPR &&
FirstMI.getOpcode() != PPC::MTSPR8) ||
matchingImmOps(FirstMI, 0, 9);
// mtlr - { bclr,bclrl }
case FusionFeature::FK_ZeroMoveLR:
// ( mtlr rx ) is alias of ( mtspr 8, rx )
return (FirstMI.getOpcode() != PPC::MTSPR &&
FirstMI.getOpcode() != PPC::MTSPR8) ||
matchingImmOps(FirstMI, 0, 8);
// addis rx,ra,si - addi rt,rx,SI, SI >= 0
case FusionFeature::FK_AddisAddi: {
const MachineOperand &RA = FirstMI.getOperand(1);
const MachineOperand &SI = SecondMI.getOperand(2);
if (!SI.isImm() || !RA.isReg())
return false;
if (RA.getReg() == PPC::ZERO || RA.getReg() == PPC::ZERO8)
return false;
return SignExtend64(SI.getImm(), 16) >= 0;
}
// addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2
case FusionFeature::FK_AddiAddis: {
const MachineOperand &RA = FirstMI.getOperand(1);
const MachineOperand &SI = FirstMI.getOperand(2);
if (!SI.isImm() || !RA.isReg())
return false;
if (RA.getReg() == PPC::ZERO || RA.getReg() == PPC::ZERO8)
return false;
int64_t ExtendedSI = SignExtend64(SI.getImm(), 16);
return ExtendedSI >= 2;
}
}
llvm_unreachable("All the cases should have been handled");

View File

@ -78,5 +78,80 @@ FUSION_FEATURE(VecLogical, hasLogicalFusion, -1,
FUSION_FEATURE(SldiAdd, hasArithAddFusion, -1, FUSION_OP_SET(RLDICR, RLDICR_32),
FUSION_OP_SET(ADD4, ADD8, SUBF, SUBF8))
// rldicl rx, ra, 1, 0 - xor
FUSION_FEATURE(RotateLeftXor, hasSha3Fusion, 1,
FUSION_OP_SET(RLDICL, RLDICL_32, RLDICL_32_64),
FUSION_OP_SET(XOR, XOR8))
// rldicr rx, ra, 1, 63 - xor
FUSION_FEATURE(RotateRightXor, hasSha3Fusion, 1,
FUSION_OP_SET(RLDICR, RLDICR_32), FUSION_OP_SET(XOR, XOR8))
// There're two special cases in 'load-compare' series, so we have to split
// them into several pattern groups to fit into current framework. This can
// be clearer once we switched to a more expressive approach.
// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpi 0,1,rx,{ 0,1,-1 }
// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpli 0,L,rx,{ 0,1 }
FUSION_FEATURE(LoadCmp1, hasCompareFusion, 1,
FUSION_OP_SET(LBZ, LBZ8, LBZX, LBZX8, LBZXTLS, LBZXTLS_,
LBZXTLS_32, LHZ, LHZ8, LHZX, LHZX8, LHZXTLS,
LHZXTLS_, LHZXTLS_32, LWZ, LWZ8, LWZX, LWZX8,
LWZXTLS, LWZXTLS_, LWZXTLS_32),
FUSION_OP_SET(CMPDI, CMPLDI, CMPLWI))
// { ld,ldx } - cmpi 0,1,rx,{ 0,1,-1 }
// { ld,ldx } - cmpli 0,1,rx,{ 0,1 }
FUSION_FEATURE(LoadCmp2, hasCompareFusion, 1,
FUSION_OP_SET(LD, LDX, LDXTLS, LDXTLS_),
FUSION_OP_SET(CMPDI, CMPLDI))
// { lha,lhax,lwa,lwax } - cmpi 0,L,rx,{ 0,1,-1 }
FUSION_FEATURE(LoadCmp3, hasCompareFusion, 1,
FUSION_OP_SET(LHA, LHA8, LHAX, LHAX8, LWA, LWA_32, LWAX,
LWAX_32),
FUSION_OP_SET(CMPLDI, CMPLWI))
// ori - oris
FUSION_FEATURE(OriOris, hasWideImmFusion, 1, FUSION_OP_SET(ORI, ORI8),
FUSION_OP_SET(ORIS, ORIS8))
// lis - ori
FUSION_FEATURE(LisOri, hasWideImmFusion, 1, FUSION_OP_SET(LIS, LIS8),
FUSION_OP_SET(ORI, ORI8))
// oris - ori
FUSION_FEATURE(OrisOri, hasWideImmFusion, 1, FUSION_OP_SET(ORIS, ORIS8),
FUSION_OP_SET(ORI, ORI8))
// xori - xoris
FUSION_FEATURE(XoriXoris, hasWideImmFusion, 1, FUSION_OP_SET(XORI, XORI8),
FUSION_OP_SET(XORIS, XORIS8))
// xoris - xori
FUSION_FEATURE(XorisXori, hasWideImmFusion, 1, FUSION_OP_SET(XORIS, XORIS8),
FUSION_OP_SET(XORI, XORI8))
// addis rx,ra,si - addi rt,rx,SI, SI >= 0
FUSION_FEATURE(AddisAddi, hasWideImmFusion, 1,
FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8),
FUSION_OP_SET(ADDI, ADDI8, ADDItocL))
// addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2
FUSION_FEATURE(AddiAddis, hasWideImmFusion, 1,
FUSION_OP_SET(ADDI, ADDI8, ADDItocL),
FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8))
// mtctr - { bcctr,bcctrl }
FUSION_FEATURE(ZeroMoveCTR, hasZeroMoveFusion, -1,
FUSION_OP_SET(MTCTR, MTCTRloop, MTSPR8, MTSPR),
FUSION_OP_SET(BCCTR, BCCTRn, BCCTR8, BCCTR8n, BCCTRL, BCCTRLn,
BCCTRL8, BCCTRL8n, gBCCTR, gBCCTRL))
// mtlr - { bclr,bclrl }
FUSION_FEATURE(ZeroMoveLR, hasZeroMoveFusion, -1,
FUSION_OP_SET(MTLR8, MTLR, MTSPR8, MTSPR),
FUSION_OP_SET(BCLR, BCLRn, gBCLR, BCLRL, BCLRLn, gBCLRL))
#undef FUSION_FEATURE
#undef FUSION_OP_SET

View File

@ -131,6 +131,10 @@ void PPCSubtarget::initializeEnvironment() {
HasAddLogicalFusion = false;
HasLogicalAddFusion = false;
HasLogicalFusion = false;
HasSha3Fusion = false;
HasCompareFusion = false;
HasWideImmFusion = false;
HasZeroMoveFusion = false;
IsISA2_06 = false;
IsISA2_07 = false;
IsISA3_0 = false;

View File

@ -151,6 +151,10 @@ protected:
bool HasAddLogicalFusion;
bool HasLogicalAddFusion;
bool HasLogicalFusion;
bool HasSha3Fusion;
bool HasCompareFusion;
bool HasWideImmFusion;
bool HasZeroMoveFusion;
bool IsISA2_06;
bool IsISA2_07;
bool IsISA3_0;
@ -340,6 +344,10 @@ public:
bool hasAddLogicalFusion() const { return HasAddLogicalFusion; }
bool hasLogicalAddFusion() const { return HasLogicalAddFusion; }
bool hasLogicalFusion() const { return HasLogicalFusion; }
bool hasCompareFusion() const { return HasCompareFusion; }
bool hasWideImmFusion() const { return HasWideImmFusion; }
bool hasSha3Fusion() const { return HasSha3Fusion; }
bool hasZeroMoveFusion() const { return HasZeroMoveFusion; }
bool needsSwapsForVSXMemOps() const {
return hasVSX() && isLittleEndian() && !hasP9Vector();
}

View File

@ -1,6 +1,7 @@
# REQUIRES: asserts
# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 -x=mir < %s \
# RUN: -debug-only=machine-scheduler -start-before=postmisched 2>&1 \
# RUN: -mattr=+fuse-zeromove,+fuse-cmp,+fuse-wideimm \
# RUN: | FileCheck %s
# CHECK: add_mulld:%bb.0
@ -93,3 +94,55 @@ body: |
renamable $x3 = ADD8 killed renamable $x4, $x5
BLR8 implicit $lr8, implicit $rm, implicit $x3
...
# CHECK: rldicl_xor:%bb.0
# CHECK: Macro fuse: SU(0) - SU(1) / RLDICL - XOR8
---
name: rldicl_xor
tracksRegLiveness: true
body: |
bb.0.entry:
liveins: $x3, $x4, $x5
renamable $x4 = RLDICL $x3, 1, 0
renamable $x3 = XOR8 killed renamable $x4, $x5
BLR8 implicit $lr8, implicit $rm, implicit $x3
...
# CHECK: rldicr_xor:%bb.0
# CHECK: Macro fuse: SU(0) - SU(1) / RLDICR - XOR8
---
name: rldicr_xor
tracksRegLiveness: true
body: |
bb.0.entry:
liveins: $x3, $x4, $x5
renamable $x4 = RLDICR $x3, 1, 63
renamable $x3 = XOR8 killed renamable $x4, $x5
BLR8 implicit $lr8, implicit $rm, implicit $x3
...
# CHECK: ori_oris:%bb.0
# CHECK: Macro fuse: SU(0) - SU(1) / ORI8 - ORIS8
---
name: ori_oris
tracksRegLiveness: true
body: |
bb.0.entry:
liveins: $x3, $x4
renamable $x4 = ORI8 $x3, 63
renamable $x3 = ORIS8 killed renamable $x4, 20
BLR8 implicit $lr8, implicit $rm, implicit $x3
...
# CHECK: load_cmp:%bb.0
# CHECK: Macro fuse: SU(0) - SU(1) / LD - CMPDI
---
name: load_cmp
tracksRegLiveness: true
body: |
bb.0.entry:
liveins: $x3, $x4, $x5
renamable $x3 = LD 0, killed renamable $x3
renamable $cr0 = CMPDI killed renamable $x3, 0
renamable $x3 = ISEL8 killed renamable $x5, killed renamable $x4, renamable $cr0lt, implicit killed $cr0
BLR8 implicit $lr8, implicit $rm, implicit $x3