From c42a225545b4b494db7866d5db90255926059bc3 Mon Sep 17 00:00:00 2001 From: zhongyunde Date: Mon, 13 Jun 2022 17:24:59 +0800 Subject: [PATCH] [MachineScheduler] Order more stores by ascending address According D125377, we order STP Q's by ascending address. While on some targets, paired 128 bit loads and stores are slow, so the STP will split into STRQ and STUR, so I hope these stores will also be ordered. Also add subtarget feature ascend-store-address to control the aggressive order. Reviewed By: dmgreen, fhahn Differential Revision: https://reviews.llvm.org/D126700 --- llvm/lib/Target/AArch64/AArch64.td | 4 + llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 35 +++++ llvm/lib/Target/AArch64/AArch64InstrInfo.h | 9 ++ .../AArch64/AArch64LoadStoreOptimizer.cpp | 139 ++++++++---------- .../AArch64/AArch64MachineScheduler.cpp | 71 +++++++-- 5 files changed, 165 insertions(+), 93 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 61c0521a198e..8d5bf16a9a65 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -215,6 +215,10 @@ def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store", def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128", "IsPaired128Slow", "true", "Paired 128 bit loads and stores are slow">; +def FeatureAscendStoreAddress : SubtargetFeature<"ascend-store-address", + "IsStoreAddressAscend", "false", + "Schedule scalar stores by ascending address">; + def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "IsSTRQroSlow", "true", "STR of Q register with register offset is slow">; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index b0930016be63..7636f3d07ea7 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3152,6 +3152,41 @@ bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) { return isPreLd(MI) || isPreSt(MI); } +bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + case AArch64::STGPi: + return true; + } +} + +const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) { + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 + : 1; + return MI.getOperand(Idx); +} + +const MachineOperand & +AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 + : 2; + return MI.getOperand(Idx); +} + static const TargetRegisterClass *getRegClass(const MachineInstr &MI, Register Reg) { if (MI.getParent() == nullptr) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 337372644c01..b7a6ac301cdc 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -103,6 +103,15 @@ public: /// Returns whether the instruction is a pre-indexed load/store. static bool isPreLdSt(const MachineInstr &MI); + /// Returns whether the instruction is a paired load/store. + static bool isPairedLdSt(const MachineInstr &MI); + + /// Returns the base register operator of a load/store. + static const MachineOperand &getLdStBaseOp(const MachineInstr &MI); + + /// Returns the the immediate offset operator of a load/store. + static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI); + /// Returns whether the instruction is FP or NEON. static bool isFpOrNEON(const MachineInstr &MI); diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 502b6321d5bb..eaf39fc0dbb1 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -556,26 +556,6 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { } } -static bool isPairedLdSt(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - return false; - case AArch64::LDPSi: - case AArch64::LDPSWi: - case AArch64::LDPDi: - case AArch64::LDPQi: - case AArch64::LDPWi: - case AArch64::LDPXi: - case AArch64::STPSi: - case AArch64::STPDi: - case AArch64::STPQi: - case AArch64::STPWi: - case AArch64::STPXi: - case AArch64::STGPi: - return true; - } -} - static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) { unsigned OpcA = FirstMI.getOpcode(); @@ -610,7 +590,7 @@ static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) { // Returns the scale and offset range of pre/post indexed variants of MI. static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, int &MinOffset, int &MaxOffset) { - bool IsPaired = isPairedLdSt(MI); + bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI); bool IsTagStore = isTagStore(MI); // ST*G and all paired ldst have the same scale in pre/post-indexed variants // as in the "unsigned offset" variant. @@ -632,17 +612,8 @@ static MachineOperand &getLdStRegOp(MachineInstr &MI, bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI); if (IsPreLdSt) PairedRegOp += 1; - unsigned Idx = isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; - return MI.getOperand(Idx); -} - -static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { - unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 : 1; - return MI.getOperand(Idx); -} - -static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { - unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 : 2; + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; return MI.getOperand(Idx); } @@ -652,12 +623,14 @@ static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); int LoadSize = TII->getMemScale(LoadInst); int StoreSize = TII->getMemScale(StoreInst); - int UnscaledStOffset = TII->hasUnscaledLdStOffset(StoreInst) - ? getLdStOffsetOp(StoreInst).getImm() - : getLdStOffsetOp(StoreInst).getImm() * StoreSize; - int UnscaledLdOffset = TII->hasUnscaledLdStOffset(LoadInst) - ? getLdStOffsetOp(LoadInst).getImm() - : getLdStOffsetOp(LoadInst).getImm() * LoadSize; + int UnscaledStOffset = + TII->hasUnscaledLdStOffset(StoreInst) + ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() + : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize; + int UnscaledLdOffset = + TII->hasUnscaledLdStOffset(LoadInst) + ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() + : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize; return (UnscaledStOffset <= UnscaledLdOffset) && (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); } @@ -736,7 +709,7 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) { case AArch64::STPWi: case AArch64::STPXi: // Make sure this is a reg+imm (as opposed to an address reloc). - if (!getLdStOffsetOp(MI).isImm()) + if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) return false; return true; @@ -770,17 +743,18 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. const MachineOperand &BaseRegOp = - MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I); + MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI) + : AArch64InstrInfo::getLdStBaseOp(*I); // Which register is Rt and which is Rt2 depends on the offset order. MachineInstr *RtMI; - if (getLdStOffsetOp(*I).getImm() == - getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) + if (AArch64InstrInfo::getLdStOffsetOp(*I).getImm() == + AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) RtMI = &*MergeMI; else RtMI = &*I; - int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); + int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm(); // Change the scaled offset from small to large type. if (IsScaled) { assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); @@ -944,10 +918,11 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. const MachineOperand &BaseRegOp = - MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I); + MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired) + : AArch64InstrInfo::getLdStBaseOp(*I); - int Offset = getLdStOffsetOp(*I).getImm(); - int PairedOffset = getLdStOffsetOp(*Paired).getImm(); + int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm(); + int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm(); bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode()); if (IsUnscaled != PairedIsUnscaled) { // We're trying to pair instructions that differ in how they are scaled. If @@ -982,7 +957,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, RtMI = &*I; Rt2MI = &*Paired; } - int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); + int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm(); // Scale the immediate offset, if necessary. if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) { assert(!(OffsetImm % TII->getMemScale(*RtMI)) && @@ -1140,12 +1115,14 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) && "Unsupported ld/st match"); assert(LoadSize <= StoreSize && "Invalid load size"); - int UnscaledLdOffset = IsUnscaled - ? getLdStOffsetOp(*LoadI).getImm() - : getLdStOffsetOp(*LoadI).getImm() * LoadSize; - int UnscaledStOffset = IsUnscaled - ? getLdStOffsetOp(*StoreI).getImm() - : getLdStOffsetOp(*StoreI).getImm() * StoreSize; + int UnscaledLdOffset = + IsUnscaled + ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() + : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize; + int UnscaledStOffset = + IsUnscaled + ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() + : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize; int Width = LoadSize * 8; Register DestReg = IsStoreXReg ? Register(TRI->getMatchingSuperReg( @@ -1243,7 +1220,7 @@ bool AArch64LoadStoreOpt::findMatchingStore( MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator MBBI = I; MachineInstr &LoadMI = *I; - Register BaseReg = getLdStBaseOp(LoadMI).getReg(); + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg(); // If the load is the first instruction in the block, there's obviously // not any matching store. @@ -1272,7 +1249,8 @@ bool AArch64LoadStoreOpt::findMatchingStore( // Also we can't handle stores without an immediate offset operand, // while the operand might be the address for a global variable. if (MI.mayStore() && isMatchingStore(LoadMI, MI) && - BaseReg == getLdStBaseOp(MI).getReg() && getLdStOffsetOp(MI).isImm() && + BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() && + AArch64InstrInfo::getLdStOffsetOp(MI).isImm() && isLdOffsetInRangeOfSt(LoadMI, MI, TII) && ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) { StoreI = MBBI; @@ -1539,8 +1517,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, bool MayLoad = FirstMI.mayLoad(); bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI); Register Reg = getLdStRegOp(FirstMI).getReg(); - Register BaseReg = getLdStBaseOp(FirstMI).getReg(); - int Offset = getLdStOffsetOp(FirstMI).getImm(); + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg(); + int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm(); int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1; bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); @@ -1575,7 +1553,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, Flags.setSExtIdx(-1); if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && - getLdStOffsetOp(MI).isImm()) { + AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) { assert(MI.mayLoadOrStore() && "Expected memory operation."); // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. @@ -1583,8 +1561,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // check for +1/-1. Make sure to check the new instruction offset is // actually an immediate and not a symbolic reference destined for // a relocation. - Register MIBaseReg = getLdStBaseOp(MI).getReg(); - int MIOffset = getLdStOffsetOp(MI).getImm(); + Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg(); + int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI); if (IsUnscaled != MIIsUnscaled) { // We're trying to pair instructions that differ in how they are scaled. @@ -1615,15 +1593,16 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // can't be paired: bail and keep looking. if (IsPreLdSt) { bool IsOutOfBounds = MIOffset != TII->getMemScale(MI); - bool IsBaseRegUsed = - !UsedRegUnits.available(getLdStBaseOp(MI).getReg()); - bool IsBaseRegModified = - !ModifiedRegUnits.available(getLdStBaseOp(MI).getReg()); + bool IsBaseRegUsed = !UsedRegUnits.available( + AArch64InstrInfo::getLdStBaseOp(MI).getReg()); + bool IsBaseRegModified = !ModifiedRegUnits.available( + AArch64InstrInfo::getLdStBaseOp(MI).getReg()); // If the stored value and the address of the second instruction is // the same, it needs to be using the updated register and therefore // it must not be folded. - bool IsMIRegTheSame = TRI->regsOverlap(getLdStRegOp(MI).getReg(), - getLdStBaseOp(MI).getReg()); + bool IsMIRegTheSame = + TRI->regsOverlap(getLdStRegOp(MI).getReg(), + AArch64InstrInfo::getLdStBaseOp(MI).getReg()); if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified || IsMIRegTheSame) { LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, @@ -1776,7 +1755,7 @@ maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) { MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION || !(MI.getFlag(MachineInstr::FrameSetup) || MI.getFlag(MachineInstr::FrameDestroy)) || - getLdStBaseOp(MI).getReg() != AArch64::SP) + AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP) return End; const MachineFunction &MF = *MI.getParent()->getParent(); @@ -1823,12 +1802,12 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, MachineInstrBuilder MIB; int Scale, MinOffset, MaxOffset; getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); - if (!isPairedLdSt(*I)) { + if (!AArch64InstrInfo::isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I)) - .add(getLdStBaseOp(*I)) + .add(AArch64InstrInfo::getLdStBaseOp(*I)) .addImm(Value / Scale) .setMemRefs(I->memoperands()) .setMIFlags(I->mergeFlagsWith(*Update)); @@ -1838,7 +1817,7 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I, 0)) .add(getLdStRegOp(*I, 1)) - .add(getLdStBaseOp(*I)) + .add(AArch64InstrInfo::getLdStBaseOp(*I)) .addImm(Value / Scale) .setMemRefs(I->memoperands()) .setMIFlags(I->mergeFlagsWith(*Update)); @@ -1928,8 +1907,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; - Register BaseReg = getLdStBaseOp(MemMI).getReg(); - int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * TII->getMemScale(MemMI); + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); + int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() * + TII->getMemScale(MemMI); // Scan forward looking for post-index opportunities. Updating instructions // can't be formed if the memory instruction doesn't have the offset we're @@ -1944,7 +1924,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( // behavior in this case unlike normal stores, and always performs writeback // after reading the source register value. if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { - bool IsPairedInsn = isPairedLdSt(MemMI); + bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { Register DestReg = getLdStRegOp(MemMI, i).getReg(); if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) @@ -2005,8 +1985,8 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineBasicBlock::iterator MBBI = I; MachineFunction &MF = *MemMI.getMF(); - Register BaseReg = getLdStBaseOp(MemMI).getReg(); - int Offset = getLdStOffsetOp(MemMI).getImm(); + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); + int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm(); // If the load/store is the first instruction in the block, there's obviously // not any matching update. Ditto if the memory offset isn't zero. @@ -2015,7 +1995,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( // If the base register overlaps a destination register, we can't // merge the update. if (!isTagStore(MemMI)) { - bool IsPairedInsn = isPairedLdSt(MemMI); + bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { Register DestReg = getLdStRegOp(MemMI, i).getReg(); if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) @@ -2085,7 +2065,7 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( // Make sure this is a reg+imm. // FIXME: It is possible to extend it to handle reg+reg cases. - if (!getLdStOffsetOp(MI).isImm()) + if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) return false; // Look backward up to LdStLimit instructions. @@ -2139,7 +2119,7 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { // range, plus allow an extra one in case we find a later insn that matches // with Offset-1) bool IsUnscaled = TII->hasUnscaledLdStOffset(MI); - int Offset = getLdStOffsetOp(MI).getImm(); + int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1; // Allow one more for offset. if (Offset > 0) @@ -2206,7 +2186,8 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate // The immediate in the load/store is scaled by the size of the memory // operation. The immediate in the add we're looking for, // however, is not, so adjust here. - int UnscaledOffset = getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); + int UnscaledOffset = + AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); // Look forward to try to find a pre-index instruction. For example, // ldr x1, [x0, #64] diff --git a/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp index ff15c0b07aa4..9c69a3704548 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp +++ b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp @@ -7,10 +7,57 @@ //===----------------------------------------------------------------------===// #include "AArch64MachineScheduler.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" using namespace llvm; +static bool needReorderStoreMI(const MachineInstr *MI) { + if (!MI) + return false; + + switch (MI->getOpcode()) { + default: + return false; + case AArch64::STURQi: + case AArch64::STRQui: + if (MI->getMF()->getSubtarget().isStoreAddressAscend()) + return false; + LLVM_FALLTHROUGH; + case AArch64::STPQi: + return AArch64InstrInfo::getLdStOffsetOp(*MI).getType() == MachineOperand::MO_Immediate; + } + + return false; +} + +// Return true if two stores with same base address may overlap writes +static bool mayOverlapWrite(const MachineInstr &MI0, const MachineInstr &MI1, + int64_t &Off0, int64_t &Off1) { + const MachineOperand &Base0 = AArch64InstrInfo::getLdStBaseOp(MI0); + const MachineOperand &Base1 = AArch64InstrInfo::getLdStBaseOp(MI1); + + // May overlapping writes if two store instructions without same base + if (!Base0.isIdenticalTo(Base1)) + return true; + + int StoreSize0 = AArch64InstrInfo::getMemScale(MI0); + int StoreSize1 = AArch64InstrInfo::getMemScale(MI1); + Off0 = AArch64InstrInfo::hasUnscaledLdStOffset(MI0.getOpcode()) + ? AArch64InstrInfo::getLdStOffsetOp(MI0).getImm() + : AArch64InstrInfo::getLdStOffsetOp(MI0).getImm() * StoreSize0; + Off1 = AArch64InstrInfo::hasUnscaledLdStOffset(MI1.getOpcode()) + ? AArch64InstrInfo::getLdStOffsetOp(MI1).getImm() + : AArch64InstrInfo::getLdStOffsetOp(MI1).getImm() * StoreSize1; + + const MachineInstr &MI = (Off0 < Off1) ? MI0 : MI1; + int Multiples = AArch64InstrInfo::isPairedLdSt(MI) ? 2 : 1; + int StoreSize = AArch64InstrInfo::getMemScale(MI) * Multiples; + + return llabs(Off0 - Off1) < StoreSize; +} + bool AArch64PostRASchedStrategy::tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) { bool OriginalResult = PostGenericScheduler::tryCandidate(Cand, TryCand); @@ -18,20 +65,16 @@ bool AArch64PostRASchedStrategy::tryCandidate(SchedCandidate &Cand, if (Cand.isValid()) { MachineInstr *Instr0 = TryCand.SU->getInstr(); MachineInstr *Instr1 = Cand.SU->getInstr(); - // When dealing with two STPqi's. - if (Instr0 && Instr1 && Instr0->getOpcode() == Instr1->getOpcode () && - Instr0->getOpcode() == AArch64::STPQi) - { - MachineOperand &Base0 = Instr0->getOperand(2); - MachineOperand &Base1 = Instr1->getOperand(2); - int64_t Off0 = Instr0->getOperand(3).getImm(); - int64_t Off1 = Instr1->getOperand(3).getImm(); - // With the same base address and non-overlapping writes. - if (Base0.isIdenticalTo(Base1) && llabs (Off0 - Off1) >= 2) { - TryCand.Reason = NodeOrder; - // Order them by ascending offsets. - return Off0 < Off1; - } + + if (!needReorderStoreMI(Instr0) || !needReorderStoreMI(Instr1)) + return OriginalResult; + + int64_t Off0, Off1; + // With the same base address and non-overlapping writes. + if (!mayOverlapWrite(*Instr0, *Instr1, Off0, Off1)) { + TryCand.Reason = NodeOrder; + // Order them by ascending offsets. + return Off0 < Off1; } }