forked from OSchip/llvm-project
[MachineScheduler] Add support for clustering mem ops with FI base operands
Before this patch, the following stores in `merge_fail` would fail to be merged, while they would get merged in `merge_ok`: ``` void use(unsigned long long *); void merge_fail(unsigned key, unsigned index) { unsigned long long args[8]; args[0] = key; args[1] = index; use(args); } void merge_ok(unsigned long long *dst, unsigned a, unsigned b) { dst[0] = a; dst[1] = b; } ``` The reason is that `getMemOpBaseImmOfs` would return false for FI base operands. This adds support for this. Differential Revision: https://reviews.llvm.org/D54847 llvm-svn: 347747
This commit is contained in:
parent
d7eebd6d83
commit
879087ce5b
|
@ -1490,8 +1490,20 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
|
||||||
: SU(su), BaseOp(Op), Offset(ofs) {}
|
: SU(su), BaseOp(Op), Offset(ofs) {}
|
||||||
|
|
||||||
bool operator<(const MemOpInfo &RHS) const {
|
bool operator<(const MemOpInfo &RHS) const {
|
||||||
|
if (BaseOp->getType() != RHS.BaseOp->getType())
|
||||||
|
return BaseOp->getType() < RHS.BaseOp->getType();
|
||||||
|
|
||||||
|
if (BaseOp->isReg())
|
||||||
return std::make_tuple(BaseOp->getReg(), Offset, SU->NodeNum) <
|
return std::make_tuple(BaseOp->getReg(), Offset, SU->NodeNum) <
|
||||||
std::make_tuple(RHS.BaseOp->getReg(), RHS.Offset, RHS.SU->NodeNum);
|
std::make_tuple(RHS.BaseOp->getReg(), RHS.Offset,
|
||||||
|
RHS.SU->NodeNum);
|
||||||
|
if (BaseOp->isFI())
|
||||||
|
return std::make_tuple(BaseOp->getIndex(), Offset, SU->NodeNum) <
|
||||||
|
std::make_tuple(RHS.BaseOp->getIndex(), RHS.Offset,
|
||||||
|
RHS.SU->NodeNum);
|
||||||
|
|
||||||
|
llvm_unreachable("MemOpClusterMutation only supports register or frame "
|
||||||
|
"index bases.");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1146,9 +1146,9 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
|
||||||
MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
|
MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// Retrieve the base register, offset from the base and width. Width
|
// Retrieve the base, offset from the base and width. Width
|
||||||
// is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
|
// is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
|
||||||
// base registers are identical, and the offset of a lower memory access +
|
// base are identical, and the offset of a lower memory access +
|
||||||
// the width doesn't overlap the offset of a higher memory access,
|
// the width doesn't overlap the offset of a higher memory access,
|
||||||
// then the memory accesses are different.
|
// then the memory accesses are different.
|
||||||
if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
|
if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
|
||||||
|
@ -2035,8 +2035,9 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
|
||||||
if (MI.hasOrderedMemoryRef())
|
if (MI.hasOrderedMemoryRef())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// Make sure this is a reg+imm (as opposed to an address reloc).
|
// Make sure this is a reg/fi+imm (as opposed to an address reloc).
|
||||||
assert(MI.getOperand(1).isReg() && "Expected a reg operand.");
|
assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
|
||||||
|
"Expected a reg or frame index operand.");
|
||||||
if (!MI.getOperand(2).isImm())
|
if (!MI.getOperand(2).isImm())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
@ -2086,11 +2087,13 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
|
||||||
// Handle only loads/stores with base register followed by immediate offset.
|
// Handle only loads/stores with base register followed by immediate offset.
|
||||||
if (LdSt.getNumExplicitOperands() == 3) {
|
if (LdSt.getNumExplicitOperands() == 3) {
|
||||||
// Non-paired instruction (e.g., ldr x1, [x0, #8]).
|
// Non-paired instruction (e.g., ldr x1, [x0, #8]).
|
||||||
if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm())
|
if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
|
||||||
|
!LdSt.getOperand(2).isImm())
|
||||||
return false;
|
return false;
|
||||||
} else if (LdSt.getNumExplicitOperands() == 4) {
|
} else if (LdSt.getNumExplicitOperands() == 4) {
|
||||||
// Paired instruction (e.g., ldp x1, x2, [x0, #8]).
|
// Paired instruction (e.g., ldp x1, x2, [x0, #8]).
|
||||||
if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() ||
|
if (!LdSt.getOperand(1).isReg() ||
|
||||||
|
(!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
|
||||||
!LdSt.getOperand(3).isImm())
|
!LdSt.getOperand(3).isImm())
|
||||||
return false;
|
return false;
|
||||||
} else
|
} else
|
||||||
|
@ -2117,9 +2120,9 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
|
||||||
Offset = LdSt.getOperand(3).getImm() * Scale;
|
Offset = LdSt.getOperand(3).getImm() * Scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(
|
assert((BaseOp->isReg() || BaseOp->isFI()) &&
|
||||||
BaseOp->isReg() &&
|
"getMemOperandWithOffset only supports base "
|
||||||
"getMemOperandWithOffset only supports base operands of type register.");
|
"operands of type register or frame index.");
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -2275,31 +2278,33 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Scale the unscaled offsets. Returns false if the unscaled offset can't be
|
static unsigned getOffsetStride(unsigned Opc) {
|
||||||
// scaled.
|
|
||||||
static bool scaleOffset(unsigned Opc, int64_t &Offset) {
|
|
||||||
unsigned OffsetStride = 1;
|
|
||||||
switch (Opc) {
|
switch (Opc) {
|
||||||
default:
|
default:
|
||||||
return false;
|
return 0;
|
||||||
case AArch64::LDURQi:
|
case AArch64::LDURQi:
|
||||||
case AArch64::STURQi:
|
case AArch64::STURQi:
|
||||||
OffsetStride = 16;
|
return 16;
|
||||||
break;
|
|
||||||
case AArch64::LDURXi:
|
case AArch64::LDURXi:
|
||||||
case AArch64::LDURDi:
|
case AArch64::LDURDi:
|
||||||
case AArch64::STURXi:
|
case AArch64::STURXi:
|
||||||
case AArch64::STURDi:
|
case AArch64::STURDi:
|
||||||
OffsetStride = 8;
|
return 8;
|
||||||
break;
|
|
||||||
case AArch64::LDURWi:
|
case AArch64::LDURWi:
|
||||||
case AArch64::LDURSi:
|
case AArch64::LDURSi:
|
||||||
case AArch64::LDURSWi:
|
case AArch64::LDURSWi:
|
||||||
case AArch64::STURWi:
|
case AArch64::STURWi:
|
||||||
case AArch64::STURSi:
|
case AArch64::STURSi:
|
||||||
OffsetStride = 4;
|
return 4;
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scale the unscaled offsets. Returns false if the unscaled offset can't be
|
||||||
|
// scaled.
|
||||||
|
static bool scaleOffset(unsigned Opc, int64_t &Offset) {
|
||||||
|
unsigned OffsetStride = getOffsetStride(Opc);
|
||||||
|
if (OffsetStride == 0)
|
||||||
|
return false;
|
||||||
// If the byte-offset isn't a multiple of the stride, we can't scale this
|
// If the byte-offset isn't a multiple of the stride, we can't scale this
|
||||||
// offset.
|
// offset.
|
||||||
if (Offset % OffsetStride != 0)
|
if (Offset % OffsetStride != 0)
|
||||||
|
@ -2311,6 +2316,19 @@ static bool scaleOffset(unsigned Opc, int64_t &Offset) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Unscale the scaled offsets. Returns false if the scaled offset can't be
|
||||||
|
// unscaled.
|
||||||
|
static bool unscaleOffset(unsigned Opc, int64_t &Offset) {
|
||||||
|
unsigned OffsetStride = getOffsetStride(Opc);
|
||||||
|
if (OffsetStride == 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// Convert the "element" offset used by scaled pair load/store instructions
|
||||||
|
// into the byte-offset used by unscaled.
|
||||||
|
Offset *= OffsetStride;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
|
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
|
||||||
if (FirstOpc == SecondOpc)
|
if (FirstOpc == SecondOpc)
|
||||||
return true;
|
return true;
|
||||||
|
@ -2329,6 +2347,30 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
|
||||||
|
int64_t Offset1, unsigned Opcode1, int FI2,
|
||||||
|
int64_t Offset2, unsigned Opcode2) {
|
||||||
|
// Accesses through fixed stack object frame indices may access a different
|
||||||
|
// fixed stack slot. Check that the object offsets + offsets match.
|
||||||
|
if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
|
||||||
|
int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
|
||||||
|
int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
|
||||||
|
assert(ObjectOffset1 >= ObjectOffset2 && "Object offsets are not ordered.");
|
||||||
|
// Get the byte-offset from the object offset.
|
||||||
|
if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2))
|
||||||
|
return false;
|
||||||
|
ObjectOffset1 += Offset1;
|
||||||
|
ObjectOffset2 += Offset2;
|
||||||
|
// Get the "element" index in the object.
|
||||||
|
if (!scaleOffset(Opcode1, ObjectOffset1) ||
|
||||||
|
!scaleOffset(Opcode2, ObjectOffset2))
|
||||||
|
return false;
|
||||||
|
return ObjectOffset2 + 1 == ObjectOffset1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return FI1 == FI2;
|
||||||
|
}
|
||||||
|
|
||||||
/// Detect opportunities for ldp/stp formation.
|
/// Detect opportunities for ldp/stp formation.
|
||||||
///
|
///
|
||||||
/// Only called for LdSt for which getMemOperandWithOffset returns true.
|
/// Only called for LdSt for which getMemOperandWithOffset returns true.
|
||||||
|
@ -2340,9 +2382,11 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
|
||||||
if (BaseOp1.getType() != BaseOp2.getType())
|
if (BaseOp1.getType() != BaseOp2.getType())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
assert(BaseOp1.isReg() && "Only base registers are supported.");
|
assert(BaseOp1.isReg() ||
|
||||||
|
BaseOp1.isFI() &&
|
||||||
|
"Only base registers and frame indices are supported.");
|
||||||
|
|
||||||
// Check for base regs.
|
// Check for both base regs and base FI.
|
||||||
if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
|
if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
@ -2379,7 +2423,17 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// The caller should already have ordered First/SecondLdSt by offset.
|
// The caller should already have ordered First/SecondLdSt by offset.
|
||||||
assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
|
// Note: except for non-equal frame index bases
|
||||||
|
assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
|
||||||
|
"Caller should have ordered offsets.");
|
||||||
|
|
||||||
|
if (BaseOp1.isFI()) {
|
||||||
|
const MachineFrameInfo &MFI =
|
||||||
|
FirstLdSt.getParent()->getParent()->getFrameInfo();
|
||||||
|
return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
|
||||||
|
BaseOp2.getIndex(), Offset2, SecondOpc);
|
||||||
|
}
|
||||||
|
|
||||||
return Offset1 + 1 == Offset2;
|
return Offset1 + 1 == Offset2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -259,9 +259,9 @@ define void @memset_12_stack() {
|
||||||
define void @memset_16_stack() {
|
define void @memset_16_stack() {
|
||||||
; CHECK-LABEL: memset_16_stack:
|
; CHECK-LABEL: memset_16_stack:
|
||||||
; CHECK: mov x8, #-6148914691236517206
|
; CHECK: mov x8, #-6148914691236517206
|
||||||
|
; CHECK-NEXT: str x8, [sp, #-32]!
|
||||||
; CHECK-NEXT: mov x0, sp
|
; CHECK-NEXT: mov x0, sp
|
||||||
; CHECK-NEXT: stp x8, x30, [sp, #8]
|
; CHECK-NEXT: stp x8, x30, [sp, #8]
|
||||||
; CHECK-NEXT: str x8, [sp]
|
|
||||||
; CHECK-NEXT: bl something
|
; CHECK-NEXT: bl something
|
||||||
%buf = alloca [16 x i8], align 1
|
%buf = alloca [16 x i8], align 1
|
||||||
%cast = bitcast [16 x i8]* %buf to i8*
|
%cast = bitcast [16 x i8]* %buf to i8*
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
#RUN: llc -mtriple=aarch64-- -mcpu=cyclone -run-pass machine-scheduler -o - %s | FileCheck %s
|
||||||
|
...
|
||||||
|
---
|
||||||
|
name: merge_stack
|
||||||
|
# CHECK-LABEL: name: merge_stack
|
||||||
|
tracksRegLiveness: true
|
||||||
|
stack:
|
||||||
|
- { id: 0, size: 64, alignment: 8 }
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $w0, $w1
|
||||||
|
|
||||||
|
%0:gpr32 = COPY $w0
|
||||||
|
%1:gpr32 = COPY $w1
|
||||||
|
undef %3.sub_32:gpr64 = ORRWrs $wzr, %0, 0
|
||||||
|
STRXui %3, %stack.0, 0 :: (store 8)
|
||||||
|
undef %5.sub_32:gpr64 = ORRWrs $wzr, %1, 0
|
||||||
|
STRXui %5, %stack.0, 1 :: (store 8)
|
||||||
|
RET_ReallyLR
|
||||||
|
|
||||||
|
; CHECK: COPY
|
||||||
|
; CHECK-NEXT: COPY
|
||||||
|
; CHECK-NEXT: ORRWrs
|
||||||
|
; CHECK-NEXT: ORRWrs
|
||||||
|
; CHECK-NEXT: STRXui
|
||||||
|
; CHECK-NEXT: STRXui
|
||||||
|
; CHECK-NEXT: RET
|
Loading…
Reference in New Issue