[CodeGen] Add support for multiple memory operands in MachineInstr::mayAlias

Summary:
To support all targets, the mayAlias member function needs to support instructions with multiple operands.

This revision also changes the order of the emitted instructions in some test cases.

Reviewers: efriedma, hfinkel, craig.topper, dmgreen

Reviewed By: efriedma

Subscribers: MatzeB, dmgreen, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D80161
This commit is contained in:
Jean-Michel Gorius 2020-05-21 16:30:48 +02:00
parent 689e616ed0
commit 7019cea26d
13 changed files with 246 additions and 91 deletions

View File

@ -1228,81 +1228,88 @@ bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other,
if (TII->areMemAccessesTriviallyDisjoint(*this, Other))
return false;
// FIXME: Need to handle multiple memory operands to support all targets.
if (!hasOneMemOperand() || !Other.hasOneMemOperand())
if (memoperands_empty() || Other.memoperands_empty())
return true;
MachineMemOperand *MMOa = *memoperands_begin();
MachineMemOperand *MMOb = *Other.memoperands_begin();
auto HasAlias = [&](const MachineMemOperand &MMOa,
const MachineMemOperand &MMOb) {
// The following interface to AA is fashioned after DAGCombiner::isAlias
// and operates with MachineMemOperand offset with some important
// assumptions:
// - LLVM fundamentally assumes flat address spaces.
// - MachineOperand offset can *only* result from legalization and
// cannot affect queries other than the trivial case of overlap
// checking.
// - These offsets never wrap and never step outside
// of allocated objects.
// - There should never be any negative offsets here.
//
// FIXME: Modify API to hide this math from "user"
// Even before we go to AA we can reason locally about some
// memory objects. It can save compile time, and possibly catch some
// corner cases not currently covered.
// The following interface to AA is fashioned after DAGCombiner::isAlias
// and operates with MachineMemOperand offset with some important
// assumptions:
// - LLVM fundamentally assumes flat address spaces.
// - MachineOperand offset can *only* result from legalization and
// cannot affect queries other than the trivial case of overlap
// checking.
// - These offsets never wrap and never step outside
// of allocated objects.
// - There should never be any negative offsets here.
//
// FIXME: Modify API to hide this math from "user"
// Even before we go to AA we can reason locally about some
// memory objects. It can save compile time, and possibly catch some
// corner cases not currently covered.
int64_t OffsetA = MMOa.getOffset();
int64_t OffsetB = MMOb.getOffset();
int64_t MinOffset = std::min(OffsetA, OffsetB);
int64_t OffsetA = MMOa->getOffset();
int64_t OffsetB = MMOb->getOffset();
int64_t MinOffset = std::min(OffsetA, OffsetB);
uint64_t WidthA = MMOa.getSize();
uint64_t WidthB = MMOb.getSize();
bool KnownWidthA = WidthA != MemoryLocation::UnknownSize;
bool KnownWidthB = WidthB != MemoryLocation::UnknownSize;
uint64_t WidthA = MMOa->getSize();
uint64_t WidthB = MMOb->getSize();
bool KnownWidthA = WidthA != MemoryLocation::UnknownSize;
bool KnownWidthB = WidthB != MemoryLocation::UnknownSize;
const Value *ValA = MMOa.getValue();
const Value *ValB = MMOb.getValue();
bool SameVal = (ValA && ValB && (ValA == ValB));
if (!SameVal) {
const PseudoSourceValue *PSVa = MMOa.getPseudoValue();
const PseudoSourceValue *PSVb = MMOb.getPseudoValue();
if (PSVa && ValB && !PSVa->mayAlias(&MFI))
return false;
if (PSVb && ValA && !PSVb->mayAlias(&MFI))
return false;
if (PSVa && PSVb && (PSVa == PSVb))
SameVal = true;
}
const Value *ValA = MMOa->getValue();
const Value *ValB = MMOb->getValue();
bool SameVal = (ValA && ValB && (ValA == ValB));
if (!SameVal) {
const PseudoSourceValue *PSVa = MMOa->getPseudoValue();
const PseudoSourceValue *PSVb = MMOb->getPseudoValue();
if (PSVa && ValB && !PSVa->mayAlias(&MFI))
return false;
if (PSVb && ValA && !PSVb->mayAlias(&MFI))
return false;
if (PSVa && PSVb && (PSVa == PSVb))
SameVal = true;
}
if (SameVal) {
if (!KnownWidthA || !KnownWidthB)
return true;
int64_t MaxOffset = std::max(OffsetA, OffsetB);
int64_t LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
return (MinOffset + LowWidth > MaxOffset);
}
if (SameVal) {
if (!KnownWidthA || !KnownWidthB)
if (!AA)
return true;
int64_t MaxOffset = std::max(OffsetA, OffsetB);
int64_t LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
return (MinOffset + LowWidth > MaxOffset);
if (!ValA || !ValB)
return true;
assert((OffsetA >= 0) && "Negative MachineMemOperand offset");
assert((OffsetB >= 0) && "Negative MachineMemOperand offset");
int64_t OverlapA = KnownWidthA ? WidthA + OffsetA - MinOffset
: MemoryLocation::UnknownSize;
int64_t OverlapB = KnownWidthB ? WidthB + OffsetB - MinOffset
: MemoryLocation::UnknownSize;
AliasResult AAResult =
AA->alias(MemoryLocation(ValA, OverlapA,
UseTBAA ? MMOa.getAAInfo() : AAMDNodes()),
MemoryLocation(ValB, OverlapB,
UseTBAA ? MMOb.getAAInfo() : AAMDNodes()));
return (AAResult != NoAlias);
};
for (auto &&MMOa : memoperands()) {
for (auto &&MMOb : Other.memoperands()) {
if (HasAlias(*MMOa, *MMOb))
return true;
}
}
if (!AA)
return true;
if (!ValA || !ValB)
return true;
assert((OffsetA >= 0) && "Negative MachineMemOperand offset");
assert((OffsetB >= 0) && "Negative MachineMemOperand offset");
int64_t OverlapA = KnownWidthA ? WidthA + OffsetA - MinOffset
: MemoryLocation::UnknownSize;
int64_t OverlapB = KnownWidthB ? WidthB + OffsetB - MinOffset
: MemoryLocation::UnknownSize;
AliasResult AAResult = AA->alias(
MemoryLocation(ValA, OverlapA,
UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
MemoryLocation(ValB, OverlapB,
UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
return (AAResult != NoAlias);
return false;
}
/// hasOrderedMemoryRef - Return true if this instruction may have an ordered

View File

@ -544,9 +544,14 @@ static inline bool isGlobalMemoryObject(AAResults *AA, MachineInstr *MI) {
void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
unsigned Latency) {
if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) {
LLVM_DEBUG(dbgs() << "Adding chain dependency\n from: " << *SUb->getInstr()
<< " to: " << *SUa->getInstr());
SDep Dep(SUa, SDep::MayAliasMem);
Dep.setLatency(Latency);
SUb->addPred(Dep);
} else {
LLVM_DEBUG(dbgs() << "Not adding chain dependency\n from: "
<< *SUb->getInstr() << " to: " << *SUa->getInstr());
}
}

View File

@ -19,11 +19,11 @@ define void @test(%struct1* %fde, i32 %fd, void (i32, i32, i8*)* %func, i8* %arg
; A53-NEXT: mov x19, x8
; A53-NEXT: mov w0, w1
; A53-NEXT: mov w9, #256
; A53-NEXT: stp x2, x3, [x8, #32]
; A53-NEXT: mov x2, x8
; A53-NEXT: str q0, [x19, #16]!
; A53-NEXT: str w1, [x19]
; A53-NEXT: mov w1, #4
; A53-NEXT: stp x2, x3, [x8, #32]
; A53-NEXT: mov x2, x8
; A53-NEXT: str q0, [x8]
; A53-NEXT: strh w9, [x8, #24]
; A53-NEXT: str wzr, [x8, #20]

View File

@ -503,12 +503,12 @@ define void @conv_v8f16_to_i128( <8 x half> %a, i128* %store ) {
; CHECK-NEXT: vmov.32 r3, d16[1]
; CHECK-NEXT: vmov.32 r1, d16[0]
; CHECK-NEXT: subs r12, r12, #1
; CHECK-NEXT: str r12, [r0, #12]
; CHECK-NEXT: sbcs r2, r2, #0
; CHECK-NEXT: str r2, [r0, #8]
; CHECK-NEXT: sbcs r3, r3, #0
; CHECK-NEXT: sbc r1, r1, #0
; CHECK-NEXT: stm r0, {r1, r3}
; CHECK-NEXT: str r2, [r0, #8]
; CHECK-NEXT: str r12, [r0, #12]
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:

View File

@ -9,7 +9,7 @@
; CHECK: ********** MI Scheduling **********
; We need second, post-ra scheduling to have VLDM instruction combined from single-loads
; CHECK: ********** MI Scheduling **********
; CHECK: VLDMDIA_UPD
; CHECK: SU(1):{{.*}}VLDMDIA_UPD
; CHECK: rdefs left
; CHECK-NEXT: Latency : 6
; CHECK: Successors:

View File

@ -5,7 +5,7 @@
; We need second, post-ra scheduling to have VSTM instruction combined from single-stores
; CHECK: ********** MI Scheduling **********
; CHECK: schedule starting
; CHECK: VSTMDIA_UPD
; CHECK: SU(2):{{.*}}VSTMDIA_UPD
; CHECK: rdefs left
; CHECK-NEXT: Latency : 4
; CHECK: Successors:

View File

@ -5,7 +5,7 @@
; We need second, post-ra scheduling to have VSTM instruction combined from single-stores
; CHECK: ********** MI Scheduling **********
; CHECK: schedule starting
; CHECK: VSTMDIA
; CHECK: SU(3):{{.*}}VSTMDIA
; CHECK: rdefs left
; CHECK-NEXT: Latency : 2

View File

@ -1092,6 +1092,7 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
; CHECK-NEXT: ldrd lr, r10, [r12, #24]
; CHECK-NEXT: vstrb.8 q0, [r11], #16
; CHECK-NEXT: vldrw.u32 q0, [r8], #32
; CHECK-NEXT: strd r11, r1, [sp, #24] @ 8-byte Folded Spill
; CHECK-NEXT: vldrw.u32 q1, [r8, #-28]
; CHECK-NEXT: vmul.f32 q0, q0, r0
; CHECK-NEXT: vldrw.u32 q6, [r8, #-24]
@ -1103,13 +1104,12 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
; CHECK-NEXT: vfma.f32 q0, q4, r6
; CHECK-NEXT: vldrw.u32 q3, [r8, #-8]
; CHECK-NEXT: vfma.f32 q0, q5, r5
; CHECK-NEXT: vldrw.u32 q1, [r8, #-4]
; CHECK-NEXT: vfma.f32 q0, q2, r3
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vfma.f32 q0, q2, r3
; CHECK-NEXT: vldrw.u32 q1, [r8, #-4]
; CHECK-NEXT: vfma.f32 q0, q3, lr
; CHECK-NEXT: strd r11, r1, [sp, #24] @ 8-byte Folded Spill
; CHECK-NEXT: vfma.f32 q0, q1, r10
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: vfma.f32 q0, q1, r10
; CHECK-NEXT: blo .LBB16_7
; CHECK-NEXT: @ %bb.5: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1

View File

@ -168,16 +168,14 @@ define dso_local i32 @e() #0 {
; CHECK-NEXT: vmov q1, q4
; CHECK-NEXT: vmov s1, r7
; CHECK-NEXT: vmov.32 q1[1], r6
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: vmov.32 q1[2], r5
; CHECK-NEXT: vmov.32 q5[0], r7
; CHECK-NEXT: vmov.32 q1[2], r5
; CHECK-NEXT: vmov s9, r4
; CHECK-NEXT: vmov.32 q1[3], r4
; CHECK-NEXT: strd r0, r10, [sp, #24]
; CHECK-NEXT: vdup.32 q6, r7
; CHECK-NEXT: vstrw.32 q1, [sp, #76]
; CHECK-NEXT: vmov q1, q5
; CHECK-NEXT: vmov s9, r4
; CHECK-NEXT: vmov.32 q1[1], r7
; CHECK-NEXT: vdup.32 q6, r7
; CHECK-NEXT: vmov.f32 s2, s1
; CHECK-NEXT: vmov.f32 s8, s0
; CHECK-NEXT: vmov.32 q1[2], r6
@ -185,6 +183,7 @@ define dso_local i32 @e() #0 {
; CHECK-NEXT: vmov q7, q6
; CHECK-NEXT: vmov.f32 s10, s1
; CHECK-NEXT: mov.w r8, #4
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: vmov.32 q1[3], r4
; CHECK-NEXT: vmov.32 q3[0], r4
; CHECK-NEXT: vmov.32 q7[1], r4
@ -192,6 +191,7 @@ define dso_local i32 @e() #0 {
; CHECK-NEXT: vmov.f32 s11, s3
; CHECK-NEXT: movs r1, #64
; CHECK-NEXT: strh.w r8, [sp, #390]
; CHECK-NEXT: strd r0, r10, [sp, #24]
; CHECK-NEXT: vstrw.32 q0, [sp, #44]
; CHECK-NEXT: str r0, [r0]
; CHECK-NEXT: vstrw.32 q2, [r0]

View File

@ -24,8 +24,8 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
; CHECK-NEXT: vmov.f32 s9, s6
; CHECK-NEXT: vmov.f32 s10, s0
; CHECK-NEXT: vmov.f32 s11, s5
; CHECK-NEXT: strd r2, r0, [r1, #16]
; CHECK-NEXT: vstrw.32 q2, [r1]
; CHECK-NEXT: strd r2, r0, [r1, #16]
; CHECK-NEXT: pop {r4, pc}
entry:
%s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0

View File

@ -8,17 +8,17 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; THUMBV7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; THUMBV7-NEXT: .pad #44
; THUMBV7-NEXT: sub sp, #44
; THUMBV7-NEXT: ldrd r4, r7, [sp, #88]
; THUMBV7-NEXT: mov r5, r3
; THUMBV7-NEXT: str r0, [sp, #40] @ 4-byte Spill
; THUMBV7-NEXT: movs r0, #0
; THUMBV7-NEXT: strd r4, r7, [sp]
; THUMBV7-NEXT: mov r1, r3
; THUMBV7-NEXT: ldrd r4, r7, [sp, #88]
; THUMBV7-NEXT: mov r5, r3
; THUMBV7-NEXT: strd r0, r0, [sp, #8]
; THUMBV7-NEXT: mov r1, r3
; THUMBV7-NEXT: mov r6, r2
; THUMBV7-NEXT: mov r0, r2
; THUMBV7-NEXT: movs r2, #0
; THUMBV7-NEXT: movs r3, #0
; THUMBV7-NEXT: strd r4, r7, [sp]
; THUMBV7-NEXT: bl __multi3
; THUMBV7-NEXT: strd r1, r0, [sp, #32] @ 8-byte Folded Spill
; THUMBV7-NEXT: strd r3, r2, [sp, #24] @ 8-byte Folded Spill

View File

@ -0,0 +1,144 @@
# RUN: llc -mtriple=i686-- -o - -run-pass=machine-scheduler -debug %s 2>&1 | FileCheck %s
# REQUIRES: asserts
--- |
%struct.Macroblock.0.1.2.3.6.17 = type { i32, i32, i32, i32, i32, [8 x i32], %struct.Macroblock.0.1.2.3.6.17*, %struct.Macroblock.0.1.2.3.6.17*, i32, [2 x [4 x [4 x [2 x i32]]]], [16 x i8], [16 x i8], i32, i64, [4 x i32], [4 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i16, double, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
define void @stepsystem(i32 %x) {
entry:
%0 = load i32, i32* undef, align 8
%inc = add i32 %x, 1
store i32 %inc, i32* undef, align 8
store <2 x double> <double 0xD47D42AEA2879F2E, double 0xD47D42AEA2879F2E>, <2 x double>* undef, align 8
ret void
}
define void @dct_chroma() {
cond_true2732.preheader:
%tmp2666 = getelementptr %struct.Macroblock.0.1.2.3.6.17, %struct.Macroblock.0.1.2.3.6.17* null, i32 0, i32 13
%tmp2667.us.us = load i64, i64* %tmp2666, align 4
%tmp2670.us.us = load i64, i64* null, align 4
%tmp2675.us.us = shl i64 %tmp2670.us.us, 0
%tmp2675not.us.us = xor i64 %tmp2675.us.us, -1
%tmp2676.us.us = and i64 %tmp2667.us.us, %tmp2675not.us.us
store i64 %tmp2676.us.us, i64* %tmp2666, align 4
ret void
}
...
---
name: stepsystem
alignment: 16
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers:
- { id: 0, class: gr32, preferred-register: '' }
- { id: 1, class: gr32, preferred-register: '' }
- { id: 2, class: gr32, preferred-register: '' }
- { id: 3, class: gr32, preferred-register: '' }
- { id: 4, class: gr32, preferred-register: '' }
liveins: []
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 0
offsetAdjustment: 0
maxAlignment: 4
adjustsStack: false
hasCalls: false
stackProtector: ''
maxCallFrameSize: 4294967295
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack:
- { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: default,
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
stack: []
callSites: []
constants: []
machineFunctionInfo: {}
body: |
bb.0.entry:
%1:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0)
%1:gr32 = INC32r %1, implicit-def dead $eflags
MOV32mr undef %2:gr32, 1, $noreg, 0, $noreg, %1 :: (store 4 into `i32* undef`, align 8)
MOV32mi undef %3:gr32, 1, $noreg, 0, $noreg, -729988434 :: (store 4 into `<2 x double>* undef` + 12)
MOV32mi undef %4:gr32, 1, $noreg, 0, $noreg, -1568170194 :: (store 4 into `<2 x double>* undef` + 8, align 8)
RET 0
# CHECK-LABEL: stepsystem
# CHECK: Not adding chain dependency{{[[:space:]]*}}from: MOV32mi {{.*}} :: (store 4 {{.*}}){{[[:space:]]*}}to: MOV32mi {{.*}} :: (store 4 {{.*}})
# CHECK: Adding chain dependency{{[[:space:]]*}}from: MOV32mi {{.*}} :: (store 4 {{.*}}){{[[:space:]]*}}to: MOV32mr {{.*}} :: (store 4 {{.*}})
...
---
name: dct_chroma
alignment: 16
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers:
- { id: 0, class: gr32, preferred-register: '' }
- { id: 1, class: gr32, preferred-register: '' }
- { id: 2, class: gr32, preferred-register: '' }
- { id: 3, class: gr32, preferred-register: '' }
- { id: 4, class: gr32, preferred-register: '' }
liveins: []
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 0
offsetAdjustment: 0
maxAlignment: 1
adjustsStack: false
hasCalls: false
stackProtector: ''
maxCallFrameSize: 4294967295
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack: []
stack: []
callSites: []
constants: []
machineFunctionInfo: {}
body: |
bb.0.cond_true2732.preheader:
%4:gr32 = MOV32rm $noreg, 1, $noreg, 0, $noreg :: (load 4 from `i64* null`)
%2:gr32 = MOV32rm $noreg, 1, $noreg, 4, $noreg :: (load 4 from `i64* null` + 4)
%2:gr32 = NOT32r %2
%4:gr32 = NOT32r %4
%4:gr32 = AND32rm %4, $noreg, 1, $noreg, 356, $noreg, implicit-def dead $eflags :: (load 4 from %ir.tmp2666)
AND32mr $noreg, 1, $noreg, 360, $noreg, %2, implicit-def dead $eflags :: (store 4 into %ir.tmp2666 + 4), (load 4 from %ir.tmp2666 + 4)
MOV32mr $noreg, 1, $noreg, 356, $noreg, %4 :: (store 4 into %ir.tmp2666)
RET 0
# Chain dependencies should not be systematically added when at least one of
# the instructions has more than one memory operand. It should only be added
# where it would be needed.
# CHECK-LABEL: dct_chroma
# CHECK: Not adding chain dependency{{[[:space:]]*}}from: MOV32mr {{.*}} :: (store 4 {{.*}}){{[[:space:]]*}}to: AND32mr {{.*}} :: (store 4 {{.*}}), (load 4 {{.*}})
# CHECK: Adding chain dependency{{[[:space:]]*}}from: AND32mr {{.*}} :: (store 4 {{.*}}), (load 4 {{.*}}){{[[:space:]]*}}to: %{{.*}} = MOV32rm {{.*}} :: (load 4 {{.*}})

View File

@ -17,13 +17,12 @@ cond_true2732.preheader: ; preds = %entry
store i64 %tmp2676.us.us, i64* %tmp2666
ret i32 0
; INTEL: and {{e..}}, dword ptr [356]
; INTEL: and dword ptr [360], {{e..}}
; FIXME: mov dword ptr [356], {{e..}}
; The above line comes out as 'mov 360, eax', but when the register is ecx it works?
; INTEL: and {{e..}}, dword ptr [356]
; INTEL: mov dword ptr [356], {{e..}}
; ATT: andl 356, %{{e..}}
; ATT: andl %{{e..}}, 360
; ATT: andl 356, %{{e..}}
; ATT: movl %{{e..}}, 356
}