Revert "[CodeGen] Add support for multiple memory operands in MachineInstr::mayAlias"

This temporarily reverts commit 7019cea26d. It seems that, for some targets, there are instructions with a lot of memory operands (probably more than would be expected). This causes a lot of buildbots to timeout and notify failed builds. While investigations are ongoing to find out why this happens, revert the changes.
2020-05-22 21:26:46 +02:00 · 2020-05-22 21:26:46 +02:00 · 65cd2c7a80
parent 8cb7574541
commit 65cd2c7a80
13 changed files with 93 additions and 248 deletions
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@ -1228,88 +1228,81 @@ bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other,
  if (TII->areMemAccessesTriviallyDisjoint(*this, Other))
    return false;

-  if (memoperands_empty() || Other.memoperands_empty())
+  // FIXME: Need to handle multiple memory operands to support all targets.
+  if (!hasOneMemOperand() || !Other.hasOneMemOperand())
    return true;

-  auto HasAlias = [&](const MachineMemOperand &MMOa,
-                      const MachineMemOperand &MMOb) {
-    // The following interface to AA is fashioned after DAGCombiner::isAlias
-    // and operates with MachineMemOperand offset with some important
-    // assumptions:
-    //   - LLVM fundamentally assumes flat address spaces.
-    //   - MachineOperand offset can *only* result from legalization and
-    //     cannot affect queries other than the trivial case of overlap
-    //     checking.
-    //   - These offsets never wrap and never step outside
-    //     of allocated objects.
-    //   - There should never be any negative offsets here.
-    //
-    // FIXME: Modify API to hide this math from "user"
-    // Even before we go to AA we can reason locally about some
-    // memory objects. It can save compile time, and possibly catch some
-    // corner cases not currently covered.
+  MachineMemOperand *MMOa = *memoperands_begin();
+  MachineMemOperand *MMOb = *Other.memoperands_begin();

-    int64_t OffsetA = MMOa.getOffset();
-    int64_t OffsetB = MMOb.getOffset();
-    int64_t MinOffset = std::min(OffsetA, OffsetB);
+  // The following interface to AA is fashioned after DAGCombiner::isAlias
+  // and operates with MachineMemOperand offset with some important
+  // assumptions:
+  //   - LLVM fundamentally assumes flat address spaces.
+  //   - MachineOperand offset can *only* result from legalization and
+  //     cannot affect queries other than the trivial case of overlap
+  //     checking.
+  //   - These offsets never wrap and never step outside
+  //     of allocated objects.
+  //   - There should never be any negative offsets here.
+  //
+  // FIXME: Modify API to hide this math from "user"
+  // Even before we go to AA we can reason locally about some
+  // memory objects. It can save compile time, and possibly catch some
+  // corner cases not currently covered.

-    uint64_t WidthA = MMOa.getSize();
-    uint64_t WidthB = MMOb.getSize();
-    bool KnownWidthA = WidthA != MemoryLocation::UnknownSize;
-    bool KnownWidthB = WidthB != MemoryLocation::UnknownSize;
+  int64_t OffsetA = MMOa->getOffset();
+  int64_t OffsetB = MMOb->getOffset();
+  int64_t MinOffset = std::min(OffsetA, OffsetB);

-    const Value *ValA = MMOa.getValue();
-    const Value *ValB = MMOb.getValue();
-    bool SameVal = (ValA && ValB && (ValA == ValB));
-    if (!SameVal) {
-      const PseudoSourceValue *PSVa = MMOa.getPseudoValue();
-      const PseudoSourceValue *PSVb = MMOb.getPseudoValue();
-      if (PSVa && ValB && !PSVa->mayAlias(&MFI))
-        return false;
-      if (PSVb && ValA && !PSVb->mayAlias(&MFI))
-        return false;
-      if (PSVa && PSVb && (PSVa == PSVb))
-        SameVal = true;
-    }
+  uint64_t WidthA = MMOa->getSize();
+  uint64_t WidthB = MMOb->getSize();
+  bool KnownWidthA = WidthA != MemoryLocation::UnknownSize;
+  bool KnownWidthB = WidthB != MemoryLocation::UnknownSize;

-    if (SameVal) {
-      if (!KnownWidthA || !KnownWidthB)
-        return true;
-      int64_t MaxOffset = std::max(OffsetA, OffsetB);
-      int64_t LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
-      return (MinOffset + LowWidth > MaxOffset);
-    }
-
-    if (!AA)
-      return true;
-
-    if (!ValA || !ValB)
-      return true;
-
-    assert((OffsetA >= 0) && "Negative MachineMemOperand offset");
-    assert((OffsetB >= 0) && "Negative MachineMemOperand offset");
-
-    int64_t OverlapA = KnownWidthA ? WidthA + OffsetA - MinOffset
-                                   : MemoryLocation::UnknownSize;
-    int64_t OverlapB = KnownWidthB ? WidthB + OffsetB - MinOffset
-                                   : MemoryLocation::UnknownSize;
-
-    AliasResult AAResult =
-        AA->alias(MemoryLocation(ValA, OverlapA,
-                                 UseTBAA ? MMOa.getAAInfo() : AAMDNodes()),
-                  MemoryLocation(ValB, OverlapB,
-                                 UseTBAA ? MMOb.getAAInfo() : AAMDNodes()));
-
-    return (AAResult != NoAlias);
-  };
-
-  for (auto &&MMOa : memoperands()) {
-    for (auto &&MMOb : Other.memoperands()) {
-      if (HasAlias(*MMOa, *MMOb))
-        return true;
-    }
+  const Value *ValA = MMOa->getValue();
+  const Value *ValB = MMOb->getValue();
+  bool SameVal = (ValA && ValB && (ValA == ValB));
+  if (!SameVal) {
+    const PseudoSourceValue *PSVa = MMOa->getPseudoValue();
+    const PseudoSourceValue *PSVb = MMOb->getPseudoValue();
+    if (PSVa && ValB && !PSVa->mayAlias(&MFI))
+      return false;
+    if (PSVb && ValA && !PSVb->mayAlias(&MFI))
+      return false;
+    if (PSVa && PSVb && (PSVa == PSVb))
+      SameVal = true;
  }
-  return false;
+
+  if (SameVal) {
+    if (!KnownWidthA || !KnownWidthB)
+      return true;
+    int64_t MaxOffset = std::max(OffsetA, OffsetB);
+    int64_t LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
+    return (MinOffset + LowWidth > MaxOffset);
+  }
+
+  if (!AA)
+    return true;
+
+  if (!ValA || !ValB)
+    return true;
+
+  assert((OffsetA >= 0) && "Negative MachineMemOperand offset");
+  assert((OffsetB >= 0) && "Negative MachineMemOperand offset");
+
+  int64_t OverlapA = KnownWidthA ? WidthA + OffsetA - MinOffset
+                                 : MemoryLocation::UnknownSize;
+  int64_t OverlapB = KnownWidthB ? WidthB + OffsetB - MinOffset
+                                 : MemoryLocation::UnknownSize;
+
+  AliasResult AAResult = AA->alias(
+      MemoryLocation(ValA, OverlapA,
+                     UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
+      MemoryLocation(ValB, OverlapB,
+                     UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
+
+  return (AAResult != NoAlias);
 }

 /// hasOrderedMemoryRef - Return true if this instruction may have an ordered
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@ -544,14 +544,9 @@ static inline bool isGlobalMemoryObject(AAResults *AA, MachineInstr *MI) {
 void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
                                            unsigned Latency) {
  if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) {
-    LLVM_DEBUG(dbgs() << "Adding chain dependency\n  from: " << *SUb->getInstr()
-                      << "  to:   " << *SUa->getInstr());
    SDep Dep(SUa, SDep::MayAliasMem);
    Dep.setLatency(Latency);
    SUb->addPred(Dep);
-  } else {
-    LLVM_DEBUG(dbgs() << "Not adding chain dependency\n  from: "
-                      << *SUb->getInstr() << "  to:   " << *SUa->getInstr());
  }
 }

--- a/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
+++ b/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
@ -19,11 +19,11 @@ define void @test(%struct1* %fde, i32 %fd, void (i32, i32, i8*)* %func, i8* %arg
 ; A53-NEXT:    mov x19, x8
 ; A53-NEXT:    mov w0, w1
 ; A53-NEXT:    mov w9, #256
-; A53-NEXT:    stp x2, x3, [x8, #32]
-; A53-NEXT:    mov x2, x8
 ; A53-NEXT:    str q0, [x19, #16]!
 ; A53-NEXT:    str w1, [x19]
 ; A53-NEXT:    mov w1, #4
+; A53-NEXT:    stp x2, x3, [x8, #32]
+; A53-NEXT:    mov x2, x8
 ; A53-NEXT:    str q0, [x8]
 ; A53-NEXT:    strh w9, [x8, #24]
 ; A53-NEXT:    str wzr, [x8, #20]
--- a/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
+++ b/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
@ -503,12 +503,12 @@ define void @conv_v8f16_to_i128( <8 x half> %a, i128* %store ) {
 ; CHECK-NEXT:    vmov.32 r3, d16[1]
 ; CHECK-NEXT:    vmov.32 r1, d16[0]
 ; CHECK-NEXT:    subs r12, r12, #1
-; CHECK-NEXT:    str r12, [r0, #12]
 ; CHECK-NEXT:    sbcs r2, r2, #0
-; CHECK-NEXT:    str r2, [r0, #8]
 ; CHECK-NEXT:    sbcs r3, r3, #0
 ; CHECK-NEXT:    sbc r1, r1, #0
 ; CHECK-NEXT:    stm r0, {r1, r3}
+; CHECK-NEXT:    str r2, [r0, #8]
+; CHECK-NEXT:    str r12, [r0, #12]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
--- a/llvm/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
+++ b/llvm/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
@ -9,7 +9,7 @@
 ; CHECK:       ********** MI Scheduling **********
 ; We need second, post-ra scheduling to have VLDM instruction combined from single-loads
 ; CHECK:       ********** MI Scheduling **********
-; CHECK:       SU(1):{{.*}}VLDMDIA_UPD
+; CHECK:       VLDMDIA_UPD
 ; CHECK:       rdefs left
 ; CHECK-NEXT:  Latency            : 6
 ; CHECK:       Successors:
--- a/llvm/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
+++ b/llvm/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
@ -5,7 +5,7 @@
 ; We need second, post-ra scheduling to have VSTM instruction combined from single-stores
 ; CHECK:       ********** MI Scheduling **********
 ; CHECK:       schedule starting
-; CHECK:       SU(2):{{.*}}VSTMDIA_UPD
+; CHECK:       VSTMDIA_UPD
 ; CHECK:       rdefs left
 ; CHECK-NEXT:  Latency            : 4
 ; CHECK:       Successors:
--- a/llvm/test/CodeGen/ARM/cortex-a57-misched-vstm.ll
+++ b/llvm/test/CodeGen/ARM/cortex-a57-misched-vstm.ll
@ -5,7 +5,7 @@
 ; We need second, post-ra scheduling to have VSTM instruction combined from single-stores
 ; CHECK:       ********** MI Scheduling **********
 ; CHECK:       schedule starting
-; CHECK:       SU(3):{{.*}}VSTMDIA
+; CHECK:       VSTMDIA
 ; CHECK:       rdefs left
 ; CHECK-NEXT:  Latency            : 2

--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@ -1092,7 +1092,6 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    ldrd lr, r10, [r12, #24]
 ; CHECK-NEXT:    vstrb.8 q0, [r11], #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r8], #32
-; CHECK-NEXT:    strd r11, r1, [sp, #24] @ 8-byte Folded Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [r8, #-28]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r0
 ; CHECK-NEXT:    vldrw.u32 q6, [r8, #-24]
@ -1104,12 +1103,13 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    vfma.f32 q0, q4, r6
 ; CHECK-NEXT:    vldrw.u32 q3, [r8, #-8]
 ; CHECK-NEXT:    vfma.f32 q0, q5, r5
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    vfma.f32 q0, q2, r3
 ; CHECK-NEXT:    vldrw.u32 q1, [r8, #-4]
+; CHECK-NEXT:    vfma.f32 q0, q2, r3
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f32 q0, q3, lr
-; CHECK-NEXT:    cmp r0, #16
+; CHECK-NEXT:    strd r11, r1, [sp, #24] @ 8-byte Folded Spill
 ; CHECK-NEXT:    vfma.f32 q0, q1, r10
+; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    blo .LBB16_7
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@ -168,14 +168,16 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov q1, q4
 ; CHECK-NEXT:    vmov s1, r7
 ; CHECK-NEXT:    vmov.32 q1[1], r6
-; CHECK-NEXT:    vmov.32 q5[0], r7
+; CHECK-NEXT:    mov.w r10, #0
 ; CHECK-NEXT:    vmov.32 q1[2], r5
-; CHECK-NEXT:    vmov s9, r4
+; CHECK-NEXT:    vmov.32 q5[0], r7
 ; CHECK-NEXT:    vmov.32 q1[3], r4
-; CHECK-NEXT:    vdup.32 q6, r7
+; CHECK-NEXT:    strd r0, r10, [sp, #24]
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #76]
 ; CHECK-NEXT:    vmov q1, q5
+; CHECK-NEXT:    vmov s9, r4
 ; CHECK-NEXT:    vmov.32 q1[1], r7
+; CHECK-NEXT:    vdup.32 q6, r7
 ; CHECK-NEXT:    vmov.f32 s2, s1
 ; CHECK-NEXT:    vmov.f32 s8, s0
 ; CHECK-NEXT:    vmov.32 q1[2], r6
@ -183,7 +185,6 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov q7, q6
 ; CHECK-NEXT:    vmov.f32 s10, s1
 ; CHECK-NEXT:    mov.w r8, #4
-; CHECK-NEXT:    mov.w r10, #0
 ; CHECK-NEXT:    vmov.32 q1[3], r4
 ; CHECK-NEXT:    vmov.32 q3[0], r4
 ; CHECK-NEXT:    vmov.32 q7[1], r4
@ -191,7 +192,6 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov.f32 s11, s3
 ; CHECK-NEXT:    movs r1, #64
 ; CHECK-NEXT:    strh.w r8, [sp, #390]
-; CHECK-NEXT:    strd r0, r10, [sp, #24]
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #44]
 ; CHECK-NEXT:    str r0, [r0]
 ; CHECK-NEXT:    vstrw.32 q2, [r0]
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@ -24,8 +24,8 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s9, s6
 ; CHECK-NEXT:    vmov.f32 s10, s0
 ; CHECK-NEXT:    vmov.f32 s11, s5
-; CHECK-NEXT:    vstrw.32 q2, [r1]
 ; CHECK-NEXT:    strd r2, r0, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q2, [r1]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
  %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
--- a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
@ -8,17 +8,17 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV7-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; THUMBV7-NEXT:    .pad #44
 ; THUMBV7-NEXT:    sub sp, #44
-; THUMBV7-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV7-NEXT:    movs r0, #0
 ; THUMBV7-NEXT:    ldrd r4, r7, [sp, #88]
 ; THUMBV7-NEXT:    mov r5, r3
-; THUMBV7-NEXT:    strd r0, r0, [sp, #8]
+; THUMBV7-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV7-NEXT:    movs r0, #0
+; THUMBV7-NEXT:    strd r4, r7, [sp]
 ; THUMBV7-NEXT:    mov r1, r3
+; THUMBV7-NEXT:    strd r0, r0, [sp, #8]
 ; THUMBV7-NEXT:    mov r6, r2
 ; THUMBV7-NEXT:    mov r0, r2
 ; THUMBV7-NEXT:    movs r2, #0
 ; THUMBV7-NEXT:    movs r3, #0
-; THUMBV7-NEXT:    strd r4, r7, [sp]
 ; THUMBV7-NEXT:    bl __multi3
 ; THUMBV7-NEXT:    strd r1, r0, [sp, #32] @ 8-byte Folded Spill
 ; THUMBV7-NEXT:    strd r3, r2, [sp, #24] @ 8-byte Folded Spill
--- a/llvm/test/CodeGen/X86/instr-sched-multiple-memops.mir
+++ b/llvm/test/CodeGen/X86/instr-sched-multiple-memops.mir
@ -1,144 +0,0 @@
-# RUN: llc -mtriple=i686-- -o - -run-pass=machine-scheduler -debug %s 2>&1 | FileCheck %s
-# REQUIRES: asserts
-
--- |
-  %struct.Macroblock.0.1.2.3.6.17 = type { i32, i32, i32, i32, i32, [8 x i32], %struct.Macroblock.0.1.2.3.6.17*, %struct.Macroblock.0.1.2.3.6.17*, i32, [2 x [4 x [4 x [2 x i32]]]], [16 x i8], [16 x i8], i32, i64, [4 x i32], [4 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i16, double, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
-  
-  define void @stepsystem(i32 %x) {
-  entry:
-    %0 = load i32, i32* undef, align 8
-    %inc = add i32 %x, 1
-    store i32 %inc, i32* undef, align 8
-    store <2 x double> <double 0xD47D42AEA2879F2E, double 0xD47D42AEA2879F2E>, <2 x double>* undef, align 8
-    ret void
-  }
-  
-  define void @dct_chroma() {
-  cond_true2732.preheader:
-    %tmp2666 = getelementptr %struct.Macroblock.0.1.2.3.6.17, %struct.Macroblock.0.1.2.3.6.17* null, i32 0, i32 13
-    %tmp2667.us.us = load i64, i64* %tmp2666, align 4
-    %tmp2670.us.us = load i64, i64* null, align 4
-    %tmp2675.us.us = shl i64 %tmp2670.us.us, 0
-    %tmp2675not.us.us = xor i64 %tmp2675.us.us, -1
-    %tmp2676.us.us = and i64 %tmp2667.us.us, %tmp2675not.us.us
-    store i64 %tmp2676.us.us, i64* %tmp2666, align 4
-    ret void
-  }
-
-...
---
-name:            stepsystem
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-registers:
-  - { id: 0, class: gr32, preferred-register: '' }
-  - { id: 1, class: gr32, preferred-register: '' }
-  - { id: 2, class: gr32, preferred-register: '' }
-  - { id: 3, class: gr32, preferred-register: '' }
-  - { id: 4, class: gr32, preferred-register: '' }
-liveins:         []
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    4
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:
-  - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: default, 
-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, 
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-stack:           []
-callSites:       []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.0.entry:
-    %1:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0)
-    %1:gr32 = INC32r %1, implicit-def dead $eflags
-    MOV32mr undef %2:gr32, 1, $noreg, 0, $noreg, %1 :: (store 4 into `i32* undef`, align 8)
-    MOV32mi undef %3:gr32, 1, $noreg, 0, $noreg, -729988434 :: (store 4 into `<2 x double>* undef` + 12)
-    MOV32mi undef %4:gr32, 1, $noreg, 0, $noreg, -1568170194 :: (store 4 into `<2 x double>* undef` + 8, align 8)
-    RET 0
-
-# CHECK-LABEL: stepsystem
-# CHECK: Not adding chain dependency{{[[:space:]]*}}from: MOV32mi {{.*}} :: (store 4 {{.*}}){{[[:space:]]*}}to:   MOV32mi {{.*}} :: (store 4 {{.*}})
-# CHECK: Adding chain dependency{{[[:space:]]*}}from: MOV32mi {{.*}} :: (store 4 {{.*}}){{[[:space:]]*}}to:   MOV32mr {{.*}} :: (store 4 {{.*}})
-...
---
-name:            dct_chroma
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-registers:
-  - { id: 0, class: gr32, preferred-register: '' }
-  - { id: 1, class: gr32, preferred-register: '' }
-  - { id: 2, class: gr32, preferred-register: '' }
-  - { id: 3, class: gr32, preferred-register: '' }
-  - { id: 4, class: gr32, preferred-register: '' }
-liveins:         []
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-callSites:       []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.0.cond_true2732.preheader:
-    %4:gr32 = MOV32rm $noreg, 1, $noreg, 0, $noreg :: (load 4 from `i64* null`)
-    %2:gr32 = MOV32rm $noreg, 1, $noreg, 4, $noreg :: (load 4 from `i64* null` + 4)
-    %2:gr32 = NOT32r %2
-    %4:gr32 = NOT32r %4
-    %4:gr32 = AND32rm %4, $noreg, 1, $noreg, 356, $noreg, implicit-def dead $eflags :: (load 4 from %ir.tmp2666)
-    AND32mr $noreg, 1, $noreg, 360, $noreg, %2, implicit-def dead $eflags :: (store 4 into %ir.tmp2666 + 4), (load 4 from %ir.tmp2666 + 4)
-    MOV32mr $noreg, 1, $noreg, 356, $noreg, %4 :: (store 4 into %ir.tmp2666)
-    RET 0
-
-# Chain dependencies should not be systematically added when at least one of
-# the instructions has more than one memory operand. It should only be added
-# where it would be needed.
-# CHECK-LABEL: dct_chroma
-# CHECK: Not adding chain dependency{{[[:space:]]*}}from: MOV32mr {{.*}} :: (store 4 {{.*}}){{[[:space:]]*}}to:   AND32mr {{.*}} :: (store 4 {{.*}}), (load 4 {{.*}})
-# CHECK: Adding chain dependency{{[[:space:]]*}}from: AND32mr {{.*}} :: (store 4 {{.*}}), (load 4 {{.*}}){{[[:space:]]*}}to:   %{{.*}} = MOV32rm {{.*}} :: (load 4 {{.*}})
-
--- a/llvm/test/CodeGen/X86/store_op_load_fold2.ll
+++ b/llvm/test/CodeGen/X86/store_op_load_fold2.ll
@ -17,12 +17,13 @@ cond_true2732.preheader:                ; preds = %entry
        store i64 %tmp2676.us.us, i64* %tmp2666
        ret i32 0

-; INTEL:	and	dword ptr [360], {{e..}}
 ; INTEL: 	and	{{e..}}, dword ptr [356]
-; INTEL:	mov	dword ptr [356], {{e..}}
+; INTEL:	and	dword ptr [360], {{e..}}
+; FIXME:	mov	dword ptr [356], {{e..}}
+; The above line comes out as 'mov 360, eax', but when the register is ecx it works?

-; ATT:	andl	%{{e..}}, 360
 ; ATT: 	andl	356, %{{e..}}
+; ATT:	andl	%{{e..}}, 360
 ; ATT:	movl	%{{e..}}, 356

 }