Enable MI Sched for x86.

This changes the SelectionDAG scheduling preference to source order. Soon, the SelectionDAG scheduler can be bypassed saving a nice chunk of compile time. Performance differences that result from this change are often a consequence of register coalescing. The register coalescer is far from perfect. Bugs can be filed for deficiencies. On x86 SandyBridge/Haswell, the source order schedule is often preserved, particularly for small blocks. Register pressure is generally improved over the SD scheduler's ILP mode. However, we are still able to handle large blocks that require latency hiding, unlike the SD scheduler's BURR mode. MI scheduler also attempts to discover the critical path in single-block loops and adjust heuristics accordingly. The MI scheduler relies on the new machine model. This is currently unimplemented for AVX, so we may not be generating the best code yet. Unit tests are updated so they don't depend on SD scheduling heuristics. llvm-svn: 192750
2013-10-15 23:33:07 +00:00 · 2013-10-15 23:33:07 +00:00 · e97d8d6dde
parent a6c38a32a9
commit e97d8d6dde
67 changed files with 346 additions and 285 deletions
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@ -383,11 +383,14 @@ public:
  /// memset with zero passed as the second argument. Otherwise it
  /// returns null.
  const char *getBZeroEntry() const;
-  
+
  /// This function returns true if the target has sincos() routine in its
  /// compiler runtime or math libraries.
  bool hasSinCos() const;

+  /// Enable the MachineScheduler pass for all X86 subtargets.
+  bool enableMachineScheduler() const LLVM_OVERRIDE { return true; }
+
  /// enablePostRAScheduler - run for Atom optimization.
  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                             TargetSubtargetInfo::AntiDepBreakMode& Mode,
--- a/llvm/test/CodeGen/X86/2006-05-02-InstrSched1.ll
+++ b/llvm/test/CodeGen/X86/2006-05-02-InstrSched1.ll
@ -1,7 +1,10 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -march=x86 -relocation-model=static -stats 2>&1 | \
-; RUN:   grep asm-printer | grep 14
+; RUN:   grep asm-printer | grep 16
 ;
+; It's possible to schedule this in 14 instructions by avoiding
+; callee-save registers, but the scheduler isn't currently that
+; conervative with registers.
@size20 = external global i32		; <i32*> [#uses=1]
@in5 = external global i8*		; <i8**> [#uses=1]

@ -21,4 +24,3 @@ define i32 @compare(i8* %a, i8* %b) nounwind {
 }

 declare i32 @memcmp(i8*, i8*, i32)
-
--- a/llvm/test/CodeGen/X86/2007-01-08-InstrSched.ll
+++ b/llvm/test/CodeGen/X86/2007-01-08-InstrSched.ll
@ -13,10 +13,10 @@ define float @foo(float %x) nounwind {

 ; CHECK: mulss
 ; CHECK: mulss
-; CHECK: addss
+; CHECK: mulss
 ; CHECK: mulss
 ; CHECK: addss
-; CHECK: mulss
+; CHECK: addss
 ; CHECK: addss
 ; CHECK: ret
 }
--- a/llvm/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/llvm/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@ -17,9 +17,9 @@ bb4:		; preds = %bb.i, %bb26, %bb4, %entry
 ; CHECK: %bb4
 ; CHECK: xorl
 ; CHECK: callq
+; CHECK: xorl
+; CHECK: xorl
 ; CHECK: movq
-; CHECK: xorl
-; CHECK: xorl

 	%0 = call i32 (...)* @xxGetOffsetForCode(i32 undef) nounwind		; <i32> [#uses=0]
 	%ins = or i64 %p, 2097152		; <i64> [#uses=1]
--- a/llvm/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll
+++ b/llvm/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll
@ -1,9 +1,9 @@
-; RUN: llc -mcpu=generic -mtriple=i386-apple-darwin -tailcallopt < %s | FileCheck %s
+; RUN: llc -mcpu=generic -mtriple=i386-apple-darwin -tailcallopt -enable-misched=false < %s | FileCheck %s
 ; Check that lowered argumens do not overwrite the return address before it is moved.
 ; Bug 6225
 ;
 ; If a call is a fastcc tail call and tail call optimization is enabled, the
-; caller frame is replaced by the callee frame. This can require that arguments are 
+; caller frame is replaced by the callee frame. This can require that arguments are
 ; placed on the former return address stack slot. Special care needs to be taken
 ; taken that the return address is moved / or stored in a register before
 ; lowering of arguments potentially overwrites the value.
@ -51,5 +51,3 @@ false:
  tail call fastcc void @l298(i32 %r10, i32 %r9, i32 %r4) noreturn nounwind
  ret void
 }
-
-
--- a/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
+++ b/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
@ -19,8 +19,8 @@ entry:
 }

 ; CHECK: movq	___stack_chk_guard@GOTPCREL(%rip)
-; CHECK: movb   38(%rsp), [[R0:%.+]]
-; CHECK: movb   8(%rsp), [[R1:%.+]]
-; CHECK: movb   [[R1]], 8(%rsp)
-; CHECK: movb   [[R0]], 38(%rsp)
+; CHECK: movb   (%rsp), [[R1:%.+]]
+; CHECK: movb   30(%rsp), [[R0:%.+]]
+; CHECK: movb   [[R1]], (%rsp)
+; CHECK: movb   [[R0]], 30(%rsp)
 ; CHECK: callq	___stack_chk_fail
--- a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
+++ b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
@ -16,8 +16,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: main
 define i32 @main() nounwind uwtable {
 entry:
-; CHECK: pmovsxbq  j(%rip), %
 ; CHECK: pmovsxbq  i(%rip), %
+; CHECK: pmovsxbq  j(%rip), %
  %0 = load <2 x i8>* @i, align 8
  %1 = load <2 x i8>* @j, align 8
  %div = sdiv <2 x i8> %1, %0
@ -25,4 +25,3 @@ entry:
  ret i32 0
 ; CHECK: ret
 }
-
--- a/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll
@ -5,8 +5,8 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK-LABEL: func:
-;CHECK: vxorps
-;CHECK: vinsertf128
+;CHECK: vpxor
+;CHECK: vinserti128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
 ;CHECK: vmulps
--- a/llvm/test/CodeGen/X86/3addr-16bit.ll
+++ b/llvm/test/CodeGen/X86/3addr-16bit.ll
@ -34,7 +34,8 @@ entry:

 ; 64BIT-LABEL:     t2:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     leal -1(%rsi), %eax
+; 64BIT:     decl %eax
+; 64BIT:     movzwl %ax
  %0 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
  %1 = add i16 %k, -1                             ; <i16> [#uses=3]
  br i1 %0, label %bb, label %bb1
@ -58,7 +59,7 @@ entry:

 ; 64BIT-LABEL:     t3:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     leal 2(%rsi), %eax
+; 64BIT:     addl $2, %eax
  %0 = add i16 %k, 2                              ; <i16> [#uses=3]
  %1 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
  br i1 %1, label %bb, label %bb1
@ -81,7 +82,7 @@ entry:

 ; 64BIT-LABEL:     t4:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     leal (%rsi,%rdi), %eax
+; 64BIT:     addl %edi, %eax
  %0 = add i16 %k, %c                             ; <i16> [#uses=3]
  %1 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
  br i1 %1, label %bb, label %bb1
--- a/llvm/test/CodeGen/X86/StackColoring.ll
+++ b/llvm/test/CodeGen/X86/StackColoring.ll
@ -4,8 +4,8 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

-;YESCOLOR: subq  $136, %rsp
-;NOCOLOR: subq  $264, %rsp
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp

 define i32 @myCall_w2(i32 %in) {
 entry:
--- a/llvm/test/CodeGen/X86/abi-isel.ll
+++ b/llvm/test/CodeGen/X86/abi-isel.ll
@ -1,16 +1,16 @@
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-32-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-PIC

-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-64-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=LINUX-64-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-PIC

-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=DARWIN-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=dynamic-no-pic -code-model=small | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=DARWIN-32-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-PIC

-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=DARWIN-64-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-PIC

@src = external global [131072 x i32]
@dst = external global [131072 x i32]
--- a/llvm/test/CodeGen/X86/add.ll
+++ b/llvm/test/CodeGen/X86/add.ll
@ -9,7 +9,7 @@ define i32 @test1(i32 inreg %a) nounwind {
  %b = add i32 %a, 128
  ret i32 %b
 ; X32: subl	$-128, %eax
-; X64: subl $-128, 
+; X64: subl $-128,
 }
 define i64 @test2(i64 inreg %a) nounwind {
  %b = add i64 %a, 2147483648
@ -20,7 +20,7 @@ define i64 @test2(i64 inreg %a) nounwind {
 define i64 @test3(i64 inreg %a) nounwind {
  %b = add i64 %a, 128
  ret i64 %b
-  
+
 ; X32: addl $128, %eax
 ; X64: subq	$-128,
 }
@ -38,7 +38,7 @@ normal:

 overflow:
  ret i1 false
-  
+
 ; X32-LABEL: test4:
 ; X32: addl
 ; X32-NEXT: jo
@ -82,11 +82,11 @@ define i64 @test6(i64 %A, i32 %B) nounwind {
        ret i64 %tmp5

 ; X32-LABEL: test6:
-; X32:	    movl 12(%esp), %edx
+; X32:	    movl 4(%esp), %eax
+; X32-NEXT: movl 12(%esp), %edx
 ; X32-NEXT: addl 8(%esp), %edx
-; X32-NEXT: movl 4(%esp), %eax
 ; X32-NEXT: ret
-        
+
 ; X64-LABEL: test6:
 ; X64:	shlq	$32, %r[[A1]]
 ; X64:	leaq	(%r[[A1]],%r[[A0]]), %rax
--- a/llvm/test/CodeGen/X86/alloca-align-rounding.ll
+++ b/llvm/test/CodeGen/X86/alloca-align-rounding.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux -enable-misched=false | FileCheck %s

 declare void @bar(<2 x i64>* %n)

--- a/llvm/test/CodeGen/X86/avx-arith.ll
+++ b/llvm/test/CodeGen/X86/avx-arith.ll
@ -240,14 +240,14 @@ define <16 x i16> @vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
 ; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsllq $32, %xmm
 ; CHECK-NEXT: vpaddq %xmm
-; CHECK-NEXT: vpmuludq %xmm
-; CHECK-NEXT: vpsrlq $32, %xmm
-; CHECK-NEXT: vpmuludq %xmm
-; CHECK-NEXT: vpsllq $32, %xmm
 ; CHECK-NEXT: vpsrlq $32, %xmm
 ; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsllq $32, %xmm
 ; CHECK-NEXT: vpaddq %xmm
+; CHECK-NEXT: vpmuludq %xmm
+; CHECK-NEXT: vpsrlq $32, %xmm
+; CHECK-NEXT: vpmuludq %xmm
+; CHECK-NEXT: vpsllq $32, %xmm
 ; CHECK-NEXT: vpaddq %xmm
 ; CHECK-NEXT: vpsrlq $32, %xmm
 ; CHECK-NEXT: vpmuludq %xmm
@ -269,4 +269,3 @@ define <4 x float> @int_sqrt_ss() {
 %x2 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %x1) nounwind
 ret <4 x float> %x2
 }
-
--- a/llvm/test/CodeGen/X86/avx-intel-ocl.ll
+++ b/llvm/test/CodeGen/X86/avx-intel-ocl.ll
@ -32,7 +32,7 @@ declare i32 @func_int(i32, i32)
 define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
  %y = alloca <16 x float>, align 16
  %x = fadd <16 x float> %a, %b
-  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
  %2 = load <16 x float>* %y, align 16
  %3 = fadd <16 x float> %2, %1
  ret <16 x float> %3
@ -43,21 +43,21 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
 ; preserved ymm6-ymm15
 ; WIN64: testf16_regs
 ; WIN64: call
-; WIN64: vaddps  {{%ymm[6-7]}}, %ymm0, %ymm0
-; WIN64: vaddps  {{%ymm[6-7]}}, %ymm1, %ymm1
+; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
+; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
 ; WIN64: ret

 ; preserved ymm8-ymm15
 ; X64: testf16_regs
 ; X64: call
-; X64: vaddps  {{%ymm[8-9]}}, %ymm0, %ymm0
-; X64: vaddps  {{%ymm[8-9]}}, %ymm1, %ymm1
+; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
+; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
 ; X64: ret

 define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
  %y = alloca <16 x float>, align 16
  %x = fadd <16 x float> %a, %b
-  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
  %2 = load <16 x float>* %y, align 16
  %3 = fadd <16 x float> %1, %b
  %4 = fadd <16 x float> %2, %3
@ -166,4 +166,3 @@ entry:
  %8 = shufflevector <8 x float> %3, <8 x float> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
  ret <8 x float> %8
 }
-
--- a/llvm/test/CodeGen/X86/avx-shuffle.ll
+++ b/llvm/test/CodeGen/X86/avx-shuffle.ll
@ -81,7 +81,7 @@ entry:
 define i32 @test9(<4 x i32> %a) nounwind {
 ; CHECK: test9
 ; CHECK: vpextrd
-  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4> 
+  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4>
  %r = extractelement <8 x i32> %b, i32 2
 ; CHECK: ret
  ret i32 %r
@ -251,6 +251,8 @@ define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
 ; CHECK: swap8doubles
 ; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
 ; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
+; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
+; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
 ; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
 ; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
 ; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@ -167,8 +167,8 @@ define i32 @float_to_int(float %x) {
 }

 ; CHECK-LABEL: uitof64
-; CHECK: vextracti64x4
 ; CHECK: vcvtudq2pd
+; CHECK: vextracti64x4
 ; CHECK: vcvtudq2pd
 ; CHECK: ret
 define <16 x double> @uitof64(<16 x i32> %a) nounwind {
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@ -27,8 +27,8 @@ define i16 @mand16(i16 %x, i16 %y) {
  %md = xor <16 x i1> %ma, %mb
  %me = or <16 x i1> %mc, %md
  %ret = bitcast <16 x i1> %me to i16
-; CHECK: kxorw
 ; CHECK: kandw
+; CHECK: kxorw
 ; CHECK: korw
  ret i16 %ret
 }
@ -55,4 +55,3 @@ define i8 @shuf_test1(i16 %v) nounwind {
   %mask1 = bitcast <8 x i1> %mask to i8
   ret i8 %mask1
 }
-
--- a/llvm/test/CodeGen/X86/break-anti-dependencies.ll
+++ b/llvm/test/CodeGen/X86/break-anti-dependencies.ll
@ -1,7 +1,7 @@
 ; Without list-burr scheduling we may not see the difference in codegen here.
 ; Use a subtarget that has post-RA scheduling enabled because the anti-dependency
 ; breaker requires liveness information to be kept.
-; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
+; RUN: llc < %s -march=x86-64 -mcpu=atom -enable-misched=false -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
 ; RUN:   grep "%xmm0" %t | count 14
 ; RUN:   not grep "%xmm1" %t
 ; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -break-anti-dependencies=critical > %t
--- a/llvm/test/CodeGen/X86/bt.ll
+++ b/llvm/test/CodeGen/X86/bt.ll
@ -38,7 +38,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @test2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: test2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@ -56,7 +56,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @atest2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: atest2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@ -74,7 +74,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @atest2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: atest2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@ -91,7 +91,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @test3(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: test3
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@ -109,7 +109,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @test3b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: test3b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@ -127,7 +127,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @testne2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: testne2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@ -145,7 +145,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @testne2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: testne2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@ -163,7 +163,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @atestne2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: atestne2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@ -181,7 +181,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @atestne2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: atestne2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@ -199,7 +199,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @testne3(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: testne3
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@ -217,7 +217,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @testne3b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: testne3b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@ -235,7 +235,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@ -253,7 +253,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@ -271,7 +271,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @aquery2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: aquery2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@ -289,7 +289,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @aquery2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: aquery2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@ -307,7 +307,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query3(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query3
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@ -325,7 +325,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query3b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query3b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@ -343,7 +343,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query3x(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query3x
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@ -361,7 +361,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query3bx(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query3bx
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@ -379,7 +379,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@ -397,7 +397,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@ -415,7 +415,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @aqueryne2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: aqueryne2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@ -433,7 +433,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @aqueryne2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: aqueryne2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@ -451,7 +451,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne3(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne3
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@ -469,7 +469,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne3b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne3b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@ -487,7 +487,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne3x(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne3x
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@ -505,7 +505,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne3bx(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne3bx
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
--- a/llvm/test/CodeGen/X86/byval7.ll
+++ b/llvm/test/CodeGen/X86/byval7.ll
@ -7,14 +7,14 @@
 define i32 @main() nounwind  {
 entry:
 ; CHECK-LABEL: main:
-; CHECK: movl $1, (%esp)
 ; CHECK: leal 16(%esp), %edi
 ; CHECK: leal 160(%esp), %esi
 ; CHECK: rep;movsl
+; CHECK: movl $1, (%esp)
 	%s = alloca %struct.S		; <%struct.S*> [#uses=2]
 	%tmp15 = getelementptr %struct.S* %s, i32 0, i32 0		; <<2 x i64>*> [#uses=1]
 	store <2 x i64> < i64 8589934595, i64 1 >, <2 x i64>* %tmp15, align 16
-	call void @t( i32 1, %struct.S* byval  %s ) nounwind 
+	call void @t( i32 1, %struct.S* byval  %s ) nounwind
 	ret i32 0
 }

--- a/llvm/test/CodeGen/X86/chain_order.ll
+++ b/llvm/test/CodeGen/X86/chain_order.ll
@ -3,8 +3,8 @@
 ;CHECK-LABEL: cftx020:
 ;CHECK: vmovsd  (%rdi), %xmm{{.*}}
 ;CHECK: vmovsd  16(%rdi), %xmm{{.*}}
-;CHECK: vmovhpd  8(%rdi), %xmm{{.*}}
 ;CHECK: vmovsd  24(%rdi), %xmm{{.*}}
+;CHECK: vmovhpd  8(%rdi), %xmm{{.*}}
 ;CHECK: vmovupd %xmm{{.*}}, (%rdi)
 ;CHECK: vmovupd %xmm{{.*}}, 16(%rdi)
 ;CHECK: ret
@ -35,4 +35,3 @@ entry:
  store <2 x double> %14, <2 x double>* %15, align 8
  ret void
 }
-
--- a/llvm/test/CodeGen/X86/cmov.ll
+++ b/llvm/test/CodeGen/X86/cmov.ll
@ -4,8 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
 entry:
 ; CHECK-LABEL: test1:
-; CHECK: movl	$12, %eax
-; CHECK-NEXT: btl
+; CHECK: btl
+; CHECK-NEXT: movl	$12, %eax
 ; CHECK-NEXT: cmovael	(%rcx), %eax
 ; CHECK-NEXT: ret

@ -19,8 +19,8 @@ entry:
 define i32 @test2(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
 entry:
 ; CHECK-LABEL: test2:
-; CHECK: movl	$12, %eax
-; CHECK-NEXT: btl
+; CHECK: btl
+; CHECK-NEXT: movl	$12, %eax
 ; CHECK-NEXT: cmovbl	(%rcx), %eax
 ; CHECK-NEXT: ret

@ -92,7 +92,7 @@ bb.i.i.i:                                         ; preds = %entry
 ; CHECK: testb
 ; CHECK-NOT: xor
 ; CHECK: setne
-; CHECK-NEXT: testb
+; CHECK: testb

 func_4.exit.i:                                    ; preds = %bb.i.i.i, %entry
  %.not.i = xor i1 %2, true                       ; <i1> [#uses=1]
--- a/llvm/test/CodeGen/X86/commute-two-addr.ll
+++ b/llvm/test/CodeGen/X86/commute-two-addr.ll
@ -38,10 +38,10 @@ define i32 @t2(i32 %X, i32 %Y) nounwind {
 define %0 @t3(i32 %lb, i8 zeroext %has_lb, i8 zeroext %lb_inclusive, i32 %ub, i8 zeroext %has_ub, i8 zeroext %ub_inclusive) nounwind {
 entry:
 ; DARWIN-LABEL: t3:
-; DARWIN: shll $16
 ; DARWIN: shlq $32, %rcx
+; DARWIN-NEXT: orq %rcx, %rax
+; DARWIN-NEXT: shll $8
 ; DARWIN-NOT: leaq
-; DARWIN: orq %rcx, %rax
  %tmp21 = zext i32 %lb to i64
  %tmp23 = zext i32 %ub to i64
  %tmp24 = shl i64 %tmp23, 32
--- a/llvm/test/CodeGen/X86/fast-isel-mem.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-mem.ll
@ -40,7 +40,7 @@ entry:
 ; CHECK:	movl	L_LotsStuff$non_lazy_ptr, %ecx

 ; ATOM: _t:
-; ATOM:         movl    L_LotsStuff$non_lazy_ptr, %ecx
-; ATOM:         movl    $0, %eax
+; ATOM:         movl    L_LotsStuff$non_lazy_ptr, %e{{..}}
+; ATOM:         movl    $0, %e{{..}}

 }
--- a/llvm/test/CodeGen/X86/fastcc.ll
+++ b/llvm/test/CodeGen/X86/fastcc.ll
@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 -post-RA-scheduler=false | FileCheck %s
-; CHECK: movsd %xmm0, 8(%esp)
-; CHECK: xorl %ecx, %ecx
+; CHECK: movsd %xmm{{[0-9]}}, 8(%esp)
+; CHECK: xorl %eax, %eax

@d = external global double		; <double*> [#uses=1]
@c = external global double		; <double*> [#uses=1]
--- a/llvm/test/CodeGen/X86/fold-load.ll
+++ b/llvm/test/CodeGen/X86/fold-load.ll
@ -38,10 +38,10 @@ L:

  store i16 %A, i16* %Q
  ret i32 %D
-  
+
 ; CHECK-LABEL: test2:
 ; CHECK: 	movl	4(%esp), %eax
-; CHECK-NEXT:	movzwl	(%eax), %ecx
+; CHECK-NEXT:	movzwl	(%eax), %e{{..}}

 }

--- a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
+++ b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
@ -54,22 +54,27 @@ forbody:		; preds = %forcond
 	%mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer		; <<4 x float>> [#uses=2]
 	%mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer		; <<4 x float>> [#uses=1]
 	%cmpunord.i11 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i8 3) nounwind		; <<4 x float>> [#uses=1]
+	%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
+	%bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32>		; <<4 x i32>> [#uses=1]
+	%andps.i5 = and <4 x i32> %bitcast.i3, zeroinitializer		; <<4 x i32>> [#uses=1]
+
+	call void null(<4 x float> %mul313, <4 x float> %cmpunord.i11, <4 x float> %tmp83, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
+
+	%tmp84 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul313, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
+
 	%bitcast6.i13 = bitcast <4 x float> %cmpunord.i11 to <4 x i32>		; <<4 x i32>> [#uses=2]
 	%andps.i14 = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %bitcast6.i13		; <<4 x i32>> [#uses=1]
 	%not.i16 = xor <4 x i32> %bitcast6.i13, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
 	%andnps.i17 = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %not.i16		; <<4 x i32>> [#uses=1]
 	%orps.i18 = or <4 x i32> %andnps.i17, %andps.i14		; <<4 x i32>> [#uses=1]
 	%bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
-	%bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andps.i5 = and <4 x i32> %bitcast.i3, zeroinitializer		; <<4 x i32>> [#uses=1]
+
 	%bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32>		; <<4 x i32>> [#uses=1]
 	%not.i7 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
 	%andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7		; <<4 x i32>> [#uses=1]
-	call void null(<4 x float> %mul313, <4 x float> %cmpunord.i11, <4 x float> %tmp83, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
 	%orps.i9 = or <4 x i32> %andnps.i8, %andps.i5		; <<4 x i32>> [#uses=1]
 	%bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp84 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul313, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
+
 	%bitcast6.i = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=2]
 	%andps.i = and <4 x i32> zeroinitializer, %bitcast6.i		; <<4 x i32>> [#uses=1]
 	%bitcast11.i = bitcast <4 x float> %tmp84 to <4 x i32>		; <<4 x i32>> [#uses=1]
--- a/llvm/test/CodeGen/X86/full-lsr.ll
+++ b/llvm/test/CodeGen/X86/full-lsr.ll
@ -4,7 +4,7 @@
 define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
 ; ATOM: foo
 ; ATOM: addl
-; ATOM: leal
+; ATOM: addl
 ; ATOM: leal

 ; CHECK: foo
--- a/llvm/test/CodeGen/X86/gather-addresses.ll
+++ b/llvm/test/CodeGen/X86/gather-addresses.ll
@ -1,21 +1,35 @@
-; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN
+; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=WIN
 ; rdar://7398554

 ; When doing vector gather-scatter index calculation with 32-bit indices,
 ; bounce the vector off of cache rather than shuffling each individual
 ; element out of the index vector.

-; CHECK: andps    ([[H:%rdx|%r8]]), %xmm0
-; CHECK: movaps   %xmm0, {{(-24)?}}(%rsp)
-; CHECK: movslq   {{(-24)?}}(%rsp), %rax
-; CHECK: movsd    ([[P:%rdi|%rcx]],%rax,8), %xmm0
-; CHECK: movslq   {{-20|4}}(%rsp), %rax
-; CHECK: movhpd   ([[P]],%rax,8), %xmm0
-; CHECK: movslq   {{-16|8}}(%rsp), %rax
-; CHECK: movsd    ([[P]],%rax,8), %xmm1
-; CHECK: movslq   {{-12|12}}(%rsp), %rax
-; CHECK: movhpd   ([[P]],%rax,8), %xmm1
+; CHECK: foo:
+; LIN: movaps	(%rsi), %xmm0
+; LIN: andps	(%rdx), %xmm0
+; LIN: movaps	%xmm0, -24(%rsp)
+; LIN: movslq	-24(%rsp), %[[REG1:r.+]]
+; LIN: movslq	-20(%rsp), %[[REG2:r.+]]
+; LIN: movslq	-16(%rsp), %[[REG3:r.+]]
+; LIN: movslq	-12(%rsp), %[[REG4:r.+]]
+; LIN: movsd	(%rdi,%[[REG1]],8), %xmm0
+; LIN: movhpd	(%rdi,%[[REG2]],8), %xmm0
+; LIN: movsd	(%rdi,%[[REG3]],8), %xmm1
+; LIN: movhpd	(%rdi,%[[REG4]],8), %xmm1
+
+; WIN: movaps	(%rdx), %xmm0
+; WIN: andps	(%r8), %xmm0
+; WIN: movaps	%xmm0, (%rsp)
+; WIN: movslq	(%rsp), %[[REG1:r.+]]
+; WIN: movslq	4(%rsp), %[[REG2:r.+]]
+; WIN: movslq	8(%rsp), %[[REG3:r.+]]
+; WIN: movslq	12(%rsp), %[[REG4:r.+]]
+; WIN: movsd	(%rcx,%[[REG1]],8), %xmm0
+; WIN: movhpd	(%rcx,%[[REG2]],8), %xmm0
+; WIN: movsd	(%rcx,%[[REG3]],8), %xmm1
+; WIN: movhpd	(%rcx,%[[REG4]],8), %xmm1

 define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
  %a = load <4 x i32>* %i
--- a/llvm/test/CodeGen/X86/ghc-cc.ll
+++ b/llvm/test/CodeGen/X86/ghc-cc.ll
@ -28,10 +28,10 @@ entry:

 define cc 10 void @foo() nounwind {
 entry:
-  ; CHECK: movl base, %ebx
-  ; CHECK-NEXT: movl sp, %ebp
+  ; CHECK:      movl r1, %esi
  ; CHECK-NEXT: movl hp, %edi
-  ; CHECK-NEXT: movl r1, %esi
+  ; CHECK-NEXT: movl sp, %ebp
+  ; CHECK-NEXT: movl base, %ebx
  %0 = load i32* @r1
  %1 = load i32* @hp
  %2 = load i32* @sp
@ -42,4 +42,3 @@ entry:
 }

 declare cc 10 void @bar(i32, i32, i32, i32)
-
--- a/llvm/test/CodeGen/X86/ghc-cc64.ll
+++ b/llvm/test/CodeGen/X86/ghc-cc64.ll
@ -41,22 +41,22 @@ entry:

 define cc 10 void @foo() nounwind {
 entry:
-  ; CHECK: movq base(%rip), %r13
-  ; CHECK-NEXT: movq sp(%rip), %rbp
-  ; CHECK-NEXT: movq hp(%rip), %r12
-  ; CHECK-NEXT: movq r1(%rip), %rbx
-  ; CHECK-NEXT: movq r2(%rip), %r14
-  ; CHECK-NEXT: movq r3(%rip), %rsi
-  ; CHECK-NEXT: movq r4(%rip), %rdi
-  ; CHECK-NEXT: movq r5(%rip), %r8
-  ; CHECK-NEXT: movq r6(%rip), %r9
-  ; CHECK-NEXT: movq splim(%rip), %r15
-  ; CHECK-NEXT: movss f1(%rip), %xmm1
-  ; CHECK-NEXT: movss f2(%rip), %xmm2
-  ; CHECK-NEXT: movss f3(%rip), %xmm3
-  ; CHECK-NEXT: movss f4(%rip), %xmm4
+  ; CHECK:      movsd d2(%rip), %xmm6
  ; CHECK-NEXT: movsd d1(%rip), %xmm5
-  ; CHECK-NEXT: movsd d2(%rip), %xmm6
+  ; CHECK-NEXT: movss f4(%rip), %xmm4
+  ; CHECK-NEXT: movss f3(%rip), %xmm3
+  ; CHECK-NEXT: movss f2(%rip), %xmm2
+  ; CHECK-NEXT: movss f1(%rip), %xmm1
+  ; CHECK-NEXT: movq splim(%rip), %r15
+  ; CHECK-NEXT: movq r6(%rip), %r9
+  ; CHECK-NEXT: movq r5(%rip), %r8
+  ; CHECK-NEXT: movq r4(%rip), %rdi
+  ; CHECK-NEXT: movq r3(%rip), %rsi
+  ; CHECK-NEXT: movq r2(%rip), %r14
+  ; CHECK-NEXT: movq r1(%rip), %rbx
+  ; CHECK-NEXT: movq hp(%rip), %r12
+  ; CHECK-NEXT: movq sp(%rip), %rbp
+  ; CHECK-NEXT: movq base(%rip), %r13
  %0 = load double* @d2
  %1 = load double* @d1
  %2 = load float* @f4
@ -83,4 +83,3 @@ entry:

 declare cc 10 void @bar(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64,
                        float, float, float, float, double, double)
-
--- a/llvm/test/CodeGen/X86/hipe-cc.ll
+++ b/llvm/test/CodeGen/X86/hipe-cc.ll
@ -49,10 +49,10 @@ entry:
  store i32 %arg1, i32* %arg1_var
  store i32 %arg2, i32* %arg2_var

-  ; CHECK:      movl   4(%esp), %edx
-  ; CHECK-NEXT: movl   8(%esp), %eax
+  ; CHECK:      movl  16(%esp), %esi
  ; CHECK-NEXT: movl  12(%esp), %ebp
-  ; CHECK-NEXT: movl  16(%esp), %esi
+  ; CHECK-NEXT: movl   8(%esp), %eax
+  ; CHECK-NEXT: movl   4(%esp), %edx
  %0 = load i32* %hp_var
  %1 = load i32* %p_var
  %2 = load i32* %arg0_var
--- a/llvm/test/CodeGen/X86/hipe-cc64.ll
+++ b/llvm/test/CodeGen/X86/hipe-cc64.ll
@ -5,10 +5,10 @@
 define void @zap(i64 %a, i64 %b) nounwind {
 entry:
  ; CHECK:      movq %rsi, %rax
-  ; CHECK-NEXT: movq %rdi, %rsi
-  ; CHECK-NEXT: movq %rax, %rdx
  ; CHECK-NEXT: movl $8, %ecx
  ; CHECK-NEXT: movl $9, %r8d
+  ; CHECK-NEXT: movq %rdi, %rsi
+  ; CHECK-NEXT: movq %rax, %rdx
  ; CHECK-NEXT: callq addfour
  %0 = call cc 11 {i64, i64, i64} @addfour(i64 undef, i64 undef, i64 %a, i64 %b, i64 8, i64 9)
  %res = extractvalue {i64, i64, i64} %0, 2
@ -57,11 +57,11 @@ entry:
  store i64 %arg2, i64* %arg2_var
  store i64 %arg3, i64* %arg3_var

-  ; CHECK:      movq  8(%rsp), %rcx
-  ; CHECK-NEXT: movq  16(%rsp), %rdx
-  ; CHECK-NEXT: movq  24(%rsp), %rsi
+  ; CHECK:      movq  40(%rsp), %r15
  ; CHECK-NEXT: movq  32(%rsp), %rbp
-  ; CHECK-NEXT: movq  40(%rsp), %r15
+  ; CHECK-NEXT: movq  24(%rsp), %rsi
+  ; CHECK-NEXT: movq  16(%rsp), %rdx
+  ; CHECK-NEXT: movq  8(%rsp), %rcx
  %0 = load i64* %hp_var
  %1 = load i64* %p_var
  %2 = load i64* %arg0_var
--- a/llvm/test/CodeGen/X86/lea-recursion.ll
+++ b/llvm/test/CodeGen/X86/lea-recursion.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep lea | count 12
+; RUN: llc < %s -march=x86-64 | grep lea | count 13

 ; This testcase was written to demonstrate an instruction-selection problem,
 ; however it also happens to expose a limitation in the DAGCombiner's
@ -44,4 +44,3 @@ entry:
 	store i32 %tmp10.6, i32* getelementptr ([1000 x i32]* @g0, i32 0, i32 7)
 	ret void
 }
-
--- a/llvm/test/CodeGen/X86/lea.ll
+++ b/llvm/test/CodeGen/X86/lea.ll
@ -28,8 +28,7 @@ bb.nph:
 bb2:
 	ret i32 %x_offs
 ; CHECK-LABEL: test2:
-; CHECK: movl %e[[A0]], %eax
-; CHECK: addl $-5, %eax
+; CHECK:        leal    -5(%r[[A0:..]]), %eax
 ; CHECK:	andl	$-4, %eax
 ; CHECK:	negl	%eax
 ; CHECK:	leal	-4(%r[[A0]],%rax), %eax
--- a/llvm/test/CodeGen/X86/load-slice.ll
+++ b/llvm/test/CodeGen/X86/load-slice.ll
@ -17,14 +17,14 @@
 ; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
 ;
 ; STRESS-LABEL: t1:
-; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
-; STRESS: vmovss 68([[BASE:[^)]+]]), [[OUT_Imm:%xmm[0-9]+]]
-; Add high slice: out[out_start].imm, this is base + 4.
-; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
-; STRESS-NEXT: vmovss 64([[BASE]]), [[OUT_Real:%xmm[0-9]+]]
+; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]]
 ; Add low slice: out[out_start].real, this is base + 0.
 ; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
+; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
+; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
+; Add high slice: out[out_start].imm, this is base + 4.
+; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
 ; Swap Imm and Real.
 ; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
 ; Put the results back into out[out_start].
@ -32,14 +32,14 @@
 ;
 ; Same for REGULAR, we eliminate register bank copy with each slices.
 ; REGULAR-LABEL: t1:
-; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
-; REGULAR: vmovss 68([[BASE:[^)]+]]), [[OUT_Imm:%xmm[0-9]+]]
-; Add high slice: out[out_start].imm, this is base + 4.
-; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
-; REGULAR-NEXT: vmovss 64([[BASE]]), [[OUT_Real:%xmm[0-9]+]]
+; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]]
 ; Add low slice: out[out_start].real, this is base + 0.
 ; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
+; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
+; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
+; Add high slice: out[out_start].imm, this is base + 4.
+; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
 ; Swap Imm and Real.
 ; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
 ; Put the results back into out[out_start].
@ -137,4 +137,3 @@ define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
  %res = add i32 %slice32_lowhigh, %tmpres
  ret i32 %res
 }
-
--- a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
@ -2,12 +2,12 @@
 ; RUN: llc -mtriple=x86_64-darwin -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s

 ; CHECK-LABEL: t:
-; CHECK: decq
-; CHECK-NEXT: movl (%r9,%rax,4), %eax
+; CHECK: movl (%r9,%rax,4), %e{{..}}
+; CHECK-NEXT: decq
 ; CHECK-NEXT: jne

 ; ATOM-LABEL: t:
-; ATOM: movl (%r9,%r{{.+}},4), %eax
+; ATOM: movl (%r9,%r{{.+}},4), %e{{..}}
 ; ATOM-NEXT: decq
 ; ATOM-NEXT: jne

--- a/llvm/test/CodeGen/X86/masked-iv-safe.ll
+++ b/llvm/test/CodeGen/X86/masked-iv-safe.ll
@ -1,15 +1,13 @@
-; RUN: llc < %s -mcpu=generic -march=x86-64 > %t
-; RUN: not grep and %t
-; RUN: not grep movz %t
-; RUN: not grep sar %t
-; RUN: not grep shl %t
-; RUN: grep add %t | count 5
-; RUN: grep inc %t | count 2
-; RUN: grep lea %t | count 3
+; RUN: llc < %s -mcpu=generic -march=x86-64 | FileCheck %s

 ; Optimize away zext-inreg and sext-inreg on the loop induction
 ; variable using trip-count information.

+; CHECK-LABEL: count_up
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: inc
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @count_up(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@ -38,6 +36,11 @@ return:
 	ret void
 }

+; CHECK-LABEL: count_down
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: addq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @count_down(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@ -66,6 +69,11 @@ return:
 	ret void
 }

+; CHECK-LABEL: count_up_signed
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: inc
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @count_up_signed(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@ -96,6 +104,11 @@ return:
 	ret void
 }

+; CHECK-LABEL: count_down_signed
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: addq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @count_down_signed(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@ -126,6 +139,11 @@ return:
 	ret void
 }

+; CHECK-LABEL: another_count_up
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: addq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @another_count_up(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@ -154,6 +172,11 @@ return:
 	ret void
 }

+; CHECK-LABEL: another_count_down
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: decq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @another_count_down(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@ -182,6 +205,11 @@ return:
 	ret void
 }

+; CHECK-LABEL: another_count_up_signed
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: addq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @another_count_up_signed(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@ -212,6 +240,11 @@ return:
 	ret void
 }

+; CHECK-LABEL: another_count_down_signed
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: decq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @another_count_down_signed(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
--- a/llvm/test/CodeGen/X86/memcpy-2.ll
+++ b/llvm/test/CodeGen/X86/memcpy-2.ll
@ -56,15 +56,15 @@ entry:
 define void @t2(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
 entry:
 ; SSE2-Darwin-LABEL: t2:
-; SSE2-Darwin: movaps (%eax), %xmm0
+; SSE2-Darwin: movaps (%ecx), %xmm0
 ; SSE2-Darwin: movaps %xmm0, (%eax)

 ; SSE2-Mingw32-LABEL: t2:
-; SSE2-Mingw32: movaps (%eax), %xmm0
+; SSE2-Mingw32: movaps (%ecx), %xmm0
 ; SSE2-Mingw32: movaps %xmm0, (%eax)

 ; SSE1-LABEL: t2:
-; SSE1: movaps (%eax), %xmm0
+; SSE1: movaps (%ecx), %xmm0
 ; SSE1: movaps %xmm0, (%eax)

 ; NOSSE-LABEL: t2:
@ -91,14 +91,14 @@ entry:
 define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
 entry:
 ; SSE2-Darwin-LABEL: t3:
-; SSE2-Darwin: movsd (%eax), %xmm0
-; SSE2-Darwin: movsd 8(%eax), %xmm1
+; SSE2-Darwin: movsd (%ecx), %xmm0
+; SSE2-Darwin: movsd 8(%ecx), %xmm1
 ; SSE2-Darwin: movsd %xmm1, 8(%eax)
 ; SSE2-Darwin: movsd %xmm0, (%eax)

 ; SSE2-Mingw32-LABEL: t3:
-; SSE2-Mingw32: movsd (%eax), %xmm0
-; SSE2-Mingw32: movsd 8(%eax), %xmm1
+; SSE2-Mingw32: movsd (%ecx), %xmm0
+; SSE2-Mingw32: movsd 8(%ecx), %xmm1
 ; SSE2-Mingw32: movsd %xmm1, 8(%eax)
 ; SSE2-Mingw32: movsd %xmm0, (%eax)

--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -mattr=sse4.1 -mcpu=nehalem -stack-alignment=16 > %t
 ; RUN: grep pmul %t | count 12
-; RUN: grep mov %t | count 11
+; RUN: grep mov %t | count 14

 define <4 x i32> @a(<4 x i32> %i) nounwind  {
        %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
--- a/llvm/test/CodeGen/X86/pr14088.ll
+++ b/llvm/test/CodeGen/X86/pr14088.ll
@ -19,7 +19,14 @@ return:
  ret i32 %retval.0
 }

-; We were miscompiling this and using %ax instead of %cx in the movw.
-; CHECK: movswl	%cx, %ecx
-; CHECK: movw	%cx, (%rsi)
-; CHECK: movslq	%ecx, %rcx
+; We were miscompiling this and using %ax instead of %cx in the movw
+; in the following sequence:
+;	movswl	%cx, %ecx
+;	movw	%cx, (%rsi)
+;	movslq	%ecx, %rcx
+;
+; We can't produce the above sequence without special SD-level
+; heuristics. Now we produce this:
+; CHECK: movw	%ax, (%rsi)
+; CHECK: cwtl
+; CHECK: cltq
--- a/llvm/test/CodeGen/X86/pr1505b.ll
+++ b/llvm/test/CodeGen/X86/pr1505b.ll
@ -57,11 +57,10 @@ entry:
 	%tmp22 = tail call %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZNSolsEd( %"struct.std::basic_ostream<char,std::char_traits<char> >"* %tmp16, double %tmp1920 )		; <%"struct.std::basic_ostream<char,std::char_traits<char> >"*> [#uses=1]
 	%tmp30 = tail call %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_( %"struct.std::basic_ostream<char,std::char_traits<char> >"* %tmp22 )		; <%"struct.std::basic_ostream<char,std::char_traits<char> >"*> [#uses=0]
 ; reload:
-; CHECK: fld
-; CHECK: fstps
 ; CHECK: ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
 	%tmp34 = tail call %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc( %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZSt4cout, i8* getelementptr ([13 x i8]* @.str1, i32 0, i32 0) )		; <%"struct.std::basic_ostream<char,std::char_traits<char> >"*> [#uses=1]
 	%tmp3940 = fpext float %tmp1314 to double		; <double> [#uses=1]
+; CHECK: fld
 ; CHECK: fstpl
 ; CHECK: ZNSolsEd
 	%tmp42 = tail call %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZNSolsEd( %"struct.std::basic_ostream<char,std::char_traits<char> >"* %tmp34, double %tmp3940 )		; <%"struct.std::basic_ostream<char,std::char_traits<char> >"*> [#uses=1]
--- a/llvm/test/CodeGen/X86/pr16031.ll
+++ b/llvm/test/CodeGen/X86/pr16031.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=corei7-avx -enable-misched=false | FileCheck %s

 ; CHECK-LABEL: main:
 ; CHECK: pushl %esi
--- a/llvm/test/CodeGen/X86/pre-ra-sched.ll
+++ b/llvm/test/CodeGen/X86/pre-ra-sched.ll
@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -debug-only=pre-RA-sched \
-; RUN:     2>&1 | FileCheck %s
+; RUN-disabled: llc < %s -mtriple=x86_64-apple-macosx -pre-RA-sched=ilp -debug-only=pre-RA-sched \
+; RUN-disabled:     2>&1 | FileCheck %s
+; RUN: true
 ; REQUIRES: asserts
 ;
 ; rdar:13279013: pre-RA-sched should not check all interferences and
--- a/llvm/test/CodeGen/X86/rdrand.ll
+++ b/llvm/test/CodeGen/X86/rdrand.ll
@ -11,10 +11,10 @@ define i32 @_rdrand16_step(i16* %random_val) {
  ret i32 %isvalid
 ; CHECK-LABEL: _rdrand16_step:
 ; CHECK: rdrandw	%ax
-; CHECK: movw	%ax, (%r[[A0:di|cx]])
 ; CHECK: movzwl	%ax, %ecx
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%ecx, %eax
+; CHECK: movw	%cx, (%r[[A0:di|cx]])
 ; CHECK: ret
 }

@ -26,9 +26,9 @@ define i32 @_rdrand32_step(i32* %random_val) {
  ret i32 %isvalid
 ; CHECK-LABEL: _rdrand32_step:
 ; CHECK: rdrandl	%e[[T0:[a-z]+]]
-; CHECK: movl	%e[[T0]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T0]], %eax
+; CHECK: movl	%e[[T0]], (%r[[A0]])
 ; CHECK: ret
 }

@ -40,9 +40,9 @@ define i32 @_rdrand64_step(i64* %random_val) {
  ret i32 %isvalid
 ; CHECK-LABEL: _rdrand64_step:
 ; CHECK: rdrandq	%r[[T1:[a-z]+]]
-; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T1]], %eax
+; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: ret
 }

--- a/llvm/test/CodeGen/X86/rdseed.ll
+++ b/llvm/test/CodeGen/X86/rdseed.ll
@ -12,10 +12,10 @@ define i32 @_rdseed16_step(i16* %random_val) {
  ret i32 %isvalid
 ; CHECK-LABEL: _rdseed16_step:
 ; CHECK: rdseedw	%ax
-; CHECK: movw	%ax, (%r[[A0:di|cx]])
 ; CHECK: movzwl	%ax, %ecx
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%ecx, %eax
+; CHECK: movw	%cx, (%r[[A0:di|cx]])
 ; CHECK: ret
 }

@ -27,9 +27,9 @@ define i32 @_rdseed32_step(i32* %random_val) {
  ret i32 %isvalid
 ; CHECK-LABEL: _rdseed32_step:
 ; CHECK: rdseedl	%e[[T0:[a-z]+]]
-; CHECK: movl	%e[[T0]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T0]], %eax
+; CHECK: movl	%e[[T0]], (%r[[A0]])
 ; CHECK: ret
 }

@ -41,8 +41,8 @@ define i32 @_rdseed64_step(i64* %random_val) {
  ret i32 %isvalid
 ; CHECK-LABEL: _rdseed64_step:
 ; CHECK: rdseedq	%r[[T1:[a-z]+]]
-; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T1]], %eax
+; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: ret
 }
--- a/llvm/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/llvm/test/CodeGen/X86/segmented-stacks-dynamic.ll
@ -31,7 +31,7 @@ false:
 ; X32-NEXT: ret

 ; X32:      movl %esp, %eax
-; X32-NEXT: subl %ecx, %eax
+; X32:      subl %ecx, %eax
 ; X32-NEXT: cmpl %eax, %gs:48

 ; X32:      movl %eax, %esp
@ -52,7 +52,7 @@ false:
 ; X64-NEXT: ret

 ; X64:      movq %rsp, %[[RDI:rdi|rax]]
-; X64-NEXT: subq %{{.*}}, %[[RDI]]
+; X64:      subq %{{.*}}, %[[RDI]]
 ; X64-NEXT: cmpq %[[RDI]], %fs:112

 ; X64:      movq %[[RDI]], %rsp
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@ -34,12 +34,12 @@ bb90:		; preds = %bb84, %bb72
 bb91:		; preds = %bb84
 	ret i32 0
 ; CHECK-LABEL: test2:
-; CHECK: movnew
-; CHECK: movswl
+; CHECK: cmovnew
+; CHECK: cwtl

 ; ATOM-LABEL: test2:
-; ATOM: movnew
-; ATOM: movswl
+; ATOM: cmovnew
+; ATOM: cwtl
 }

 declare i1 @return_false()
@ -256,8 +256,8 @@ entry:
  %call = tail call noalias i8* @_Znam(i64 %D) nounwind noredzone
  ret i8* %call
 ; CHECK-LABEL: test12:
-; CHECK: movq $-1, %[[R:r..]]
 ; CHECK: mulq
+; CHECK: movq $-1, %[[R:r..]]
 ; CHECK: cmovnoq	%rax, %[[R]]
 ; CHECK: jmp	__Znam

--- a/llvm/test/CodeGen/X86/shift-bmi2.ll
+++ b/llvm/test/CodeGen/X86/shift-bmi2.ll
@ -30,10 +30,11 @@ entry:
  %x = load i32* %p
  %shl = shl i32 %x, %shamt
 ; BMI2: shl32p
-; BMI2: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; Source order scheduling prevents folding, rdar:14208996.
+; BMI2: shlxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI2: ret
 ; BMI264: shl32p
-; BMI264: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: shlxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
  ret i32 %shl
 }
@ -74,7 +75,7 @@ entry:
  %x = load i64* %p
  %shl = shl i64 %x, %shamt
 ; BMI264: shl64p
-; BMI264: shlxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: shlxq %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
  ret i64 %shl
 }
@ -106,10 +107,11 @@ entry:
  %x = load i32* %p
  %shl = lshr i32 %x, %shamt
 ; BMI2: lshr32p
-; BMI2: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; Source order scheduling prevents folding, rdar:14208996.
+; BMI2: shrxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI2: ret
 ; BMI264: lshr32p
-; BMI264: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: shrxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
  ret i32 %shl
 }
@ -128,7 +130,7 @@ entry:
  %x = load i64* %p
  %shl = lshr i64 %x, %shamt
 ; BMI264: lshr64p
-; BMI264: shrxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: shrxq %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
  ret i64 %shl
 }
@ -150,10 +152,11 @@ entry:
  %x = load i32* %p
  %shl = ashr i32 %x, %shamt
 ; BMI2: ashr32p
-; BMI2: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; Source order scheduling prevents folding, rdar:14208996.
+; BMI2: sarxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI2: ret
 ; BMI264: ashr32p
-; BMI264: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: sarxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
  ret i32 %shl
 }
@ -172,7 +175,7 @@ entry:
  %x = load i64* %p
  %shl = ashr i64 %x, %shamt
 ; BMI264: ashr64p
-; BMI264: sarxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: sarxq %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
  ret i64 %shl
 }
--- a/llvm/test/CodeGen/X86/sink-hoist.ll
+++ b/llvm/test/CodeGen/X86/sink-hoist.ll
@ -26,11 +26,10 @@ define double @foo(double %x, double %y, i1 %c) nounwind {

 ; CHECK-LABEL: split:
 ; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: jne
-; CHECK-NEXT: movaps
-; CHECK-NEXT: ret
+; CHECK-NEXT: je
 ; CHECK:      divsd
-; CHECK-NEXT: ret
+; CHECK:      movaps
+; CHECK:      ret
 define double @split(double %x, double %y, i1 %c) nounwind {
  %a = fdiv double %x, 3.2
  %z = select i1 %c, double %a, double %y
@ -65,7 +64,7 @@ return:
 ; Sink instructions with dead EFLAGS defs.

 ; FIXME: Unfail the zzz test if we can correctly mark pregs with the kill flag.
-; 
+;
 ; See <rdar://problem/8030636>. This test isn't valid after we made machine
 ; sinking more conservative about sinking instructions that define a preg into a
 ; block when we don't know if the preg is killed within the current block.
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@ -7,7 +7,7 @@ define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
 	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
 	store <2 x double> %tmp9, <2 x double>* %r, align 16
 	ret void
-        
+
 ; CHECK-LABEL: test1:
 ; CHECK: 	movl	8(%esp), %eax
 ; CHECK-NEXT: 	movapd	(%eax), %xmm0
@ -23,12 +23,12 @@ define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
 	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
 	store <2 x double> %tmp9, <2 x double>* %r, align 16
 	ret void
-        
+
 ; CHECK-LABEL: test2:
-; CHECK: 	movl	8(%esp), %eax
-; CHECK-NEXT: 	movapd	(%eax), %xmm0
+; CHECK: 	movl	4(%esp), %eax
+; CHECK: 	movl	8(%esp), %ecx
+; CHECK-NEXT: 	movapd	(%ecx), %xmm0
 ; CHECK-NEXT: 	movhpd	12(%esp), %xmm0
-; CHECK-NEXT: 	movl	4(%esp), %eax
 ; CHECK-NEXT: 	movapd	%xmm0, (%eax)
 ; CHECK-NEXT: 	ret
 }
@ -48,7 +48,7 @@ define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind
 	store <4 x float> %tmp13, <4 x float>* %res
 	ret void
 ; CHECK: @test3
-; CHECK: 	unpcklps	
+; CHECK: 	unpcklps
 }

 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
@ -85,9 +85,9 @@ define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
        store <4 x float> %tmp2, <4 x float>* %res
        ret void
-        
+
 ; CHECK-LABEL: test6:
-; CHECK: 	movaps	(%eax), %xmm0
+; CHECK: 	movaps	(%ecx), %xmm0
 ; CHECK:	movaps	%xmm0, (%eax)
 }

@ -96,7 +96,7 @@ define void @test7() nounwind {
        shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
        store <4 x float> %2, <4 x float>* null
        ret void
-        
+
 ; CHECK-LABEL: test7:
 ; CHECK:	xorps	%xmm0, %xmm0
 ; CHECK:	movaps	%xmm0, 0
@ -166,7 +166,7 @@ define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x fl
        store <4 x float> %tmp11, <4 x float>* %res
        ret void
 ; CHECK: test13
-; CHECK: shufps	$69, (%eax), %xmm0
+; CHECK: shufps	$69, (%ecx), %xmm0
 ; CHECK: pshufd	$-40, %xmm0, %xmm0
 }

@ -178,8 +178,8 @@ define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
        %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
        ret <4 x float> %tmp27
 ; CHECK-LABEL: test14:
-; CHECK: 	subps	[[X1:%xmm[0-9]+]], [[X2:%xmm[0-9]+]]
-; CHECK: 	addps	[[X1]], [[X0:%xmm[0-9]+]]
+; CHECK: 	addps	[[X1:%xmm[0-9]+]], [[X0:%xmm[0-9]+]]
+; CHECK: 	subps	[[X1]], [[X2:%xmm[0-9]+]]
 ; CHECK: 	movlhps	[[X2]], [[X0]]
 }

@ -221,4 +221,3 @@ entry:
 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
 ret <4 x float> %double2float.i
 }
-
--- a/llvm/test/CodeGen/X86/store-narrow.ll
+++ b/llvm/test/CodeGen/X86/store-narrow.ll
@ -12,7 +12,7 @@ entry:
  %D = or i32 %C, %B
  store i32 %D, i32* %a0, align 4
  ret void
-  
+
 ; X64-LABEL: test1:
 ; X64: movb	%sil, (%rdi)

@ -34,8 +34,8 @@ entry:
 ; X64: movb	%sil, 1(%rdi)

 ; X32-LABEL: test2:
-; X32: movb	8(%esp), %al
-; X32: movb	%al, 1(%{{.*}})
+; X32: movb	8(%esp), %[[REG:[abcd]l]]
+; X32: movb	%[[REG]], 1(%{{.*}})
 }

 define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@ -67,8 +67,8 @@ entry:
 ; X64: movw	%si, 2(%rdi)

 ; X32-LABEL: test4:
-; X32: movl	8(%esp), %eax
-; X32: movw	%ax, 2(%{{.*}})
+; X32: movl	8(%esp), %e[[REG:[abcd]x]]
+; X32: movw	%[[REG]], 2(%{{.*}})
 }

 define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@ -84,8 +84,8 @@ entry:
 ; X64: movw	%si, 2(%rdi)

 ; X32-LABEL: test5:
-; X32: movzwl	8(%esp), %eax
-; X32: movw	%ax, 2(%{{.*}})
+; X32: movzwl	8(%esp), %e[[REG:[abcd]x]]
+; X32: movw	%[[REG]], 2(%{{.*}})
 }

 define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {
@ -102,8 +102,8 @@ entry:


 ; X32-LABEL: test6:
-; X32: movb	8(%esp), %al
-; X32: movb	%al, 5(%{{.*}})
+; X32: movb	8(%esp), %[[REG:[abcd]l]]
+; X32: movb	%[[REG]], 5(%{{.*}})
 }

 define i32 @test7(i64* nocapture %a0, i8 zeroext %a1, i32* %P2) nounwind {
@ -121,8 +121,8 @@ entry:


 ; X32-LABEL: test7:
-; X32: movb	8(%esp), %cl
-; X32: movb	%cl, 5(%{{.*}})
+; X32: movb	8(%esp), %[[REG:[abcd]l]]
+; X32: movb	%[[REG]], 5(%{{.*}})
 }

 ; PR7833
--- a/llvm/test/CodeGen/X86/tailcall-largecode.ll
+++ b/llvm/test/CodeGen/X86/tailcall-largecode.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -tailcallopt -code-model=large | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -tailcallopt -code-model=large -enable-misched=false | FileCheck %s

 declare fastcc i32 @callee(i32 %arg)
 define fastcc i32 @directcall(i32 %arg) {
--- a/llvm/test/CodeGen/X86/test-nofold.ll
+++ b/llvm/test/CodeGen/X86/test-nofold.ll
@ -2,10 +2,10 @@
 ; rdar://5752025

 ; We want:
-;      CHECK: movl	$42, %ecx
-; CHECK-NEXT: movl	4(%esp), %eax
-; CHECK-NEXT: andl	$15, %eax
-; CHECK-NEXT: cmovnel	%ecx, %eax
+;      CHECK: movl	4(%esp), %ecx
+; CHECK-NEXT: andl	$15, %ecx
+; CHECK-NEXT: movl	$42, %eax
+; CHECK-NEXT: cmovel	%ecx, %eax
 ; CHECK-NEXT: ret
 ;
 ; We don't want:
@ -39,4 +39,3 @@ entry:
 	%retval = select i1 %tmp4, i32 %tmp2, i32 42		; <i32> [#uses=1]
 	ret i32 %retval
 }
-
--- a/llvm/test/CodeGen/X86/trunc-to-bool.ll
+++ b/llvm/test/CodeGen/X86/trunc-to-bool.ll
@ -22,7 +22,7 @@ ret_false:
    ret i1 false
 }
 ; CHECK-LABEL: test2:
-; CHECK: btl %eax
+; CHECK: btl

 define i32 @test3(i8* %ptr) nounwind {
    %val = load i8* %ptr
--- a/llvm/test/CodeGen/X86/v-binop-widen.ll
+++ b/llvm/test/CodeGen/X86/v-binop-widen.ll
@ -1,7 +1,7 @@
 ; RUN: llc -mcpu=generic -march=x86 -mattr=+sse < %s | FileCheck %s
+; CHECK: divps
+; CHECK: divps
 ; CHECK: divss
-; CHECK: divps
-; CHECK: divps

 %vec = type <9 x float>
 define %vec @vecdiv( %vec %p1, %vec %p2)
@ -9,4 +9,3 @@ define %vec @vecdiv( %vec %p1, %vec %p2)
  %result = fdiv %vec %p1, %p2
  ret %vec %result
 }
-
--- a/llvm/test/CodeGen/X86/v-binop-widen2.ll
+++ b/llvm/test/CodeGen/X86/v-binop-widen2.ll
@ -2,9 +2,9 @@
 ; RUN: llc -march=x86 -mcpu=atom -mattr=+sse < %s | FileCheck -check-prefix=ATOM %s

 %vec = type <6 x float>
-; CHECK: divss
-; CHECK: divss
 ; CHECK: divps
+; CHECK: divss
+; CHECK: divss

 ; Scheduler causes a different instruction order to be produced on Intel Atom
 ; ATOM: divps
--- a/llvm/test/CodeGen/X86/vec_shuffle-27.ll
+++ b/llvm/test/CodeGen/X86/vec_shuffle-27.ll
@ -7,10 +7,10 @@ target triple = "i686-apple-cl.1.0"
 define <8 x float> @my2filter4_1d(<4 x float> %a, <8 x float> %T0, <8 x float> %T1) nounwind readnone {
 entry:
 ; CHECK: subps
-; CHECK: mulps
-; CHECK: addps
 ; CHECK: subps
 ; CHECK: mulps
+; CHECK: mulps
+; CHECK: addps
 ; CHECK: addps
 	%tmp7 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3 >		; <<8 x float>> [#uses=1]
 	%sub = fsub <8 x float> %T1, %T0		; <<8 x float>> [#uses=1]
--- a/llvm/test/CodeGen/X86/vec_shuffle-39.ll
+++ b/llvm/test/CodeGen/X86/vec_shuffle-39.ll
@ -54,8 +54,8 @@ entry:
 define <2 x double> @t3() nounwind readonly {
 bb:
 ; CHECK-LABEL: t3:
-; CHECK: punpcklqdq %xmm1, %xmm0
 ; CHECK: movq (%rax), %xmm1
+; CHECK: punpcklqdq %xmm2, %xmm0
 ; CHECK: movsd %xmm1, %xmm0
  %tmp0 = load i128* null, align 1
  %tmp1 = load <2 x i32>* undef, align 8
@ -72,9 +72,9 @@ bb:
 define <2 x i64> @t4() nounwind readonly {
 bb:
 ; CHECK-LABEL: t4:
-; CHECK: punpcklqdq %xmm0, %xmm1
 ; CHECK: movq (%rax), %xmm0
-; CHECK: movsd %xmm1, %xmm0
+; CHECK: punpcklqdq %{{xmm.}}, %[[XMM:xmm[0-9]]]
+; CHECK: movsd %[[XMM]], %xmm0
  %tmp0 = load i128* null, align 1
  %tmp1 = load <2 x i32>* undef, align 8
  %tmp2 = bitcast i128 %tmp0 to <16 x i8>
--- a/llvm/test/CodeGen/X86/widen_cast-1.ll
+++ b/llvm/test/CodeGen/X86/widen_cast-1.ll
@ -1,8 +1,8 @@
 ; RUN: llc -march=x86 -mcpu=generic -mattr=+sse4.2 < %s | FileCheck %s
 ; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s

-; CHECK: paddd
 ; CHECK: movl
+; CHECK: paddd
 ; CHECK: movlpd

 ; Scheduler causes produce a different instruction order
--- a/llvm/test/CodeGen/X86/win64_alloca_dynalloca.ll
+++ b/llvm/test/CodeGen/X86/win64_alloca_dynalloca.ll
@ -1,6 +1,6 @@
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
+; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
+; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
+; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
 ; PR8777
 ; PR8778

@ -52,18 +52,18 @@ entry:
  %r = call i64 @bar(i64 %n, i64 %x, i64 %n, i8* %buf0, i8* %buf1) nounwind

 ; M64: subq  $48, %rsp
-; M64: leaq  -4096(%rbp), %r9
 ; M64: movq  %rax, 32(%rsp)
+; M64: leaq  -4096(%rbp), %r9
 ; M64: callq bar

 ; W64: subq  $48, %rsp
-; W64: leaq  -4096(%rbp), %r9
 ; W64: movq  %rax, 32(%rsp)
+; W64: leaq  -4096(%rbp), %r9
 ; W64: callq bar

 ; EFI: subq  $48, %rsp
-; EFI: leaq  -[[B0OFS]](%rbp), %r9
 ; EFI: movq  [[R64]], 32(%rsp)
+; EFI: leaq  -[[B0OFS]](%rbp), %r9
 ; EFI: callq _bar

  ret i64 %r
--- a/llvm/test/CodeGen/X86/x86-64-psub.ll
+++ b/llvm/test/CodeGen/X86/x86-64-psub.ll
@ -4,8 +4,8 @@
 ; This test checks that the operands of packed sub instructions are
 ; never interchanged by the "Two-Address instruction pass".

-declare { i64, double } @getFirstParam() 
-declare { i64, double } @getSecondParam() 
+declare { i64, double } @getFirstParam()
+declare { i64, double } @getSecondParam()

 define i64 @test_psubb() {
 entry:
@ -28,9 +28,10 @@ entry:

 ; CHECK-LABEL: test_psubb:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubb [[PARAM2]], [[PARAM1]]
 ; CHECK: ret

@ -55,9 +56,10 @@ entry:

 ; CHECK-LABEL: test_psubw:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubw [[PARAM2]], [[PARAM1]]
 ; CHECK: ret

@ -83,9 +85,10 @@ entry:

 ; CHECK-LABEL: test_psubd:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubd [[PARAM2]], [[PARAM1]]
 ; CHECK: ret

@ -110,9 +113,10 @@ entry:

 ; CHECK-LABEL: test_psubsb:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubsb [[PARAM2]], [[PARAM1]]
 ; CHECK: ret

@ -137,9 +141,10 @@ entry:

 ; CHECK-LABEL: test_psubswv:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubsw [[PARAM2]], [[PARAM1]]
 ; CHECK: ret

@ -164,9 +169,10 @@ entry:

 ; CHECK-LABEL: test_psubusbv:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubusb [[PARAM2]], [[PARAM1]]
 ; CHECK: ret

@ -191,9 +197,10 @@ entry:

 ; CHECK-LABEL: test_psubuswv:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubusw [[PARAM2]], [[PARAM1]]
 ; CHECK: ret

--- a/llvm/test/CodeGen/X86/x86-shifts.ll
+++ b/llvm/test/CodeGen/X86/x86-shifts.ll
@ -6,8 +6,8 @@
 define <4 x i32> @shl4(<4 x i32> %A) nounwind {
 entry:
 ; CHECK:      shl4
-; CHECK:      padd
 ; CHECK:      pslld
+; CHECK:      padd
 ; CHECK:      ret
  %B = shl <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
  %C = shl <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
@ -67,8 +67,8 @@ entry:
 define <8 x i16> @shl8(<8 x i16> %A) nounwind {
 entry:
 ; CHECK:      shl8
-; CHECK:      padd
 ; CHECK:      psllw
+; CHECK:      padd
 ; CHECK:      ret
  %B = shl <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
  %C = shl <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
--- a/llvm/test/CodeGen/X86/zext-fold.ll
+++ b/llvm/test/CodeGen/X86/zext-fold.ll
@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -enable-misched=false | FileCheck %s

 ;; Simple case
 define i32 @test1(i8 %x) nounwind readnone {
@ -10,7 +10,7 @@ define i32 @test1(i8 %x) nounwind readnone {
 ; CHECK: movzbl
 ; CHECK-NEXT: andl {{.*}}224

-;; Multiple uses of %x but easily extensible. 
+;; Multiple uses of %x but easily extensible.
 define i32 @test2(i8 %x) nounwind readnone {
  %A = and i8 %x, -32
  %B = zext i8 %A to i32
@ -21,8 +21,8 @@ define i32 @test2(i8 %x) nounwind readnone {
 }
 ; CHECK: test2
 ; CHECK: movzbl
-; CHECK: orl $63
 ; CHECK: andl $224
+; CHECK: orl $63

 declare void @use(i32, i8)

--- a/llvm/test/CodeGen/X86/zext-sext.ll
+++ b/llvm/test/CodeGen/X86/zext-sext.ll
@ -34,10 +34,10 @@ entry:
  %tmp12 = add i64 %tmp11, 5089792279245435153

 ; CHECK:      addl	$2138875574, %e[[REGISTER_zext:[a-z0-9]+]]
-; CHECK-NEXT: cmpl	$-8608074, %e[[REGISTER_zext]]
-; CHECK:      movslq	%e[[REGISTER_zext]], [[REGISTER_tmp:%r[a-z0-9]+]]
-; CHECK:      movq	[[REGISTER_tmp]], [[REGISTER_sext:%r[a-z0-9]+]]
+; CHECK:      movslq	%e[[REGISTER_zext]], [[REGISTER_sext:%r[a-z0-9]+]]
+; CHECK:      cmpl	$-8608074, %e[[REGISTER_zext]]
 ; CHECK-NOT:  [[REGISTER_zext]]
+; CHECK-DAG:  testl     %e[[REGISTER_zext]]
 ; CHECK:      subq	%r[[REGISTER_zext]], [[REGISTER_sext]]

  %tmp13 = sub i64 %tmp12, 2138875574
--- a/llvm/test/DebugInfo/X86/dbg-value-dag-combine.ll
+++ b/llvm/test/DebugInfo/X86/dbg-value-dag-combine.ll
@ -16,7 +16,7 @@ entry:
  call void @llvm.dbg.value(metadata !12, i64 0, metadata !13), !dbg !14
  %tmp2 = load i32 addrspace(1)* %ip, align 4, !dbg !15
  %tmp3 = add i32 0, %tmp2, !dbg !15
-; CHECK:  ##DEBUG_VALUE: idx <- EAX{{$}}
+; CHECK:  ##DEBUG_VALUE: idx <- E{{..$}}
  call void @llvm.dbg.value(metadata !{i32 %tmp3}, i64 0, metadata !13), !dbg
 !15
  %arrayidx = getelementptr i32 addrspace(1)* %ip, i32 %1, !dbg !16