Committing X86-64 support.

llvm-svn: 30177
2006-09-08 06:48:29 +00:00 · 2006-09-08 06:48:29 +00:00 · 11b0a5dbd4
parent 02a7d09b40
commit 11b0a5dbd4
25 changed files with 3607 additions and 466 deletions
--- a/llvm/lib/Target/X86/README-X86-64.txt
+++ b/llvm/lib/Target/X86/README-X86-64.txt
@ -0,0 +1,269 @@
+//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
+
+Implement different PIC models? Right now we only support Mac OS X with small
+PIC code model.
+
+//===---------------------------------------------------------------------===//
+
+Make use of "Red Zone".
+
+//===---------------------------------------------------------------------===//
+
+Implement __int128 and long double support.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+extern void xx(void);
+void bar(void) {
+  xx();
+}
+
+gcc compiles to:
+
+.globl _bar
+_bar:
+	jmp	_xx
+
+We need to do the tailcall optimization as well.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+int test(int a)
+{
+  return a * 3;
+}
+
+We generates
+	leal (%edi,%edi,2), %eax
+
+We should be generating 
+	leal (%rdi,%rdi,2), %eax
+
+instead. The later form does not require an address-size prefix 67H.
+
+It's probably ok to simply emit the corresponding 64-bit super class registers
+in this case?
+
+
+//===---------------------------------------------------------------------===//
+
+AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
+multiplication by a constant. How much of it applies to Intel's X86-64
+implementation? There are definite trade-offs to consider: latency vs. register
+pressure vs. code size.
+
+//===---------------------------------------------------------------------===//
+
+Are we better off using branches instead of cmove to implement FP to
+unsigned i64?
+
+_conv:
+	ucomiss	LC0(%rip), %xmm0
+	cvttss2siq	%xmm0, %rdx
+	jb	L3
+	subss	LC0(%rip), %xmm0
+	movabsq	$-9223372036854775808, %rax
+	cvttss2siq	%xmm0, %rdx
+	xorq	%rax, %rdx
+L3:
+	movq	%rdx, %rax
+	ret
+
+instead of
+
+_conv:
+	movss LCPI1_0(%rip), %xmm1
+	cvttss2siq %xmm0, %rcx
+	movaps %xmm0, %xmm2
+	subss %xmm1, %xmm2
+	cvttss2siq %xmm2, %rax
+	movabsq $-9223372036854775808, %rdx
+	xorq %rdx, %rax
+	ucomiss %xmm1, %xmm0
+	cmovb %rcx, %rax
+	ret
+
+Seems like the jb branch has high likelyhood of being taken. It would have
+saved a few instructions.
+
+//===---------------------------------------------------------------------===//
+
+Poor codegen:
+
+int X[2];
+int b;
+void test(void) {
+  memset(X, b, 2*sizeof(X[0]));
+}
+
+llc:
+	movq _b@GOTPCREL(%rip), %rax
+	movzbq (%rax), %rax
+	movq %rax, %rcx
+	shlq $8, %rcx
+	orq %rax, %rcx
+	movq %rcx, %rax
+	shlq $16, %rax
+	orq %rcx, %rax
+	movq %rax, %rcx
+	shlq $32, %rcx
+	movq _X@GOTPCREL(%rip), %rdx
+	orq %rax, %rcx
+	movq %rcx, (%rdx)
+	ret
+
+gcc:
+	movq	_b@GOTPCREL(%rip), %rax
+	movabsq	$72340172838076673, %rdx
+	movzbq	(%rax), %rax
+	imulq	%rdx, %rax
+	movq	_X@GOTPCREL(%rip), %rdx
+	movq	%rax, (%rdx)
+	ret
+
+//===---------------------------------------------------------------------===//
+
+Vararg function prologue can be further optimized. Currently all XMM registers
+are stored into register save area. Most of them can be eliminated since the
+upper bound of the number of XMM registers used are passed in %al. gcc produces
+something like the following:
+
+	movzbl	%al, %edx
+	leaq	0(,%rdx,4), %rax
+	leaq	4+L2(%rip), %rdx
+	leaq	239(%rsp), %rax
+       	jmp	*%rdx
+	movaps	%xmm7, -15(%rax)
+	movaps	%xmm6, -31(%rax)
+	movaps	%xmm5, -47(%rax)
+	movaps	%xmm4, -63(%rax)
+	movaps	%xmm3, -79(%rax)
+	movaps	%xmm2, -95(%rax)
+	movaps	%xmm1, -111(%rax)
+	movaps	%xmm0, -127(%rax)
+L2:
+
+It jumps over the movaps that do not need to be stored. Hard to see this being
+significant as it added 5 instruciton (including a indirect branch) to avoid
+executing 0 to 8 stores in the function prologue.
+
+Perhaps we can optimize for the common case where no XMM registers are used for
+parameter passing. i.e. is %al == 0 jump over all stores. Or in the case of a
+leaf function where we can determine that no XMM input parameter is need, avoid
+emitting the stores at all.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 has a complex calling convention for aggregate passing by value:
+
+1. If the size of an object is larger than two eightbytes, or in C++, is a non- 
+   POD structure or union type, or contains unaligned fields, it has class 
+   MEMORY.
+2. Both eightbytes get initialized to class NO_CLASS. 
+3. Each field of an object is classified recursively so that always two fields
+   are considered. The resulting class is calculated according to the classes
+   of the fields in the eightbyte: 
+   (a) If both classes are equal, this is the resulting class. 
+   (b) If one of the classes is NO_CLASS, the resulting class is the other 
+       class. 
+   (c) If one of the classes is MEMORY, the result is the MEMORY class. 
+   (d) If one of the classes is INTEGER, the result is the INTEGER. 
+   (e) If one of the classes is X87, X87UP, COMPLEX_X87 class, MEMORY is used as
+      class. 
+   (f) Otherwise class SSE is used. 
+4. Then a post merger cleanup is done: 
+   (a) If one of the classes is MEMORY, the whole argument is passed in memory. 
+   (b) If SSEUP is not preceeded by SSE, it is converted to SSE.
+
+Currently llvm frontend does not handle this correctly.
+
+Problem 1:
+    typedef struct { int i; double d; } QuadWordS;
+It is currently passed in two i64 integer registers. However, gcc compiled
+callee expects the second element 'd' to be passed in XMM0.
+
+Problem 2:
+    typedef struct { int32_t i; float j; double d; } QuadWordS;
+The size of the first two fields == i64 so they will be combined and passed in
+a integer register RDI. The third field is still passed in XMM0.
+
+Problem 3:
+    typedef struct { int64_t i; int8_t j; int64_t d; } S;
+    void test(S s)
+The size of this aggregate is greater than two i64 so it should be passed in 
+memory. Currently llvm breaks this down and passed it in three integer
+registers.
+
+Problem 4:
+Taking problem 3 one step ahead where a function expects a aggregate value
+in memory followed by more parameter(s) passed in register(s).
+    void test(S s, int b)
+
+LLVM IR does not allow parameter passing by aggregates, therefore it must break
+the aggregates value (in problem 3 and 4) into a number of scalar values:
+    void %test(long %s.i, byte %s.j, long %s.d);
+
+However, if the backend were to lower this code literally it would pass the 3
+values in integer registers. To force it be passed in memory, the frontend
+should change the function signiture to:
+    void %test(long %undef1, long %undef2, long %undef3, long %undef4, 
+               long %undef5, long %undef6,
+               long %s.i, byte %s.j, long %s.d);
+And the callee would look something like this:
+    call void %test( undef, undef, undef, undef, undef, undef,
+                     %tmp.s.i, %tmp.s.j, %tmp.s.d );
+The first 6 undef parameters would exhaust the 6 integer registers used for
+parameter passing. The following three integer values would then be forced into
+memory.
+
+For problem 4, the parameter 'd' would be moved to the front of the parameter
+list so it will be passed in register:
+    void %test(int %d,
+               long %undef1, long %undef2, long %undef3, long %undef4, 
+               long %undef5, long %undef6,
+               long %s.i, byte %s.j, long %s.d);
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+extern int dst[]; 
+extern int* ptr; 
+
+void test(void) {
+  ptr = dst;
+}
+
+We generate this code for static relocation model:
+
+_test:
+	leaq _dst(%rip), %rax
+	movq %rax, _ptr(%rip)
+	ret
+
+If we are in small code model, they we can treat _dst as a 32-bit constant.
+        movq $_dst, _ptr(%rip)
+
+Note, however, we should continue to use RIP relative addressing mode as much as
+possible. The above is actually one byte shorter than
+        movq $_dst, _ptr
+
+//===---------------------------------------------------------------------===//
+
+Right now the asm printer assumes GlobalAddress are accessed via RIP relative
+addressing. Therefore, it is not possible to generate this:
+        movabsq $__ZTV10polynomialIdE+16, %rax
+
+That is ok for now since we currently only support small model. So the above
+is selected as
+        leaq __ZTV10polynomialIdE+16(%rip), %rax
+
+This is probably slightly slower but is much shorter than movabsq. However, if
+we were to support medium or larger code models, we need to use the movabs
+instruction. We should probably introduce something like AbsoluteAddress to
+distinguish it from GlobalAddress so the asm printer and JIT code emitter can
+do the right thing.
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@ -20,8 +20,8 @@ include "../Target.td"
 // X86 Subtarget features.
 //
 
-def Feature64Bit     : SubtargetFeature<"64bit", "Is64Bit", "true",
-                                        "Enable 64-bit instructions">;
+def Feature64Bit     : SubtargetFeature<"64bit", "HasX86_64", "true",
+                                        "Support 64-bit instructions">;
 def FeatureMMX       : SubtargetFeature<"mmx","X86SSELevel", "MMX",
                                        "Enable MMX instructions">;
 def FeatureSSE1      : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
@ -61,6 +61,8 @@ def : Proc<"prescott",        [FeatureMMX, FeatureSSE1, FeatureSSE2,
                               FeatureSSE3]>;
 def : Proc<"nocona",          [FeatureMMX, FeatureSSE1, FeatureSSE2,
                               FeatureSSE3, Feature64Bit]>;
+def : Proc<"core2",           [FeatureMMX, FeatureSSE1, FeatureSSE2,
+                               FeatureSSE3,  Feature64Bit]>;

 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [FeatureMMX, Feature3DNow]>;
@ -105,16 +107,20 @@ def X86InstrInfo : InstrInfo {
  // should be kept up-to-date with the fields in the X86InstrInfo.h file.
  let TSFlagsFields = ["FormBits",
                       "hasOpSizePrefix",
+                       "hasAdSizePrefix",
                       "Prefix",
+                       "hasREX_WPrefix",
                       "ImmTypeBits",
                       "FPFormBits",
                       "Opcode"];
  let TSFlagsShifts = [0,
                       6,
                       7,
-                       11,
+                       8,
+                       12,
                       13,
-                       16];
+                       16,
+                       24];
 }

 // The X86 target supports two different syntaxes for emitting machine code.
--- a/llvm/lib/Target/X86/X86ATTAsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86ATTAsmPrinter.cpp
@ -126,8 +126,9 @@ void X86ATTAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
    O << '%';
    unsigned Reg = MO.getReg();
    if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
-      MVT::ValueType VT = (strcmp(Modifier,"subreg16") == 0)
-        ? MVT::i16 : MVT::i8;
+      MVT::ValueType VT = (strcmp(Modifier+6,"64") == 0) ?
+        MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
+                    ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
      Reg = getX86SubSuperRegister(Reg, VT);
    }
    for (const char *Name = RI.get(Reg).Name; *Name; ++Name)
@ -148,9 +149,11 @@ void X86ATTAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
    if (!isMemOp) O << '$';
    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << "_"
      << MO.getJumpTableIndex();
-    if (Subtarget->isTargetDarwin() && 
+    if (X86PICStyle == PICStyle::Stub &&
        TM.getRelocationModel() == Reloc::PIC_)
      O << "-\"L" << getFunctionNumber() << "$pb\"";
+    if (Subtarget->is64Bit())
+      O << "(%rip)";
    return;
  }
  case MachineOperand::MO_ConstantPoolIndex: {
@ -158,7 +161,7 @@ void X86ATTAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
    if (!isMemOp) O << '$';
    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
      << MO.getConstantPoolIndex();
-    if (Subtarget->isTargetDarwin() && 
+    if (X86PICStyle == PICStyle::Stub &&
        TM.getRelocationModel() == Reloc::PIC_)
      O << "-\"L" << getFunctionNumber() << "$pb\"";
    int Offset = MO.getOffset();
@ -166,47 +169,59 @@ void X86ATTAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
      O << "+" << Offset;
    else if (Offset < 0)
      O << Offset;
+
+    if (Subtarget->is64Bit())
+      O << "(%rip)";
    return;
  }
  case MachineOperand::MO_GlobalAddress: {
    bool isCallOp = Modifier && !strcmp(Modifier, "call");
    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
    if (!isMemOp && !isCallOp) O << '$';
-    // Darwin block shameless ripped from PPCAsmPrinter.cpp
-    if (Subtarget->isTargetDarwin() && 
+
+    GlobalValue *GV = MO.getGlobal();
+    std::string Name = Mang->getValueName(GV);
+    bool isExt = (GV->isExternal() || GV->hasWeakLinkage() ||
+                  GV->hasLinkOnceLinkage());
+    if (X86PICStyle == PICStyle::Stub &&
        TM.getRelocationModel() != Reloc::Static) {
-      GlobalValue *GV = MO.getGlobal();
-      std::string Name = Mang->getValueName(GV);
      // Link-once, External, or Weakly-linked global variables need
      // non-lazily-resolved stubs
-      if (GV->isExternal() || GV->hasWeakLinkage() ||
-          GV->hasLinkOnceLinkage()) {
+      if (isExt) {
        // Dynamically-resolved functions need a stub for the function.
-        if (isCallOp && isa<Function>(GV) && cast<Function>(GV)->isExternal()) {
+        if (isCallOp && isa<Function>(GV)) {
          FnStubs.insert(Name);
          O << "L" << Name << "$stub";
        } else {
          GVStubs.insert(Name);
          O << "L" << Name << "$non_lazy_ptr";
        }
-      } else {
-        O << Mang->getValueName(GV);
-      } 
+      } else
+        O << Name;
      if (!isCallOp && TM.getRelocationModel() == Reloc::PIC_)
        O << "-\"L" << getFunctionNumber() << "$pb\"";
-   } else
-      O << Mang->getValueName(MO.getGlobal());
+    } else
+      O << Name;
+
    int Offset = MO.getOffset();
    if (Offset > 0)
      O << "+" << Offset;
    else if (Offset < 0)
      O << Offset;
+
+    if (!isCallOp &&
+        Subtarget->is64Bit()) {
+      if (isExt && TM.getRelocationModel() != Reloc::Static)
+        O << "@GOTPCREL";
+      O << "(%rip)";
+    }
+
    return;
  }
  case MachineOperand::MO_ExternalSymbol: {
    bool isCallOp = Modifier && !strcmp(Modifier, "call");
    if (isCallOp && 
-        Subtarget->isTargetDarwin() && 
+        X86PICStyle == PICStyle::Stub &&
        TM.getRelocationModel() != Reloc::Static) {
      std::string Name(TAI->getGlobalPrefix());
      Name += MO.getSymbolName();
@ -216,6 +231,11 @@ void X86ATTAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
    }
    if (!isCallOp) O << '$';
    O << TAI->getGlobalPrefix() << MO.getSymbolName();
+
+    if (!isCallOp &&
+        Subtarget->is64Bit())
+      O << "(%rip)";
+
    return;
  }
  default:
@ -238,7 +258,8 @@ void X86ATTAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
  }
 }

-void X86ATTAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op){
+void X86ATTAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+                                         const char *Modifier){
  assert(isMem(MI, Op) && "Invalid memory reference!");

  const MachineOperand &BaseReg  = MI->getOperand(Op);
@ -266,12 +287,13 @@ void X86ATTAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op){

  if (IndexReg.getReg() || BaseReg.getReg()) {
    O << "(";
-    if (BaseReg.getReg())
-      printOperand(MI, Op);
+    if (BaseReg.getReg()) {
+      printOperand(MI, Op, Modifier);
+    }

    if (IndexReg.getReg()) {
      O << ",";
-      printOperand(MI, Op+2);
+      printOperand(MI, Op+2, Modifier);
      if (ScaleVal != 1)
        O << "," << ScaleVal;
    }
@ -350,43 +372,25 @@ bool X86ATTAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 ///
 void X86ATTAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
  ++EmittedInsts;
-  // This works around some Darwin assembler bugs.
-  if (Subtarget->isTargetDarwin()) {
-    switch (MI->getOpcode()) {
-    case X86::REP_MOVSB:
-      O << "rep/movsb (%esi),(%edi)\n";
-      return;
-    case X86::REP_MOVSD:
-      O << "rep/movsl (%esi),(%edi)\n";
-      return;
-    case X86::REP_MOVSW:
-      O << "rep/movsw (%esi),(%edi)\n";
-      return;
-    case X86::REP_STOSB:
-      O << "rep/stosb\n";
-      return;
-    case X86::REP_STOSD:
-      O << "rep/stosl\n";
-      return;
-    case X86::REP_STOSW:
-      O << "rep/stosw\n";
-      return;
-    default:
-      break;
-    }
-  }

  // See if a truncate instruction can be turned into a nop.
  switch (MI->getOpcode()) {
  default: break;
-  case X86::TRUNC_GR32_GR16:
-  case X86::TRUNC_GR32_GR8:
-  case X86::TRUNC_GR16_GR8: {
+  case X86::TRUNC_64to32:
+  case X86::TRUNC_64to16:
+  case X86::TRUNC_32to16:
+  case X86::TRUNC_32to8:
+  case X86::TRUNC_16to8:
+  case X86::TRUNC_32_to8:
+  case X86::TRUNC_16_to8: {
    const MachineOperand &MO0 = MI->getOperand(0);
    const MachineOperand &MO1 = MI->getOperand(1);
    unsigned Reg0 = MO0.getReg();
    unsigned Reg1 = MO1.getReg();
-    if (MI->getOpcode() == X86::TRUNC_GR32_GR16)
+    unsigned Opc = MI->getOpcode();
+    if (Opc == X86::TRUNC_64to32)
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i32);
+    else if (Opc == X86::TRUNC_32to16 || Opc == X86::TRUNC_64to16)
      Reg1 = getX86SubSuperRegister(Reg1, MVT::i16);
    else
      Reg1 = getX86SubSuperRegister(Reg1, MVT::i8);
@ -395,6 +399,9 @@ void X86ATTAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
      O << "\n\t";
    break;
  }
+  case X86::PsMOVZX64rr32:
+    O << TAI->getCommentString() << " ZERO-EXTEND " << "\n\t";
+    break;
  }

  // Call the autogenerated instruction printer routines.
--- a/llvm/lib/Target/X86/X86ATTAsmPrinter.h
+++ b/llvm/lib/Target/X86/X86ATTAsmPrinter.h
@ -60,6 +60,9 @@ struct X86ATTAsmPrinter : public X86SharedAsmPrinter {
  void printf128mem(const MachineInstr *MI, unsigned OpNo) {
    printMemReference(MI, OpNo);
  }
+  void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo, "subreg64");
+  }
  
  bool printAsmMRegister(const MachineOperand &MO, const char Mode);
  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@ -69,7 +72,8 @@ struct X86ATTAsmPrinter : public X86SharedAsmPrinter {
  
  void printMachineInstruction(const MachineInstr *MI);
  void printSSECC(const MachineInstr *MI, unsigned Op);
-  void printMemReference(const MachineInstr *MI, unsigned Op);
+  void printMemReference(const MachineInstr *MI, unsigned Op,
+                         const char *Modifier=NULL);
  void printPICLabel(const MachineInstr *MI, unsigned Op);
  bool runOnMachineFunction(MachineFunction &F);
 };
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@ -30,8 +30,12 @@ Statistic<> llvm::EmittedInsts("asm-printer",
                               "Number of machine instrs printed");

 /// doInitialization
-bool X86SharedAsmPrinter::doInitialization(Module &M) {  
+bool X86SharedAsmPrinter::doInitialization(Module &M) {
  if (Subtarget->isTargetDarwin()) {
+    const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+    if (!Subtarget->is64Bit())
+      X86PICStyle = PICStyle::Stub;
+
    // Emit initial debug information.
    DW.BeginModule(&M);
  }
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@ -29,12 +29,19 @@ namespace llvm {

 extern Statistic<> EmittedInsts;

+// FIXME: Move this to CodeGen/AsmPrinter.h
+namespace PICStyle {
+  enum X86AsmPICStyle {
+    Stub, GOT
+  };
+}
+
 struct VISIBILITY_HIDDEN X86SharedAsmPrinter : public AsmPrinter {
  DwarfWriter DW;

  X86SharedAsmPrinter(std::ostream &O, X86TargetMachine &TM,
                      const TargetAsmInfo *T)
-    : AsmPrinter(O, TM, T), DW(O, this, T) {
+    : AsmPrinter(O, TM, T), DW(O, this, T), X86PICStyle(PICStyle::GOT) {
    Subtarget = &TM.getSubtarget<X86Subtarget>();
  }

@ -49,6 +56,8 @@ struct VISIBILITY_HIDDEN X86SharedAsmPrinter : public AsmPrinter {
    MachineFunctionPass::getAnalysisUsage(AU);
  }

+  PICStyle::X86AsmPICStyle X86PICStyle;
+  
  const X86Subtarget *Subtarget;

  // Necessary for Darwin to print out the apprioriate types of linker stubs
--- a/llvm/lib/Target/X86/X86CodeEmitter.cpp
+++ b/llvm/lib/Target/X86/X86CodeEmitter.cpp
@ -12,6 +12,8 @@
 //
 //===----------------------------------------------------------------------===//

+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "X86Relocations.h"
 #include "X86.h"
@ -35,14 +37,16 @@ namespace {
 namespace {
  class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass {
    const X86InstrInfo  *II;
-    TargetMachine &TM;
+    const TargetData    *TD;
+    TargetMachine       &TM;
    MachineCodeEmitter  &MCE;
+    bool Is64BitMode;
  public:
    explicit Emitter(TargetMachine &tm, MachineCodeEmitter &mce)
-      : II(0), TM(tm), MCE(mce) {}
+      : II(0), TD(0), TM(tm), MCE(mce), Is64BitMode(false) {}
    Emitter(TargetMachine &tm, MachineCodeEmitter &mce,
-            const X86InstrInfo& ii)
-      : II(&ii), TM(tm), MCE(mce) {}
+            const X86InstrInfo &ii, const TargetData &td, bool is64)
+      : II(&ii), TD(&td), TM(tm), MCE(mce), Is64BitMode(is64) {}

    bool runOnMachineFunction(MachineFunction &MF);

@ -54,20 +58,29 @@ namespace {

  private:
    void emitPCRelativeBlockAddress(MachineBasicBlock *MBB);
-    void emitPCRelativeValue(unsigned Address);
-    void emitGlobalAddressForCall(GlobalValue *GV, bool isTailCall);
-    void emitGlobalAddressForPtr(GlobalValue *GV, int Disp = 0);
+    void emitPCRelativeValue(intptr_t Address);
+    void emitGlobalAddressForCall(GlobalValue *GV, bool DoesntNeedStub);
+    void emitGlobalAddressForPtr(GlobalValue *GV, bool isPCRelative,
+                                 int Disp = 0, unsigned PCAdj = 0);
    void emitExternalSymbolAddress(const char *ES, bool isPCRelative);
+    void emitPCRelativeConstPoolAddress(unsigned CPI, int Disp = 0,
+                                        unsigned PCAdj = 0);
+    void emitPCRelativeJumpTableAddress(unsigned JTI, unsigned PCAdj = 0);

-    void emitDisplacementField(const MachineOperand *RelocOp, int DispVal);
+    void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
+                               unsigned PCAdj = 0);

    void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
    void emitSIBByte(unsigned SS, unsigned Index, unsigned Base);
-    void emitConstant(unsigned Val, unsigned Size);
+    void emitConstant(uint64_t Val, unsigned Size);

    void emitMemModRMByte(const MachineInstr &MI,
-                          unsigned Op, unsigned RegOpcodeField);
+                          unsigned Op, unsigned RegOpcodeField,
+                          unsigned PCAdj = 0);

+    unsigned getX86RegNum(unsigned RegNo);
+    bool isX86_64ExtendedReg(const MachineOperand &MO);
+    unsigned determineREX(const MachineInstr &MI);
  };
 }

@ -83,6 +96,9 @@ bool Emitter::runOnMachineFunction(MachineFunction &MF) {
          MF.getTarget().getRelocationModel() != Reloc::Static) &&
         "JIT relocation model must be set to static or default!");
  II = ((X86TargetMachine&)MF.getTarget()).getInstrInfo();
+  TD = ((X86TargetMachine&)MF.getTarget()).getTargetData();
+  Is64BitMode =
+    ((X86TargetMachine&)MF.getTarget()).getSubtarget<X86Subtarget>().is64Bit();

  do {
    MCE.startFunction(MF);
@ -98,9 +114,9 @@ bool Emitter::runOnMachineFunction(MachineFunction &MF) {
  return false;
 }

-/// emitPCRelativeValue - Emit a 32-bit PC relative address.
+/// emitPCRelativeValue - Emit a PC relative address.
 ///
-void Emitter::emitPCRelativeValue(unsigned Address) {
+void Emitter::emitPCRelativeValue(intptr_t Address) {
  MCE.emitWordLE(Address-MCE.getCurrentPCValue()-4);
 }

@ -119,20 +135,22 @@ void Emitter::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) {
 /// emitGlobalAddressForCall - Emit the specified address to the code stream
 /// assuming this is part of a function call, which is PC relative.
 ///
-void Emitter::emitGlobalAddressForCall(GlobalValue *GV, bool isTailCall) {
+void Emitter::emitGlobalAddressForCall(GlobalValue *GV, bool DoesntNeedStub) {
  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(),
                                      X86::reloc_pcrel_word, GV, 0,
-                                      !isTailCall /*Doesn'tNeedStub*/));
+                                      DoesntNeedStub));
  MCE.emitWordLE(0);
 }

 /// emitGlobalAddress - Emit the specified address to the code stream assuming
-/// this is part of a "take the address of a global" instruction, which is not
-/// PC relative.
+/// this is part of a "take the address of a global" instruction.
 ///
-void Emitter::emitGlobalAddressForPtr(GlobalValue *GV, int Disp /* = 0 */) {
-  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(),
-                                      X86::reloc_absolute_word, GV));
+void Emitter::emitGlobalAddressForPtr(GlobalValue *GV, bool isPCRelative,
+                                      int Disp /* = 0 */,
+                                      unsigned PCAdj /* = 0 */) {
+  unsigned rt = isPCRelative ? X86::reloc_pcrel_word : X86::reloc_absolute_word;
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), rt,
+                                             GV, PCAdj));
  MCE.emitWordLE(Disp); // The relocated value will be added to the displacement
 }

@ -145,6 +163,26 @@ void Emitter::emitExternalSymbolAddress(const char *ES, bool isPCRelative) {
  MCE.emitWordLE(0);
 }

+/// emitPCRelativeConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void Emitter::emitPCRelativeConstPoolAddress(unsigned CPI, int Disp /* = 0 */,
+                                             unsigned PCAdj /* = 0 */) {
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                            X86::reloc_pcrel_word, CPI, PCAdj));
+  MCE.emitWordLE(Disp); // The relocated value will be added to the displacement
+}
+
+/// emitPCRelativeJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void Emitter::emitPCRelativeJumpTableAddress(unsigned JTI,
+                                             unsigned PCAdj /* = 0 */) {
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                            X86::reloc_pcrel_word, JTI, PCAdj));
+  MCE.emitWordLE(0); // The relocated value will be added to the displacement
+}
+
 /// N86 namespace - Native X86 Register numbers... used by X86 backend.
 ///
 namespace N86 {
@ -153,28 +191,53 @@ namespace N86 {
  };
 }

-
 // getX86RegNum - This function maps LLVM register identifiers to their X86
 // specific numbering, which is used in various places encoding instructions.
 //
-static unsigned getX86RegNum(unsigned RegNo) {
+unsigned Emitter::getX86RegNum(unsigned RegNo) {
  switch(RegNo) {
-  case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
-  case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
-  case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
-  case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
-  case X86::ESP: case X86::SP: case X86::AH: return N86::ESP;
-  case X86::EBP: case X86::BP: case X86::CH: return N86::EBP;
-  case X86::ESI: case X86::SI: case X86::DH: return N86::ESI;
-  case X86::EDI: case X86::DI: case X86::BH: return N86::EDI;
+  case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
+  case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
+  case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
+  case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
+  case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
+    return N86::ESP;
+  case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
+    return N86::EBP;
+  case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
+    return N86::ESI;
+  case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
+    return N86::EDI;
+
+  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
+    return N86::EAX;
+  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
+    return N86::ECX;
+  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+    return N86::EDX;
+  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+    return N86::EBX;
+  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+    return N86::ESP;
+  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+    return N86::EBP;
+  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+    return N86::ESI;
+  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+    return N86::EDI;

  case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
  case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
    return RegNo-X86::ST0;

-  case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3:
-  case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7:
-    return RegNo-X86::XMM0;
+  case X86::XMM0:  case X86::XMM1:  case X86::XMM2:  case X86::XMM3:
+  case X86::XMM4:  case X86::XMM5:  case X86::XMM6:  case X86::XMM7:
+    return II->getRegisterInfo().getDwarfRegNum(RegNo) -
+           II->getRegisterInfo().getDwarfRegNum(X86::XMM0);
+  case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
+  case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+    return II->getRegisterInfo().getDwarfRegNum(RegNo) -
+           II->getRegisterInfo().getDwarfRegNum(X86::XMM8);

  default:
    assert(MRegisterInfo::isVirtualRegister(RegNo) &&
@ -199,7 +262,7 @@ void Emitter::emitSIBByte(unsigned SS, unsigned Index, unsigned Base) {
  MCE.emitByte(ModRMByte(SS, Index, Base));
 }

-void Emitter::emitConstant(unsigned Val, unsigned Size) {
+void Emitter::emitConstant(uint64_t Val, unsigned Size) {
  // Output the constant in little endian byte order...
  for (unsigned i = 0; i != Size; ++i) {
    MCE.emitByte(Val & 255);
@ -214,7 +277,7 @@ static bool isDisp8(int Value) {
 }

 void Emitter::emitDisplacementField(const MachineOperand *RelocOp,
-                                    int DispVal) {
+                                    int DispVal, unsigned PCAdj) {
  // If this is a simple integer displacement that doesn't require a relocation,
  // emit it now.
  if (!RelocOp) {
@ -225,14 +288,27 @@ void Emitter::emitDisplacementField(const MachineOperand *RelocOp,
  // Otherwise, this is something that requires a relocation.  Emit it as such
  // now.
  if (RelocOp->isGlobalAddress()) {
-    emitGlobalAddressForPtr(RelocOp->getGlobal(), RelocOp->getOffset());
+    // In 64-bit static small code model, we could potentially emit absolute.
+    // But it's probably not beneficial.
+    //  89 05 00 00 00 00    	mov    %eax,0(%rip)  # PC-relative
+    //	89 04 25 00 00 00 00 	mov    %eax,0x0      # Absolute
+    emitGlobalAddressForPtr(RelocOp->getGlobal(), Is64BitMode,
+                            RelocOp->getOffset(), PCAdj);
+  } else if (RelocOp->isConstantPoolIndex()) {
+    // Must be in 64-bit mode.
+    emitPCRelativeConstPoolAddress(RelocOp->getConstantPoolIndex(),
+                                   RelocOp->getOffset(), PCAdj);
+  } else if (RelocOp->isJumpTableIndex()) {
+    // Must be in 64-bit mode.
+    emitPCRelativeJumpTableAddress(RelocOp->getJumpTableIndex(), PCAdj);
  } else {
    assert(0 && "Unknown value to relocate!");
  }
 }

 void Emitter::emitMemModRMByte(const MachineInstr &MI,
-                               unsigned Op, unsigned RegOpcodeField) {
+                               unsigned Op, unsigned RegOpcodeField,
+                               unsigned PCAdj) {
  const MachineOperand &Op3 = MI.getOperand(Op+3);
  int DispVal = 0;
  const MachineOperand *DispForReloc = 0;
@ -241,10 +317,18 @@ void Emitter::emitMemModRMByte(const MachineInstr &MI,
  if (Op3.isGlobalAddress()) {
    DispForReloc = &Op3;
  } else if (Op3.isConstantPoolIndex()) {
-    DispVal += MCE.getConstantPoolEntryAddress(Op3.getConstantPoolIndex());
-    DispVal += Op3.getOffset();
+    if (Is64BitMode) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getConstantPoolEntryAddress(Op3.getConstantPoolIndex());
+      DispVal += Op3.getOffset();
+    }
  } else if (Op3.isJumpTableIndex()) {
-    DispVal += MCE.getJumpTableEntryAddress(Op3.getJumpTableIndex());
+    if (Is64BitMode) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getJumpTableEntryAddress(Op3.getJumpTableIndex());
+    }
  } else {
    DispVal = Op3.getImm();
  }
@ -256,12 +340,13 @@ void Emitter::emitMemModRMByte(const MachineInstr &MI,
  unsigned BaseReg = Base.getReg();

  // Is a SIB byte needed?
-  if (IndexReg.getReg() == 0 && BaseReg != X86::ESP) {
+  if (IndexReg.getReg() == 0 &&
+      (BaseReg == 0 || getX86RegNum(BaseReg) != N86::ESP)) {
    if (BaseReg == 0) {  // Just a displacement?
      // Emit special case [disp32] encoding
      MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
      
-      emitDisplacementField(DispForReloc, DispVal);
+      emitDisplacementField(DispForReloc, DispVal, PCAdj);
    } else {
      unsigned BaseRegNo = getX86RegNum(BaseReg);
      if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
@ -274,12 +359,13 @@ void Emitter::emitMemModRMByte(const MachineInstr &MI,
      } else {
        // Emit the most general non-SIB encoding: [REG+disp32]
        MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
-        emitDisplacementField(DispForReloc, DispVal);
+        emitDisplacementField(DispForReloc, DispVal, PCAdj);
      }
    }

  } else {  // We need a SIB byte, so start by outputting the ModR/M byte first
-    assert(IndexReg.getReg() != X86::ESP && "Cannot use ESP as index reg!");
+    assert(IndexReg.getReg() != X86::ESP &&
+           IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");

    bool ForceDisp32 = false;
    bool ForceDisp8  = false;
@ -292,7 +378,7 @@ void Emitter::emitMemModRMByte(const MachineInstr &MI,
      // Emit the normal disp32 encoding.
      MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
      ForceDisp32 = true;
-    } else if (DispVal == 0 && BaseReg != X86::EBP) {
+    } else if (DispVal == 0 && getX86RegNum(BaseReg) != N86::EBP) {
      // Emit no displacement ModR/M byte
      MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
    } else if (isDisp8(DispVal)) {
@ -327,7 +413,7 @@ void Emitter::emitMemModRMByte(const MachineInstr &MI,
    if (ForceDisp8) {
      emitConstant(DispVal, 1);
    } else if (DispVal != 0 || ForceDisp32) {
-      emitDisplacementField(DispForReloc, DispVal);
+      emitDisplacementField(DispForReloc, DispVal, PCAdj);
    }
  }
 }
@ -337,11 +423,131 @@ static unsigned sizeOfImm(const TargetInstrDescriptor &Desc) {
  case X86II::Imm8:   return 1;
  case X86II::Imm16:  return 2;
  case X86II::Imm32:  return 4;
+  case X86II::Imm64:  return 8;
  default: assert(0 && "Immediate size not set!");
    return 0;
  }
 }

+/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended register?
+/// e.g. r8, xmm8, etc.
+bool Emitter::isX86_64ExtendedReg(const MachineOperand &MO) {
+  if (!MO.isRegister()) return false;
+  unsigned RegNo = MO.getReg();
+  int DWNum = II->getRegisterInfo().getDwarfRegNum(RegNo);
+  if (DWNum >= II->getRegisterInfo().getDwarfRegNum(X86::R8) &&
+      DWNum <= II->getRegisterInfo().getDwarfRegNum(X86::R15))
+    return true;
+  if (DWNum >= II->getRegisterInfo().getDwarfRegNum(X86::XMM8) &&
+      DWNum <= II->getRegisterInfo().getDwarfRegNum(X86::XMM15))
+    return true;
+  return false;
+}
+
+inline static bool isX86_64TruncToByte(unsigned oc) {
+  return (oc == X86::TRUNC_64to8 || oc == X86::TRUNC_32to8 ||
+          oc == X86::TRUNC_16to8);
+}
+
+
+inline static bool isX86_64NonExtLowByteReg(unsigned reg) {
+  return (reg == X86::SPL || reg == X86::BPL ||
+          reg == X86::SIL || reg == X86::DIL);
+}
+
+/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+unsigned Emitter::determineREX(const MachineInstr &MI) {
+  unsigned REX = 0;
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDescriptor &Desc = II->get(Opcode);
+
+  // Pseudo instructions do not need REX prefix byte.
+  if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return 0;
+  if (Desc.TSFlags & X86II::REX_W)
+    REX |= 1 << 3;
+
+  if (MI.getNumOperands()) {
+    // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+    bool isTrunc8 = isX86_64TruncToByte(Opcode);
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      const MachineOperand& MO = MI.getOperand(i);
+      if (MO.isRegister()) {
+	unsigned Reg = MO.getReg();
+	// Trunc to byte are actually movb. The real source operand is the low
+	// byte of the register.
+	if (isTrunc8 && i == 1)
+	  Reg = getX86SubSuperRegister(Reg, MVT::i8);
+	if (isX86_64NonExtLowByteReg(Reg))
+	  REX |= 0x40;
+      }
+    }
+
+    switch (Desc.TSFlags & X86II::FormMask) {
+    case X86II::MRMInitReg:
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= (1 << 0) | (1 << 2);
+      break;
+    case X86II::MRMSrcReg: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 2;
+      for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (isX86_64ExtendedReg(MO))
+          REX |= 1 << 0;
+      }
+      break;
+    }
+    case X86II::MRMSrcMem: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 2;
+      unsigned Bit = 0;
+      for (unsigned i = 1; i != 5; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (MO.isRegister()) {
+          if (isX86_64ExtendedReg(MO))
+            REX |= 1 << Bit;
+          Bit++;
+        }
+      }
+      break;
+    }
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m:
+    case X86II::MRMDestMem: {
+      if (MI.getNumOperands() >= 5 &&
+          isX86_64ExtendedReg(MI.getOperand(4)))
+        REX |= 1 << 2;
+      unsigned Bit = 0;
+      for (unsigned i = 0; i != 4; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (MO.isRegister()) {
+          if (isX86_64ExtendedReg(MO))
+            REX |= 1 << Bit;
+          Bit++;
+        }
+      }
+      break;
+    }
+    default: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 0;
+      for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (isX86_64ExtendedReg(MO))
+          REX |= 1 << 2;
+      }
+      break;
+    }
+    }
+  }
+  return REX;
+}
+
 void Emitter::emitInstruction(const MachineInstr &MI) {
  NumEmitted++;  // Keep track of the # of mi's emitted

@ -354,18 +560,22 @@ void Emitter::emitInstruction(const MachineInstr &MI) {
  // Emit the operand size opcode prefix as needed.
  if (Desc.TSFlags & X86II::OpSize) MCE.emitByte(0x66);

+  // Emit the address size opcode prefix as needed.
+  if (Desc.TSFlags & X86II::AdSize) MCE.emitByte(0x67);
+
+  bool Need0FPrefix = false;
  switch (Desc.TSFlags & X86II::Op0Mask) {
  case X86II::TB:
-    MCE.emitByte(0x0F);   // Two-byte opcode prefix
+    Need0FPrefix = true;   // Two-byte opcode prefix
    break;
  case X86II::REP: break; // already handled.
  case X86II::XS:   // F3 0F
    MCE.emitByte(0xF3);
-    MCE.emitByte(0x0F);
+    Need0FPrefix = true;
    break;
  case X86II::XD:   // F2 0F
    MCE.emitByte(0xF2);
-    MCE.emitByte(0x0F);
+    Need0FPrefix = true;
    break;
  case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
  case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
@ -377,6 +587,17 @@ void Emitter::emitInstruction(const MachineInstr &MI) {
  case 0: break;  // No prefix!
  }

+  if (Is64BitMode) {
+    // REX prefix
+    unsigned REX = determineREX(MI);
+    if (REX)
+      MCE.emitByte(0x40 | REX);
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  if (Need0FPrefix)
+    MCE.emitByte(0x0F);
+
  // If this is a two-address instruction, skip one of the register operands.
  unsigned CurOp = 0;
  CurOp += (Desc.Flags & M_2_ADDR_FLAG) != 0;
@ -397,6 +618,7 @@ void Emitter::emitInstruction(const MachineInstr &MI) {
    case X86::IMPLICIT_DEF_GR8:
    case X86::IMPLICIT_DEF_GR16:
    case X86::IMPLICIT_DEF_GR32:
+    case X86::IMPLICIT_DEF_GR64:
    case X86::IMPLICIT_DEF_FR32:
    case X86::IMPLICIT_DEF_FR64:
    case X86::IMPLICIT_DEF_VR64:
@ -417,7 +639,7 @@ void Emitter::emitInstruction(const MachineInstr &MI) {
      } else if (MO.isGlobalAddress()) {
        bool isTailCall = Opcode == X86::TAILJMPd ||
                          Opcode == X86::TAILJMPr || Opcode == X86::TAILJMPm;
-        emitGlobalAddressForCall(MO.getGlobal(), isTailCall);
+        emitGlobalAddressForCall(MO.getGlobal(), !isTailCall);
      } else if (MO.isExternalSymbol()) {
        emitExternalSymbolAddress(MO.getSymbolName(), true);
      } else if (MO.isImmediate()) {
@ -434,15 +656,15 @@ void Emitter::emitInstruction(const MachineInstr &MI) {
    if (CurOp != MI.getNumOperands()) {
      const MachineOperand &MO1 = MI.getOperand(CurOp++);
      if (MO1.isGlobalAddress()) {
-        assert(sizeOfImm(Desc) == 4 &&
+        assert(sizeOfImm(Desc) == TD->getPointerSize() &&
               "Don't know how to emit non-pointer values!");
-        emitGlobalAddressForPtr(MO1.getGlobal(), MO1.getOffset());
+        emitGlobalAddressForPtr(MO1.getGlobal(), Is64BitMode, MO1.getOffset());
      } else if (MO1.isExternalSymbol()) {
-        assert(sizeOfImm(Desc) == 4 &&
+        assert(sizeOfImm(Desc) == TD->getPointerSize() &&
               "Don't know how to emit non-pointer values!");
        emitExternalSymbolAddress(MO1.getSymbolName(), false);
      } else if (MO1.isJumpTableIndex()) {
-        assert(sizeOfImm(Desc) == 4 &&
+        assert(sizeOfImm(Desc) == TD->getPointerSize() &&
               "Don't know how to emit non-pointer values!");
        emitConstant(MCE.getJumpTableEntryAddress(MO1.getJumpTableIndex()), 4);
      } else {
@ -460,13 +682,14 @@ void Emitter::emitInstruction(const MachineInstr &MI) {
      emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
    break;
  }
-  case X86II::MRMDestMem:
+  case X86II::MRMDestMem: {
    MCE.emitByte(BaseOpcode);
    emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(CurOp+4).getReg()));
    CurOp += 5;
    if (CurOp != MI.getNumOperands())
      emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
    break;
+  }

  case X86II::MRMSrcReg:
    MCE.emitByte(BaseOpcode);
@ -477,13 +700,17 @@ void Emitter::emitInstruction(const MachineInstr &MI) {
      emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
    break;

-  case X86II::MRMSrcMem:
+  case X86II::MRMSrcMem: {
+    unsigned PCAdj = (CurOp+5 != MI.getNumOperands()) ? sizeOfImm(Desc) : 0;
+
    MCE.emitByte(BaseOpcode);
-    emitMemModRMByte(MI, CurOp+1, getX86RegNum(MI.getOperand(CurOp).getReg()));
+    emitMemModRMByte(MI, CurOp+1, getX86RegNum(MI.getOperand(CurOp).getReg()),
+                     PCAdj);
    CurOp += 5;
    if (CurOp != MI.getNumOperands())
      emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
    break;
+  }

  case X86II::MRM0r: case X86II::MRM1r:
  case X86II::MRM2r: case X86II::MRM3r:
@ -500,9 +727,13 @@ void Emitter::emitInstruction(const MachineInstr &MI) {
  case X86II::MRM0m: case X86II::MRM1m:
  case X86II::MRM2m: case X86II::MRM3m:
  case X86II::MRM4m: case X86II::MRM5m:
-  case X86II::MRM6m: case X86II::MRM7m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    unsigned PCAdj = (CurOp+4 != MI.getNumOperands()) ?
+      (MI.getOperand(CurOp+4).isImmediate() ? sizeOfImm(Desc) : 4) : 0;
+
    MCE.emitByte(BaseOpcode);
-    emitMemModRMByte(MI, CurOp, (Desc.TSFlags & X86II::FormMask)-X86II::MRM0m);
+    emitMemModRMByte(MI, CurOp, (Desc.TSFlags & X86II::FormMask)-X86II::MRM0m,
+                     PCAdj);
    CurOp += 4;

    if (CurOp != MI.getNumOperands()) {
@ -510,13 +741,14 @@ void Emitter::emitInstruction(const MachineInstr &MI) {
      if (MO.isImmediate())
        emitConstant(MO.getImm(), sizeOfImm(Desc));
      else if (MO.isGlobalAddress())
-        emitGlobalAddressForPtr(MO.getGlobal(), MO.getOffset());
+        emitGlobalAddressForPtr(MO.getGlobal(), Is64BitMode, MO.getOffset());
      else if (MO.isJumpTableIndex())
        emitConstant(MCE.getJumpTableEntryAddress(MO.getJumpTableIndex()), 4);
      else
        assert(0 && "Unknown operand!");
    }
    break;
+  }

  case X86II::MRMInitReg:
    MCE.emitByte(BaseOpcode);
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@ -30,8 +30,9 @@
 #include "llvm/CodeGen/SSARegMap.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include <deque>
 #include <iostream>
@ -58,16 +59,19 @@ namespace {
      int FrameIndex;
    } Base;

+    bool isRIPRel;     // RIP relative?
    unsigned Scale;
    SDOperand IndexReg; 
    unsigned Disp;
    GlobalValue *GV;
    Constant *CP;
+    const char *ES;
+    int JT;
    unsigned Align;    // CP alignment.

    X86ISelAddressMode()
-      : BaseType(RegBase), Scale(1), IndexReg(), Disp(0), GV(0),
-        CP(0), Align(0) {
+      : BaseType(RegBase), isRIPRel(false), Scale(1), IndexReg(), Disp(0),
+        GV(0), CP(0), ES(0), JT(-1), Align(0) {
    }
  };
 }
@ -92,6 +96,10 @@ namespace {
    ///
    bool FastISel;

+    /// TM - Keep a reference to X86TargetMachine.
+    ///
+    X86TargetMachine &TM;
+
    /// X86Lowering - This object fully describes how to lower LLVM code to an
    /// X86-specific SelectionDAG.
    X86TargetLowering X86Lowering;
@ -100,12 +108,14 @@ namespace {
    /// make the right decision when generating code for different targets.
    const X86Subtarget *Subtarget;

+    /// GlobalBaseReg - keeps track of the virtual register mapped onto global
+    /// base register.
    unsigned GlobalBaseReg;

  public:
-    X86DAGToDAGISel(X86TargetMachine &TM, bool fast)
+    X86DAGToDAGISel(X86TargetMachine &tm, bool fast)
      : SelectionDAGISel(X86Lowering),
-        ContainsFPCode(false), FastISel(fast), 
+        ContainsFPCode(false), FastISel(fast), TM(tm),
        X86Lowering(*TM.getTargetLowering()),
        Subtarget(&TM.getSubtarget<X86Subtarget>()) {}

@ -156,13 +166,22 @@ namespace {
                                   SDOperand &Scale, SDOperand &Index,
                                   SDOperand &Disp) {
      Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
-        CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, MVT::i32) : AM.Base.Reg;
+        CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) :
+        AM.Base.Reg;
      Scale = getI8Imm(AM.Scale);
      Index = AM.IndexReg;
-      Disp  = AM.GV ? CurDAG->getTargetGlobalAddress(AM.GV, MVT::i32, AM.Disp)
-        : (AM.CP ?
-           CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Align, AM.Disp)
-           : getI32Imm(AM.Disp));
+      // These are 32-bit even in 64-bit mode since RIP relative offset
+      // is 32-bit.
+      if (AM.GV)
+        Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i32, AM.Disp);
+      else if (AM.CP)
+        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Align, AM.Disp);
+      else if (AM.ES)
+        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32);
+      else if (AM.JT != -1)
+        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32);
+      else
+        Disp = getI32Imm(AM.Disp);
    }

    /// getI8Imm - Return a target constant with the specified value, of type
@ -476,26 +495,56 @@ void X86DAGToDAGISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) {
 /// addressing mode
 bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
                                   bool isRoot) {
+  // RIP relative addressing: %rip + 32-bit displacement!
+  if (AM.isRIPRel) {
+    if (!AM.ES && AM.JT != -1 && N.getOpcode() == ISD::Constant) {
+      uint64_t Val = cast<ConstantSDNode>(N)->getValue();
+      if (isInt32(AM.Disp + Val)) {
+        AM.Disp += Val;
+        return false;
+      }
+    }
+    return true;
+  }
+
  int id = N.Val->getNodeId();
  bool Available = isSelected(id);

  switch (N.getOpcode()) {
  default: break;
-  case ISD::Constant:
-    AM.Disp += cast<ConstantSDNode>(N)->getValue();
-    return false;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getValue();
+    if (isInt32(AM.Disp + Val)) {
+      AM.Disp += Val;
+      return false;
+    }
+    break;
+  }

  case X86ISD::Wrapper:
-    // If both base and index components have been picked, we can't fit
-    // the result available in the register in the addressing mode. Duplicate
-    // GlobalAddress or ConstantPool as displacement.
-    if (!Available || (AM.Base.Reg.Val && AM.IndexReg.Val)) {
+    // If value is available in a register both base and index components have
+    // been picked, we can't fit the result available in the register in the
+    // addressing mode. Duplicate GlobalAddress or ConstantPool as displacement.
+
+    // Can't fit GV or CP in addressing mode for X86-64 medium or large code
+    // model since the displacement field is 32-bit. Ok for small code model.
+
+    // For X86-64 PIC code, only allow GV / CP + displacement so we can use RIP
+    // relative addressing mode.
+    if ((!Subtarget->is64Bit() || TM.getCodeModel() == CodeModel::Small) &&
+        (!Available || (AM.Base.Reg.Val && AM.IndexReg.Val))) {
+      bool isRIP = Subtarget->is64Bit();
+      if (isRIP && (AM.Base.Reg.Val || AM.Scale > 1 || AM.IndexReg.Val ||
+                    AM.BaseType == X86ISelAddressMode::FrameIndexBase))
+        break;
      if (ConstantPoolSDNode *CP =
          dyn_cast<ConstantPoolSDNode>(N.getOperand(0))) {
        if (AM.CP == 0) {
          AM.CP = CP->get();
          AM.Align = CP->getAlignment();
          AM.Disp += CP->getOffset();
+          if (isRIP)
+            AM.isRIPRel = true;
          return false;
        }
      } else if (GlobalAddressSDNode *G =
@ -503,6 +552,20 @@ bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
        if (AM.GV == 0) {
          AM.GV = G->getGlobal();
          AM.Disp += G->getOffset();
+          if (isRIP)
+            AM.isRIPRel = true;
+          return false;
+        }
+      } else if (isRoot && isRIP) {
+        if (ExternalSymbolSDNode *S =
+            dyn_cast<ExternalSymbolSDNode>(N.getOperand(0))) {
+          AM.ES = S->getSymbol();
+          AM.isRIPRel = true;
+          return false;
+        } else if (JumpTableSDNode *J =
+                   dyn_cast<JumpTableSDNode>(N.getOperand(0))) {
+          AM.JT = J->getIndex();
+          AM.isRIPRel = true;
          return false;
        }
      }
@ -533,7 +596,11 @@ bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
            AM.IndexReg = ShVal.Val->getOperand(0);
            ConstantSDNode *AddVal =
              cast<ConstantSDNode>(ShVal.Val->getOperand(1));
-            AM.Disp += AddVal->getValue() << Val;
+            uint64_t Disp = AM.Disp + AddVal->getValue() << Val;
+            if (isInt32(Disp))
+              AM.Disp = Disp;
+            else
+              AM.IndexReg = ShVal;
          } else {
            AM.IndexReg = ShVal;
          }
@ -563,7 +630,11 @@ bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
            Reg = MulVal.Val->getOperand(0);
            ConstantSDNode *AddVal =
              cast<ConstantSDNode>(MulVal.Val->getOperand(1));
-            AM.Disp += AddVal->getValue() * CN->getValue();
+            uint64_t Disp = AM.Disp + AddVal->getValue() * CN->getValue();
+            if (isInt32(Disp))
+              AM.Disp = Disp;
+            else
+              Reg = N.Val->getOperand(0);
          } else {
            Reg = N.Val->getOperand(0);
          }
@ -641,13 +712,14 @@ bool X86DAGToDAGISel::SelectAddr(SDOperand N, SDOperand &Base, SDOperand &Scale,
  if (MatchAddress(N, AM))
    return false;

+  MVT::ValueType VT = N.getValueType();
  if (AM.BaseType == X86ISelAddressMode::RegBase) {
    if (!AM.Base.Reg.Val)
-      AM.Base.Reg = CurDAG->getRegister(0, MVT::i32);
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
  }

  if (!AM.IndexReg.Val)
-    AM.IndexReg = CurDAG->getRegister(0, MVT::i32);
+    AM.IndexReg = CurDAG->getRegister(0, VT);

  getAddressOperands(AM, Base, Scale, Index, Disp);
  return true;
@ -662,19 +734,20 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDOperand N, SDOperand &Base,
  if (MatchAddress(N, AM))
    return false;

+  MVT::ValueType VT = N.getValueType();
  unsigned Complexity = 0;
  if (AM.BaseType == X86ISelAddressMode::RegBase)
    if (AM.Base.Reg.Val)
      Complexity = 1;
    else
-      AM.Base.Reg = CurDAG->getRegister(0, MVT::i32);
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
    Complexity = 4;

  if (AM.IndexReg.Val)
    Complexity++;
  else
-    AM.IndexReg = CurDAG->getRegister(0, MVT::i32);
+    AM.IndexReg = CurDAG->getRegister(0, VT);

  if (AM.Scale > 2) 
    Complexity += 2;
@ -687,8 +760,14 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDOperand N, SDOperand &Base,
  // optimal (especially for code size consideration). LEA is nice because of
  // its three-address nature. Tweak the cost function again when we can run
  // convertToThreeAddress() at register allocation time.
-  if (AM.GV || AM.CP)
-    Complexity += 2;
+  if (AM.GV || AM.CP || AM.ES || AM.JT != -1) {
+    // For X86-64, we should always use lea to materialize RIP relative
+    // addresses.
+    if (Subtarget->is64Bit())
+      Complexity = 4;
+    else
+      Complexity += 2;
+  }

  if (AM.Disp && (AM.Base.Reg.Val || AM.IndexReg.Val))
    Complexity++;
@ -721,6 +800,7 @@ static bool isRegister0(SDOperand Op) {
 /// base address to use for accessing globals into a register.
 ///
 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+  assert(!Subtarget->is64Bit() && "X86-64 PIC uses RIP relative addressing");
  if (!GlobalBaseReg) {
    // Insert the set of GlobalBaseReg into the first MBB of the function
    MachineBasicBlock &FirstMBB = BB->getParent()->front();
@ -732,7 +812,7 @@ SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
    BuildMI(FirstMBB, MBBI, X86::MovePCtoStack, 0);
    BuildMI(FirstMBB, MBBI, X86::POP32r, 1, GlobalBaseReg);
  }
-  return CurDAG->getRegister(GlobalBaseReg, MVT::i32).Val;
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).Val;
 }

 static SDNode *FindCallStartFromCall(SDNode *Node) {
@ -776,9 +856,11 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
      // Turn ADD X, c to MOV32ri X+c. This cannot be done with tblgen'd
      // code and is matched first so to prevent it from being turned into
      // LEA32r X+c.
+      // In 64-bit mode, use LEA to take advantage of RIP-relative addressing.
+      MVT::ValueType PtrVT = TLI.getPointerTy();
      SDOperand N0 = N.getOperand(0);
      SDOperand N1 = N.getOperand(1);
-      if (N.Val->getValueType(0) == MVT::i32 &&
+      if (N.Val->getValueType(0) == PtrVT &&
          N0.getOpcode() == X86ISD::Wrapper &&
          N1.getOpcode() == ISD::Constant) {
        unsigned Offset = (unsigned)cast<ConstantSDNode>(N1)->getValue();
@ -786,17 +868,23 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
        // TODO: handle ExternalSymbolSDNode.
        if (GlobalAddressSDNode *G =
            dyn_cast<GlobalAddressSDNode>(N0.getOperand(0))) {
-          C = CurDAG->getTargetGlobalAddress(G->getGlobal(), MVT::i32,
+          C = CurDAG->getTargetGlobalAddress(G->getGlobal(), PtrVT,
                                             G->getOffset() + Offset);
        } else if (ConstantPoolSDNode *CP =
                   dyn_cast<ConstantPoolSDNode>(N0.getOperand(0))) {
-          C = CurDAG->getTargetConstantPool(CP->get(), MVT::i32,
+          C = CurDAG->getTargetConstantPool(CP->get(), PtrVT,
                                            CP->getAlignment(),
                                            CP->getOffset()+Offset);
        }

-        if (C.Val)
-          return CurDAG->SelectNodeTo(N.Val, X86::MOV32ri, MVT::i32, C);
+        if (C.Val) {
+          if (Subtarget->is64Bit()) {
+            SDOperand Ops[] = { CurDAG->getRegister(0, PtrVT), getI8Imm(1),
+                                CurDAG->getRegister(0, PtrVT), C };
+            return CurDAG->SelectNodeTo(N.Val, X86::LEA64r, MVT::i64, Ops, 4);
+          } else
+            return CurDAG->SelectNodeTo(N.Val, X86::MOV32ri, PtrVT, C);
+        }
      }

      // Other cases are handled by auto-generated code.
@ -811,6 +899,7 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
        case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
        case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
        case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
+        case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
        }
      else
        switch (NVT) {
@ -818,6 +907,7 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
        case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
        case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
        case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+        case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
        }

      unsigned LoReg, HiReg;
@ -826,6 +916,7 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
      case MVT::i8:  LoReg = X86::AL;  HiReg = X86::AH;  break;
      case MVT::i16: LoReg = X86::AX;  HiReg = X86::DX;  break;
      case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break;
+      case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
      }

      SDOperand N0 = Node->getOperand(0);
@ -899,6 +990,7 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
        case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
        case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
        case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+        case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
        }
      else
        switch (NVT) {
@ -906,6 +998,7 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
        case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
        case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
        case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+        case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
        }

      unsigned LoReg, HiReg;
@ -927,6 +1020,11 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
        ClrOpcode  = X86::MOV32r0;
        SExtOpcode = X86::CDQ;
        break;
+      case MVT::i64:
+        LoReg = X86::RAX; HiReg = X86::RDX;
+        ClrOpcode  = X86::MOV64r0;
+        SExtOpcode = X86::CQO;
+        break;
      }

      SDOperand N0 = Node->getOperand(0);
@ -994,7 +1092,7 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
    }

    case ISD::TRUNCATE: {
-      if (NVT == MVT::i8) {
+      if (!Subtarget->is64Bit() && NVT == MVT::i8) {
        unsigned Opc2;
        MVT::ValueType VT;
        switch (Node->getOperand(0).getValueType()) {
@ -1002,12 +1100,12 @@ SDNode *X86DAGToDAGISel::Select(SDOperand N) {
        case MVT::i16:
          Opc = X86::MOV16to16_;
          VT = MVT::i16;
-          Opc2 = X86::TRUNC_GR16_GR8;
+          Opc2 = X86::TRUNC_16_to8;
          break;
        case MVT::i32:
          Opc = X86::MOV32to32_;
          VT = MVT::i32;
-          Opc2 = X86::TRUNC_GR32_GR8;
+          Opc2 = X86::TRUNC_32_to8;
          break;
        }

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@ -267,6 +267,9 @@ namespace llvm {
  //  X86TargetLowering - X86 Implementation of the TargetLowering interface
  class X86TargetLowering : public TargetLowering {
    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    int RegSaveFrameIndex;            // X86-64 vararg func register save area.
+    unsigned VarArgsGPOffset;         // X86-64 vararg func int reg offset.
+    unsigned VarArgsFPOffset;         // X86-64 vararg func fp reg offset.
    int ReturnAddrIndex;              // FrameIndex for return slot.
    int BytesToPopOnReturn;           // Number of arg bytes ret should pop.
    int BytesCallerReserves;          // Number of arg bytes caller makes.
@ -347,6 +350,9 @@ namespace llvm {
    /// make the right decision when generating code for different targets.
    const X86Subtarget *Subtarget;

+    /// X86StackPtr - X86 physical register used as stack ptr.
+    unsigned X86StackPtr;
+
    /// X86ScalarSSE - Select between SSE2 or x87 floating point ops.
    bool X86ScalarSSE;

@ -354,6 +360,10 @@ namespace llvm {
    SDOperand LowerCCCArguments(SDOperand Op, SelectionDAG &DAG);
    SDOperand LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG);

+    // X86-64 C Calling Convention implementation.
+    SDOperand LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG);
+
    // Fast Calling Convention implementation.
    SDOperand LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG);
    SDOperand LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG);
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@ -22,7 +22,7 @@ using namespace llvm;

 X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
  : TargetInstrInfo(X86Insts, sizeof(X86Insts)/sizeof(X86Insts[0])),
-    TM(tm), RI(*this) {
+    TM(tm), RI(tm, *this) {
 }


@ -30,7 +30,8 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
                               unsigned& sourceReg,
                               unsigned& destReg) const {
  MachineOpCode oc = MI.getOpcode();
-  if (oc == X86::MOV8rr || oc == X86::MOV16rr || oc == X86::MOV32rr ||
+  if (oc == X86::MOV8rr || oc == X86::MOV16rr ||
+      oc == X86::MOV32rr || oc == X86::MOV64rr ||
      oc == X86::MOV16to16_ || oc == X86::MOV32to32_ ||
      oc == X86::FpMOV  || oc == X86::MOVSSrr || oc == X86::MOVSDrr ||
      oc == X86::FsMOVAPSrr || oc == X86::FsMOVAPDrr ||
@ -59,6 +60,7 @@ unsigned X86InstrInfo::isLoadFromStackSlot(MachineInstr *MI,
  case X86::MOV16_rm:
  case X86::MOV32rm:
  case X86::MOV32_rm:
+  case X86::MOV64rm:
  case X86::FpLD64m:
  case X86::MOVSSrm:
  case X86::MOVSDrm:
@ -86,6 +88,7 @@ unsigned X86InstrInfo::isStoreToStackSlot(MachineInstr *MI,
  case X86::MOV16_mr:
  case X86::MOV32mr:
  case X86::MOV32_mr:
+  case X86::MOV64mr:
  case X86::FpSTP64m:
  case X86::MOVSSmr:
  case X86::MOVSDmr:
@ -145,16 +148,20 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr *MI) const {

  switch (MI->getOpcode()) {
  case X86::INC32r:
+  case X86::INC64_32r:
    assert(MI->getNumOperands() == 2 && "Unknown inc instruction!");
    return addRegOffset(BuildMI(X86::LEA32r, 5, Dest), Src, 1);
  case X86::INC16r:
+  case X86::INC64_16r:
    if (DisableLEA16) return 0;
    assert(MI->getNumOperands() == 2 && "Unknown inc instruction!");
    return addRegOffset(BuildMI(X86::LEA16r, 5, Dest), Src, 1);
  case X86::DEC32r:
+  case X86::DEC64_32r:
    assert(MI->getNumOperands() == 2 && "Unknown dec instruction!");
    return addRegOffset(BuildMI(X86::LEA32r, 5, Dest), Src, -1);
  case X86::DEC16r:
+  case X86::DEC64_16r:
    if (DisableLEA16) return 0;
    assert(MI->getNumOperands() == 2 && "Unknown dec instruction!");
    return addRegOffset(BuildMI(X86::LEA16r, 5, Dest), Src, -1);
@ -264,3 +271,10 @@ X86InstrInfo::reverseBranchCondition(MachineBasicBlock::iterator MI) const {
  return BuildMI(*MBB, MBB->erase(MI), ROpcode, 1).addMBB(TMBB);
 }

+const TargetRegisterClass *X86InstrInfo::getPointerRegClass() const {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  if (Subtarget->is64Bit())
+    return &X86::GR64RegClass;
+  else
+    return &X86::GR32RegClass;
+}
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@ -18,6 +18,7 @@
 #include "X86RegisterInfo.h"

 namespace llvm {
+  class X86RegisterInfo;
  class X86TargetMachine;

 /// X86II - This namespace holds all of the target specific flags that
@ -90,12 +91,18 @@ namespace X86II {
    // instead of 32 bit data.
    OpSize      = 1 << 6,

+    // AsSize - Set if this instruction requires an operand size prefix (0x67),
+    // which most often indicates that the instruction address 16 bit address
+    // instead of 32 bit address (or 32 bit address in 64 bit mode).
+    AdSize      = 1 << 7,
+
+    //===------------------------------------------------------------------===//
    // Op0Mask - There are several prefix bytes that are used to form two byte
    // opcodes.  These are currently 0x0F, 0xF3, and 0xD8-0xDF.  This mask is
    // used to obtain the setting of this field.  If no bits in this field is
    // set, there is no prefix byte for obtaining a multibyte opcode.
    //
-    Op0Shift    = 7,
+    Op0Shift    = 8,
    Op0Mask     = 0xF << Op0Shift,

    // TB - TwoByte - Set if this instruction has a two byte opcode, which
@ -118,19 +125,29 @@ namespace X86II {
    XD = 11 << Op0Shift,   XS = 12 << Op0Shift,

    //===------------------------------------------------------------------===//
-    // This two-bit field describes the size of an immediate operand.  Zero is
+    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+    // They are used to specify GPRs and SSE registers, 64-bit operand size,
+    // etc. We only cares about REX.W and REX.R bits and only the former is
+    // statically determined.
+    //
+    REXShift    = 12,
+    REX_W       = 1 << REXShift,
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the size of an immediate operand.  Zero is
    // unused so that we can tell if we forgot to set a value.
-    ImmShift = 11,
-    ImmMask  = 3 << ImmShift,
+    ImmShift = 13,
+    ImmMask  = 7 << ImmShift,
    Imm8     = 1 << ImmShift,
    Imm16    = 2 << ImmShift,
    Imm32    = 3 << ImmShift,
+    Imm64    = 4 << ImmShift,

    //===------------------------------------------------------------------===//
    // FP Instruction Classification...  Zero is non-fp instruction.

    // FPTypeMask - Mask for all of the FP types...
-    FPTypeShift = 13,
+    FPTypeShift = 16,
    FPTypeMask  = 7 << FPTypeShift,

    // NotFP - The default, set for instructions that do not use FP registers.
@ -162,9 +179,9 @@ namespace X86II {
    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
    SpecialFP  = 7 << FPTypeShift,

-    OpcodeShift   = 16,
+    // Bits 19 -> 23 are unused
+    OpcodeShift   = 24,
    OpcodeMask    = 0xFF << OpcodeShift
-    // Bits 25 -> 31 are unused
  };
 }

@ -216,6 +233,8 @@ public:
  virtual MachineBasicBlock::iterator
  reverseBranchCondition(MachineBasicBlock::iterator MI) const;

+  const TargetRegisterClass *getPointerRegClass() const;
+
  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
  // specified opcode number.
  //
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@ -39,7 +39,7 @@ def SDT_X86CallSeqStart : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>;
 def SDT_X86CallSeqEnd   : SDTypeProfile<0, 2, [ SDTCisVT<0, i32>,
                                                SDTCisVT<1, i32> ]>;

-def SDT_X86Call   : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def SDT_X86Call   : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;

 def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;

@ -95,7 +95,7 @@ def X86Wrapper : SDNode<"X86ISD::Wrapper",  SDTX86Wrapper>;
 class X86MemOperand<string printMethod> : Operand<iPTR> {
  let PrintMethod = printMethod;
  let NumMIOperands = 4;
-  let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm);
 }

 def i8mem   : X86MemOperand<"printi8mem">;
@ -107,6 +107,12 @@ def f32mem  : X86MemOperand<"printf32mem">;
 def f64mem  : X86MemOperand<"printf64mem">;
 def f128mem : X86MemOperand<"printf128mem">;

+def lea32mem : Operand<i32> {
+  let PrintMethod = "printi32mem";
+  let NumMIOperands = 4;
+  let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+}
+
 def SSECC : Operand<i8> {
  let PrintMethod = "printSSECC";
 }
@ -129,9 +135,9 @@ def brtarget : Operand<OtherVT>;
 //

 // Define X86 specific addressing mode.
-def addr    : ComplexPattern<iPTR, 4, "SelectAddr", []>;
-def leaaddr : ComplexPattern<iPTR, 4, "SelectLEAAddr",
-                             [add, mul, shl, or, frameindex]>;
+def addr      : ComplexPattern<iPTR, 4, "SelectAddr", []>;
+def lea32addr : ComplexPattern<i32, 4, "SelectLEAAddr",
+                               [add, mul, shl, or, frameindex]>;

 //===----------------------------------------------------------------------===//
 // X86 Instruction Format Definitions.
@ -158,11 +164,13 @@ def MRMInitReg : Format<32>;

 //===----------------------------------------------------------------------===//
 // X86 Instruction Predicate Definitions.
-def HasMMX  : Predicate<"Subtarget->hasMMX()">;
-def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
-def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
-def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
-def FPStack : Predicate<"!Subtarget->hasSSE2()">;
+def HasMMX   : Predicate<"Subtarget->hasMMX()">;
+def HasSSE1  : Predicate<"Subtarget->hasSSE1()">;
+def HasSSE2  : Predicate<"Subtarget->hasSSE2()">;
+def HasSSE3  : Predicate<"Subtarget->hasSSE3()">;
+def FPStack  : Predicate<"!Subtarget->hasSSE2()">;
+def In32BitMode : Predicate<"!Subtarget->is64Bit()">;
+def In64BitMode : Predicate<"Subtarget->is64Bit()">;

 //===----------------------------------------------------------------------===//
 // X86 specific pattern fragments.
@ -171,13 +179,14 @@ def FPStack : Predicate<"!Subtarget->hasSSE2()">;
 // ImmType - This specifies the immediate type used by an instruction. This is
 // part of the ad-hoc solution used to emit machine instruction encodings by our
 // machine code emitter.
-class ImmType<bits<2> val> {
-  bits<2> Value = val;
+class ImmType<bits<3> val> {
+  bits<3> Value = val;
 }
 def NoImm  : ImmType<0>;
 def Imm8   : ImmType<1>;
 def Imm16  : ImmType<2>;
 def Imm32  : ImmType<3>;
+def Imm64  : ImmType<4>;

 // FPFormat - This specifies what form this FP instruction has.  This is used by
 // the Floating-Point stackifier pass.
@ -202,7 +211,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag ops, string AsmStr>
  Format Form = f;
  bits<6> FormBits = Form.Value;
  ImmType ImmT = i;
-  bits<2> ImmTypeBits = ImmT.Value;
+  bits<3> ImmTypeBits = ImmT.Value;

  dag OperandList = ops;
  string AsmString = AsmStr;
@ -210,9 +219,11 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag ops, string AsmStr>
  //
  // Attributes specific to X86 instructions...
  //
-  bit hasOpSizePrefix = 0; // Does this inst have a 0x66 prefix?
+  bit hasOpSizePrefix = 0;  // Does this inst have a 0x66 prefix?
+  bit hasAdSizePrefix = 0;  // Does this inst have a 0x67 prefix?

  bits<4> Prefix = 0;       // Which prefix byte does this inst have?
+  bit hasREX_WPrefix  = 0;  // Does this inst requires the REX.W prefix?
  FPFormat FPForm;          // What flavor of FP instruction is this?
  bits<3> FPFormBits = 0;
 }
@ -226,6 +237,8 @@ class Imp<list<Register> uses, list<Register> defs> {
 // Prefix byte classes which are used to indicate to the ad-hoc machine code
 // emitter that various prefix bytes are required.
 class OpSize { bit hasOpSizePrefix = 1; }
+class AdSize { bit hasAdSizePrefix = 1; }
+class REX_W  { bit hasREX_WPrefix = 1; }
 class TB     { bits<4> Prefix = 1; }
 class REP    { bits<4> Prefix = 2; }
 class D8     { bits<4> Prefix = 3; }
@ -276,8 +289,6 @@ def i32immSExt8  : PatLeaf<(i32 imm), [{
 }]>;

 // Helper fragments for loads.
-def loadiPTR : PatFrag<(ops node:$ptr), (iPTR (load node:$ptr))>;
-
 def loadi8  : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
 def loadi16 : PatFrag<(ops node:$ptr), (i16 (load node:$ptr))>;
 def loadi32 : PatFrag<(ops node:$ptr), (i32 (load node:$ptr))>;
@ -308,6 +319,7 @@ def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extload node:$ptr, i16))>;

 //===----------------------------------------------------------------------===//
 // Instruction templates...
+//

 class I<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
  : X86Inst<o, f, NoImm, ops, asm> {
@ -355,13 +367,13 @@ def IMPLICIT_DEF_GR32  : I<0, Pseudo, (ops GR32:$dst),
 def NOOP : I<0x90, RawFrm, (ops), "nop", []>;

 // Truncate
-def TRUNC_GR32_GR8  : I<0x88, MRMDestReg, (ops GR8:$dst, GR32_:$src),
-                      "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}", []>;
-def TRUNC_GR16_GR8  : I<0x88, MRMDestReg, (ops GR8:$dst, GR16_:$src),
-                      "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}}", []>;
-def TRUNC_GR32_GR16 : I<0x89, MRMDestReg, (ops GR16:$dst, GR32:$src),
-                      "mov{w} {${src:subreg16}, $dst|$dst, ${src:subreg16}}",
-                      [(set GR16:$dst, (trunc GR32:$src))]>;
+def TRUNC_32_to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR32_:$src),
+                     "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}", []>;
+def TRUNC_16_to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR16_:$src),
+                     "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}}", []>;
+def TRUNC_32to16 : I<0x89, MRMDestReg, (ops GR16:$dst, GR32:$src),
+                     "mov{w} {${src:subreg16}, $dst|$dst, ${src:subreg16}}",
+                     [(set GR16:$dst, (trunc GR32:$src))]>;

 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions...
@ -388,7 +400,7 @@ let isBranch = 1, isTerminator = 1, noResults = 1, isBarrier = 1 in {
  def JMP32r     : I<0xFF, MRM4r, (ops GR32:$dst), "jmp{l} {*}$dst",
                     [(brind GR32:$dst)]>;
  def JMP32m     : I<0xFF, MRM4m, (ops i32mem:$dst), "jmp{l} {*}$dst",
-                     [(brind (loadiPTR addr:$dst))]>;
+                     [(brind (loadi32 addr:$dst))]>;
 }

 // Conditional branches
@ -510,9 +522,9 @@ def LEA16r   : I<0x8D, MRMSrcMem,
                 (ops GR16:$dst, i32mem:$src),
                 "lea{w} {$src|$dst}, {$dst|$src}", []>, OpSize;
 def LEA32r   : I<0x8D, MRMSrcMem,
-                 (ops GR32:$dst, i32mem:$src),
+                 (ops GR32:$dst, lea32mem:$src),
                 "lea{l} {$src|$dst}, {$dst|$src}",
-                 [(set GR32:$dst, leaaddr:$src)]>;
+                 [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>;

 def REP_MOVSB : I<0xA4, RawFrm, (ops), "{rep;movsb|rep movsb}",
                  [(X86rep_movs i8)]>,
@ -1101,9 +1113,10 @@ def INC8r  : I<0xFE, MRM0r, (ops GR8 :$dst, GR8 :$src), "inc{b} $dst",
               [(set GR8:$dst, (add GR8:$src, 1))]>;
 let isConvertibleToThreeAddress = 1, CodeSize = 1 in {  // Can xform into LEA.
 def INC16r : I<0x40, AddRegFrm, (ops GR16:$dst, GR16:$src), "inc{w} $dst",
-               [(set GR16:$dst, (add GR16:$src, 1))]>, OpSize;
+               [(set GR16:$dst, (add GR16:$src, 1))]>,
+             OpSize, Requires<[In32BitMode]>;
 def INC32r : I<0x40, AddRegFrm, (ops GR32:$dst, GR32:$src), "inc{l} $dst",
-               [(set GR32:$dst, (add GR32:$src, 1))]>;
+               [(set GR32:$dst, (add GR32:$src, 1))]>, Requires<[In32BitMode]>;
 }
 let isTwoAddress = 0, CodeSize = 2 in {
  def INC8m  : I<0xFE, MRM0m, (ops i8mem :$dst), "inc{b} $dst",
@ -1119,9 +1132,10 @@ def DEC8r  : I<0xFE, MRM1r, (ops GR8 :$dst, GR8 :$src), "dec{b} $dst",
               [(set GR8:$dst, (add GR8:$src, -1))]>;
 let isConvertibleToThreeAddress = 1, CodeSize = 1 in {   // Can xform into LEA.
 def DEC16r : I<0x48, AddRegFrm, (ops GR16:$dst, GR16:$src), "dec{w} $dst",
-               [(set GR16:$dst, (add GR16:$src, -1))]>, OpSize;
+               [(set GR16:$dst, (add GR16:$src, -1))]>,
+             OpSize, Requires<[In32BitMode]>;
 def DEC32r : I<0x48, AddRegFrm, (ops GR32:$dst, GR32:$src), "dec{l} $dst",
-               [(set GR32:$dst, (add GR32:$src, -1))]>;
+               [(set GR32:$dst, (add GR32:$src, -1))]>, Requires<[In32BitMode]>;
 }

 let isTwoAddress = 0, CodeSize = 2 in {
@ -2455,7 +2469,7 @@ def DWARF_LABEL : I<0, Pseudo, (ops i32imm:$id),
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//

-// ConstantPool GlobalAddress, ExternalSymbol
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
 def : Pat<(i32 (X86Wrapper tconstpool  :$dst)), (MOV32ri tconstpool  :$dst)>;
 def : Pat<(i32 (X86Wrapper tjumptable  :$dst)), (MOV32ri tjumptable  :$dst)>;
 def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
@ -2477,18 +2491,16 @@ def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),

 // Calls
 def : Pat<(X86tailcall GR32:$dst),
-      (CALL32r     GR32:$dst)>;
+          (CALL32r     GR32:$dst)>;

-def : Pat<(X86tailcall tglobaladdr:$dst),
+def : Pat<(X86tailcall (i32 tglobaladdr:$dst)),
          (CALLpcrel32 tglobaladdr:$dst)>;
-def : Pat<(X86tailcall texternalsym:$dst),
+def : Pat<(X86tailcall (i32 texternalsym:$dst)),
          (CALLpcrel32 texternalsym:$dst)>;

-
-
-def : Pat<(X86call tglobaladdr:$dst),
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
          (CALLpcrel32 tglobaladdr:$dst)>;
-def : Pat<(X86call texternalsym:$dst),
+def : Pat<(X86call (i32 texternalsym:$dst)),
          (CALLpcrel32 texternalsym:$dst)>;

 // X86 specific add which produces a flag.
@ -2611,3 +2623,9 @@ include "X86InstrMMX.td"
 //===----------------------------------------------------------------------===//

 include "X86InstrSSE.td"
+
+//===----------------------------------------------------------------------===//
+// X86-64 Support
+//===----------------------------------------------------------------------===//
+
+include "X86InstrX86-64.td"
--- a/llvm/lib/Target/X86/X86InstrX86-64.td
+++ b/llvm/lib/Target/X86/X86InstrX86-64.td
--- a/llvm/lib/Target/X86/X86IntelAsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86IntelAsmPrinter.cpp
@ -86,8 +86,9 @@ void X86IntelAsmPrinter::printOp(const MachineOperand &MO,
    if (MRegisterInfo::isPhysicalRegister(MO.getReg())) {
      unsigned Reg = MO.getReg();
      if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
-        MVT::ValueType VT = (strcmp(Modifier,"subreg16") == 0)
-          ? MVT::i16 : MVT::i8;
+        MVT::ValueType VT = (strcmp(Modifier,"subreg64") == 0) ?
+          MVT::i64 : ((strcmp(Modifier, "subreg32") == 0) ? MVT::i32 :
+                      ((strcmp(Modifier,"subreg16") == 0) ? MVT::i16 :MVT::i8));
        Reg = getX86SubSuperRegister(Reg, VT);
      }
      O << RI.get(Reg).Name;
@ -137,7 +138,8 @@ void X86IntelAsmPrinter::printOp(const MachineOperand &MO,
  }
 }

-void X86IntelAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op){
+void X86IntelAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+                                           const char *Modifier) {
  assert(isMem(MI, Op) && "Invalid memory reference!");

  const MachineOperand &BaseReg  = MI->getOperand(Op);
@ -156,7 +158,7 @@ void X86IntelAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op){
  O << "[";
  bool NeedPlus = false;
  if (BaseReg.getReg()) {
-    printOp(BaseReg, "mem");
+    printOp(BaseReg, Modifier);
    NeedPlus = true;
  }

@ -164,7 +166,7 @@ void X86IntelAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op){
    if (NeedPlus) O << " + ";
    if (ScaleVal != 1)
      O << ScaleVal << "*";
-    printOp(IndexReg);
+    printOp(IndexReg, Modifier);
    NeedPlus = true;
  }

@ -259,14 +261,21 @@ void X86IntelAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
  // See if a truncate instruction can be turned into a nop.
  switch (MI->getOpcode()) {
  default: break;
-  case X86::TRUNC_GR32_GR16:
-  case X86::TRUNC_GR32_GR8:
-  case X86::TRUNC_GR16_GR8: {
+  case X86::TRUNC_64to32:
+  case X86::TRUNC_64to16:
+  case X86::TRUNC_32to16:
+  case X86::TRUNC_32to8:
+  case X86::TRUNC_16to8:
+  case X86::TRUNC_32_to8:
+  case X86::TRUNC_16_to8: {
    const MachineOperand &MO0 = MI->getOperand(0);
    const MachineOperand &MO1 = MI->getOperand(1);
    unsigned Reg0 = MO0.getReg();
    unsigned Reg1 = MO1.getReg();
-    if (MI->getOpcode() == X86::TRUNC_GR32_GR16)
+    unsigned Opc = MI->getOpcode();
+    if (Opc == X86::TRUNC_64to32)
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i32);
+    else if (Opc == X86::TRUNC_32to16 || Opc == X86::TRUNC_64to16)
      Reg1 = getX86SubSuperRegister(Reg1, MVT::i16);
    else
      Reg1 = getX86SubSuperRegister(Reg1, MVT::i8);
@ -275,6 +284,9 @@ void X86IntelAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
      O << "\n\t";
    break;
  }
+  case X86::PsMOVZX64rr32:
+    O << TAI->getCommentString() << " ZERO-EXTEND " << "\n\t";
+    break;
  }

  // Call the autogenerated instruction printer routines.
--- a/llvm/lib/Target/X86/X86IntelAsmPrinter.h
+++ b/llvm/lib/Target/X86/X86IntelAsmPrinter.h
@ -80,6 +80,10 @@ struct X86IntelAsmPrinter : public X86SharedAsmPrinter {
    O << "XMMWORD PTR ";
    printMemReference(MI, OpNo);
  }
+  void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo, "subreg64");
+  }

  bool printAsmMRegister(const MachineOperand &MO, const char Mode);
  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@ -89,7 +93,8 @@ struct X86IntelAsmPrinter : public X86SharedAsmPrinter {
  void printMachineInstruction(const MachineInstr *MI);
  void printOp(const MachineOperand &MO, const char *Modifier = 0);
  void printSSECC(const MachineInstr *MI, unsigned Op);
-  void printMemReference(const MachineInstr *MI, unsigned Op);
+  void printMemReference(const MachineInstr *MI, unsigned Op,
+                         const char *Modifier=NULL);
  void printPICLabel(const MachineInstr *MI, unsigned Op);
  bool runOnMachineFunction(MachineFunction &F);
  bool doInitialization(Module &M);
--- a/llvm/lib/Target/X86/X86JITInfo.cpp
+++ b/llvm/lib/Target/X86/X86JITInfo.cpp
@ -42,7 +42,65 @@ static TargetJITInfo::JITCompilerFn JITCompilerFunction;
 // Provide a wrapper for X86CompilationCallback2 that saves non-traditional
 // callee saved registers, for the fastcc calling convention.
 extern "C" {
-#if defined(__i386__) || defined(i386) || defined(_M_IX86)
+#if defined(__x86_64__)
+  // No need to save EAX/EDX for X86-64.
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl _X86CompilationCallback\n"
+  "_X86CompilationCallback:\n"
+    // Save RBP
+    "pushq   %rbp\n"
+    // Save RSP
+    "movq    %rsp, %rbp\n"
+    // Save all int arg registers
+    "pushq   %rdi\n"
+    "pushq   %rsi\n"
+    "pushq   %rdx\n"
+    "pushq   %rcx\n"
+    "pushq   %r8\n"
+    "pushq   %r9\n"
+    // Align stack on 16-byte boundary. ESP might not be properly aligned
+    // (8 byte) if this is called from an indirect stub.
+    "andq    $-16, %rsp\n"
+    // Save all XMM arg registers
+    "subq    $128, %rsp\n"
+    "movaps  %xmm0, (%rsp)\n"
+    "movaps  %xmm1, 16(%rsp)\n"
+    "movaps  %xmm2, 32(%rsp)\n"
+    "movaps  %xmm3, 48(%rsp)\n"
+    "movaps  %xmm4, 64(%rsp)\n"
+    "movaps  %xmm5, 80(%rsp)\n"
+    "movaps  %xmm6, 96(%rsp)\n"
+    "movaps  %xmm7, 112(%rsp)\n"
+    // JIT callee
+    "movq    %rbp, %rdi\n"    // Pass prev frame and return address
+    "movq    8(%rbp), %rsi\n"
+    "call    _X86CompilationCallback2\n"
+    // Restore all XMM arg registers
+    "movaps  112(%rsp), %xmm7\n"
+    "movaps  96(%rsp), %xmm6\n"
+    "movaps  80(%rsp), %xmm5\n"
+    "movaps  64(%rsp), %xmm4\n"
+    "movaps  48(%rsp), %xmm3\n"
+    "movaps  32(%rsp), %xmm2\n"
+    "movaps  16(%rsp), %xmm1\n"
+    "movaps  (%rsp), %xmm0\n"
+    // Restore RSP
+    "movq    %rbp, %rsp\n"
+    // Restore all int arg registers
+    "subq    $48, %rsp\n"
+    "popq    %r9\n"
+    "popq    %r8\n"
+    "popq    %rcx\n"
+    "popq    %rdx\n"
+    "popq    %rsi\n"
+    "popq    %rdi\n"
+    // Restore RBP
+    "popq    %rbp\n"
+    "ret\n");
+#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
 #ifndef _MSC_VER
  void X86CompilationCallback(void);
  asm(
@ -122,7 +180,7 @@ extern "C" void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
         "Could not find return address on the stack!");

  // It's a stub if there is an interrupt marker after the call.
-  bool isStub = ((unsigned char*)(intptr_t)RetAddr)[0] == 0xCD;
+  bool isStub = ((unsigned char*)RetAddr)[0] == 0xCD;

  // The call instruction should have pushed the return value onto the stack...
  RetAddr -= 4;  // Backtrack to the reference itself...
@ -135,20 +193,20 @@ extern "C" void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
 #endif

  // Sanity check to make sure this really is a call instruction.
-  assert(((unsigned char*)(intptr_t)RetAddr)[-1] == 0xE8 &&"Not a call instr!");
+  assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!");

-  unsigned NewVal = (intptr_t)JITCompilerFunction((void*)(intptr_t)RetAddr);
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);

  // Rewrite the call target... so that we don't end up here every time we
  // execute the call.
-  *(unsigned*)(intptr_t)RetAddr = NewVal-RetAddr-4;
+  *(unsigned *)RetAddr = (unsigned)(NewVal-RetAddr-4);

  if (isStub) {
    // If this is a stub, rewrite the call into an unconditional branch
    // instruction so that two return addresses are not pushed onto the stack
    // when the requested function finally gets called.  This also makes the
    // 0xCD byte (interrupt) dead, so the marker doesn't effect anything.
-    ((unsigned char*)(intptr_t)RetAddr)[-1] = 0xE9;
+    ((unsigned char*)RetAddr)[-1] = 0xE9;
  }

  // Change the return address to reexecute the call instruction...
@ -189,16 +247,17 @@ void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
    switch ((X86::RelocationType)MR->getRelocationType()) {
-    case X86::reloc_pcrel_word:
+    case X86::reloc_pcrel_word: {
      // PC relative relocation, add the relocated value to the value already in
      // memory, after we adjust it for where the PC is.
-      ResultPtr = ResultPtr-(intptr_t)RelocPos-4;
-      *((intptr_t*)RelocPos) += ResultPtr;
+      ResultPtr = ResultPtr-(intptr_t)RelocPos-4-MR->getConstantVal();
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
      break;
+    }
    case X86::reloc_absolute_word:
      // Absolute relocation, just add the relocated value to the value already
      // in memory.
-      *((intptr_t*)RelocPos) += ResultPtr;
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
      break;
    }
  }
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@ -14,13 +14,13 @@

 #include "X86.h"
 #include "X86RegisterInfo.h"
-#include "X86Subtarget.h"
 #include "X86InstrBuilder.h"
 #include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/Constants.h"
-#include "llvm/Type.h"
 #include "llvm/Function.h"
+#include "llvm/Type.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
@ -46,15 +46,32 @@ namespace {
                    cl::Hidden);
 }

-X86RegisterInfo::X86RegisterInfo(const TargetInstrInfo &tii)
-  : X86GenRegisterInfo(X86::ADJCALLSTACKDOWN, X86::ADJCALLSTACKUP), TII(tii) {}
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
+                                 const TargetInstrInfo &tii)
+  : X86GenRegisterInfo(X86::ADJCALLSTACKDOWN, X86::ADJCALLSTACKUP),
+    TM(tm), TII(tii) {
+  // Cache some information.
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  Is64Bit = Subtarget->is64Bit();
+  if (Is64Bit) {
+    SlotSize = 8;
+    StackPtr = X86::RSP;
+    FramePtr = X86::RBP;
+  } else {
+    SlotSize = 4;
+    StackPtr = X86::ESP;
+    FramePtr = X86::EBP;
+  }
+}

 void X86RegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator MI,
                                          unsigned SrcReg, int FrameIdx,
                                          const TargetRegisterClass *RC) const {
  unsigned Opc;
-  if (RC == &X86::GR32RegClass) {
+  if (RC == &X86::GR64RegClass) {
+    Opc = X86::MOV64mr;
+  } else if (RC == &X86::GR32RegClass) {
    Opc = X86::MOV32mr;
  } else if (RC == &X86::GR16RegClass) {
    Opc = X86::MOV16mr;
@ -84,7 +101,9 @@ void X86RegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                           unsigned DestReg, int FrameIdx,
                                           const TargetRegisterClass *RC) const{
  unsigned Opc;
-  if (RC == &X86::GR32RegClass) {
+  if (RC == &X86::GR64RegClass) {
+    Opc = X86::MOV64rm;
+  } else if (RC == &X86::GR32RegClass) {
    Opc = X86::MOV32rm;
  } else if (RC == &X86::GR16RegClass) {
    Opc = X86::MOV16rm;
@ -114,7 +133,9 @@ void X86RegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
                                   unsigned DestReg, unsigned SrcReg,
                                   const TargetRegisterClass *RC) const {
  unsigned Opc;
-  if (RC == &X86::GR32RegClass) {
+  if (RC == &X86::GR64RegClass) {
+    Opc = X86::MOV64rr;
+  } else if (RC == &X86::GR32RegClass) {
    Opc = X86::MOV32rr;
  } else if (RC == &X86::GR16RegClass) {
    Opc = X86::MOV16rr;
@ -270,12 +291,18 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::ADC32ri,     X86::ADC32mi },
      { X86::ADC32ri8,    X86::ADC32mi8 },
      { X86::ADC32rr,     X86::ADC32mr },
+      { X86::ADC64ri32,   X86::ADC64mi32 },
+      { X86::ADC64ri8,    X86::ADC64mi8 },
+      { X86::ADC64rr,     X86::ADC64mr },
      { X86::ADD16ri,     X86::ADD16mi },
      { X86::ADD16ri8,    X86::ADD16mi8 },
      { X86::ADD16rr,     X86::ADD16mr },
      { X86::ADD32ri,     X86::ADD32mi },
      { X86::ADD32ri8,    X86::ADD32mi8 },
      { X86::ADD32rr,     X86::ADD32mr },
+      { X86::ADD64ri32,   X86::ADD64mi32 },
+      { X86::ADD64ri8,    X86::ADD64mi8 },
+      { X86::ADD64rr,     X86::ADD64mr },
      { X86::ADD8ri,      X86::ADD8mi },
      { X86::ADD8rr,      X86::ADD8mr },
      { X86::AND16ri,     X86::AND16mi },
@ -284,19 +311,30 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::AND32ri,     X86::AND32mi },
      { X86::AND32ri8,    X86::AND32mi8 },
      { X86::AND32rr,     X86::AND32mr },
+      { X86::AND64ri32,   X86::AND64mi32 },
+      { X86::AND64ri8,    X86::AND64mi8 },
+      { X86::AND64rr,     X86::AND64mr },
      { X86::AND8ri,      X86::AND8mi },
      { X86::AND8rr,      X86::AND8mr },
      { X86::DEC16r,      X86::DEC16m },
      { X86::DEC32r,      X86::DEC32m },
+      { X86::DEC64_16r,   X86::DEC16m },
+      { X86::DEC64_32r,   X86::DEC32m },
+      { X86::DEC64r,      X86::DEC64m },
      { X86::DEC8r,       X86::DEC8m },
      { X86::INC16r,      X86::INC16m },
      { X86::INC32r,      X86::INC32m },
+      { X86::INC64_16r,   X86::INC16m },
+      { X86::INC64_32r,   X86::INC32m },
+      { X86::INC64r,      X86::INC64m },
      { X86::INC8r,       X86::INC8m },
      { X86::NEG16r,      X86::NEG16m },
      { X86::NEG32r,      X86::NEG32m },
+      { X86::NEG64r,      X86::NEG64m },
      { X86::NEG8r,       X86::NEG8m },
      { X86::NOT16r,      X86::NOT16m },
      { X86::NOT32r,      X86::NOT32m },
+      { X86::NOT64r,      X86::NOT64m },
      { X86::NOT8r,       X86::NOT8m },
      { X86::OR16ri,      X86::OR16mi },
      { X86::OR16ri8,     X86::OR16mi8 },
@ -304,6 +342,9 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::OR32ri,      X86::OR32mi },
      { X86::OR32ri8,     X86::OR32mi8 },
      { X86::OR32rr,      X86::OR32mr },
+      { X86::OR64ri32,    X86::OR64mi32 },
+      { X86::OR64ri8,     X86::OR64mi8 },
+      { X86::OR64rr,      X86::OR64mr },
      { X86::OR8ri,       X86::OR8mi },
      { X86::OR8rr,       X86::OR8mr },
      { X86::ROL16r1,     X86::ROL16m1 },
@ -312,6 +353,9 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::ROL32r1,     X86::ROL32m1 },
      { X86::ROL32rCL,    X86::ROL32mCL },
      { X86::ROL32ri,     X86::ROL32mi },
+      { X86::ROL64r1,     X86::ROL64m1 },
+      { X86::ROL64rCL,    X86::ROL64mCL },
+      { X86::ROL64ri,     X86::ROL64mi },
      { X86::ROL8r1,      X86::ROL8m1 },
      { X86::ROL8rCL,     X86::ROL8mCL },
      { X86::ROL8ri,      X86::ROL8mi },
@ -321,6 +365,9 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::ROR32r1,     X86::ROR32m1 },
      { X86::ROR32rCL,    X86::ROR32mCL },
      { X86::ROR32ri,     X86::ROR32mi },
+      { X86::ROR64r1,     X86::ROR64m1 },
+      { X86::ROR64rCL,    X86::ROR64mCL },
+      { X86::ROR64ri,     X86::ROR64mi },
      { X86::ROR8r1,      X86::ROR8m1 },
      { X86::ROR8rCL,     X86::ROR8mCL },
      { X86::ROR8ri,      X86::ROR8mi },
@ -330,18 +377,27 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::SAR32r1,     X86::SAR32m1 },
      { X86::SAR32rCL,    X86::SAR32mCL },
      { X86::SAR32ri,     X86::SAR32mi },
+      { X86::SAR64r1,     X86::SAR64m1 },
+      { X86::SAR64rCL,    X86::SAR64mCL },
+      { X86::SAR64ri,     X86::SAR64mi },
      { X86::SAR8r1,      X86::SAR8m1 },
      { X86::SAR8rCL,     X86::SAR8mCL },
      { X86::SAR8ri,      X86::SAR8mi },
      { X86::SBB32ri,     X86::SBB32mi },
      { X86::SBB32ri8,    X86::SBB32mi8 },
      { X86::SBB32rr,     X86::SBB32mr },
+      { X86::SBB64ri32,   X86::SBB64mi32 },
+      { X86::SBB64ri8,    X86::SBB64mi8 },
+      { X86::SBB64rr,     X86::SBB64mr },
      { X86::SHL16r1,     X86::SHL16m1 },
      { X86::SHL16rCL,    X86::SHL16mCL },
      { X86::SHL16ri,     X86::SHL16mi },
      { X86::SHL32r1,     X86::SHL32m1 },
      { X86::SHL32rCL,    X86::SHL32mCL },
      { X86::SHL32ri,     X86::SHL32mi },
+      { X86::SHL64r1,     X86::SHL64m1 },
+      { X86::SHL64rCL,    X86::SHL64mCL },
+      { X86::SHL64ri,     X86::SHL64mi },
      { X86::SHL8r1,      X86::SHL8m1 },
      { X86::SHL8rCL,     X86::SHL8mCL },
      { X86::SHL8ri,      X86::SHL8mi },
@ -349,12 +405,17 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::SHLD16rri8,  X86::SHLD16mri8 },
      { X86::SHLD32rrCL,  X86::SHLD32mrCL },
      { X86::SHLD32rri8,  X86::SHLD32mri8 },
+      { X86::SHLD64rrCL,  X86::SHLD64mrCL },
+      { X86::SHLD64rri8,  X86::SHLD64mri8 },
      { X86::SHR16r1,     X86::SHR16m1 },
      { X86::SHR16rCL,    X86::SHR16mCL },
      { X86::SHR16ri,     X86::SHR16mi },
      { X86::SHR32r1,     X86::SHR32m1 },
      { X86::SHR32rCL,    X86::SHR32mCL },
      { X86::SHR32ri,     X86::SHR32mi },
+      { X86::SHR64r1,     X86::SHR64m1 },
+      { X86::SHR64rCL,    X86::SHR64mCL },
+      { X86::SHR64ri,     X86::SHR64mi },
      { X86::SHR8r1,      X86::SHR8m1 },
      { X86::SHR8rCL,     X86::SHR8mCL },
      { X86::SHR8ri,      X86::SHR8mi },
@ -362,12 +423,17 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::SHRD16rri8,  X86::SHRD16mri8 },
      { X86::SHRD32rrCL,  X86::SHRD32mrCL },
      { X86::SHRD32rri8,  X86::SHRD32mri8 },
+      { X86::SHRD64rrCL,  X86::SHRD64mrCL },
+      { X86::SHRD64rri8,  X86::SHRD64mri8 },
      { X86::SUB16ri,     X86::SUB16mi },
      { X86::SUB16ri8,    X86::SUB16mi8 },
      { X86::SUB16rr,     X86::SUB16mr },
      { X86::SUB32ri,     X86::SUB32mi },
      { X86::SUB32ri8,    X86::SUB32mi8 },
      { X86::SUB32rr,     X86::SUB32mr },
+      { X86::SUB64ri32,   X86::SUB64mi32 },
+      { X86::SUB64ri8,    X86::SUB64mi8 },
+      { X86::SUB64rr,     X86::SUB64mr },
      { X86::SUB8ri,      X86::SUB8mi },
      { X86::SUB8rr,      X86::SUB8mr },
      { X86::XOR16ri,     X86::XOR16mi },
@ -376,6 +442,9 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::XOR32ri,     X86::XOR32mi },
      { X86::XOR32ri8,    X86::XOR32mi8 },
      { X86::XOR32rr,     X86::XOR32mr },
+      { X86::XOR64ri32,   X86::XOR64mi32 },
+      { X86::XOR64ri8,    X86::XOR64mi8 },
+      { X86::XOR64rr,     X86::XOR64mr },
      { X86::XOR8ri,      X86::XOR8mi },
      { X86::XOR8rr,      X86::XOR8mr }
    };
@ -388,6 +457,8 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      return MakeM0Inst(X86::MOV16mi, FrameIndex, MI);
    else if (MI->getOpcode() == X86::MOV32r0)
      return MakeM0Inst(X86::MOV32mi, FrameIndex, MI);
+    else if (MI->getOpcode() == X86::MOV64r0)
+      return MakeM0Inst(X86::MOV64mi32, FrameIndex, MI);
    else if (MI->getOpcode() == X86::MOV8r0)
      return MakeM0Inst(X86::MOV8mi, FrameIndex, MI);
    
@ -399,19 +470,24 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::CMP8ri,      X86::CMP8mi },
      { X86::DIV16r,      X86::DIV16m },
      { X86::DIV32r,      X86::DIV32m },
+      { X86::DIV64r,      X86::DIV64m },
      { X86::DIV8r,       X86::DIV8m },
      { X86::FsMOVAPDrr,  X86::MOVSDmr },
      { X86::FsMOVAPSrr,  X86::MOVSSmr },
      { X86::IDIV16r,     X86::IDIV16m },
      { X86::IDIV32r,     X86::IDIV32m },
+      { X86::IDIV64r,     X86::IDIV64m },
      { X86::IDIV8r,      X86::IDIV8m },
      { X86::IMUL16r,     X86::IMUL16m },
      { X86::IMUL32r,     X86::IMUL32m },
+      { X86::IMUL64r,     X86::IMUL64m },
      { X86::IMUL8r,      X86::IMUL8m },
      { X86::MOV16ri,     X86::MOV16mi },
      { X86::MOV16rr,     X86::MOV16mr },
      { X86::MOV32ri,     X86::MOV32mi },
      { X86::MOV32rr,     X86::MOV32mr },
+      { X86::MOV64ri32,   X86::MOV64mi32 },
+      { X86::MOV64rr,     X86::MOV64mr },
      { X86::MOV8ri,      X86::MOV8mi },
      { X86::MOV8rr,      X86::MOV8mr },
      { X86::MOVAPDrr,    X86::MOVAPDmr },
@ -424,6 +500,7 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::MOVUPSrr,    X86::MOVUPSmr },
      { X86::MUL16r,      X86::MUL16m },
      { X86::MUL32r,      X86::MUL32m },
+      { X86::MUL64r,      X86::MUL64m },
      { X86::MUL8r,       X86::MUL8m },
      { X86::SETAEr,      X86::SETAEm },
      { X86::SETAr,       X86::SETAm },
@ -441,9 +518,11 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::SETSr,       X86::SETSm },
      { X86::TEST16ri,    X86::TEST16mi },
      { X86::TEST32ri,    X86::TEST32mi },
+      { X86::TEST64ri32,  X86::TEST64mi32 },
      { X86::TEST8ri,     X86::TEST8mi },
      { X86::XCHG16rr,    X86::XCHG16mr },
      { X86::XCHG32rr,    X86::XCHG32mr },
+      { X86::XCHG64rr,    X86::XCHG64mr },
      { X86::XCHG8rr,     X86::XCHG8mr }
    };
    ASSERT_SORTED(OpcodeTable);
@ -453,16 +532,23 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
    static const TableEntry OpcodeTable[] = {
      { X86::CMP16rr,         X86::CMP16rm },
      { X86::CMP32rr,         X86::CMP32rm },
+      { X86::CMP64ri32,       X86::CMP64mi32 },
+      { X86::CMP64ri8,        X86::CMP64mi8 },
+      { X86::CMP64rr,         X86::CMP64rm },
      { X86::CMP8rr,          X86::CMP8rm },
      { X86::CMPPDrri,        X86::CMPPDrmi },
      { X86::CMPPSrri,        X86::CMPPSrmi },
      { X86::CMPSDrr,         X86::CMPSDrm },
      { X86::CMPSSrr,         X86::CMPSSrm },
      { X86::CVTSD2SSrr,      X86::CVTSD2SSrm },
+      { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm },
      { X86::CVTSI2SDrr,      X86::CVTSI2SDrm },
+      { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm },
      { X86::CVTSI2SSrr,      X86::CVTSI2SSrm },
      { X86::CVTSS2SDrr,      X86::CVTSS2SDrm },
+      { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm },
      { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm },
+      { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm },
      { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm },
      { X86::FsMOVAPDrr,      X86::MOVSDrm },
      { X86::FsMOVAPSrr,      X86::MOVSSrm },
@ -470,6 +556,9 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::IMUL16rri8,      X86::IMUL16rmi8 },
      { X86::IMUL32rri,       X86::IMUL32rmi },
      { X86::IMUL32rri8,      X86::IMUL32rmi8 },
+      { X86::IMUL64rr,        X86::IMUL64rm },
+      { X86::IMUL64rri32,     X86::IMUL64rmi32 },
+      { X86::IMUL64rri8,      X86::IMUL64rmi8 },
      { X86::Int_CMPSDrr,     X86::Int_CMPSDrm },
      { X86::Int_CMPSSrr,     X86::Int_CMPSSrm },
      { X86::Int_COMISDrr,    X86::Int_COMISDrm },
@ -480,20 +569,27 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm },
      { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm },
      { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm },
+      { X86::Int_CVTSD2SI64rr,X86::Int_CVTSD2SI64rm },
      { X86::Int_CVTSD2SIrr,  X86::Int_CVTSD2SIrm },
      { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm },
+      { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm },
      { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm },
+      { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm },
      { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm },
      { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm },
+      { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm },
      { X86::Int_CVTSS2SIrr,  X86::Int_CVTSS2SIrm },
      { X86::Int_CVTTPD2DQrr, X86::Int_CVTTPD2DQrm },
      { X86::Int_CVTTPS2DQrr, X86::Int_CVTTPS2DQrm },
+      { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm },
      { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm },
+      { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm },
      { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm },
      { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm },
      { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm },
      { X86::MOV16rr,         X86::MOV16rm },
      { X86::MOV32rr,         X86::MOV32rm },
+      { X86::MOV64rr,         X86::MOV64rm },
      { X86::MOV8rr,          X86::MOV8rm },
      { X86::MOVAPDrr,        X86::MOVAPDrm },
      { X86::MOVAPSrr,        X86::MOVAPSrm },
@ -509,22 +605,30 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::MOVSX16rr8,      X86::MOVSX16rm8 },
      { X86::MOVSX32rr16,     X86::MOVSX32rm16 },
      { X86::MOVSX32rr8,      X86::MOVSX32rm8 },
+      { X86::MOVSX64rr16,     X86::MOVSX64rm16 },
+      { X86::MOVSX64rr32,     X86::MOVSX64rm32 },
+      { X86::MOVSX64rr8,      X86::MOVSX64rm8 },
      { X86::MOVUPDrr,        X86::MOVUPDrm },
      { X86::MOVUPSrr,        X86::MOVUPSrm },
      { X86::MOVZX16rr8,      X86::MOVZX16rm8 },
      { X86::MOVZX32rr16,     X86::MOVZX32rm16 },
      { X86::MOVZX32rr8,      X86::MOVZX32rm8 },
+      { X86::MOVZX64rr16,     X86::MOVZX64rm16 },
+      { X86::MOVZX64rr8,      X86::MOVZX64rm8 },
      { X86::PSHUFDri,        X86::PSHUFDmi },
      { X86::PSHUFHWri,       X86::PSHUFHWmi },
      { X86::PSHUFLWri,       X86::PSHUFLWmi },
+      { X86::PsMOVZX64rr32,   X86::PsMOVZX64rm32 },
      { X86::TEST16rr,        X86::TEST16rm },
      { X86::TEST32rr,        X86::TEST32rm },
+      { X86::TEST64rr,        X86::TEST64rm },
      { X86::TEST8rr,         X86::TEST8rm },
      // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
      { X86::UCOMISDrr,       X86::UCOMISDrm },
      { X86::UCOMISSrr,       X86::UCOMISSrm },
      { X86::XCHG16rr,        X86::XCHG16rm },
      { X86::XCHG32rr,        X86::XCHG32rm },
+      { X86::XCHG64rr,        X86::XCHG64rm },
      { X86::XCHG8rr,         X86::XCHG8rm }
    };
    ASSERT_SORTED(OpcodeTable);
@ -533,8 +637,10 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
  } else if (i == 2) {
    static const TableEntry OpcodeTable[] = {
      { X86::ADC32rr,         X86::ADC32rm },
+      { X86::ADC64rr,         X86::ADC64rm },
      { X86::ADD16rr,         X86::ADD16rm },
      { X86::ADD32rr,         X86::ADD32rm },
+      { X86::ADD64rr,         X86::ADD64rm },
      { X86::ADD8rr,          X86::ADD8rm },
      { X86::ADDPDrr,         X86::ADDPDrm },
      { X86::ADDPSrr,         X86::ADDPSrm },
@ -544,6 +650,7 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::ADDSUBPSrr,      X86::ADDSUBPSrm },
      { X86::AND16rr,         X86::AND16rm },
      { X86::AND32rr,         X86::AND32rm },
+      { X86::AND64rr,         X86::AND64rm },
      { X86::AND8rr,          X86::AND8rm },
      { X86::ANDNPDrr,        X86::ANDNPDrm },
      { X86::ANDNPSrr,        X86::ANDNPSrm },
@ -551,32 +658,46 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::ANDPSrr,         X86::ANDPSrm },
      { X86::CMOVA16rr,       X86::CMOVA16rm },
      { X86::CMOVA32rr,       X86::CMOVA32rm },
+      { X86::CMOVA64rr,       X86::CMOVA64rm },
      { X86::CMOVAE16rr,      X86::CMOVAE16rm },
      { X86::CMOVAE32rr,      X86::CMOVAE32rm },
+      { X86::CMOVAE64rr,      X86::CMOVAE64rm },
      { X86::CMOVB16rr,       X86::CMOVB16rm },
      { X86::CMOVB32rr,       X86::CMOVB32rm },
+      { X86::CMOVB64rr,       X86::CMOVB64rm },
      { X86::CMOVBE16rr,      X86::CMOVBE16rm },
      { X86::CMOVBE32rr,      X86::CMOVBE32rm },
+      { X86::CMOVBE64rr,      X86::CMOVBE64rm },
      { X86::CMOVE16rr,       X86::CMOVE16rm },
      { X86::CMOVE32rr,       X86::CMOVE32rm },
+      { X86::CMOVE64rr,       X86::CMOVE64rm },
      { X86::CMOVG16rr,       X86::CMOVG16rm },
      { X86::CMOVG32rr,       X86::CMOVG32rm },
+      { X86::CMOVG64rr,       X86::CMOVG64rm },
      { X86::CMOVGE16rr,      X86::CMOVGE16rm },
      { X86::CMOVGE32rr,      X86::CMOVGE32rm },
+      { X86::CMOVGE64rr,      X86::CMOVGE64rm },
      { X86::CMOVL16rr,       X86::CMOVL16rm },
      { X86::CMOVL32rr,       X86::CMOVL32rm },
+      { X86::CMOVL64rr,       X86::CMOVL64rm },
      { X86::CMOVLE16rr,      X86::CMOVLE16rm },
      { X86::CMOVLE32rr,      X86::CMOVLE32rm },
+      { X86::CMOVLE64rr,      X86::CMOVLE64rm },
      { X86::CMOVNE16rr,      X86::CMOVNE16rm },
      { X86::CMOVNE32rr,      X86::CMOVNE32rm },
+      { X86::CMOVNE64rr,      X86::CMOVNE64rm },
      { X86::CMOVNP16rr,      X86::CMOVNP16rm },
      { X86::CMOVNP32rr,      X86::CMOVNP32rm },
+      { X86::CMOVNP64rr,      X86::CMOVNP64rm },
      { X86::CMOVNS16rr,      X86::CMOVNS16rm },
      { X86::CMOVNS32rr,      X86::CMOVNS32rm },
+      { X86::CMOVNS64rr,      X86::CMOVNS64rm },
      { X86::CMOVP16rr,       X86::CMOVP16rm },
      { X86::CMOVP32rr,       X86::CMOVP32rm },
+      { X86::CMOVP64rr,       X86::CMOVP64rm },
      { X86::CMOVS16rr,       X86::CMOVS16rm },
      { X86::CMOVS32rr,       X86::CMOVS32rm },
+      { X86::CMOVS64rr,       X86::CMOVS64rm },
      { X86::DIVPDrr,         X86::DIVPDrm },
      { X86::DIVPSrr,         X86::DIVPSrm },
      { X86::DIVSDrr,         X86::DIVSDrm },
@ -597,6 +718,7 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::MULSSrr,         X86::MULSSrm },
      { X86::OR16rr,          X86::OR16rm },
      { X86::OR32rr,          X86::OR32rm },
+      { X86::OR64rr,          X86::OR64rm },
      { X86::OR8rr,           X86::OR8rm },
      { X86::ORPDrr,          X86::ORPDrm },
      { X86::ORPSrr,          X86::ORPSrm },
@ -655,6 +777,7 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::RCPPSr,          X86::RCPPSm },
      { X86::RSQRTPSr,        X86::RSQRTPSm },
      { X86::SBB32rr,         X86::SBB32rm },
+      { X86::SBB64rr,         X86::SBB64rm },
      { X86::SHUFPDrri,       X86::SHUFPDrmi },
      { X86::SHUFPSrri,       X86::SHUFPSrmi },
      { X86::SQRTPDr,         X86::SQRTPDm },
@ -663,6 +786,7 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::SQRTSSr,         X86::SQRTSSm },
      { X86::SUB16rr,         X86::SUB16rm },
      { X86::SUB32rr,         X86::SUB32rm },
+      { X86::SUB64rr,         X86::SUB64rm },
      { X86::SUB8rr,          X86::SUB8rm },
      { X86::SUBPDrr,         X86::SUBPDrm },
      { X86::SUBPSrr,         X86::SUBPSrm },
@ -675,6 +799,7 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
      { X86::UNPCKLPSrr,      X86::UNPCKLPSrm },
      { X86::XOR16rr,         X86::XOR16rm },
      { X86::XOR32rr,         X86::XOR32rm },
+      { X86::XOR64rr,         X86::XOR64rm },
      { X86::XOR8rr,          X86::XOR8rm },
      { X86::XORPDrr,         X86::XORPDrm },
      { X86::XORPSrr,         X86::XORPSrm }
@ -707,19 +832,29 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,


 const unsigned *X86RegisterInfo::getCalleeSaveRegs() const {
-  static const unsigned CalleeSaveRegs[] = {
+  static const unsigned CalleeSaveRegs32Bit[] = {
    X86::ESI, X86::EDI, X86::EBX, X86::EBP,  0
  };
-  return CalleeSaveRegs;
+  static const unsigned CalleeSaveRegs64Bit[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+
+  return Is64Bit ? CalleeSaveRegs64Bit : CalleeSaveRegs32Bit;
 }

 const TargetRegisterClass* const*
 X86RegisterInfo::getCalleeSaveRegClasses() const {
-  static const TargetRegisterClass * const CalleeSaveRegClasses[] = {
+  static const TargetRegisterClass * const CalleeSaveRegClasses32Bit[] = {
    &X86::GR32RegClass, &X86::GR32RegClass,
    &X86::GR32RegClass, &X86::GR32RegClass,  0
  };
-  return CalleeSaveRegClasses;
+  static const TargetRegisterClass * const CalleeSaveRegClasses64Bit[] = {
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass, 0
+  };
+
+  return Is64Bit ? CalleeSaveRegClasses64Bit : CalleeSaveRegClasses32Bit;
 }

 //===----------------------------------------------------------------------===//
@ -754,15 +889,18 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,

      MachineInstr *New = 0;
      if (Old->getOpcode() == X86::ADJCALLSTACKDOWN) {
-        New=BuildMI(X86::SUB32ri, 2, X86::ESP).addReg(X86::ESP).addImm(Amount);
+        New=BuildMI(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri, 1, StackPtr)
+          .addReg(StackPtr).addImm(Amount);
      } else {
        assert(Old->getOpcode() == X86::ADJCALLSTACKUP);
        // factor out the amount the callee already popped.
        unsigned CalleeAmt = Old->getOperand(1).getImmedValue();
        Amount -= CalleeAmt;
        if (Amount) {
-          unsigned Opc = Amount < 128 ? X86::ADD32ri8 : X86::ADD32ri;
-          New = BuildMI(Opc, 2, X86::ESP).addReg(X86::ESP).addImm(Amount);
+          unsigned Opc = (Amount < 128) ?
+            (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+            (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri);
+          New = BuildMI(Opc, 1,  StackPtr).addReg(StackPtr).addImm(Amount);
        }
      }

@ -774,9 +912,11 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
    // something off the stack pointer, add it back.  We do this until we have
    // more advanced stack pointer tracking ability.
    if (unsigned CalleeAmt = I->getOperand(1).getImmedValue()) {
-      unsigned Opc = CalleeAmt < 128 ? X86::SUB32ri8 : X86::SUB32ri;
+      unsigned Opc = (CalleeAmt < 128) ?
+        (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+        (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri);
      MachineInstr *New =
-        BuildMI(Opc, 1, X86::ESP).addReg(X86::ESP).addImm(CalleeAmt);
+        BuildMI(Opc, 1, StackPtr).addReg(StackPtr).addImm(CalleeAmt);
      MBB.insert(I, New);
    }
  }
@ -794,19 +934,18 @@ void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II) const{
  }

  int FrameIndex = MI.getOperand(i).getFrameIndex();
-
  // This must be part of a four operand memory reference.  Replace the
-  // FrameIndex with base register with EBP.  Add add an offset to the offset.
-  MI.getOperand(i).ChangeToRegister(hasFP(MF) ? X86::EBP : X86::ESP, false);
+  // FrameIndex with base register with EBP.  Add an offset to the offset.
+  MI.getOperand(i).ChangeToRegister(hasFP(MF) ? FramePtr : StackPtr, false);

  // Now add the frame object offset to the offset from EBP.
  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
-               MI.getOperand(i+3).getImmedValue()+4;
+               MI.getOperand(i+3).getImmedValue()+SlotSize;

  if (!hasFP(MF))
    Offset += MF.getFrameInfo()->getStackSize();
  else
-    Offset += 4;  // Skip the saved EBP
+    Offset += SlotSize;  // Skip the saved EBP

  MI.getOperand(i+3).ChangeToImmediate(Offset);
 }
@ -815,7 +954,7 @@ void
 X86RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) const{
  if (hasFP(MF)) {
    // Create a frame entry for the EBP register that must be saved.
-    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, -8);
+    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize,SlotSize * -2);
    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
           "Slot for EBP register must be last in order to be found!");
  }
@ -840,9 +979,9 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
    if (!hasFP(MF))
      NumBytes += MFI->getMaxCallFrameSize();

-    // Round the size to a multiple of the alignment (don't forget the 4 byte
+    // Round the size to a multiple of the alignment (don't forget the 4/8 byte
    // offset though).
-    NumBytes = ((NumBytes+4)+Align-1)/Align*Align - 4;
+    NumBytes = ((NumBytes+SlotSize)+Align-1)/Align*Align - SlotSize;
  }

  // Update frame info to pretend that this is part of the stack...
@ -859,8 +998,10 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
      MI = BuildMI(X86::CALLpcrel32, 1).addExternalSymbol("_alloca");
      MBB.insert(MBBI, MI);
    } else {
-      unsigned Opc = NumBytes < 128 ? X86::SUB32ri8 : X86::SUB32ri;
-      MI = BuildMI(Opc, 2, X86::ESP).addReg(X86::ESP).addImm(NumBytes);
+      unsigned Opc = (NumBytes < 128) ?
+        (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+        (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri);
+      MI= BuildMI(Opc, 1, StackPtr).addReg(StackPtr).addImm(NumBytes);
      MBB.insert(MBBI, MI);
    }
  }
@ -868,18 +1009,21 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
  if (hasFP(MF)) {
    // Get the offset of the stack slot for the EBP register... which is
    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
-    int EBPOffset = MFI->getObjectOffset(MFI->getObjectIndexBegin())+4;
+    int EBPOffset = MFI->getObjectOffset(MFI->getObjectIndexBegin())+SlotSize;

    // Save EBP into the appropriate stack slot...
-    MI = addRegOffset(BuildMI(X86::MOV32mr, 5),    // mov [ESP-<offset>], EBP
-                      X86::ESP, EBPOffset+NumBytes).addReg(X86::EBP);
+    // mov [ESP-<offset>], EBP
+    MI = addRegOffset(BuildMI(Is64Bit ? X86::MOV64mr : X86::MOV32mr, 5),
+                      StackPtr, EBPOffset+NumBytes).addReg(FramePtr);
    MBB.insert(MBBI, MI);

    // Update EBP with the new base value...
-    if (NumBytes == 4)    // mov EBP, ESP
-      MI = BuildMI(X86::MOV32rr, 2, X86::EBP).addReg(X86::ESP);
+    if (NumBytes == SlotSize)    // mov EBP, ESP
+      MI = BuildMI(Is64Bit ? X86::MOV64rr : X86::MOV32rr, 2, FramePtr).
+        addReg(StackPtr);
    else                  // lea EBP, [ESP+StackSize]
-      MI = addRegOffset(BuildMI(X86::LEA32r, 5, X86::EBP), X86::ESP,NumBytes-4);
+      MI = addRegOffset(BuildMI(Is64Bit ? X86::LEA64r : X86::LEA32r,
+                               5, FramePtr), StackPtr, NumBytes-SlotSize);

    MBB.insert(MBBI, MI);
  }
@ -916,13 +1060,14 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
  if (hasFP(MF)) {
    // Get the offset of the stack slot for the EBP register... which is
    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
-    int EBPOffset = MFI->getObjectOffset(MFI->getObjectIndexEnd()-1)+4;
+    int EBPOffset = MFI->getObjectOffset(MFI->getObjectIndexEnd()-1)+SlotSize;

    // mov ESP, EBP
-    BuildMI(MBB, MBBI, X86::MOV32rr, 1, X86::ESP).addReg(X86::EBP);
+    BuildMI(MBB, MBBI, Is64Bit ? X86::MOV64rr : X86::MOV32rr, 1, StackPtr).
+      addReg(FramePtr);

    // pop EBP
-    BuildMI(MBB, MBBI, X86::POP32r, 0, X86::EBP);
+    BuildMI(MBB, MBBI, Is64Bit ? X86::POP64r : X86::POP32r, 0, FramePtr);
  } else {
    // Get the number of bytes allocated from the FrameInfo...
    unsigned NumBytes = MFI->getStackSize();
@ -932,14 +1077,15 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
      // instruction, merge the two instructions.
      if (MBBI != MBB.begin()) {
        MachineBasicBlock::iterator PI = prior(MBBI);
-        if ((PI->getOpcode() == X86::ADD32ri || 
-             PI->getOpcode() == X86::ADD32ri8) &&
-            PI->getOperand(0).getReg() == X86::ESP) {
+        unsigned Opc = PI->getOpcode();
+        if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+             Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+            PI->getOperand(0).getReg() == StackPtr) {
          NumBytes += PI->getOperand(2).getImmedValue();
          MBB.erase(PI);
-        } else if ((PI->getOpcode() == X86::SUB32ri ||
-                    PI->getOpcode() == X86::SUB32ri8) &&
-                   PI->getOperand(0).getReg() == X86::ESP) {
+        } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+                    Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+                   PI->getOperand(0).getReg() == StackPtr) {
          NumBytes -= PI->getOperand(2).getImmedValue();
          MBB.erase(PI);
        } else if (PI->getOpcode() == X86::ADJSTACKPTRri) {
@ -949,11 +1095,15 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
      }

      if (NumBytes > 0) {
-        unsigned Opc = NumBytes < 128 ? X86::ADD32ri8 : X86::ADD32ri;
-        BuildMI(MBB, MBBI, Opc, 2, X86::ESP).addReg(X86::ESP).addImm(NumBytes);
+        unsigned Opc = (NumBytes < 128) ?
+          (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+          (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri);
+        BuildMI(MBB, MBBI, Opc, 2, StackPtr).addReg(StackPtr).addImm(NumBytes);
      } else if ((int)NumBytes < 0) {
-        unsigned Opc = -NumBytes < 128 ? X86::SUB32ri8 : X86::SUB32ri;
-        BuildMI(MBB, MBBI, Opc, 2, X86::ESP).addReg(X86::ESP).addImm(-NumBytes);
+        unsigned Opc = (-NumBytes < 128) ?
+          (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+          (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri);
+        BuildMI(MBB, MBBI, Opc, 2, StackPtr).addReg(StackPtr).addImm(-NumBytes);
      }
    }
  }
@ -964,7 +1114,7 @@ unsigned X86RegisterInfo::getRARegister() const {
 }

 unsigned X86RegisterInfo::getFrameRegister(MachineFunction &MF) const {
-  return hasFP(MF) ? X86::EBP : X86::ESP;
+  return hasFP(MF) ? FramePtr : StackPtr;
 }

 namespace llvm {
@ -974,68 +1124,160 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::ValueType VT, bool High) {
  case MVT::i8:
    if (High) {
      switch (Reg) {
-      default: return Reg;
-      case X86::AH: case X86::AL: case X86::AX: case X86::EAX:
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
        return X86::AH;
-      case X86::DH: case X86::DL: case X86::DX: case X86::EDX:
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
        return X86::DH;
-      case X86::CH: case X86::CL: case X86::CX: case X86::ECX:
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
        return X86::CH;
-      case X86::BH: case X86::BL: case X86::BX: case X86::EBX:
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
        return X86::BH;
      }
    } else {
      switch (Reg) {
-      default: return Reg;
-      case X86::AH: case X86::AL: case X86::AX: case X86::EAX:
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
        return X86::AL;
-      case X86::DH: case X86::DL: case X86::DX: case X86::EDX:
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
        return X86::DL;
-      case X86::CH: case X86::CL: case X86::CX: case X86::ECX:
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
        return X86::CL;
-      case X86::BH: case X86::BL: case X86::BX: case X86::EBX:
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
        return X86::BL;
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SIL;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DIL;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BPL;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SPL;
+      case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+        return X86::R8B;
+      case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+        return X86::R9B;
+      case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+        return X86::R10B;
+      case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+        return X86::R11B;
+      case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+        return X86::R12B;
+      case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+        return X86::R13B;
+      case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+        return X86::R14B;
+      case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+        return X86::R15B;
      }
    }
  case MVT::i16:
    switch (Reg) {
    default: return Reg;
-    case X86::AH: case X86::AL: case X86::AX: case X86::EAX:
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
      return X86::AX;
-    case X86::DH: case X86::DL: case X86::DX: case X86::EDX:
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
      return X86::DX;
-    case X86::CH: case X86::CL: case X86::CX: case X86::ECX:
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
      return X86::CX;
-    case X86::BH: case X86::BL: case X86::BX: case X86::EBX:
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
      return X86::BX;
-    case X86::ESI:
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
      return X86::SI;
-    case X86::EDI:
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
      return X86::DI;
-    case X86::EBP:
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
      return X86::BP;
-    case X86::ESP:
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
      return X86::SP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8W;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9W;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10W;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11W;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12W;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13W;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14W;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15W;
    }
  case MVT::i32:
    switch (Reg) {
-    default: return true;
-    case X86::AH: case X86::AL: case X86::AX: case X86::EAX:
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
      return X86::EAX;
-    case X86::DH: case X86::DL: case X86::DX: case X86::EDX:
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
      return X86::EDX;
-    case X86::CH: case X86::CL: case X86::CX: case X86::ECX:
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
      return X86::ECX;
-    case X86::BH: case X86::BL: case X86::BX: case X86::EBX:
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
      return X86::EBX;
-    case X86::SI:
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
      return X86::ESI;
-    case X86::DI:
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
      return X86::EDI;
-    case X86::BP:
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
      return X86::EBP;
-    case X86::SP:
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
      return X86::ESP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8D;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9D;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10D;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11D;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12D;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13D;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14D;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15D;
+    }
+  case MVT::i64:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::RAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::RDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::RCX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::RBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::RSI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::RDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::RBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::RSP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15;
    }
  }

--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@ -20,10 +20,26 @@
 namespace llvm {
  class Type;
  class TargetInstrInfo;
+  class X86TargetMachine;

 struct X86RegisterInfo : public X86GenRegisterInfo {
+  X86TargetMachine &TM;
  const TargetInstrInfo &TII;
-  X86RegisterInfo(const TargetInstrInfo &tii);
+private:
+  /// Is64Bit - Is the target 64-bits.
+  bool Is64Bit;
+
+  /// SlotSize - Stack slot size in bytes.
+  unsigned SlotSize;
+
+  /// StackPtr - X86 physical register used as stack ptr.
+  unsigned StackPtr;
+
+  /// FramePtr - X86 physical register used as frame ptr.
+  unsigned FramePtr;
+
+public:
+  X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);

  /// Code Generation virtual methods...
  void storeRegToStackSlot(MachineBasicBlock &MBB,
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@ -23,35 +23,92 @@ let Namespace = "X86" in {
  // because the register file generator is smart enough to figure out that
  // AL aliases AX if we tell it that AX aliased AL (for example).

+  // FIXME: X86-64 have different Dwarf numbers.
+  // 64-bit registers, X86-64 only
+  def RAX : Register<"RAX">, DwarfRegNum<0>;
+  def RDX : Register<"RDX">, DwarfRegNum<1>;
+  def RCX : Register<"RCX">, DwarfRegNum<2>;
+  def RBX : Register<"RBX">, DwarfRegNum<3>;
+  def RSI : Register<"RSI">, DwarfRegNum<4>;
+  def RDI : Register<"RDI">, DwarfRegNum<5>;
+  def RBP : Register<"RBP">, DwarfRegNum<6>;
+  def RSP : Register<"RSP">, DwarfRegNum<7>;
+
+  def R8  : Register<"R8">,  DwarfRegNum<8>;
+  def R9  : Register<"R9">,  DwarfRegNum<9>;
+  def R10 : Register<"R10">, DwarfRegNum<10>;
+  def R11 : Register<"R11">, DwarfRegNum<11>;
+  def R12 : Register<"R12">, DwarfRegNum<12>;
+  def R13 : Register<"R13">, DwarfRegNum<13>;
+  def R14 : Register<"R14">, DwarfRegNum<14>;
+  def R15 : Register<"R15">, DwarfRegNum<15>;
+
  // 32-bit registers
-  def EAX : Register<"EAX">, DwarfRegNum<0>;
-  def ECX : Register<"ECX">, DwarfRegNum<1>;
-  def EDX : Register<"EDX">, DwarfRegNum<2>;
-  def EBX : Register<"EBX">, DwarfRegNum<3>;
-  def ESP : Register<"ESP">, DwarfRegNum<4>;
-  def EBP : Register<"EBP">, DwarfRegNum<5>;
-  def ESI : Register<"ESI">, DwarfRegNum<6>;
-  def EDI : Register<"EDI">, DwarfRegNum<7>;
+  def EAX : RegisterGroup<"EAX", [RAX]>, DwarfRegNum<0>;
+  def ECX : RegisterGroup<"ECX", [RCX]>, DwarfRegNum<1>;
+  def EDX : RegisterGroup<"EDX", [RDX]>, DwarfRegNum<2>;
+  def EBX : RegisterGroup<"EBX", [RBX]>, DwarfRegNum<3>;
+  def ESP : RegisterGroup<"ESP", [RSP]>, DwarfRegNum<4>;
+  def EBP : RegisterGroup<"EBP", [RBP]>, DwarfRegNum<5>;
+  def ESI : RegisterGroup<"ESI", [RSI]>, DwarfRegNum<6>;
+  def EDI : RegisterGroup<"EDI", [RDI]>, DwarfRegNum<7>;
  
+  // X86-64 only
+  def R8D  : RegisterGroup<"R8D",  [R8]>,  DwarfRegNum<8>;
+  def R9D  : RegisterGroup<"R9D",  [R9]>,  DwarfRegNum<9>;
+  def R10D : RegisterGroup<"R10D", [R10]>, DwarfRegNum<10>;
+  def R11D : RegisterGroup<"R11D", [R11]>, DwarfRegNum<11>;
+  def R12D : RegisterGroup<"R12D", [R12]>, DwarfRegNum<12>;
+  def R13D : RegisterGroup<"R13D", [R13]>, DwarfRegNum<13>;
+  def R14D : RegisterGroup<"R14D", [R14]>, DwarfRegNum<14>;
+  def R15D : RegisterGroup<"R15D", [R15]>, DwarfRegNum<15>;
+
  // 16-bit registers
-  def AX : RegisterGroup<"AX", [EAX]>, DwarfRegNum<0>;
-  def CX : RegisterGroup<"CX", [ECX]>, DwarfRegNum<1>;
-  def DX : RegisterGroup<"DX", [EDX]>, DwarfRegNum<2>;
-  def BX : RegisterGroup<"BX", [EBX]>, DwarfRegNum<3>;
-  def SP : RegisterGroup<"SP", [ESP]>, DwarfRegNum<4>;
-  def BP : RegisterGroup<"BP", [EBP]>, DwarfRegNum<5>;
-  def SI : RegisterGroup<"SI", [ESI]>, DwarfRegNum<6>;
-  def DI : RegisterGroup<"DI", [EDI]>, DwarfRegNum<7>;
+  def AX : RegisterGroup<"AX", [EAX,RAX]>, DwarfRegNum<0>;
+  def CX : RegisterGroup<"CX", [ECX,RCX]>, DwarfRegNum<1>;
+  def DX : RegisterGroup<"DX", [EDX,RDX]>, DwarfRegNum<2>;
+  def BX : RegisterGroup<"BX", [EBX,RBX]>, DwarfRegNum<3>;
+  def SP : RegisterGroup<"SP", [ESP,RSP]>, DwarfRegNum<4>;
+  def BP : RegisterGroup<"BP", [EBP,RBP]>, DwarfRegNum<5>;
+  def SI : RegisterGroup<"SI", [ESI,RSI]>, DwarfRegNum<6>;
+  def DI : RegisterGroup<"DI", [EDI,RDI]>, DwarfRegNum<7>;
  
+  // X86-64 only
+  def R8W  : RegisterGroup<"R8W",  [R8D,R8]>,   DwarfRegNum<8>;
+  def R9W  : RegisterGroup<"R9W",  [R9D,R9]>,   DwarfRegNum<9>;
+  def R10W : RegisterGroup<"R10W", [R10D,R10]>, DwarfRegNum<10>;
+  def R11W : RegisterGroup<"R11W", [R11D,R11]>, DwarfRegNum<11>;
+  def R12W : RegisterGroup<"R12W", [R12D,R12]>, DwarfRegNum<12>;
+  def R13W : RegisterGroup<"R13W", [R13D,R13]>, DwarfRegNum<13>;
+  def R14W : RegisterGroup<"R14W", [R14D,R14]>, DwarfRegNum<14>;
+  def R15W : RegisterGroup<"R15W", [R15D,R15]>, DwarfRegNum<15>;
+
  // 8-bit registers
-  def AL : RegisterGroup<"AL", [AX,EAX]>, DwarfRegNum<0>;
-  def CL : RegisterGroup<"CL", [CX,ECX]>, DwarfRegNum<1>;
-  def DL : RegisterGroup<"DL", [DX,EDX]>, DwarfRegNum<2>;
-  def BL : RegisterGroup<"BL", [BX,EBX]>, DwarfRegNum<3>;
-  def AH : RegisterGroup<"AH", [AX,EAX]>, DwarfRegNum<0>;
-  def CH : RegisterGroup<"CH", [CX,ECX]>, DwarfRegNum<1>;
-  def DH : RegisterGroup<"DH", [DX,EDX]>, DwarfRegNum<2>;
-  def BH : RegisterGroup<"BH", [BX,EBX]>, DwarfRegNum<3>;
+  // Low registers
+  def AL : RegisterGroup<"AL", [AX,EAX,RAX]>, DwarfRegNum<0>;
+  def CL : RegisterGroup<"CL", [CX,ECX,RCX]>, DwarfRegNum<1>;
+  def DL : RegisterGroup<"DL", [DX,EDX,RDX]>, DwarfRegNum<2>;
+  def BL : RegisterGroup<"BL", [BX,EBX,RBX]>, DwarfRegNum<3>;
+
+  // X86-64 only
+  def SIL : RegisterGroup<"SIL", [SI,ESI,RSI]>, DwarfRegNum<4>;
+  def DIL : RegisterGroup<"DIL", [DI,EDI,RDI]>, DwarfRegNum<5>;
+  def BPL : RegisterGroup<"BPL", [BP,EBP,RBP]>, DwarfRegNum<6>;
+  def SPL : RegisterGroup<"SPL", [SP,ESP,RSP]>, DwarfRegNum<7>;
+  def R8B  : RegisterGroup<"R8B",  [R8W,R8D,R8]>,    DwarfRegNum<8>;
+  def R9B  : RegisterGroup<"R9B",  [R9W,R9D,R9]>,    DwarfRegNum<9>;
+  def R10B : RegisterGroup<"R10B", [R10W,R10D,R10]>, DwarfRegNum<10>;
+  def R11B : RegisterGroup<"R11B", [R11W,R11D,R11]>, DwarfRegNum<11>;
+  def R12B : RegisterGroup<"R12B", [R12W,R12D,R12]>, DwarfRegNum<12>;
+  def R13B : RegisterGroup<"R13B", [R13W,R13D,R13]>, DwarfRegNum<13>;
+  def R14B : RegisterGroup<"R14B", [R14W,R14D,R14]>, DwarfRegNum<14>;
+  def R15B : RegisterGroup<"R15B", [R15W,R15D,R15]>, DwarfRegNum<15>;
+
+  // High registers X86-32 only
+  def AH : RegisterGroup<"AH", [AX,EAX,RAX]>, DwarfRegNum<0>;
+  def CH : RegisterGroup<"CH", [CX,ECX,RCX]>, DwarfRegNum<1>;
+  def DH : RegisterGroup<"DH", [DX,EDX,RDX]>, DwarfRegNum<2>;
+  def BH : RegisterGroup<"BH", [BX,EBX,RBX]>, DwarfRegNum<3>;

  // MMX Registers. These are actually aliased to ST0 .. ST7
  def MM0 : Register<"MM0">, DwarfRegNum<29>;
@ -73,14 +130,24 @@ let Namespace = "X86" in {
  def FP6 : Register<"FP6">, DwarfRegNum<-1>; 

  // XMM Registers, used by the various SSE instruction set extensions
-  def XMM0: Register<"XMM0">, DwarfRegNum<21>;
-  def XMM1: Register<"XMM1">, DwarfRegNum<22>;
-  def XMM2: Register<"XMM2">, DwarfRegNum<23>;
-  def XMM3: Register<"XMM3">, DwarfRegNum<24>;
-  def XMM4: Register<"XMM4">, DwarfRegNum<25>;
-  def XMM5: Register<"XMM5">, DwarfRegNum<26>;
-  def XMM6: Register<"XMM6">, DwarfRegNum<27>;
-  def XMM7: Register<"XMM7">, DwarfRegNum<28>;
+  def XMM0: Register<"XMM0">, DwarfRegNum<17>;
+  def XMM1: Register<"XMM1">, DwarfRegNum<18>;
+  def XMM2: Register<"XMM2">, DwarfRegNum<19>;
+  def XMM3: Register<"XMM3">, DwarfRegNum<20>;
+  def XMM4: Register<"XMM4">, DwarfRegNum<21>;
+  def XMM5: Register<"XMM5">, DwarfRegNum<22>;
+  def XMM6: Register<"XMM6">, DwarfRegNum<23>;
+  def XMM7: Register<"XMM7">, DwarfRegNum<24>;
+
+  // X86-64 only
+  def XMM8:  Register<"XMM8">,  DwarfRegNum<25>;
+  def XMM9:  Register<"XMM9">,  DwarfRegNum<26>;
+  def XMM10: Register<"XMM10">, DwarfRegNum<27>;
+  def XMM11: Register<"XMM11">, DwarfRegNum<28>;
+  def XMM12: Register<"XMM12">, DwarfRegNum<29>;
+  def XMM13: Register<"XMM13">, DwarfRegNum<30>;
+  def XMM14: Register<"XMM14">, DwarfRegNum<31>;
+  def XMM15: Register<"XMM15">, DwarfRegNum<32>;

  // Floating point stack registers
  def ST0 : Register<"ST(0)">, DwarfRegNum<11>;
@ -99,52 +166,247 @@ let Namespace = "X86" in {
 // implicitly defined to be the register allocation order.
 //

-// List AL,CL,DL before AH,CH,DH, as X86 processors often suffer from false
-// dependences between upper and lower parts of the register.  BL and BH are
-// last because they are call clobbered. Both Athlon and P4 chips suffer this
-// issue.
-def GR8  : RegisterClass<"X86", [i8],  8, [AL, CL, DL, AH, CH, DH, BL, BH]>;
-
-def GR16 : RegisterClass<"X86", [i16], 16, [AX, CX, DX, SI, DI, BX, BP, SP]> {
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and 
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B. 
+// FIXME: Allow AH, CH, DH, BH in 64-mode for non-REX instructions,
+def GR8 : RegisterClass<"X86", [i8],  8,
+                        [AL, CL, DL, BL, AH, CH, DH, BH, SIL, DIL, BPL, SPL,
+                         R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]> {
  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
    iterator allocation_order_end(const MachineFunction &MF) const;
  }];
  let MethodBodies = [{
+      // Does the function dedicate RBP / EBP to being a frame ptr?
+      // If so, don't allocate SPL or BPL.
+      static const unsigned X86_GR8_AO_64_fp[] =
+      {X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL,
+       X86::R8B, X86::R9B, X86::R10B, X86::R11B,
+       X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B};
+      // If not, just don't allocate SPL.
+      static const unsigned X86_GR8_AO_64[] =
+      {X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL,
+       X86::R8B, X86::R9B, X86::R10B, X86::R11B,
+       X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B, X86::BPL};
+      // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
+      static const unsigned X86_GR8_AO_32[] =
+      {X86::AL, X86::CL, X86::DL, X86::AH, X86::CH, X86::DH, X86::BL, X86::BH};
+
+    GR8Class::iterator
+    GR8Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return X86_GR8_AO_32;
+      else if (hasFP(MF))
+        return X86_GR8_AO_64_fp;
+      else
+        return X86_GR8_AO_64;
+    }
+
+    GR8Class::iterator
+    GR8Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return X86_GR8_AO_32 + (sizeof(X86_GR8_AO_32) / sizeof(unsigned));
+      else if (hasFP(MF))
+        return X86_GR8_AO_64_fp + (sizeof(X86_GR8_AO_64_fp) / sizeof(unsigned));
+      else
+        return X86_GR8_AO_64 + (sizeof(X86_GR8_AO_64) / sizeof(unsigned));
+    }
+  }];
+}
+
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+                         [AX, CX, DX, SI, DI, BX, BP, SP,
+                          R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+      // Does the function dedicate RBP / EBP to being a frame ptr?
+      // If so, don't allocate SP or BP.
+      static const unsigned X86_GR16_AO_64_fp[] =
+      {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI,
+       X86::R8W, X86::R9W, X86::R10W, X86::R11W,
+       X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W};
+      static const unsigned X86_GR16_AO_32_fp[] =
+      {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX};
+      // If not, just don't allocate SPL.
+      static const unsigned X86_GR16_AO_64[] =
+      {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI,
+       X86::R8W, X86::R9W, X86::R10W, X86::R11W,
+       X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W, X86::BP};
+      static const unsigned X86_GR16_AO_32[] =
+      {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX, X86::BP};
+
+    GR16Class::iterator
+    GR16Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (hasFP(MF))
+          return X86_GR16_AO_64_fp;
+        else
+          return X86_GR16_AO_64;
+      } else {
+        if (hasFP(MF))
+          return X86_GR16_AO_32_fp;
+        else
+          return X86_GR16_AO_32;
+      }
+    }
+
    GR16Class::iterator
    GR16Class::allocation_order_end(const MachineFunction &MF) const {
-      if (hasFP(MF))     // Does the function dedicate EBP to being a frame ptr?
-        return end()-2;  // If so, don't allocate SP or BP
-      else
-        return end()-1;  // If not, just don't allocate SP
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (hasFP(MF))
+          return X86_GR16_AO_64_fp+(sizeof(X86_GR16_AO_64_fp)/sizeof(unsigned));
+        else
+          return X86_GR16_AO_64 + (sizeof(X86_GR16_AO_64) / sizeof(unsigned));
+      } else {
+        if (hasFP(MF))
+          return X86_GR16_AO_32_fp+(sizeof(X86_GR16_AO_32_fp)/sizeof(unsigned));
+        else
+          return X86_GR16_AO_32 + (sizeof(X86_GR16_AO_32) / sizeof(unsigned));
+      }
    }
  }];
 }

+
 def GR32 : RegisterClass<"X86", [i32], 32, 
-                         [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> {
+                         [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+                          R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+      // Does the function dedicate RBP / EBP to being a frame ptr?
+      // If so, don't allocate ESP or EBP.
+      static const unsigned X86_GR32_AO_64_fp[] =
+      {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI,
+       X86::R8D, X86::R9D, X86::R10D, X86::R11D,
+       X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D};
+      static const unsigned X86_GR32_AO_32_fp[] =
+      {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX};
+      // If not, just don't allocate SPL.
+      static const unsigned X86_GR32_AO_64[] =
+      {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI,
+       X86::R8D, X86::R9D, X86::R10D, X86::R11D,
+       X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP};
+      static const unsigned X86_GR32_AO_32[] =
+      {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP};
+
+    GR32Class::iterator
+    GR32Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (hasFP(MF))
+          return X86_GR32_AO_64_fp;
+        else
+          return X86_GR32_AO_64;
+      } else {
+        if (hasFP(MF))
+          return X86_GR32_AO_32_fp;
+        else
+          return X86_GR32_AO_32;
+      }
+    }
+
+    GR32Class::iterator
+    GR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (hasFP(MF))
+          return X86_GR32_AO_64_fp+(sizeof(X86_GR32_AO_64_fp)/sizeof(unsigned));
+        else
+          return X86_GR32_AO_64 + (sizeof(X86_GR32_AO_64) / sizeof(unsigned));
+      } else {
+        if (hasFP(MF))
+          return X86_GR32_AO_32_fp+(sizeof(X86_GR32_AO_32_fp)/sizeof(unsigned));
+        else
+          return X86_GR32_AO_32 + (sizeof(X86_GR32_AO_32) / sizeof(unsigned));
+      }
+    }
+  }];
+}
+
+
+def GR64 : RegisterClass<"X86", [i64], 64, 
+                         [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+                          RBX, R14, R15, R12, R13, RBP, RSP]> {
  let MethodProtos = [{
    iterator allocation_order_end(const MachineFunction &MF) const;
  }];
  let MethodBodies = [{
-    GR32Class::iterator
-    GR32Class::allocation_order_end(const MachineFunction &MF) const {
-      if (hasFP(MF))     // Does the function dedicate EBP to being a frame ptr?
-        return end()-2;  // If so, don't allocate ESP or EBP
+    GR64Class::iterator
+    GR64Class::allocation_order_end(const MachineFunction &MF) const {
+      if (hasFP(MF))     // Does the function dedicate RBP to being a frame ptr?
+        return end()-2;  // If so, don't allocate RSP or RBP
      else
-        return end()-1;  // If not, just don't allocate ESP
+        return end()-1;  // If not, just don't allocate RSP
    }
  }];
 }

+
 // GR16, GR32 subclasses which contain registers that have R8 sub-registers.
+// These should only be used for 32-bit mode.
 def GR16_ : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]>;
 def GR32_ : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]>;

 // Scalar SSE2 floating point registers.
 def FR32 : RegisterClass<"X86", [f32], 32,
-                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>;
+                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                          XMM8, XMM9, XMM10, XMM11,
+                          XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FR32Class::iterator
+    FR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
+
 def FR64 : RegisterClass<"X86", [f64], 64,
-                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>;
+                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                          XMM8, XMM9, XMM10, XMM11,
+                          XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FR64Class::iterator
+    FR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
+

 // FIXME: This sets up the floating point register files as though they are f64
 // values, though they really are f80 values.  This will cause us to spill
@ -174,4 +436,21 @@ def RST : RegisterClass<"X86", [f64], 32,
 def VR64  : RegisterClass<"X86", [v8i8, v4i16, v2i32], 64,
                          [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>;
 def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
-                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>;
+                          [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                           XMM8, XMM9, XMM10, XMM11,
+                           XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    VR128Class::iterator
+    VR128Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@ -12,9 +12,10 @@
 //===----------------------------------------------------------------------===//

 #include "X86Subtarget.h"
+#include "X86GenSubtarget.inc"
 #include "llvm/Module.h"
 #include "llvm/Support/CommandLine.h"
-#include "X86GenSubtarget.inc"
+#include <iostream>
 using namespace llvm;

 cl::opt<X86Subtarget::AsmWriterFlavorTy>
@ -29,7 +30,18 @@ AsmWriterFlavor("x86-asm-syntax", cl::init(X86Subtarget::unset),
 /// specified arguments.  If we can't run cpuid on the host, return true.
 static bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
                            unsigned *rECX, unsigned *rEDX) {
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+#if defined(__x86_64__)
+  asm ("pushq\t%%rbx\n\t"
+       "cpuid\n\t"
+       "movl\t%%ebx, %%esi\n\t"
+       "popq\t%%rbx"
+       : "=a" (*rEAX),
+         "=S" (*rEBX),
+         "=c" (*rECX),
+         "=d" (*rEDX)
+       :  "a" (value));
+  return false;
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
 #if defined(__GNUC__)
  asm ("pushl\t%%ebx\n\t"
       "cpuid\n\t"
@ -99,8 +111,8 @@ static const char *GetCurrentX86CPU() {
        case 9:
        case 13: return "pentium-m";
        case 14: return "yonah";
-        default:
-          return (Model > 14) ? "yonah" : "i686";
+        case 15: return "core2";
+        default: return "i686";
        }
      case 15: {
        switch (Model) {
@ -154,14 +166,16 @@ static const char *GetCurrentX86CPU() {
  }
 }

-X86Subtarget::X86Subtarget(const Module &M, const std::string &FS) {
-  stackAlignment = 8;
-  // FIXME: this is a known good value for Yonah. Not sure about others.
-  MinRepStrSizeThreshold = 128;
-  X86SSELevel = NoMMXSSE;
-  X863DNowLevel = NoThreeDNow;
-  AsmFlavor = AsmWriterFlavor;
-  Is64Bit = false;
+X86Subtarget::X86Subtarget(const Module &M, const std::string &FS, bool is64Bit)
+  :  AsmFlavor(AsmWriterFlavor)
+  , X86SSELevel(NoMMXSSE)
+  , X863DNowLevel(NoThreeDNow)
+  , HasX86_64(false)
+  , stackAlignment(8)
+  // FIXME: this is a known good value for Yonah. How about others?
+  , MinRepStrSizeThreshold(128)
+  , Is64Bit(is64Bit)
+  , TargetType(isELF) { // Default to ELF unless otherwise specified.

  // Determine default and user specified characteristics
  std::string CPU = GetCurrentX86CPU();
@ -169,9 +183,12 @@ X86Subtarget::X86Subtarget(const Module &M, const std::string &FS) {
  // Parse features string.
  ParseSubtargetFeatures(FS, CPU);

-  // Default to ELF unless otherwise specified.
-  TargetType = isELF;
-  
+  if (Is64Bit && !HasX86_64) {
+      std::cerr << "Warning: Generation of 64-bit code for a 32-bit processor "
+                   "requested.\n";
+      HasX86_64 = true;
+  }
+
  // Set the boolean corresponding to the current target triple, or the default
  // if one cannot be determined, to true.
  const std::string& TT = M.getTargetTriple();
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@ -44,9 +44,9 @@ protected:

  /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported.
  X863DNowEnum X863DNowLevel;
-  
-  /// Is64Bit - True if the processor supports Em64T.
-  bool Is64Bit;
+
+  /// HasX86_64 - True if the processor supports X86-64 instructions.
+  bool HasX86_64;

  /// stackAlignment - The minimum alignment known to hold of the stack frame on
  /// entry to the function and which must be maintained by every function.
@ -55,6 +55,11 @@ protected:
  /// Min. memset / memcpy size that is turned into rep/movs, rep/stos ops.
  unsigned MinRepStrSizeThreshold;

+private:
+  /// Is64Bit - True if the processor supports 64-bit instructions and module
+  /// pointer size is 64 bit.
+  bool Is64Bit;
+
 public:
  enum {
    isELF, isCygwin, isDarwin, isWindows
@ -63,7 +68,7 @@ public:
  /// This constructor initializes the data members to match that
  /// of the specified module.
  ///
-  X86Subtarget(const Module &M, const std::string &FS);
+  X86Subtarget(const Module &M, const std::string &FS, bool is64Bit);

  /// getStackAlignment - Returns the minimum alignment known to hold of the
  /// stack frame on entry to the function and which must be maintained by every
--- a/llvm/lib/Target/X86/X86TargetAsmInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetAsmInfo.cpp
@ -26,13 +26,16 @@ X86TargetAsmInfo::X86TargetAsmInfo(const X86TargetMachine &TM) {
  case X86Subtarget::isDarwin:
    AlignmentIsInBytes = false;
    GlobalPrefix = "_";
-    Data64bitsDirective = 0;       // we can't emit a 64-bit unit
+    if (!Subtarget->is64Bit())
+      Data64bitsDirective = 0;       // we can't emit a 64-bit unit
    ZeroDirective = "\t.space\t";  // ".space N" emits N zeros.
    PrivateGlobalPrefix = "L";     // Marker for constant pool idxs
    ConstantPoolSection = "\t.const\n";
    JumpTableDataSection = "\t.const\n"; // FIXME: depends on PIC mode
    FourByteConstantSection = "\t.literal4\n";
    EightByteConstantSection = "\t.literal8\n";
+    if (Subtarget->is64Bit())
+      SixteenByteConstantSection = "\t.literal16\n";
    LCOMMDirective = "\t.lcomm\t";
    COMMDirectiveTakesAlignment = false;
    HasDotTypeDotSizeDirective = false;
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@ -33,22 +33,31 @@ int X86TargetMachineModule = 0;

 namespace {
  // Register the target.
-  RegisterTarget<X86TargetMachine> X("x86", "  IA-32 (Pentium and above)");
+  RegisterTarget<X86_32TargetMachine>
+  X("x86",    "  32-bit X86: Pentium-Pro and above");
+  RegisterTarget<X86_64TargetMachine>
+  Y("x86-64", "  64-bit X86: EM64T and AMD64");
 }

 const TargetAsmInfo *X86TargetMachine::createTargetAsmInfo() const {
  return new X86TargetAsmInfo(*this);
 }

-unsigned X86TargetMachine::getJITMatchQuality() {
+unsigned X86_32TargetMachine::getJITMatchQuality() {
 #if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
  return 10;
-#else
-  return 0;
 #endif
+  return 0;
 }

-unsigned X86TargetMachine::getModuleMatchQuality(const Module &M) {
+unsigned X86_64TargetMachine::getJITMatchQuality() {
+#if defined(__x86_64__)
+  return 10;
+#endif
+  return 0;
+}
+
+unsigned X86_32TargetMachine::getModuleMatchQuality(const Module &M) {
  // We strongly match "i[3-9]86-*".
  std::string TT = M.getTargetTriple();
  if (TT.size() >= 5 && TT[0] == 'i' && TT[2] == '8' && TT[3] == '6' &&
@ -65,18 +74,55 @@ unsigned X86TargetMachine::getModuleMatchQuality(const Module &M) {
  return getJITMatchQuality()/2;
 }

+unsigned X86_64TargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "x86_64-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 7 && TT[0] == 'x' && TT[1] == '8' && TT[2] == '6' &&
+      TT[3] == '_' && TT[4] == '6' && TT[5] == '4' && TT[6] == '-')
+    return 20;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer64)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+X86_32TargetMachine::X86_32TargetMachine(const Module &M, const std::string &FS) 
+  : X86TargetMachine(M, FS, false) {
+}
+
+
+X86_64TargetMachine::X86_64TargetMachine(const Module &M, const std::string &FS)
+  : X86TargetMachine(M, FS, true) {
+}
+
 /// X86TargetMachine ctor - Create an ILP32 architecture model
 ///
-X86TargetMachine::X86TargetMachine(const Module &M, const std::string &FS)
-  : Subtarget(M, FS), DataLayout("e-p:32:32-d:32-l:32"),
+X86TargetMachine::X86TargetMachine(const Module &M, const std::string &FS, bool is64Bit)
+  : Subtarget(M, FS, is64Bit),
+    DataLayout(Subtarget.is64Bit() ?
+               std::string("e-p:64:64-d:32-l:32") :
+               std::string("e-p:32:32-d:32-l:32")),
    FrameInfo(TargetFrameInfo::StackGrowsDown,
-              Subtarget.getStackAlignment(), -4),
+              Subtarget.getStackAlignment(), Subtarget.is64Bit() ? -8 : -4),
    InstrInfo(*this), JITInfo(*this), TLInfo(*this) {
  if (getRelocationModel() == Reloc::Default)
    if (Subtarget.isTargetDarwin())
      setRelocationModel(Reloc::DynamicNoPIC);
    else
      setRelocationModel(Reloc::PIC_);
+  if (Subtarget.is64Bit()) {
+    // No DynamicNoPIC support under X86-64.
+    if (getRelocationModel() == Reloc::DynamicNoPIC)
+      setRelocationModel(Reloc::PIC_);
+    // Default X86-64 code model is small.
+    if (getCodeModel() == CodeModel::Default)
+      setCodeModel(CodeModel::Small);
+  }
 }

 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@ -37,7 +37,7 @@ protected:
  virtual const TargetAsmInfo *createTargetAsmInfo() const;
  
 public:
-  X86TargetMachine(const Module &M, const std::string &FS);
+  X86TargetMachine(const Module &M, const std::string &FS, bool is64Bit);

  virtual const X86InstrInfo     *getInstrInfo() const { return &InstrInfo; }
  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
@ -54,6 +54,7 @@ public:
  static unsigned getModuleMatchQuality(const Module &M);
  static unsigned getJITMatchQuality();
  
+  
  // Set up the pass pipeline.
  virtual bool addInstSelector(FunctionPassManager &PM, bool Fast);  
  virtual bool addPostRegAlloc(FunctionPassManager &PM, bool Fast);
@ -64,6 +65,27 @@ public:
  virtual bool addCodeEmitter(FunctionPassManager &PM, bool Fast,
                              MachineCodeEmitter &MCE);
 };
+
+/// X86_32TargetMachine - X86 32-bit target machine.
+///
+class X86_32TargetMachine : public X86TargetMachine {
+public:
+  X86_32TargetMachine(const Module &M, const std::string &FS);
+  
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+/// X86_64TargetMachine - X86 64-bit target machine.
+///
+class X86_64TargetMachine : public X86TargetMachine {
+public:
+  X86_64TargetMachine(const Module &M, const std::string &FS);
+  
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
 } // End llvm namespace

 #endif