ARM backend contribution from Apple.

llvm-svn: 33353
2007-01-19 07:51:42 +00:00 · 2007-01-19 07:51:42 +00:00 · 10043e215b
parent 28c5b8618a
commit 10043e215b
32 changed files with 8730 additions and 2003 deletions
--- a/llvm/lib/Target/ARM/ARM.h
+++ b/llvm/lib/Target/ARM/ARM.h
@ -20,43 +20,77 @@
 #include <cassert>

 namespace llvm {
-  // Enums corresponding to ARM condition codes
-  namespace ARMCC {
-    enum CondCodes {
-      EQ,
-      NE,
-      CS,
-      CC,
-      MI,
-      PL,
-      VS,
-      VC,
-      HI,
-      LS,
-      GE,
-      LT,
-      GT,
-      LE,
-      AL
-    };
+
+class ARMTargetMachine;
+class FunctionPass;
+
+// Enums corresponding to ARM condition codes
+namespace ARMCC {
+  enum CondCodes {
+    EQ,
+    NE,
+    HS,
+    LO,
+    MI,
+    PL,
+    VS,
+    VC,
+    HI,
+    LS,
+    GE,
+    LT,
+    GT,
+    LE,
+    AL
+  };
+  
+  inline static CondCodes getOppositeCondition(CondCodes CC){
+    switch (CC) {
+    default: assert(0 && "Unknown condition code");
+    case EQ: return NE;
+    case NE: return EQ;
+    case HS: return LO;
+    case LO: return HS;
+    case MI: return PL;
+    case PL: return MI;
+    case VS: return VC;
+    case VC: return VS;
+    case HI: return LS;
+    case LS: return HI;
+    case GE: return LT;
+    case LT: return GE;
+    case GT: return LE;
+    case LE: return GT;
+    }
  }
+}

-  namespace ARMShift {
-    enum ShiftTypes {
-      LSL,
-      LSR,
-      ASR,
-      ROR,
-      RRX
-    };
+inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown condition code");
+  case ARMCC::EQ:  return "eq";
+  case ARMCC::NE:  return "ne";
+  case ARMCC::HS:  return "hs";
+  case ARMCC::LO:  return "lo";
+  case ARMCC::MI:  return "mi";
+  case ARMCC::PL:  return "pl";
+  case ARMCC::VS:  return "vs";
+  case ARMCC::VC:  return "vc";
+  case ARMCC::HI:  return "hi";
+  case ARMCC::LS:  return "ls";
+  case ARMCC::GE:  return "ge";
+  case ARMCC::LT:  return "lt";
+  case ARMCC::GT:  return "gt";
+  case ARMCC::LE:  return "le";
+  case ARMCC::AL:  return "al";
  }
+}

-  class FunctionPass;
-  class TargetMachine;
+FunctionPass *createARMISelDag(ARMTargetMachine &TM);
+FunctionPass *createARMCodePrinterPass(std::ostream &O, ARMTargetMachine &TM);
+FunctionPass *createARMLoadStoreOptimizationPass();
+FunctionPass *createARMConstantIslandPass();

-  FunctionPass *createARMISelDag(TargetMachine &TM);
-  FunctionPass *createARMCodePrinterPass(std::ostream &OS, TargetMachine &TM);
-  FunctionPass *createARMFixMulPass();
 } // end namespace llvm;

 // Defines symbolic names for ARM registers.  This defines a mapping from
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@ -17,6 +17,73 @@

 include "../Target.td"

+//===----------------------------------------------------------------------===//
+// ARM Subtarget features.
+//
+
+def ArchV4T     : SubtargetFeature<"v4t", "ARMArchVersion", "V4T",
+                                   "ARM v4T">;
+def ArchV5T     : SubtargetFeature<"v5t", "ARMArchVersion", "V5T",
+                                   "ARM v5T">;
+def ArchV5TE    : SubtargetFeature<"v5te", "ARMArchVersion", "V5TE",
+                                   "ARM v5TE, v5TEj, v5TExp">;
+def ArchV6      : SubtargetFeature<"v6", "ARMArchVersion", "V6",
+                                   "ARM v6">;
+def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFP2", "true",
+                                   "Enable VFP2 instructions ">;
+
+//===----------------------------------------------------------------------===//
+// ARM Processors supported.
+//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+// V4 Processors.
+def : Proc<"generic",         []>;
+def : Proc<"arm8",            []>;
+def : Proc<"arm810",          []>;
+def : Proc<"strongarm",       []>;
+def : Proc<"strongarm110",    []>;
+def : Proc<"strongarm1100",   []>;
+def : Proc<"strongarm1110",   []>;
+
+// V4T Processors.
+def : Proc<"arm7tdmi",        [ArchV4T]>;
+def : Proc<"arm7tdmi-s",      [ArchV4T]>;
+def : Proc<"arm710t",         [ArchV4T]>;
+def : Proc<"arm720t",         [ArchV4T]>;
+def : Proc<"arm9",            [ArchV4T]>;
+def : Proc<"arm9tdmi",        [ArchV4T]>;
+def : Proc<"arm920",          [ArchV4T]>;
+def : Proc<"arm920t",         [ArchV4T]>;
+def : Proc<"arm922t",         [ArchV4T]>;
+def : Proc<"arm940t",         [ArchV4T]>;
+def : Proc<"ep9312",          [ArchV4T]>;
+
+// V5T Processors.
+def : Proc<"arm10tdmi",       [ArchV5T]>;
+def : Proc<"arm1020t",        [ArchV5T]>;
+
+// V5TE Processors.
+def : Proc<"arm9e",           [ArchV5TE]>;
+def : Proc<"arm946e-s",       [ArchV5TE]>;
+def : Proc<"arm966e-s",       [ArchV5TE]>;
+def : Proc<"arm968e-s",       [ArchV5TE]>;
+def : Proc<"arm10e",          [ArchV5TE]>;
+def : Proc<"arm1020e",        [ArchV5TE]>;
+def : Proc<"arm1022e",        [ArchV5TE]>;
+def : Proc<"xscale",          [ArchV5TE]>;
+def : Proc<"iwmmxt",          [ArchV5TE]>;
+
+// V6 Processors.
+def : Proc<"arm1136j-s",      [ArchV6]>;
+def : Proc<"arm1136jf-s",     [ArchV6, FeatureVFP2]>;
+def : Proc<"arm1176jz-s",     [ArchV6]>;
+def : Proc<"arm1176jzf-s",    [ArchV6, FeatureVFP2]>;
+def : Proc<"mpcorenovfp",     [ArchV6]>;
+def : Proc<"mpcore",          [ArchV6, FeatureVFP2]>;
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
@ -31,8 +98,14 @@ include "ARMInstrInfo.td"

 def ARMInstrInfo : InstrInfo {
  // Define how we want to layout our target-specific information field.
-  let TSFlagsFields = [];
-  let TSFlagsShifts = [];
+  let TSFlagsFields = ["AddrModeBits",
+                       "SizeFlag",
+                       "IndexModeBits",
+                       "Opcode"];
+  let TSFlagsShifts = [0,
+                       4,
+                       7,
+                       9];
 }

 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/ARM/ARMAddressingModes.h
+++ b/llvm/lib/Target/ARM/ARMAddressingModes.h
@ -0,0 +1,394 @@
+//===- ARMAddressingModes.h - ARM Addressing Modes --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+  
+/// ARM_AM - ARM Addressing Mode Stuff
+namespace ARM_AM {
+  enum ShiftOpc {
+    no_shift = 0,
+    asr,
+    lsl,
+    lsr,
+    ror,
+    rrx
+  };
+  
+  enum AddrOpc {
+    add = '+', sub = '-'
+  };
+  
+  static inline const char *getShiftOpcStr(ShiftOpc Op) {
+    switch (Op) {
+    default: assert(0 && "Unknown shift opc!");
+    case ARM_AM::asr: return "asr";
+    case ARM_AM::lsl: return "lsl";
+    case ARM_AM::lsr: return "lsr";
+    case ARM_AM::ror: return "ror";
+    case ARM_AM::rrx: return "rrx";
+    }
+  }
+  
+  static inline ShiftOpc getShiftOpcForNode(SDOperand N) {
+    switch (N.getOpcode()) {
+    default:          return ARM_AM::no_shift;
+    case ISD::SHL:    return ARM_AM::lsl;
+    case ISD::SRL:    return ARM_AM::lsr;
+    case ISD::SRA:    return ARM_AM::asr;
+    case ISD::ROTR:   return ARM_AM::ror;
+    //case ISD::ROTL:  // Only if imm -> turn into ROTR.
+    // Can't handle RRX here, because it would require folding a flag into
+    // the addressing mode.  :(  This causes us to miss certain things.
+    //case ARMISD::RRX: return ARM_AM::rrx;
+    }
+  }
+
+  enum AMSubMode {
+    bad_am_submode = 0,
+    ia,
+    ib,
+    da,
+    db
+  };
+
+  static inline const char *getAMSubModeStr(AMSubMode Mode) {
+    switch (Mode) {
+    default: assert(0 && "Unknown addressing sub-mode!");
+    case ARM_AM::ia: return "ia";
+    case ARM_AM::ib: return "ib";
+    case ARM_AM::da: return "da";
+    case ARM_AM::db: return "db";
+    }
+  }
+
+  static inline const char *getAMSubModeAltStr(AMSubMode Mode, bool isLD) {
+    switch (Mode) {
+    default: assert(0 && "Unknown addressing sub-mode!");
+    case ARM_AM::ia: return isLD ? "fd" : "ea";
+    case ARM_AM::ib: return isLD ? "ed" : "fa";
+    case ARM_AM::da: return isLD ? "fa" : "ed";
+    case ARM_AM::db: return isLD ? "ea" : "fd";
+    }
+  }
+
+  /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
+  ///
+  static inline unsigned rotr32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val >> Amt) | (Val << ((32-Amt)&31));
+  }
+  
+  /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
+  ///
+  static inline unsigned rotl32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val << Amt) | (Val >> ((32-Amt)&31));
+  }
+  
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #1: shift_operand with registers
+  //===--------------------------------------------------------------------===//
+  //
+  // This 'addressing mode' is used for arithmetic instructions.  It can
+  // represent things like:
+  //   reg
+  //   reg [asr|lsl|lsr|ror|rrx] reg
+  //   reg [asr|lsl|lsr|ror|rrx] imm
+  //
+  // This is stored three operands [rega, regb, opc].  The first is the base
+  // reg, the second is the shift amount (or reg0 if not present or imm).  The
+  // third operand encodes the shift opcode and the imm if a reg isn't present.
+  //
+  static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
+    return ShOp | (Imm << 3);
+  }
+  static inline unsigned getSORegOffset(unsigned Op) {
+    return Op >> 3;
+  }
+  static inline ShiftOpc getSORegShOp(unsigned Op) {
+    return (ShiftOpc)(Op & 7);
+  }
+
+  /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
+  /// the 8-bit imm value.
+  static inline unsigned getSOImmValImm(unsigned Imm) {
+    return Imm & 0xFF;
+  }
+  /// getSOImmValRotate - Given an encoded imm field for the reg/imm form, return
+  /// the rotate amount.
+  static inline unsigned getSOImmValRot(unsigned Imm) {
+    return (Imm >> 8) * 2;
+  }
+  
+  /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
+  /// computing the rotate amount to use.  If this immediate value cannot be
+  /// handled with a single shifter-op, determine a good rotate amount that will
+  /// take a maximal chunk of bits out of the immediate.
+  static inline unsigned getSOImmValRotate(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+    
+    // Use CTZ to compute the rotate amount.
+    unsigned TZ = CountTrailingZeros_32(Imm);
+    
+    // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
+    // not 9.
+    unsigned RotAmt = TZ & ~1;
+    
+    // If we can handle this spread, return it.
+    if ((rotr32(Imm, RotAmt) & ~255U) == 0)
+      return (32-RotAmt)&31;  // HW rotates right, not left.
+
+    // For values like 0xF000000F, we should skip the first run of ones, then
+    // retry the hunt.
+    if (Imm & 1) {
+      unsigned TrailingOnes = CountTrailingZeros_32(~Imm);
+      if (TrailingOnes != 32) {  // Avoid overflow on 0xFFFFFFFF
+        // Restart the search for a high-order bit after the initial seconds of
+        // ones.
+        unsigned TZ2 = CountTrailingZeros_32(Imm & ~((1 << TrailingOnes)-1));
+      
+        // Rotate amount must be even.
+        unsigned RotAmt2 = TZ2 & ~1;
+        
+        // If this fits, use it.
+        if (RotAmt2 != 32 && (rotr32(Imm, RotAmt2) & ~255U) == 0)
+          return (32-RotAmt2)&31;  // HW rotates right, not left.
+      }
+    }
+    
+    // Otherwise, we have no way to cover this span of bits with a single
+    // shifter_op immediate.  Return a chunk of bits that will be useful to
+    // handle.
+    return (32-RotAmt)&31;  // HW rotates right, not left.
+  }
+
+  /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into an shifter_operand immediate operand, return the 12-bit encoding for
+  /// it.  If not, return -1.
+  static inline int getSOImmVal(unsigned Arg) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Arg & ~255U) == 0) return Arg;
+    
+    unsigned RotAmt = getSOImmValRotate(Arg);
+
+    // If this cannot be handled with a single shifter_op, bail out.
+    if (rotr32(~255U, RotAmt) & Arg)
+      return -1;
+      
+    // Encode this correctly.
+    return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
+  }
+  
+  /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
+  /// or'ing together two SOImmVal's.
+  static inline bool isSOImmTwoPartVal(unsigned V) {
+    // If this can be handled with a single shifter_op, bail out.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+    
+    // If this can be handled with two shifter_op's, accept.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    return V == 0;
+  }
+  
+  /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the first chunk of it.
+  static inline unsigned getSOImmTwoPartFirst(unsigned V) {
+    return rotr32(255U, getSOImmValRotate(V)) & V;
+  }
+
+  /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the second chunk of it.
+  static inline unsigned getSOImmTwoPartSecond(unsigned V) {
+    // Mask out the first hunk.  
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    
+    // Take what's left.
+    assert(V == (rotr32(255U, getSOImmValRotate(V)) & V));
+    return V;
+  }
+  
+  /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImmValShift(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return CountTrailingZeros_32(Imm);
+  }
+
+  /// isThumbImmShiftedVal - Return true if the specified value can be obtained
+  /// by left shifting a 8-bit immediate.
+  static inline bool isThumbImmShiftedVal(unsigned V) {
+    // If this can be handled with 
+    V = (~255U << getThumbImmValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImmNonShiftedVal - If V is a value that satisfies
+  /// isThumbImmShiftedVal, return the non-shiftd value.
+  static inline unsigned getThumbImmNonShiftedVal(unsigned V) {
+    return V >> getThumbImmValShift(V);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #2
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for most simple load/store instructions.
+  //
+  // addrmode2 := reg +/- reg shop imm
+  // addrmode2 := reg +/- imm12
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 12, the immediate in bits 0-11, and the shift op in 13-15.
+  //
+  // If this addressing mode is a frame index (before prolog/epilog insertion
+  // and code rewriting), this operand will have the form:  FI#, reg0, <offs>
+  // with no shift amount for the frame offset.
+  // 
+  static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO) {
+    assert(Imm12 < (1 << 12) && "Imm too large!");
+    bool isSub = Opc == sub;
+    return Imm12 | ((int)isSub << 12) | (SO << 13);
+  }
+  static inline unsigned getAM2Offset(unsigned AM2Opc) {
+    return AM2Opc & ((1 << 12)-1);
+  }
+  static inline AddrOpc getAM2Op(unsigned AM2Opc) {
+    return ((AM2Opc >> 12) & 1) ? sub : add;
+  }
+  static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
+    return (ShiftOpc)(AM2Opc >> 13);
+  }
+  
+  
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #3
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for sign-extending loads, and load/store-pair instructions.
+  //
+  // addrmode3 := reg +/- reg
+  // addrmode3 := reg +/- imm8
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 8, the immediate in bits 0-7.
+  
+  /// getAM3Opc - This function encodes the addrmode3 opc field.
+  static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM3Offset(unsigned AM3Opc) {
+    return AM3Opc & 0xFF;
+  }
+  static inline AddrOpc getAM3Op(unsigned AM3Opc) {
+    return ((AM3Opc >> 8) & 1) ? sub : add;
+  }
+  
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #4
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for load / store multiple instructions.
+  //
+  // addrmode4 := reg, <mode>
+  //
+  // The four modes are:
+  //    IA - Increment after
+  //    IB - Increment before
+  //    DA - Decrement after
+  //    DB - Decrement before
+  //
+  // If the 4th bit (writeback)is set, then the base register is updated after
+  // the memory transfer.
+
+  static inline AMSubMode getAM4SubMode(unsigned Mode) {
+    return (AMSubMode)(Mode & 0x7);
+  }
+
+  static inline unsigned getAM4ModeImm(AMSubMode SubMode, bool WB = false) {
+    return (int)SubMode | ((int)WB << 3);
+  }
+
+  static inline bool getAM4WBFlag(unsigned Mode) {
+    return (Mode >> 3) & 1;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #5
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for coprocessor instructions, such as FP load/stores.
+  //
+  // addrmode5 := reg +/- imm8*4
+  //
+  // The first operand is always a Reg.  The third field encodes the operation
+  // in bit 8, the immediate in bits 0-7.
+  //
+  // This can also be used for FP load/store multiple ops. The third field encodes
+  // writeback mode in bit 8, the number of registers (or 2 times the number of
+  // registers for DPR ops) in bits 0-7. In addition, bit 9-11 encodes one of the
+  // following two sub-modes:
+  //
+  //    IA - Increment after
+  //    DB - Decrement before
+  
+  /// getAM5Opc - This function encodes the addrmode5 opc field.
+  static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM5Offset(unsigned AM5Opc) {
+    return AM5Opc & 0xFF;
+  }
+  static inline AddrOpc getAM5Op(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1) ? sub : add;
+  }
+
+  /// getAM5Opc - This function encodes the addrmode5 opc field for FLDM and
+  /// FSTM instructions.
+  static inline unsigned getAM5Opc(AMSubMode SubMode, bool WB,
+                                   unsigned char Offset) {
+    assert((SubMode == ia || SubMode == db) &&
+           "Illegal addressing mode 5 sub-mode!");
+    return ((int)SubMode << 9) | ((int)WB << 8) | Offset;
+  }
+  static inline AMSubMode getAM5SubMode(unsigned AM5Opc) {
+    return (AMSubMode)((AM5Opc >> 9) & 0x7);
+  }
+  static inline bool getAM5WBFlag(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1);
+  }
+  
+} // end namespace ARM_AM
+} // end namespace llvm
+
+#endif
+
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
--- a/llvm/lib/Target/ARM/ARMCommon.cpp
+++ b/llvm/lib/Target/ARM/ARMCommon.cpp
@ -1,84 +0,0 @@
-//===-- ARMCommon.cpp - Define support functions for ARM --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file was developed by the "Instituto Nokia de Tecnologia" and
-// is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//
-//===----------------------------------------------------------------------===//
-#include "ARMCommon.h"
-
-static inline unsigned rotateL(unsigned x, unsigned n){
-  return ((x << n) | (x  >> (32 - n)));
-}
-
-static inline unsigned rotateR(unsigned x, unsigned n){
-  return ((x >> n) | (x  << (32 - n)));
-}
-
-// finds the end position of largest sequence of zeros in binary representation
-// of 'immediate'.
-static int findLargestZeroSequence(unsigned immediate){
-  int max_zero_pos = 0;
-  int max_zero_length = 0;
-  int zero_pos;
-  int zero_length;
-  int pos = 0;
-  int end_pos;
-
-  while ((immediate & 0x3) == 0) {
-    immediate = rotateR(immediate, 2);
-    pos+=2;
-  }
-  end_pos = pos+32;
-
-  while (pos<end_pos){
-    while (((immediate & 0x3) != 0)&&(pos<end_pos)) {
-      immediate = rotateR(immediate, 2);
-      pos+=2;
-    }
-    zero_pos = pos;
-    while (((immediate & 0x3) == 0)&&(pos<end_pos)) {
-      immediate = rotateR(immediate, 2);
-      pos+=2;
-    }
-    zero_length = pos - zero_pos;
-    if (zero_length > max_zero_length){
-      max_zero_length = zero_length;
-      max_zero_pos = zero_pos % 32;
-    }
-
-  }
-
-  return (max_zero_pos + max_zero_length) % 32;
-}
-
-std::vector<unsigned> splitImmediate(unsigned immediate){
-  std::vector<unsigned> immediatePieces;
-
-  if (immediate == 0){
-    immediatePieces.push_back(0);
-  } else {
-    int start_pos = findLargestZeroSequence(immediate);
-    unsigned immediate_tmp = rotateR(immediate, start_pos);
-    int pos = 0;
-    while (pos < 32){
-      while(((immediate_tmp&0x3) == 0)&&(pos<32)){
-        immediate_tmp = rotateR(immediate_tmp,2);
-        pos+=2;
-      }
-      if (pos < 32){
-        immediatePieces.push_back(rotateL(immediate_tmp&0xFF,
-                                          (start_pos + pos) % 32 ));
-        immediate_tmp = rotateR(immediate_tmp,8);
-        pos+=8;
-      }
-    }
-  }
-  return immediatePieces;
-}
--- a/llvm/lib/Target/ARM/ARMCommon.h
+++ b/llvm/lib/Target/ARM/ARMCommon.h
@ -1,22 +0,0 @@
-//===-- ARMCommon.h - Define support functions for ARM ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file was developed by the "Instituto Nokia de Tecnologia" and
-// is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM_COMMON_H
-#define ARM_COMMON_H
-
-#include <vector>
-
-std::vector<unsigned> splitImmediate(unsigned immediate);
-
-#endif
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@ -0,0 +1,490 @@
+//===-- ARMConstantIslandPass.cpp - ARM constant islands --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that splits the constant pool up into 'islands'
+// which are scattered through-out the function.  This is required due to the
+// limited pc-relative displacements that ARM has.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-cp-islands"
+#include "ARM.h"
+#include "ARMInstrInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include <iostream>
+using namespace llvm;
+
+STATISTIC(NumSplit, "Number of uncond branches inserted");
+
+namespace {
+  /// ARMConstantIslands - Due to limited pc-relative displacements, ARM
+  /// requires constant pool entries to be scattered among the instructions
+  /// inside a function.  To do this, it completely ignores the normal LLVM
+  /// constant pool, instead, it places constants where-ever it feels like with
+  /// special instructions.
+  ///
+  /// The terminology used in this pass includes:
+  ///   Islands - Clumps of constants placed in the function.
+  ///   Water   - Potential places where an island could be formed.
+  ///   CPE     - A constant pool entry that has been placed somewhere, which
+  ///             tracks a list of users.
+  class VISIBILITY_HIDDEN ARMConstantIslands : public MachineFunctionPass {
+    /// NextUID - Assign unique ID's to CPE's.
+    unsigned NextUID;
+    
+    /// BBSizes - The size of each MachineBasicBlock in bytes of code, indexed
+    /// by MBB Number.
+    std::vector<unsigned> BBSizes;
+    
+    /// WaterList - A sorted list of basic blocks where islands could be placed
+    /// (i.e. blocks that don't fall through to the following block, due
+    /// to a return, unreachable, or unconditional branch).
+    std::vector<MachineBasicBlock*> WaterList;
+    
+    /// CPUser - One user of a constant pool, keeping the machine instruction
+    /// pointer, the constant pool being referenced, and the max displacement
+    /// allowed from the instruction to the CP.
+    struct CPUser {
+      MachineInstr *MI;
+      MachineInstr *CPEMI;
+      unsigned MaxDisp;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp) {}
+    };
+    
+    /// CPUsers - Keep track of all of the machine instructions that use various
+    /// constant pools and their max displacement.
+    std::vector<CPUser> CPUsers;
+    
+    const TargetInstrInfo *TII;
+    const TargetAsmInfo   *TAI;
+  public:
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM constant island placement pass";
+    }
+    
+  private:
+    void DoInitialPlacement(MachineFunction &Fn,
+                            std::vector<MachineInstr*> &CPEMIs);
+    void InitialFunctionScan(MachineFunction &Fn,
+                             const std::vector<MachineInstr*> &CPEMIs);
+    void SplitBlockBeforeInstr(MachineInstr *MI);
+    bool HandleConstantPoolUser(MachineFunction &Fn, CPUser &U);
+    void UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+
+    unsigned GetInstSize(MachineInstr *MI) const;
+    unsigned GetOffsetOf(MachineInstr *MI) const;
+  };
+}
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createARMConstantIslandPass() {
+  return new ARMConstantIslands();
+}
+
+bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) {
+  // If there are no constants, there is nothing to do.
+  MachineConstantPool &MCP = *Fn.getConstantPool();
+  if (MCP.isEmpty()) return false;
+  
+  TII = Fn.getTarget().getInstrInfo();
+  TAI = Fn.getTarget().getTargetAsmInfo();
+  
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  Fn.RenumberBlocks();
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr*> CPEMIs;
+  DoInitialPlacement(Fn, CPEMIs);
+  
+  /// The next UID to take is the first unused one.
+  NextUID = CPEMIs.size();
+  
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  InitialFunctionScan(Fn, CPEMIs);
+  CPEMIs.clear();
+  
+  // Iteratively place constant pool entries until there is no change.
+  bool MadeChange;
+  do {
+    MadeChange = false;
+    for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+      MadeChange |= HandleConstantPoolUser(Fn, CPUsers[i]);
+  } while (MadeChange);
+  
+  BBSizes.clear();
+  WaterList.clear();
+  CPUsers.clear();
+    
+  return true;
+}
+
+/// DoInitialPlacement - Perform the initial placement of the constant pool
+/// entries.  To start with, we put them all at the end of the function.
+void ARMConstantIslands::DoInitialPlacement(MachineFunction &Fn,
+                                            std::vector<MachineInstr*> &CPEMIs){
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = new MachineBasicBlock();
+  Fn.getBasicBlockList().push_back(BB);
+  
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs =
+    Fn.getConstantPool()->getConstants();
+  
+  const TargetData &TD = *Fn.getTarget().getTargetData();
+  for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+    unsigned Size = TD.getTypeSize(CPs[i].getType());
+    // Verify that all constant pool entries are a multiple of 4 bytes.  If not,
+    // we would have to pad them out or something so that instructions stay
+    // aligned.
+    assert((Size & 3) == 0 && "CP Entry not multiple of 4 bytes!");
+    MachineInstr *CPEMI =
+      BuildMI(BB, TII->get(ARM::CONSTPOOL_ENTRY))
+                           .addImm(i).addConstantPoolIndex(i).addImm(Size);
+    CPEMIs.push_back(CPEMI);
+    DEBUG(std::cerr << "Moved CPI#" << i << " to end of function as #"
+                    << i << "\n");
+  }
+}
+
+/// BBHasFallthrough - Return true of the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  if (next(MBBI) == MBB->getParent()->end())  // Can't fall off end of function.
+    return false;
+  
+  MachineBasicBlock *NextBB = next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+  
+  return false;
+}
+
+/// InitialFunctionScan - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
+                                     const std::vector<MachineInstr*> &CPEMIs) {
+  for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+    
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!BBHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+    
+    unsigned MBBSize = 0;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      // Add instruction size to MBBSize.
+      MBBSize += GetInstSize(I);
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+        if (I->getOperand(op).isConstantPoolIndex()) {
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+          unsigned MaxOffs = 0;
+          
+          // Basic size info comes from the TSFlags field.
+          unsigned TSFlags = I->getInstrDescriptor()->TSFlags;
+          switch (TSFlags & ARMII::AddrModeMask) {
+          default: 
+            // Constant pool entries can reach anything.
+            if (I->getOpcode() == ARM::CONSTPOOL_ENTRY)
+              continue;
+            assert(0 && "Unknown addressing mode for CP reference!");
+          case ARMII::AddrMode1: // AM1: 8 bits << 2
+            MaxOffs = 1 << (8+2);   // Taking the address of a CP entry.
+            break;
+          case ARMII::AddrMode2:
+            MaxOffs = 1 << 12;   // +-offset_12
+            break;
+          case ARMII::AddrMode3:
+            MaxOffs = 1 << 8;   // +-offset_8
+            break;
+            // addrmode4 has no immediate offset.
+          case ARMII::AddrMode5:
+            MaxOffs = 1 << (8+2);   // +-(offset_8*4)
+            break;
+          case ARMII::AddrModeT1:
+            MaxOffs = 1 << 5;
+            break;
+          case ARMII::AddrModeT2:
+            MaxOffs = 1 << (5+1);
+            break;
+          case ARMII::AddrModeT4:
+            MaxOffs = 1 << (5+2);
+            break;
+          }
+          
+          // Remember that this is a user of a CP entry.
+          MachineInstr *CPEMI =CPEMIs[I->getOperand(op).getConstantPoolIndex()];
+          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs));
+          
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+        }
+    }
+    BBSizes.push_back(MBBSize);
+  }
+}
+
+/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI) DISABLE_INLINE;
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI) {
+  return JT[JTI].MBBs.size();
+}
+
+/// GetInstSize - Return the size of the specified MachineInstr.
+///
+unsigned ARMConstantIslands::GetInstSize(MachineInstr *MI) const {
+  // Basic size info comes from the TSFlags field.
+  unsigned TSFlags = MI->getInstrDescriptor()->TSFlags;
+  
+  switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) {
+  default:
+    // If this machine instr is an inline asm, measure it.
+    if (MI->getOpcode() == ARM::INLINEASM)
+      return TAI->getInlineAsmLength(MI->getOperand(0).getSymbolName());
+    assert(0 && "Unknown or unset size field for instr!");
+    break;
+  case ARMII::Size8Bytes: return 8;          // Arm instruction x 2.
+  case ARMII::Size4Bytes: return 4;          // Arm instruction.
+  case ARMII::Size2Bytes: return 2;          // Thumb instruction.
+  case ARMII::SizeSpecial: {
+    switch (MI->getOpcode()) {
+    case ARM::CONSTPOOL_ENTRY:
+      // If this machine instr is a constant pool entry, its size is recorded as
+      // operand #2.
+      return MI->getOperand(2).getImm();
+    case ARM::BR_JTr:
+    case ARM::BR_JTm:
+    case ARM::BR_JTadd: {
+      // These are jumptable branches, i.e. a branch followed by an inlined
+      // jumptable. The size is 4 + 4 * number of entries.
+      unsigned JTI = MI->getOperand(MI->getNumOperands()-2).getJumpTableIndex();
+      const MachineFunction *MF = MI->getParent()->getParent();
+      MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+      const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+      assert(JTI < JT.size());
+      return getNumJTEntries(JT, JTI) * 4 + 4;
+    }
+    default:
+      // Otherwise, pseudo-instruction sizes are zero.
+      return 0;
+    }
+  }
+  }
+}
+
+/// GetOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned ARMConstantIslands::GetOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = 0;
+  
+  // Sum block sizes before MBB.
+  for (unsigned BB = 0, e = MBB->getNumber(); BB != e; ++BB)
+    Offset += BBSizes[BB];
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); ; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    if (&*I == MI) return Offset;
+    Offset += GetInstSize(I);
+  }
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// UpdateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consequtive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+  
+  // Insert a size into BBSizes to align it properly with the (newly
+  // renumbered) block numbers.
+  BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0);
+  
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having 
+  // available water after it.
+  std::vector<MachineBasicBlock*>::iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+                     CompareMBBNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update datastructures and renumber blocks to
+/// account for this change.
+void ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB = new MachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  OrigBB->getParent()->getBasicBlockList().insert(MBBI, NewBB);
+  
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+  
+  // Add an unconditional branch from OrigBB to NewBB.
+  BuildMI(OrigBB, TII->get(ARM::B)).addMBB(NewBB);
+  NumSplit++;
+  
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  while (!OrigBB->succ_empty()) {
+    MachineBasicBlock *Succ = *OrigBB->succ_begin();
+    OrigBB->removeSuccessor(Succ);
+    NewBB->addSuccessor(Succ);
+    
+    // This pass should be run after register allocation, so there should be no
+    // PHI nodes to update.
+    assert((Succ->empty() || Succ->begin()->getOpcode() != TargetInstrInfo::PHI)
+           && "PHI nodes should be eliminated by now!");
+  }
+  
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+  
+  // Update internal data structures to account for the newly inserted MBB.
+  UpdateForInsertedWaterBlock(NewBB);
+  
+  // Figure out how large the first NewMBB is.
+  unsigned NewBBSize = 0;
+  for (MachineBasicBlock::iterator I = NewBB->begin(), E = NewBB->end();
+       I != E; ++I)
+    NewBBSize += GetInstSize(I);
+  
+  // Set the size of NewBB in BBSizes.
+  BBSizes[NewBB->getNumber()] = NewBBSize;
+  
+  // We removed instructions from UserMBB, subtract that off from its size.
+  // Add 4 to the block to count the unconditional branch we added to it.
+  BBSizes[OrigBB->getNumber()] -= NewBBSize-4;
+}
+
+/// HandleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick it up the constant pool value and move it some
+/// place in-range.
+bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn, CPUser &U){
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  unsigned UserOffset = GetOffsetOf(UserMI);
+  unsigned CPEOffset  = GetOffsetOf(CPEMI);
+  
+  DEBUG(std::cerr << "User of CPE#" << CPEMI->getOperand(0).getImm()
+                  << " max delta=" << U.MaxDisp
+                  << " at offset " << int(UserOffset-CPEOffset) << "\t"
+                  << *UserMI);
+
+  // Check to see if the CPE is already in-range.
+  if (UserOffset < CPEOffset) {
+    // User before the CPE.
+    if (CPEOffset-UserOffset <= U.MaxDisp)
+      return false;
+  } else {
+    if (UserOffset-CPEOffset <= U.MaxDisp)
+      return false;
+  }
+  
+ 
+  // Solution guaranteed to work: split the user's MBB right before the user and
+  // insert a clone the CPE into the newly created water.
+  
+  // If the user isn't at the start of its MBB, or if there is a fall-through
+  // into the user's MBB, split the MBB before the User.
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  if (&UserMBB->front() != UserMI ||
+      UserMBB == &Fn.front() || // entry MBB of function.
+      BBHasFallthrough(prior(MachineFunction::iterator(UserMBB)))) {
+    // TODO: Search for the best place to split the code.  In practice, using
+    // loop nesting information to insert these guys outside of loops would be
+    // sufficient.    
+    SplitBlockBeforeInstr(UserMI);
+    
+    // UserMI's BB may have changed.
+    UserMBB = UserMI->getParent();
+  }
+  
+  // Okay, we know we can put an island before UserMBB now, do it!
+  MachineBasicBlock *NewIsland = new MachineBasicBlock();
+  Fn.getBasicBlockList().insert(UserMBB, NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  UpdateForInsertedWaterBlock(NewIsland);
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  unsigned ID  = NextUID++;
+  unsigned CPI = CPEMI->getOperand(1).getConstantPoolIndex();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  
+  // Build a new CPE for this user.
+  U.CPEMI = BuildMI(NewIsland, TII->get(ARM::CONSTPOOL_ENTRY))
+                .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+  
+  // Increase the size of the island block to account for the new entry.
+  BBSizes[NewIsland->getNumber()] += Size;
+  
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+    if (UserMI->getOperand(i).isConstantPoolIndex()) {
+      UserMI->getOperand(i).setConstantPoolIndex(ID);
+      break;
+    }
+      
+  DEBUG(std::cerr << "  Moved CPE to #" << ID << " CPI=" << CPI << "\t"
+                  << *UserMI);
+  
+      
+  return true;
+}
+
--- a/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
@ -0,0 +1,55 @@
+//===- ARMConstantPoolValue.cpp - ARM constantpool value --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/GlobalValue.h"
+using namespace llvm;
+
+ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, unsigned id,
+                                         bool isNonLazy, unsigned char PCAdj)
+  : MachineConstantPoolValue((const Type*)gv->getType()),
+    GV(gv), LabelId(id), isNonLazyPtr(isNonLazy), PCAdjust(PCAdj) {}
+
+int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                    unsigned Alignment) {
+  unsigned AlignMask = (1 << Alignment)-1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].Offset & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      if (CPV->GV == GV && CPV->LabelId == LabelId &&
+          CPV->isNonLazyPtr == isNonLazyPtr)
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+void
+ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(GV);
+  ID.AddInteger(LabelId);
+  ID.AddInteger((unsigned)isNonLazyPtr);
+  ID.AddInteger(PCAdjust);
+}
+
+void ARMConstantPoolValue::print(std::ostream &O) const {
+  O << GV->getName();
+  if (isNonLazyPtr) O << "$non_lazy_ptr";
+  if (PCAdjust != 0) O << "-(LPIC" << LabelId << "+"
+                       << (unsigned)PCAdjust << ")";
+}
--- a/llvm/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/llvm/lib/Target/ARM/ARMConstantPoolValue.h
@ -0,0 +1,50 @@
+//===- ARMConstantPoolValue.h - ARM constantpool value ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+#define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+
+namespace llvm {
+
+/// ARMConstantPoolValue - ARM specific constantpool value. This is used to
+/// represent PC relative displacement between the address of the load
+/// instruction and the global value being loaded, i.e. (&GV-(LPIC+8)).
+class ARMConstantPoolValue : public MachineConstantPoolValue {
+  GlobalValue *GV;         // GlobalValue being loaded.
+  unsigned LabelId;        // Label id of the load.
+  bool isNonLazyPtr;       // True if loading a Mac OS X non_lazy_ptr stub.
+  unsigned char PCAdjust;  // Extra adjustment if constantpool is pc relative.
+                           // 8 for ARM, 4 for Thumb.
+
+public:
+  ARMConstantPoolValue(GlobalValue *gv, unsigned id, bool isNonLazy = false,
+                       unsigned char PCAdj = 0);
+
+  GlobalValue *getGV() const { return GV; }
+  unsigned getLabelId() const { return LabelId; }
+  bool isNonLazyPointer() const { return isNonLazyPtr; }
+  unsigned char getPCAdjustment() const { return PCAdjust; }
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  virtual void AddSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  virtual void print(std::ostream &O) const;
+};
+  
+}
+
+#endif
--- a/llvm/lib/Target/ARM/ARMFrameInfo.h
+++ b/llvm/lib/Target/ARM/ARMFrameInfo.h
@ -17,17 +17,15 @@

 #include "ARM.h"
 #include "llvm/Target/TargetFrameInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "ARMSubtarget.h"

 namespace llvm {

-class ARMFrameInfo: public TargetFrameInfo {
-
+class ARMFrameInfo : public TargetFrameInfo {
 public:
-  ARMFrameInfo()
-    : TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 8, 0) {
+  ARMFrameInfo(const ARMSubtarget &ST)
+    : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0) {
  }
-
 };

 } // End llvm namespace
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@ -0,0 +1,134 @@
+//===-- ARMISelLowering.h - ARM DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMISELLOWERING_H
+#define ARMISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include <vector>
+
+namespace llvm {
+  class ARMConstantPoolValue;
+  class ARMSubtarget;
+
+  namespace ARMISD {
+    // ARM Specific DAG Nodes
+    enum NodeType {
+      // Start the numbering where the builting ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END+ARM::INSTRUCTION_LIST_END,
+
+      Wrapper,      // Wrapper - A wrapper node for TargetConstantPool,
+                    // TargetExternalSymbol, and TargetGlobalAddress.
+      WrapperCall,  // WrapperCall - Same as wrapper, but mark the wrapped
+                    // node as call operand.
+      WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
+      
+      CALL,         // Function call.
+      CALL_NOLINK,  // Function call with branch not branch-and-link.
+      tCALL,        // Thumb function call.
+      BRCOND,       // Conditional branch.
+      BR_JT,        // Jumptable branch.
+      RET_FLAG,     // Return with a flag operand.
+
+      PIC_ADD,      // Add with a PC operand and a PIC label.
+
+      CMP,          // ARM compare instructions.
+      CMPFP,        // ARM VFP compare instruction, sets FPSCR.
+      CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
+      FMSTAT,       // ARM fmstat instruction.
+      CMOV,         // ARM conditional move instructions.
+      CNEG,         // ARM conditional negate instructions.
+      
+      FTOSI,        // FP to sint within a FP register.
+      FTOUI,        // FP to uint within a FP register.
+      SITOF,        // sint to FP within a FP register.
+      UITOF,        // uint to FP within a FP register.
+
+      MULHILOU,     // Lo,Hi = umul LHS, RHS.
+      MULHILOS,     // Lo,Hi = smul LHS, RHS.
+      
+      SRL_FLAG,     // V,Flag = srl_flag X -> srl X, 1 + save carry out.
+      SRA_FLAG,     // V,Flag = sra_flag X -> sra X, 1 + save carry out.
+      RRX,          // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
+      
+      FMRRD,        // double to two gprs.
+      FMDRR         // Two gprs to double.
+    };
+  }
+
+  //===----------------------------------------------------------------------===//
+  //  ARMTargetLowering - X86 Implementation of the TargetLowering interface
+  
+  class ARMTargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+  public:
+    ARMTargetLowering(TargetMachine &TM);
+
+    virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                                       MachineBasicBlock *MBB);
+
+    /// isLegalAddressImmediate - Return true if the integer value or
+    /// GlobalValue can be used as the offset of the target addressing mode.
+    virtual bool isLegalAddressImmediate(int64_t V) const;
+    virtual bool isLegalAddressImmediate(GlobalValue *GV) const;
+
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    virtual bool getPreIndexedAddressParts(SDNode *N, SDOperand &Base,
+                                           SDOperand &Offset,
+                                           ISD::MemIndexedMode &AM,
+                                           SelectionDAG &DAG);
+
+    /// getPostIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if this node can be
+    /// combined with a load / store to form a post-indexed load / store.
+    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                            SDOperand &Base, SDOperand &Offset,
+                                            ISD::MemIndexedMode &AM,
+                                            SelectionDAG &DAG);
+
+    virtual void computeMaskedBitsForTargetNode(const SDOperand Op,
+                                                uint64_t Mask,
+                                                uint64_t &KnownZero, 
+                                                uint64_t &KnownOne,
+                                                unsigned Depth) const;
+    ConstraintType getConstraintType(char ConstraintLetter) const;
+    std::pair<unsigned, const TargetRegisterClass*> 
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT::ValueType VT) const;
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                      MVT::ValueType VT) const;
+  private:
+    /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const ARMSubtarget *Subtarget;
+
+    /// ARMPCLabelIndex - Keep track the number of ARM PC labels created.
+    ///
+    unsigned ARMPCLabelIndex;
+
+    SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerBR_JT(SDOperand Op, SelectionDAG &DAG);
+  };
+}
+
+#endif  // ARMISELLOWERING_H
--- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@ -14,46 +14,409 @@

 #include "ARMInstrInfo.h"
 #include "ARM.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "ARMAddressingModes.h"
 #include "ARMGenInstrInfo.inc"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/Support/CommandLine.h"
 using namespace llvm;

-ARMInstrInfo::ARMInstrInfo()
+static cl::opt<bool> EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
+                                  cl::desc("Enable ARM 2-addr to 3-addr conv"));
+
+ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
  : TargetInstrInfo(ARMInsts, sizeof(ARMInsts)/sizeof(ARMInsts[0])),
-    RI(*this) {
+    RI(*this, STI) {
+}
+
+unsigned ARMInstrInfo::getDWARF_LABELOpcode() const {
+  return ARM::DWARF_LABEL;
 }

 const TargetRegisterClass *ARMInstrInfo::getPointerRegClass() const {
-  return &ARM::IntRegsRegClass;
+  return &ARM::GPRRegClass;
 }

 /// Return true if the instruction is a register to register move and
 /// leave the source and dest operands in the passed parameters.
 ///
 bool ARMInstrInfo::isMoveInstr(const MachineInstr &MI,
-                                 unsigned &SrcReg, unsigned &DstReg) const {
+                               unsigned &SrcReg, unsigned &DstReg) const {
  MachineOpCode oc = MI.getOpcode();
  switch (oc) {
-  case ARM::MOV: {
-    assert(MI.getNumOperands() == 4 &&
-	   MI.getOperand(0).isRegister() &&
+  default:
+    return false;
+  case ARM::FCPYS:
+  case ARM::FCPYD:
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  case ARM::MOVrr:
+  case ARM::tMOVrr:
+    assert(MI.getNumOperands() == 2 && MI.getOperand(0).isRegister() &&
+	   MI.getOperand(1).isRegister() &&
 	   "Invalid ARM MOV instruction");
-    const MachineOperand   &Arg = MI.getOperand(1);
-    const MachineOperand &Shift = MI.getOperand(2);
-    if (Arg.isRegister() && Shift.isImmediate() && Shift.getImmedValue() == 0) {
-      SrcReg = MI.getOperand(1).getReg();
-      DstReg = MI.getOperand(0).getReg();
-      return true;
-    }
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
  }
-  }
-  return false;
 }

-void ARMInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
-                                  MachineBasicBlock *FBB,
-                                  const std::vector<MachineOperand> &Cond)const{
-  // Can only insert uncond branches so far.
-  assert(Cond.empty() && !FBB && TBB && "Can only handle uncond branches!");
-  BuildMI(&MBB, get(ARM::b)).addMBB(TBB);
+unsigned ARMInstrInfo::isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const{
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::LDR:
+    if (MI->getOperand(1).isFrameIndex() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImmediate() && 
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::FLDD:
+  case ARM::FLDS:
+    if (MI->getOperand(1).isFrameIndex() &&
+        MI->getOperand(2).isImmediate() && 
+        MI->getOperand(2).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::tLDRspi:
+    if (MI->getOperand(1).isFrameIndex() &&
+        MI->getOperand(2).isImmediate() && 
+        MI->getOperand(2).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned ARMInstrInfo::isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::STR:
+    if (MI->getOperand(1).isFrameIndex() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImmediate() && 
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::FSTD:
+  case ARM::FSTS:
+    if (MI->getOperand(1).isFrameIndex() &&
+        MI->getOperand(2).isImmediate() && 
+        MI->getOperand(2).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::tSTRspi:
+    if (MI->getOperand(1).isFrameIndex() &&
+        MI->getOperand(2).isImmediate() && 
+        MI->getOperand(2).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+static unsigned getUnindexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default: break;
+  case ARM::LDR_PRE:
+  case ARM::LDR_POST:
+    return ARM::LDR;
+  case ARM::LDRH_PRE:
+  case ARM::LDRH_POST:
+    return ARM::LDRH;
+  case ARM::LDRB_PRE:
+  case ARM::LDRB_POST:
+    return ARM::LDRB;
+  case ARM::LDRSH_PRE:
+  case ARM::LDRSH_POST:
+    return ARM::LDRSH;
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSB_POST:
+    return ARM::LDRSB;
+  case ARM::STR_PRE:
+  case ARM::STR_POST:
+    return ARM::STR;
+  case ARM::STRH_PRE:
+  case ARM::STRH_POST:
+    return ARM::STRH;
+  case ARM::STRB_PRE:
+  case ARM::STRB_POST:
+    return ARM::STRB;
+  }
+  return 0;
+}
+
+MachineInstr *
+ARMInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    LiveVariables &LV) const {
+  if (!EnableARM3Addr)
+    return NULL;
+
+  MachineInstr *MI = MBBI;
+  unsigned TSFlags = MI->getInstrDescriptor()->TSFlags;
+  bool isPre = false;
+  switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
+  default: return NULL;
+  case ARMII::IndexModePre:
+    isPre = true;
+    break;
+  case ARMII::IndexModePost:
+    break;
+  }
+
+  // Try spliting an indexed load / store to a un-indexed one plus an add/sub
+  // operation.
+  unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
+  if (MemOpc == 0)
+    return NULL;
+
+  MachineInstr *UpdateMI = NULL;
+  MachineInstr *MemMI = NULL;
+  unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+  unsigned NumOps = MI->getNumOperands();
+  bool isLoad = (MI->getInstrDescriptor()->Flags & M_LOAD_FLAG) != 0;
+  const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
+  const MachineOperand &Base = MI->getOperand(2);
+  const MachineOperand &Offset = MI->getOperand(NumOps-2);
+  unsigned WBReg = WB.getReg();
+  unsigned BaseReg = Base.getReg();
+  unsigned OffReg = Offset.getReg();
+  unsigned OffImm = MI->getOperand(NumOps-1).getImm();
+  switch (AddrMode) {
+  default:
+    assert(false && "Unknown indexed op!");
+    return NULL;
+  case ARMII::AddrMode2: {
+    bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+    if (OffReg == 0) {
+      int SOImmVal = ARM_AM::getSOImmVal(Amt);
+      if (SOImmVal == -1)
+        // Can't encode it in a so_imm operand. This transformation will
+        // add more than 1 instruction. Abandon!
+        return NULL;
+      UpdateMI = BuildMI(get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(SOImmVal);
+    } else if (Amt != 0) {
+      ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+      unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
+      UpdateMI = BuildMI(get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg)
+        .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc);
+    } else 
+      UpdateMI = BuildMI(get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg);
+    break;
+  }
+  case ARMII::AddrMode3 : {
+    bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM3Offset(OffImm);
+    if (OffReg == 0)
+      // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
+      UpdateMI = BuildMI(get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(Amt);
+    else
+      UpdateMI = BuildMI(get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg);
+    break;
+  }
+  }
+
+  std::vector<MachineInstr*> NewMIs;
+  if (isPre) {
+    if (isLoad)
+      MemMI = BuildMI(get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(WBReg).addReg(0).addImm(0);
+    else
+      MemMI = BuildMI(get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(WBReg).addReg(0).addImm(0);
+    NewMIs.push_back(MemMI);
+    NewMIs.push_back(UpdateMI);
+  } else {
+    if (isLoad)
+      MemMI = BuildMI(get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(BaseReg).addReg(0).addImm(0);
+    else
+      MemMI = BuildMI(get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(BaseReg).addReg(0).addImm(0);
+    if (WB.isDead())
+      UpdateMI->getOperand(0).setIsDead();
+    NewMIs.push_back(UpdateMI);
+    NewMIs.push_back(MemMI);
+  }
+  
+  // Transfer LiveVariables states, kill / dead info.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isRegister() && MO.getReg() &&
+        MRegisterInfo::isVirtualRegister(MO.getReg())) {
+      unsigned Reg = MO.getReg();
+      LiveVariables::VarInfo &VI = LV.getVarInfo(Reg);
+      if (MO.isDef()) {
+        MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
+        if (MO.isDead())
+          LV.addVirtualRegisterDead(Reg, NewMI);
+        // Update the defining instruction.
+        if (VI.DefInst == MI)
+          VI.DefInst = NewMI;
+      }
+      if (MO.isUse() && MO.isKill()) {
+        for (unsigned j = 0; j < 2; ++j) {
+          // Look at the two new MI's in reverse order.
+          MachineInstr *NewMI = NewMIs[j];
+          MachineOperand *NMO = NewMI->findRegisterUseOperand(Reg);
+          if (!NMO)
+            continue;
+          LV.addVirtualRegisterKilled(Reg, NewMI);
+          if (VI.removeKill(MI))
+            VI.Kills.push_back(NewMI);
+          break;
+        }
+      }
+    }
+  }
+
+  MFI->insert(MBBI, NewMIs[1]);
+  MFI->insert(MBBI, NewMIs[0]);
+  return NewMIs[0];
+}
+
+// Branch analysis.
+bool ARMInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 std::vector<MachineOperand> &Cond) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isTerminatorInstr((--I)->getOpcode()))
+    return false;
+  
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isTerminatorInstr((--I)->getOpcode())) {
+    if (LastOpc == ARM::B || LastOpc == ARM::tB) {
+      TBB = LastInst->getOperand(0).getMachineBasicBlock();
+      return false;
+    }
+    if (LastOpc == ARM::Bcc || LastOpc == ARM::tBcc) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(0).getMachineBasicBlock();
+      Cond.push_back(LastInst->getOperand(1));
+      return false;
+    }
+    return true;  // Can't handle indirect branch.
+  }
+  
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isTerminatorInstr((--I)->getOpcode()))
+    return true;
+  
+  // If the block ends with ARM::B/ARM::tB and a ARM::Bcc/ARM::tBcc, handle it.
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+  if ((SecondLastOpc == ARM::Bcc && LastOpc == ARM::B) ||
+      (SecondLastOpc == ARM::tBcc && LastOpc == ARM::tB)) {
+    TBB =  SecondLastInst->getOperand(0).getMachineBasicBlock();
+    Cond.push_back(SecondLastInst->getOperand(1));
+    FBB = LastInst->getOperand(0).getMachineBasicBlock();
+    return false;
+  }
+  
+  // Otherwise, can't handle this.
+  return true;
+}
+
+
+void ARMInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int BOpc   = AFI->isThumbFunction() ? ARM::tB : ARM::B;
+  int BccOpc = AFI->isThumbFunction() ? ARM::tBcc : ARM::Bcc;
+
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return;
+  --I;
+  if (I->getOpcode() != BOpc && I->getOpcode() != BccOpc)
+    return;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+  
+  if (I == MBB.begin()) return;
+  --I;
+  if (I->getOpcode() != BccOpc)
+    return;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+}
+
+void ARMInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const std::vector<MachineOperand> &Cond) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int BOpc   = AFI->isThumbFunction() ? ARM::tB : ARM::B;
+  int BccOpc = AFI->isThumbFunction() ? ARM::tBcc : ARM::Bcc;
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "ARM branch conditions have two components!");
+  
+  if (FBB == 0) {
+    if (Cond.empty()) // Unconditional branch?
+      BuildMI(&MBB, get(BOpc)).addMBB(TBB);
+    else
+      BuildMI(&MBB, get(BccOpc)).addMBB(TBB).addImm(Cond[0].getImm());
+    return;
+  }
+  
+  // Two-way conditional branch.
+  BuildMI(&MBB, get(BccOpc)).addMBB(TBB).addImm(Cond[0].getImm());
+  BuildMI(&MBB, get(BOpc)).addMBB(FBB);
+}
+
+bool ARMInstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const {
+  if (MBB.empty()) return false;
+  
+  switch (MBB.back().getOpcode()) {
+  case ARM::B:
+  case ARM::tB:       // Uncond branch.
+  case ARM::BR_JTr:   // Jumptable branch.
+  case ARM::BR_JTm:   // Jumptable branch through mem.
+  case ARM::BR_JTadd: // Jumptable branch add to pc.
+    return true;
+  default: return false;
+  }
+}
+
+bool ARMInstrInfo::
+ReverseBranchCondition(std::vector<MachineOperand> &Cond) const {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
+  Cond[0].setImm(ARMCC::getOppositeCondition(CC));
+  return false;
 }
--- a/llvm/lib/Target/ARM/ARMInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.h
@ -1,4 +1,4 @@
-//===- ARMInstrInfo.h - ARM Instruction Information --------------*- C++ -*-===//
+//===- ARMInstrInfo.h - ARM Instruction Information -------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -19,11 +19,56 @@
 #include "ARMRegisterInfo.h"

 namespace llvm {
+  class ARMSubtarget;
+
+/// ARMII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace ARMII {
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction Flags.
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the addressing mode used.  Zero is unused
+    // so that we can tell if we forgot to set a value.
+
+    AddrModeMask  = 0xf,
+    AddrMode1     = 1,
+    AddrMode2     = 2,
+    AddrMode3     = 3,
+    AddrMode4     = 4,
+    AddrMode5     = 5,
+    AddrModeT1    = 6,
+    AddrModeT2    = 7,
+    AddrModeT4    = 8,
+    AddrModeTs    = 9,   // i8 * 4 for pc and sp relative data
+
+    // Size* - Flags to keep track of the size of an instruction.
+    SizeShift     = 4,
+    SizeMask      = 7 << SizeShift,
+    SizeSpecial   = 1,   // 0 byte pseudo or special case.
+    Size8Bytes    = 2,
+    Size4Bytes    = 3,
+    Size2Bytes    = 4,
+    
+    // IndexMode - Unindex, pre-indexed, or post-indexed. Only valid for load
+    // and store ops 
+    IndexModeShift = 7,
+    IndexModeMask  = 3 << IndexModeShift,
+    IndexModePre   = 1,
+    IndexModePost  = 2,
+    
+    // Opcode
+    OpcodeShift   = 9,
+    OpcodeMask    = 0xf << OpcodeShift
+  };
+}

 class ARMInstrInfo : public TargetInstrInfo {
  const ARMRegisterInfo RI;
 public:
-  ARMInstrInfo();
+  ARMInstrInfo(const ARMSubtarget &STI);

  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
  /// such, whenever a client has an instance of instruction info, it should
@ -35,15 +80,33 @@ public:
  /// This is used for addressing modes.
  virtual const TargetRegisterClass *getPointerRegClass() const;

+  /// getDWARF_LABELOpcode - Return the opcode of the target's DWARF_LABEL
+  /// instruction if it has one.  This is used by codegen passes that update
+  /// DWARF line number info as they modify the code.
+  virtual unsigned getDWARF_LABELOpcode() const;
+  
  /// Return true if the instruction is a register to register move and
  /// leave the source and dest operands in the passed parameters.
  ///
  virtual bool isMoveInstr(const MachineInstr &MI,
                           unsigned &SrcReg, unsigned &DstReg) const;
+  virtual unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  
+  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables &LV) const;

+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             std::vector<MachineOperand> &Cond) const;
+  virtual void RemoveBranch(MachineBasicBlock &MBB) const;
  virtual void InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                            MachineBasicBlock *FBB,
                            const std::vector<MachineOperand> &Cond) const;
+  virtual bool BlockHasNoFallThrough(MachineBasicBlock &MBB) const;
+  virtual bool ReverseBranchCondition(std::vector<MachineOperand> &Cond) const;
 };

 }
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@ -0,0 +1,513 @@
+//===- ARMInstrThumb.td - Thumb support for ARM ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Thumb specific DAG Nodes.
+//
+
+def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall,
+                      [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+// TI - Thumb instruction.
+
+// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
+class ThumbPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb];
+}
+
+class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb, HasV5T];
+}
+
+class ThumbI<dag ops, AddrMode am, SizeFlagVal sz,
+             string asm, string cstr, list<dag> pattern>
+  // FIXME: Set all opcodes to 0 for now.
+  : InstARM<0, am, sz, IndexModeNone, ops, asm, cstr> {
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+class TI<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeNone, Size2Bytes, asm, "", pattern>;
+class TI1<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeT1, Size2Bytes, asm, "", pattern>;
+class TI2<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeT2, Size2Bytes, asm, "", pattern>;
+class TI4<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeT4, Size2Bytes, asm, "", pattern>;
+class TIs<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeTs, Size2Bytes, asm, "", pattern>;
+
+// Two-address instructions
+class TIt<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeNone, Size2Bytes, asm, "$lhs = $dst", pattern>;
+
+// BL, BLX(1) are translated by assembler into two instructions
+class TIx2<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeNone, Size4Bytes, asm, "", pattern>;
+
+def imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-(int)N->getValue(), MVT::i32);
+}]>;
+def imm_comp_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~((uint32_t)N->getValue()), MVT::i32);
+}]>;
+
+
+/// imm0_7 predicate - True if the 32-bit immediate is in the range [0,7].
+def imm0_7 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getValue() < 8;
+}]>;
+def imm0_7_neg : PatLeaf<(i32 imm), [{
+  return (uint32_t)-N->getValue() < 8;
+}], imm_neg_XFORM>;
+
+def imm0_255 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getValue() < 256;
+}]>;
+def imm0_255_comp : PatLeaf<(i32 imm), [{
+  return ~((uint32_t)N->getValue()) < 256;
+}]>;
+
+def imm8_255 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getValue() >= 8 && (uint32_t)N->getValue() < 256;
+}]>;
+def imm8_255_neg : PatLeaf<(i32 imm), [{
+  unsigned Val = -N->getValue();
+  return Val >= 8 && Val < 256;
+}], imm_neg_XFORM>;
+
+// Break imm's up into two pieces: an immediate + a left shift.
+// This uses thumb_immshifted to match and thumb_immshifted_val and
+// thumb_immshifted_shamt to get the val/shift pieces.
+def thumb_immshifted : PatLeaf<(imm), [{
+  return ARM_AM::isThumbImmShiftedVal((unsigned)N->getValue());
+}]>;
+
+def thumb_immshifted_val : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def thumb_immshifted_shamt : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+// Define Thumb specific addressing modes.
+
+// t_addrmode_rr := reg + reg
+//
+def t_addrmode_rr : Operand<i32>,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg);
+}
+
+// t_addrmode_ri5_{1|2|4} := reg + imm5 * {1|2|4}
+//
+def t_addrmode_ri5_1 : Operand<i32>,
+                       ComplexPattern<i32, 2, "SelectThumbAddrModeRI5_1", []> {
+  let PrintMethod = "printThumbAddrModeRI5_1Operand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+def t_addrmode_ri5_2 : Operand<i32>,
+                       ComplexPattern<i32, 2, "SelectThumbAddrModeRI5_2", []> {
+  let PrintMethod = "printThumbAddrModeRI5_2Operand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+def t_addrmode_ri5_4 : Operand<i32>,
+                       ComplexPattern<i32, 2, "SelectThumbAddrModeRI5_4", []> {
+  let PrintMethod = "printThumbAddrModeRI5_4Operand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_sp := sp + imm8 * 4
+//
+def t_addrmode_sp : Operand<i32>,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
+  let PrintMethod = "printThumbAddrModeSPOperand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+def tPICADD : TIt<(ops GPR:$dst, GPR:$lhs, pclabel:$cp),
+                  "\n$cp:\n\tadd $dst, pc",
+                  [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>;
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+let isReturn = 1, isTerminator = 1 in
+  def tBX_RET : TI<(ops), "bx lr", [(ARMretflag)]>;
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+let isLoad = 1, isReturn = 1, isTerminator = 1 in
+def tPOP_RET : TI<(ops reglist:$dst1, variable_ops),
+                   "pop $dst1", []>;
+
+let isCall = 1, noResults = 1, 
+  Defs = [R0, R1, R2, R3, LR,
+          D0, D1, D2, D3, D4, D5, D6, D7] in {
+  def tBL  : TIx2<(ops i32imm:$func, variable_ops),
+                   "bl ${func:call}",
+                   [(ARMtcall tglobaladdr:$func)]>;
+  // ARMv5T and above
+  def tBLXi : TIx2<(ops i32imm:$func, variable_ops),
+                    "blx ${func:call}",
+                    [(ARMcall tglobaladdr:$func)]>, Requires<[HasV5T]>;
+  def tBLXr : TI<(ops GPR:$dst, variable_ops),
+                  "blx $dst",
+                  [(ARMtcall GPR:$dst)]>, Requires<[HasV5T]>;
+  // ARMv4T
+  def tBX : TIx2<(ops GPR:$dst, variable_ops),
+                  "cpy lr, pc\n\tbx $dst",
+                  [(ARMcall_nolink GPR:$dst)]>;
+}
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in
+  def tB   : TI<(ops brtarget:$dst), "b $dst", [(br bb:$dst)]>;
+
+let isBranch = 1, isTerminator = 1, noResults = 1, isBarrier = 1 in
+  def tBcc : TI<(ops brtarget:$dst, CCOp:$cc), "b$cc $dst",
+                 [(ARMbrcond bb:$dst, imm:$cc)]>;
+
+//===----------------------------------------------------------------------===//
+//  Load Store Instructions.
+//
+
+let isLoad = 1 in {
+def tLDRri : TI4<(ops GPR:$dst, t_addrmode_ri5_4:$addr),
+                 "ldr $dst, $addr",
+                 [(set GPR:$dst, (load t_addrmode_ri5_4:$addr))]>;
+
+def tLDRrr : TI<(ops GPR:$dst, t_addrmode_rr:$addr),
+                "ldr $dst, $addr",
+                [(set GPR:$dst, (load t_addrmode_rr:$addr))]>;
+// def tLDRpci
+def tLDRspi : TIs<(ops GPR:$dst, t_addrmode_sp:$addr),
+                  "ldr $dst, $addr",
+                  [(set GPR:$dst, (load t_addrmode_sp:$addr))]>;
+
+def tLDRBri : TI1<(ops GPR:$dst, t_addrmode_ri5_1:$addr),
+                  "ldrb $dst, $addr",
+                  [(set GPR:$dst, (zextloadi8 t_addrmode_ri5_1:$addr))]>;
+
+def tLDRBrr : TI1<(ops GPR:$dst, t_addrmode_rr:$addr),
+                  "ldrb $dst, $addr",
+                  [(set GPR:$dst, (zextloadi8 t_addrmode_rr:$addr))]>;
+
+def tLDRHri : TI2<(ops GPR:$dst, t_addrmode_ri5_2:$addr),
+                  "ldrh $dst, $addr",
+                  [(set GPR:$dst, (zextloadi16 t_addrmode_ri5_2:$addr))]>;
+
+def tLDRHrr : TI2<(ops GPR:$dst, t_addrmode_rr:$addr),
+                  "ldrh $dst, $addr",
+                  [(set GPR:$dst, (zextloadi16 t_addrmode_rr:$addr))]>;
+
+def tLDRSBrr : TI1<(ops GPR:$dst, t_addrmode_rr:$addr),
+                   "ldrsb $dst, $addr",
+                   [(set GPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>;
+
+def tLDRSHrr : TI2<(ops GPR:$dst, t_addrmode_rr:$addr),
+                   "ldrsh $dst, $addr",
+                   [(set GPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>;
+} // isLoad
+
+let isStore = 1 in {
+def tSTRri : TI4<(ops GPR:$src, t_addrmode_ri5_4:$addr),
+                 "str $src, $addr",
+                 [(store GPR:$src, t_addrmode_ri5_4:$addr)]>;
+
+def tSTRrr : TI<(ops GPR:$src, t_addrmode_rr:$addr),
+                 "str $src, $addr",
+                 [(store GPR:$src, t_addrmode_rr:$addr)]>;
+
+def tSTRspi : TIs<(ops GPR:$src, t_addrmode_sp:$addr),
+                   "str $src, $addr",
+                   [(store GPR:$src, t_addrmode_sp:$addr)]>;
+
+def tSTRBri : TI1<(ops GPR:$src, t_addrmode_ri5_1:$addr),
+                   "strb $src, $addr",
+                   [(truncstorei8 GPR:$src, t_addrmode_ri5_1:$addr)]>;
+
+def tSTRBrr : TI1<(ops GPR:$src, t_addrmode_rr:$addr),
+                   "strb $src, $addr",
+                   [(truncstorei8 GPR:$src, t_addrmode_rr:$addr)]>;
+
+def tSTRHri : TI2<(ops GPR:$src, t_addrmode_ri5_2:$addr),
+                   "strh $src, $addr",
+                   [(truncstorei16 GPR:$src, t_addrmode_ri5_1:$addr)]>;
+
+def tSTRHrr : TI2<(ops GPR:$src, t_addrmode_rr:$addr),
+                   "strh $src, $addr",
+                   [(truncstorei16 GPR:$src, t_addrmode_rr:$addr)]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+// TODO: A7-44: LDMIA - load multiple
+
+let isLoad = 1 in
+def tPOP : TI<(ops reglist:$dst1, variable_ops),
+               "pop $dst1", []>;
+
+let isStore = 1 in
+def tPUSH : TI<(ops reglist:$src1, variable_ops),
+                "push $src1", []>;
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+def tADDi3 : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                "add $dst, $lhs, $rhs",
+                [(set GPR:$dst, (add GPR:$lhs, imm0_7:$rhs))]>;
+
+def tADDi8 : TIt<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                 "add $dst, $rhs",
+                 [(set GPR:$dst, (add GPR:$lhs, imm8_255:$rhs))]>;
+
+def tADDrr : TI<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                "add $dst, $lhs, $rhs",
+                [(set GPR:$dst, (add GPR:$lhs, GPR:$rhs))]>;
+
+def tADDhirr : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                   "add $dst, $rhs", []>;
+
+def tADDrPCi : TI<(ops GPR:$dst, i32imm:$rhs),
+                  "add $dst, pc, $rhs * 4", []>;
+def tADDrSPi : TI<(ops GPR:$dst, GPR:$sp, i32imm:$rhs),
+                  "add $dst, $sp, $rhs * 4", []>;
+def tADDspi : TI<(ops GPR:$sp, i32imm:$rhs),
+                 "add $sp, $rhs * 4", []>;
+
+
+def tAND : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                "and $dst, $rhs",
+                [(set GPR:$dst, (and GPR:$lhs, GPR:$rhs))]>;
+
+def tASRri : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                "asr $dst, $lhs, $rhs",
+                [(set GPR:$dst, (sra GPR:$lhs, imm:$rhs))]>;
+
+def tASRrr : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                 "asr $dst, $rhs",
+                 [(set GPR:$dst, (sra GPR:$lhs, GPR:$rhs))]>;
+
+def tBIC : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+               "bic $dst, $rhs",
+               [(set GPR:$dst, (and GPR:$lhs, (not GPR:$rhs)))]>;
+
+
+def tCMN : TI<(ops GPR:$lhs, GPR:$rhs),
+              "cmn $lhs, $rhs",
+              [(ARMcmp GPR:$lhs, (ineg GPR:$rhs))]>;
+
+def tCMPi8 : TI<(ops GPR:$lhs, i32imm:$rhs),
+               "cmp $lhs, $rhs",
+               [(ARMcmp GPR:$lhs, imm0_255:$rhs)]>;
+
+def tCMPr : TI<(ops GPR:$lhs, GPR:$rhs),
+               "cmp $lhs, $rhs",
+               [(ARMcmp GPR:$lhs, GPR:$rhs)]>;
+               
+// TODO: A7-37: CMP(3) - cmp hi regs
+
+def tEOR : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+               "eor $dst, $rhs",
+               [(set GPR:$dst, (xor GPR:$lhs, GPR:$rhs))]>;
+
+def tLSLri : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                "lsl $dst, $lhs, $rhs",
+                [(set GPR:$dst, (shl GPR:$lhs, imm:$rhs))]>;
+
+def tLSLrr : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                 "lsl $dst, $rhs",
+                 [(set GPR:$dst, (shl GPR:$lhs, GPR:$rhs))]>;
+
+def tLSRri : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                "lsr $dst, $lhs, $rhs",
+                [(set GPR:$dst, (srl GPR:$lhs, imm:$rhs))]>;
+
+def tLSRrr : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                 "lsr $dst, $rhs",
+                 [(set GPR:$dst, (srl GPR:$lhs, GPR:$rhs))]>;
+
+def tMOVri8 : TI<(ops GPR:$dst, i32imm:$src),
+                 "mov $dst, $src",
+                 [(set GPR:$dst, imm0_255:$src)]>;
+
+// TODO: A7-73: MOV(2) - mov setting flag.
+
+
+// Note: MOV(2) of two low regs updates the flags, so we emit this as 'cpy',
+// which is MOV(3).  This also supports high registers.
+def tMOVrr  : TI<(ops GPR:$dst, GPR:$src),
+                 "cpy $dst, $src", []>;
+
+def tMUL : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+               "mul $dst, $rhs",
+               [(set GPR:$dst, (mul GPR:$lhs, GPR:$rhs))]>;
+
+def tMVN : TI<(ops GPR:$dst, GPR:$src),
+              "mvn $dst, $src",
+              [(set GPR:$dst, (not GPR:$src))]>;
+
+def tNEG : TI<(ops GPR:$dst, GPR:$src),
+              "neg $dst, $src",
+              [(set GPR:$dst, (ineg GPR:$src))]>;
+
+def tORR : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+               "orr $dst, $rhs",
+               [(set GPR:$dst, (or GPR:$lhs, GPR:$rhs))]>;
+
+
+def tREV : TI<(ops GPR:$dst, GPR:$src),
+              "rev $dst, $src",
+              [(set GPR:$dst, (bswap GPR:$src))]>, 
+              Requires<[IsThumb, HasV6]>;
+
+def tREV16 : TI<(ops GPR:$dst, GPR:$src),
+                "rev16 $dst, $src",
+                [(set GPR:$dst,
+                    (or (and (srl GPR:$src, 8), 0xFF),
+                        (or (and (shl GPR:$src, 8), 0xFF00),
+                            (or (and (srl GPR:$src, 8), 0xFF0000),
+                                (and (shl GPR:$src, 8), 0xFF000000)))))]>,
+                Requires<[IsThumb, HasV6]>;
+
+def tREVSH : TI<(ops GPR:$dst, GPR:$src),
+                "revsh $dst, $src",
+                [(set GPR:$dst,
+                   (sext_inreg
+                     (or (srl (and GPR:$src, 0xFFFF), 8),
+                         (shl GPR:$src, 8)), i16))]>,
+                Requires<[IsThumb, HasV6]>;
+
+def tROR : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                "ror $dst, $rhs",
+                [(set GPR:$dst, (rotr GPR:$lhs, GPR:$rhs))]>;
+
+def tSBC : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                "sbc $dst, $rhs",
+                [(set GPR:$dst, (sube GPR:$lhs, GPR:$rhs))]>;
+
+// TODO: A7-96: STMIA - store multiple.
+
+def tSUBi3 : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                "sub $dst, $lhs, $rhs",
+                [(set GPR:$dst, (add GPR:$lhs, imm0_7_neg:$rhs))]>;
+                
+def tSUBi8 : TIt<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                  "sub $dst, $rhs",
+                  [(set GPR:$dst, (add GPR:$lhs, imm8_255_neg:$rhs))]>;
+                
+def tSUBrr : TI<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                "sub $dst, $lhs, $rhs",
+                [(set GPR:$dst, (sub GPR:$lhs, GPR:$rhs))]>;
+
+def tSUBspi : TI<(ops GPR:$sp, i32imm:$rhs),
+                 "sub $sp, $rhs * 4", []>;
+
+def tSXTB  : TI<(ops GPR:$dst, GPR:$src),
+                "sxtb $dst, $src",
+                [(set GPR:$dst, (sext_inreg GPR:$src, i8))]>,
+                Requires<[IsThumb, HasV6]>;
+def tSXTH  : TI<(ops GPR:$dst, GPR:$src),
+                "sxth $dst, $src",
+                [(set GPR:$dst, (sext_inreg GPR:$src, i16))]>,
+                Requires<[IsThumb, HasV6]>;
+
+// TODO: A7-122: TST - test.
+
+def tUXTB  : TI<(ops GPR:$dst, GPR:$src),
+                "uxtb $dst, $src",
+                [(set GPR:$dst, (and GPR:$src, 0xFF))]>,
+                Requires<[IsThumb, HasV6]>;
+def tUXTH  : TI<(ops GPR:$dst, GPR:$src),
+                "uxth $dst, $src",
+                [(set GPR:$dst, (and GPR:$src, 0xFFFF))]>, 
+                Requires<[IsThumb, HasV6]>;
+
+
+// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC DAG operation.
+// Expanded by the scheduler into a branch sequence.
+let usesCustomDAGSchedInserter = 1 in  // Expanded by the scheduler.
+  def tMOVCCr :
+  PseudoInst<(ops GPR:$dst, GPR:$false, GPR:$true, CCOp:$cc),
+              "@ tMOVCCr $cc",
+              [(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc))]>;
+
+// tLEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def tLEApcrel : TI<(ops GPR:$dst, i32imm:$label),
+                    !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
+                                          "${:private}PCRELL${:uid}+4))\n"),
+                               !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                          "add $dst, pc, #PCRELV${:uid}")),
+                    []>;
+
+def tLEApcrelCall : TI<(ops GPR:$dst, i32imm:$label),
+                   !strconcat(!strconcat(".set PCRELV${:uid}, (${label:call}-(",
+                                         "${:private}PCRELL${:uid}+4))\n"),
+                              !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                         "add $dst, pc, #PCRELV${:uid}")),
+                   []>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// ConstantPool, GlobalAddress
+def : ThumbPat<(ARMWrapper  tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>;
+def : ThumbPat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
+def : ThumbPat<(ARMWrapperCall tglobaladdr :$dst),
+               (tLEApcrelCall  tglobaladdr :$dst)>;
+def : ThumbPat<(ARMWrapperCall texternalsym:$dst),
+               (tLEApcrelCall  texternalsym:$dst)>;
+
+// Direct calls
+def : ThumbPat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>;
+def : ThumbV5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>;
+
+// Indirect calls to ARM routines
+def : ThumbV5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>;
+
+// zextload i1 -> zextload i8
+def : ThumbPat<(zextloadi1 t_addrmode_ri5_1:$addr),
+               (tLDRBri t_addrmode_ri5_1:$addr)>;
+def : ThumbPat<(zextloadi1 t_addrmode_rr:$addr),
+               (tLDRBri t_addrmode_rr:$addr)>;
+                  
+// truncstore i1 -> truncstore i8
+def : ThumbPat<(truncstorei1 GPR:$src, t_addrmode_ri5_1:$dst), 
+               (tSTRBri GPR:$src, t_addrmode_ri5_1:$dst)>;
+def : ThumbPat<(truncstorei1 GPR:$src, t_addrmode_rr:$dst), 
+               (tSTRBrr GPR:$src, t_addrmode_rr:$dst)>;
+
+// Large immediate handling.
+
+// Two piece imms.
+def : ThumbPat<(i32 thumb_immshifted:$src),
+               (tLSLri (tMOVri8 (thumb_immshifted_val imm:$src)),
+                       (thumb_immshifted_shamt imm:$src))>;
+
+def : ThumbPat<(i32 imm0_255_comp:$src),
+               (tMVN (tMOVri8 (imm_comp_XFORM imm:$src)))>;
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@ -0,0 +1,359 @@
+//===- ARMInstrVFP.td - VFP support for ARM -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM VP instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM VFP Instruction templates.
+//
+
+// ARM Float Instruction
+class ASI<dag ops, string asm, list<dag> pattern> : AI<ops, asm, pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+}
+
+class ASI5<dag ops, string asm, list<dag> pattern>
+  : I<ops, AddrMode5, Size4Bytes, IndexModeNone, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+}
+
+// ARM Double Instruction
+class ADI<dag ops, string asm, list<dag> pattern> : AI<ops, asm, pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+}
+
+class ADI5<dag ops, string asm, list<dag> pattern>
+  : I<ops, AddrMode5, Size4Bytes, IndexModeNone, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+}
+
+def SDT_FTOI :
+SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDT_ITOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDT_CMPFP0 :
+SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_FMDRR :
+SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
+                     SDTCisSameAs<1, 2>]>;
+
+def arm_ftoui  : SDNode<"ARMISD::FTOUI", SDT_FTOI>;
+def arm_ftosi  : SDNode<"ARMISD::FTOSI", SDT_FTOI>;
+def arm_sitof  : SDNode<"ARMISD::SITOF", SDT_ITOF>;
+def arm_uitof  : SDNode<"ARMISD::UITOF", SDT_ITOF>;
+def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTRet, [SDNPInFlag,SDNPOutFlag]>;
+def arm_cmpfp  : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutFlag]>;
+def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutFlag]>;
+def arm_fmdrr  : SDNode<"ARMISD::FMDRR", SDT_FMDRR>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+let isLoad = 1 in {
+def FLDD  : ADI5<(ops DPR:$dst, addrmode5:$addr),
+                 "fldd $dst, $addr",
+                 [(set DPR:$dst, (load addrmode5:$addr))]>;
+
+def FLDS  : ASI5<(ops SPR:$dst, addrmode5:$addr),
+                 "flds $dst, $addr",
+                 [(set SPR:$dst, (load addrmode5:$addr))]>;
+} // isLoad
+
+let isStore = 1 in {
+def FSTD  : ADI5<(ops DPR:$src, addrmode5:$addr),
+                 "fstd $src, $addr",
+                 [(store DPR:$src, addrmode5:$addr)]>;
+
+def FSTS  : ASI5<(ops SPR:$src, addrmode5:$addr),
+                 "fsts $src, $addr",
+                 [(store SPR:$src, addrmode5:$addr)]>;
+} // isStore
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+let isLoad = 1 in {
+def FLDMD : ADI5<(ops addrmode5:$addr, reglist:$dst1, variable_ops),
+                 "fldm${addr:submode}d ${addr:base}, $dst1",
+                 []>;
+
+def FLDMS : ASI5<(ops addrmode5:$addr, reglist:$dst1, variable_ops),
+                 "fldm${addr:submode}s ${addr:base}, $dst1",
+                 []>;
+} // isLoad
+
+let isStore = 1 in {
+def FSTMD : ADI5<(ops addrmode5:$addr, reglist:$src1, variable_ops),
+                 "fstm${addr:submode}d ${addr:base}, $src1",
+                 []>;
+
+def FSTMS : ASI5<(ops addrmode5:$addr, reglist:$src1, variable_ops),
+                 "fstm${addr:submode}s ${addr:base}, $src1",
+                 []>;
+} // isStore
+
+// FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores
+
+//===----------------------------------------------------------------------===//
+// FP Binary Operations.
+//
+
+def FADDD  : ADI<(ops DPR:$dst, DPR:$a, DPR:$b),
+                 "faddd $dst, $a, $b",
+                 [(set DPR:$dst, (fadd DPR:$a, DPR:$b))]>;
+
+def FADDS  : ASI<(ops SPR:$dst, SPR:$a, SPR:$b),
+                 "fadds $dst, $a, $b",
+                 [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>;
+
+def FCMPED : ADI<(ops DPR:$a, DPR:$b),
+                 "fcmped $a, $b",
+                 [(arm_cmpfp DPR:$a, DPR:$b)]>;
+
+def FCMPES : ASI<(ops SPR:$a, SPR:$b),
+                 "fcmpes $a, $b",
+                 [(arm_cmpfp SPR:$a, SPR:$b)]>;
+
+def FDIVD  : ADI<(ops DPR:$dst, DPR:$a, DPR:$b),
+                 "fdivd $dst, $a, $b",
+                 [(set DPR:$dst, (fdiv DPR:$a, DPR:$b))]>;
+
+def FDIVS  : ASI<(ops SPR:$dst, SPR:$a, SPR:$b),
+                 "fdivs $dst, $a, $b",
+                 [(set SPR:$dst, (fdiv SPR:$a, SPR:$b))]>;
+
+def FMULD  : ADI<(ops DPR:$dst, DPR:$a, DPR:$b),
+                 "fmuld $dst, $a, $b",
+                 [(set DPR:$dst, (fmul DPR:$a, DPR:$b))]>;
+
+def FMULS  : ASI<(ops SPR:$dst, SPR:$a, SPR:$b),
+                 "fmuls $dst, $a, $b",
+                 [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>;
+
+
+def FNMULD  : ADI<(ops DPR:$dst, DPR:$a, DPR:$b),
+                  "fnmuld $dst, $a, $b",
+                  [(set DPR:$dst, (fneg (fmul DPR:$a, DPR:$b)))]>;
+
+def FNMULS  : ASI<(ops SPR:$dst, SPR:$a, SPR:$b),
+                  "fnmuls $dst, $a, $b",
+                  [(set SPR:$dst, (fneg (fmul SPR:$a, SPR:$b)))]>;
+
+def FSUBD  : ADI<(ops DPR:$dst, DPR:$a, DPR:$b),
+                 "fsubd $dst, $a, $b",
+                 [(set DPR:$dst, (fsub DPR:$a, DPR:$b))]>;
+
+def FSUBS  : ASI<(ops SPR:$dst, SPR:$a, SPR:$b),
+                 "fsubs $dst, $a, $b",
+                 [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]>;
+
+//===----------------------------------------------------------------------===//
+// FP Unary Operations.
+//
+
+def FABSD  : ADI<(ops DPR:$dst, DPR:$a),
+                 "fabsd $dst, $a",
+                 [(set DPR:$dst, (fabs DPR:$a))]>;
+
+def FABSS  : ASI<(ops SPR:$dst, SPR:$a),
+                 "fabss $dst, $a",
+                 [(set SPR:$dst, (fabs SPR:$a))]>;
+
+def FCMPEZD : ADI<(ops DPR:$a),
+                  "fcmpezd $a",
+                  [(arm_cmpfp0 DPR:$a)]>;
+
+def FCMPEZS : ASI<(ops SPR:$a),
+                  "fcmpezs $a",
+                  [(arm_cmpfp0 SPR:$a)]>;
+
+def FCVTDS : ADI<(ops DPR:$dst, SPR:$a),
+                 "fcvtds $dst, $a",
+                 [(set DPR:$dst, (fextend SPR:$a))]>;
+
+def FCVTSD : ADI<(ops SPR:$dst, DPR:$a),
+                 "fcvtsd $dst, $a",
+                 [(set SPR:$dst, (fround DPR:$a))]>;
+
+def FCPYD  : ADI<(ops DPR:$dst, DPR:$a),
+                 "fcpyd $dst, $a",
+                 [/*(set DPR:$dst, DPR:$a)*/]>;
+
+def FCPYS  : ASI<(ops SPR:$dst, SPR:$a),
+                 "fcpys $dst, $a",
+                 [/*(set SPR:$dst, SPR:$a)*/]>;
+
+def FNEGD  : ADI<(ops DPR:$dst, DPR:$a),
+                 "fnegd $dst, $a",
+                 [(set DPR:$dst, (fneg DPR:$a))]>;
+
+def FNEGS  : ASI<(ops SPR:$dst, SPR:$a),
+                 "fnegs $dst, $a",
+                 [(set SPR:$dst, (fneg SPR:$a))]>;
+
+def FSQRTD  : ADI<(ops DPR:$dst, DPR:$a),
+                 "fsqrtd $dst, $a",
+                 [(set DPR:$dst, (fsqrt DPR:$a))]>;
+
+def FSQRTS  : ASI<(ops SPR:$dst, SPR:$a),
+                 "fsqrts $dst, $a",
+                 [(set SPR:$dst, (fsqrt SPR:$a))]>;
+
+//===----------------------------------------------------------------------===//
+// FP <-> GPR Copies.  Int <-> FP Conversions.
+//
+
+def IMPLICIT_DEF_SPR : PseudoInst<(ops SPR:$rD),
+                                  "@ IMPLICIT_DEF_SPR $rD",
+                                  [(set SPR:$rD, (undef))]>;
+def IMPLICIT_DEF_DPR : PseudoInst<(ops DPR:$rD),
+                                  "@ IMPLICIT_DEF_DPR $rD",
+                                  [(set DPR:$rD, (undef))]>;
+
+def FMRS   : ASI<(ops GPR:$dst, SPR:$src),
+                 "fmrs $dst, $src",
+                 [(set GPR:$dst, (bitconvert SPR:$src))]>;
+
+def FMSR   : ASI<(ops SPR:$dst, GPR:$src),
+                 "fmsr $dst, $src",
+                 [(set SPR:$dst, (bitconvert GPR:$src))]>;
+
+
+def FMRRD  : ADI<(ops GPR:$dst1, GPR:$dst2, DPR:$src),
+                 "fmrrd $dst1, $dst2, $src",
+                 [/* FIXME: Can't write pattern for multiple result instr*/]>;
+
+// FMDHR: GPR -> SPR
+// FMDLR: GPR -> SPR
+
+def FMDRR : ADI<(ops DPR:$dst, GPR:$src1, GPR:$src2),
+                "fmdrr $dst, $src1, $src2",
+                [(set DPR:$dst, (arm_fmdrr GPR:$src1, GPR:$src2))]>;
+
+// FMRDH: SPR -> GPR
+// FMRDL: SPR -> GPR
+// FMRRS: SPR -> GPR
+// FMRX : SPR system reg -> GPR
+
+// FMSRR: GPR -> SPR
+
+
+def FMSTAT : ASI<(ops), "fmstat", [(arm_fmstat)]>;
+
+// FMXR: GPR -> VFP Sstem reg
+
+
+// Int to FP:
+
+def FSITOD : ADI<(ops DPR:$dst, SPR:$a),
+                 "fsitod $dst, $a",
+                 [(set DPR:$dst, (arm_sitof SPR:$a))]>;
+
+def FSITOS : ASI<(ops SPR:$dst, SPR:$a),
+                 "fsitos $dst, $a",
+                 [(set SPR:$dst, (arm_sitof SPR:$a))]>;
+
+def FUITOD : ADI<(ops DPR:$dst, SPR:$a),
+                 "fuitod $dst, $a",
+                 [(set DPR:$dst, (arm_uitof SPR:$a))]>;
+
+def FUITOS : ASI<(ops SPR:$dst, SPR:$a),
+                 "fuitos $dst, $a",
+                 [(set SPR:$dst, (arm_uitof SPR:$a))]>;
+
+// FP to Int:
+// Always set Z bit in the instruction, i.e. "round towards zero" variants.
+
+def FTOSIZD : ADI<(ops SPR:$dst, DPR:$a),
+                 "ftosizd $dst, $a",
+                 [(set SPR:$dst, (arm_ftosi DPR:$a))]>;
+
+def FTOSIZS : ASI<(ops SPR:$dst, SPR:$a),
+                 "ftosizs $dst, $a",
+                 [(set SPR:$dst, (arm_ftosi SPR:$a))]>;
+
+def FTOUIZD : ADI<(ops SPR:$dst, DPR:$a),
+                 "ftouizd $dst, $a",
+                 [(set SPR:$dst, (arm_ftoui DPR:$a))]>;
+
+def FTOUIZS : ASI<(ops SPR:$dst, SPR:$a),
+                 "ftouizs $dst, $a",
+                 [(set SPR:$dst, (arm_ftoui SPR:$a))]>;
+
+//===----------------------------------------------------------------------===//
+// FP FMA Operations.
+//
+
+def FMACD : ADI<(ops DPR:$dst, DPR:$dstin, DPR:$a, DPR:$b),
+                "fmacd $dst, $a, $b",
+                [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FMACS : ASI<(ops SPR:$dst, SPR:$dstin, SPR:$a, SPR:$b),
+                "fmacs $dst, $a, $b",
+                [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FMSCD : ADI<(ops DPR:$dst, DPR:$dstin, DPR:$a, DPR:$b),
+                "fmscd $dst, $a, $b",
+                [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FMSCS : ASI<(ops SPR:$dst, SPR:$dstin, SPR:$a, SPR:$b),
+                "fmscs $dst, $a, $b",
+                [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FNMACD : ADI<(ops DPR:$dst, DPR:$dstin, DPR:$a, DPR:$b),
+                 "fnmacd $dst, $a, $b",
+             [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FNMACS : ASI<(ops SPR:$dst, SPR:$dstin, SPR:$a, SPR:$b),
+                "fnmacs $dst, $a, $b",
+             [(set SPR:$dst, (fadd (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FNMSCD : ADI<(ops DPR:$dst, DPR:$dstin, DPR:$a, DPR:$b),
+                 "fnmscd $dst, $a, $b",
+             [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FNMSCS : ASI<(ops SPR:$dst, SPR:$dstin, SPR:$a, SPR:$b),
+                "fnmscs $dst, $a, $b",
+             [(set SPR:$dst, (fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+//===----------------------------------------------------------------------===//
+// FP Conditional moves.
+//
+
+def FCPYDcc  : ADI<(ops DPR:$dst, DPR:$false, DPR:$true, CCOp:$cc),
+                   "fcpyd$cc $dst, $true",
+                   [(set DPR:$dst, (ARMcmov DPR:$false, DPR:$true, imm:$cc))]>,
+                   RegConstraint<"$false = $dst">;
+
+def FCPYScc  : ASI<(ops SPR:$dst, SPR:$false, SPR:$true, CCOp:$cc),
+                   "fcpys$cc $dst, $true",
+                   [(set SPR:$dst, (ARMcmov SPR:$false, SPR:$true, imm:$cc))]>,
+                   RegConstraint<"$false = $dst">;
+
+def FNEGDcc  : ADI<(ops DPR:$dst, DPR:$false, DPR:$true, CCOp:$cc),
+                   "fnegd$cc $dst, $true",
+                   [(set DPR:$dst, (ARMcneg DPR:$false, DPR:$true, imm:$cc))]>,
+                   RegConstraint<"$false = $dst">;
+
+def FNEGScc  : ASI<(ops SPR:$dst, SPR:$false, SPR:$true, CCOp:$cc),
+                   "fnegs$cc $dst, $true",
+                   [(set SPR:$dst, (ARMcneg SPR:$false, SPR:$true, imm:$cc))]>,
+                   RegConstraint<"$false = $dst">;
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@ -0,0 +1,628 @@
+//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-ldst-opt"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMRegisterInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+STATISTIC(NumLDMGened , "Number of ldm instructions generated");
+STATISTIC(NumSTMGened , "Number of stm instructions generated");
+STATISTIC(NumFLDMGened, "Number of fldm instructions generated");
+STATISTIC(NumFSTMGened, "Number of fstm instructions generated");
+
+namespace {
+  struct VISIBILITY_HIDDEN ARMLoadStoreOpt : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM load / store optimization pass";
+    }
+
+  private:
+    struct MemOpQueueEntry {
+      int Offset;
+      unsigned Position;
+      MachineBasicBlock::iterator MBBI;
+      bool Merged;
+      MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i)
+        : Offset(o), Position(p), MBBI(i), Merged(false) {};
+    };
+    typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
+    typedef MemOpQueue::iterator MemOpQueueIter;
+
+    SmallVector<MachineBasicBlock::iterator, 4>
+    MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
+                 int Opcode, unsigned Size, MemOpQueue &MemOps);
+
+    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
+    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
+  };
+}
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createARMLoadStoreOptimizationPass() {
+  return new ARMLoadStoreOpt();
+}
+
+static int getLoadStoreMultipleOpcode(int Opcode) {
+  switch (Opcode) {
+  case ARM::LDR:
+    NumLDMGened++;
+    return ARM::LDM;
+  case ARM::STR:
+    NumSTMGened++;
+    return ARM::STM;
+  case ARM::FLDS:
+    NumFLDMGened++;
+    return ARM::FLDMS;
+  case ARM::FSTS:
+    NumFSTMGened++;
+    return ARM::FSTMS;
+  case ARM::FLDD:
+    NumFLDMGened++;
+    return ARM::FLDMD;
+  case ARM::FSTD:
+    NumFSTMGened++;
+    return ARM::FSTMD;
+  default: abort();
+  }
+  return 0;
+}
+
+/// mergeOps - Create and insert a LDM or STM with Base as base register and
+/// registers in Regs as the register operands that would be loaded / stored.
+/// It returns true if the transformation is done. 
+static bool mergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     int Offset, unsigned Base, int Opcode,
+                     SmallVector<unsigned, 8> &Regs,
+                     const TargetInstrInfo *TII) {
+  // Only a single register to load / store. Don't bother.
+  unsigned NumRegs = Regs.size();
+  if (NumRegs <= 1)
+    return false;
+
+  ARM_AM::AMSubMode Mode = ARM_AM::ia;
+  bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  if (isAM4 && Offset == 4)
+    Mode = ARM_AM::ib;
+  else if (isAM4 && Offset == -4 * (int)NumRegs + 4)
+    Mode = ARM_AM::da;
+  else if (isAM4 && Offset == -4 * (int)NumRegs)
+    Mode = ARM_AM::db;
+  else if (Offset != 0) {
+    // If starting offset isn't zero, insert a MI to materialize a new base.
+    // But only do so if it is cost effective, i.e. merging more than two
+    // loads / stores.
+    if (NumRegs <= 2)
+      return false;
+
+    unsigned NewBase;
+    if (Opcode == ARM::LDR)
+      // If it is a load, then just use one of the destination register to
+      // use as the new base.
+      NewBase = Regs[NumRegs-1];
+    else {
+      // FIXME: Try scavenging a register to use as a new base.
+      NewBase = ARM::R12;
+    }
+    int BaseOpc = ARM::ADDri;
+    if (Offset < 0) {
+      BaseOpc = ARM::SUBri;
+      Offset = - Offset;
+    }
+    int ImmedOffset = ARM_AM::getSOImmVal(Offset);
+    if (ImmedOffset == -1)
+      return false;  // Probably not worth it then.
+    BuildMI(MBB, MBBI, TII->get(BaseOpc), NewBase).addReg(Base).addImm(ImmedOffset);
+    Base = NewBase;
+  }
+
+  bool isDPR = Opcode == ARM::FLDD || Opcode == ARM::FSTD;
+  bool isDef = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+  Opcode = getLoadStoreMultipleOpcode(Opcode);
+  MachineInstrBuilder MIB = (isAM4)
+    ? BuildMI(MBB, MBBI, TII->get(Opcode)).addReg(Base)
+        .addImm(ARM_AM::getAM4ModeImm(Mode))
+    : BuildMI(MBB, MBBI, TII->get(Opcode)).addReg(Base)
+        .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs));
+  for (unsigned i = 0; i != NumRegs; ++i)
+    MIB = MIB.addReg(Regs[i], Opcode == isDef);
+
+  return true;
+}
+
+SmallVector<MachineBasicBlock::iterator, 4>
+ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB,
+                              unsigned SIndex, unsigned Base, int Opcode,
+                              unsigned Size, MemOpQueue &MemOps) {
+  bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  SmallVector<MachineBasicBlock::iterator, 4> Merges;
+  int Offset = MemOps[SIndex].Offset;
+  int SOffset = Offset;
+  unsigned Pos = MemOps[SIndex].Position;
+  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
+  SmallVector<unsigned, 8> Regs;
+  unsigned PReg = MemOps[SIndex].MBBI->getOperand(0).getReg();
+  unsigned PRegNum = ARMRegisterInfo::getRegisterNumbering(PReg);
+  Regs.push_back(PReg);
+  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
+    int NewOffset = MemOps[i].Offset;
+    unsigned Reg = MemOps[i].MBBI->getOperand(0).getReg();
+    unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
+    // AM4 - register numbers in ascending order.
+    // AM5 - consecutive register numbers in ascending order.
+    if (NewOffset == Offset + (int)Size &&
+        ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) {
+      Offset += Size;
+      Regs.push_back(Reg);
+      PRegNum = RegNum;
+    } else {
+      // Can't merge this in. Try merge the earlier ones first.
+      if (mergeOps(MBB, ++Loc, SOffset, Base, Opcode, Regs, TII)) {
+        Merges.push_back(prior(Loc));
+        for (unsigned j = SIndex; j < i; ++j) {
+          MBB.erase(MemOps[j].MBBI);
+          MemOps[j].Merged = true;
+        }
+      }
+      SmallVector<MachineBasicBlock::iterator, 4> Merges2 =
+        MergeLDR_STR(MBB, i, Base, Opcode, Size, MemOps);
+      Merges.append(Merges2.begin(), Merges2.end());
+      return Merges;
+    }
+
+    if (MemOps[i].Position > Pos) {
+      Pos = MemOps[i].Position;
+      Loc = MemOps[i].MBBI;
+    }
+  }
+
+  if (mergeOps(MBB, ++Loc, SOffset, Base, Opcode, Regs, TII)) {
+    Merges.push_back(prior(Loc));
+    for (unsigned i = SIndex, e = MemOps.size(); i != e; ++i) {
+      MBB.erase(MemOps[i].MBBI);
+      MemOps[i].Merged = true;
+    }
+  }
+
+  return Merges;
+}
+
+static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
+                                       unsigned Bytes) {
+  return (MI && MI->getOpcode() == ARM::SUBri &&
+          MI->getOperand(0).getReg() == Base &&
+          MI->getOperand(1).getReg() == Base &&
+          ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes);
+}
+
+static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
+                                       unsigned Bytes) {
+  return (MI && MI->getOpcode() == ARM::ADDri &&
+          MI->getOperand(0).getReg() == Base &&
+          MI->getOperand(1).getReg() == Base &&
+          ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes);
+}
+
+static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return 0;
+  case ARM::LDR:
+  case ARM::STR:
+  case ARM::FLDS:
+  case ARM::FSTS:
+    return 4;
+  case ARM::FLDD:
+  case ARM::FSTD:
+    return 8;
+  case ARM::LDM:
+  case ARM::STM:
+    return (MI->getNumOperands() - 2) * 4;
+  case ARM::FLDMS:
+  case ARM::FSTMS:
+  case ARM::FLDMD:
+  case ARM::FSTMD:
+    return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4;
+  }
+}
+
+/// mergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
+/// register into the LDM/STM/FLDM{D|S}/FSTM{D|S} op when possible:
+///
+/// stmia rn, <ra, rb, rc>
+/// rn := rn + 4 * 3;
+/// =>
+/// stmia rn!, <ra, rb, rc>
+///
+/// rn := rn - 4 * 3;
+/// ldmia rn, <ra, rb, rc>
+/// =>
+/// ldmdb rn!, <ra, rb, rc>
+static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI) {
+  MachineInstr *MI = MBBI;
+  unsigned Base = MI->getOperand(0).getReg();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
+  int Opcode = MI->getOpcode();
+  bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::STM;
+
+  if (isAM4) {
+    if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm()))
+      return false;
+
+    // Can't use the updating AM4 sub-mode if the base register is also a dest
+    // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
+    for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i) {
+      if (MI->getOperand(i).getReg() == Base)
+        return false;
+    }
+
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
+    if (MBBI != MBB.begin()) {
+      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+      if (Mode == ARM_AM::ia &&
+          isMatchingDecrement(PrevMBBI, Base, Bytes)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true));
+        MBB.erase(PrevMBBI);
+        return true;
+      } else if (Mode == ARM_AM::ib &&
+                 isMatchingDecrement(PrevMBBI, Base, Bytes)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true));
+        MBB.erase(PrevMBBI);
+        return true;
+      }
+    }
+
+    if (MBBI != MBB.end()) {
+      MachineBasicBlock::iterator NextMBBI = next(MBBI);
+      if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
+          isMatchingIncrement(NextMBBI, Base, Bytes)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
+        MBB.erase(NextMBBI);
+        return true;
+      } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
+                 isMatchingDecrement(NextMBBI, Base, Bytes)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
+        MBB.erase(NextMBBI);
+        return true;
+      }
+    }
+  } else {
+    // FLDM{D|S}, FSTM{D|S} addressing mode 5 ops.
+    if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm()))
+      return false;
+
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm());
+    unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm());
+    if (MBBI != MBB.begin()) {
+      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+      if (Mode == ARM_AM::ia &&
+          isMatchingDecrement(PrevMBBI, Base, Bytes)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset));
+        MBB.erase(PrevMBBI);
+        return true;
+      }
+    }
+
+    if (MBBI != MBB.end()) {
+      MachineBasicBlock::iterator NextMBBI = next(MBBI);
+      if (Mode == ARM_AM::ia &&
+          isMatchingIncrement(NextMBBI, Base, Bytes)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset));
+        MBB.erase(NextMBBI);
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
+  switch (Opc) {
+  case ARM::LDR: return ARM::LDR_PRE;
+  case ARM::STR: return ARM::STR_PRE;
+  case ARM::FLDS: return ARM::FLDMS;
+  case ARM::FLDD: return ARM::FLDMD;
+  case ARM::FSTS: return ARM::FSTMS;
+  case ARM::FSTD: return ARM::FSTMD;
+  default: abort();
+  }
+  return 0;
+}
+
+static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) {
+  switch (Opc) {
+  case ARM::LDR: return ARM::LDR_POST;
+  case ARM::STR: return ARM::STR_POST;
+  case ARM::FLDS: return ARM::FLDMS;
+  case ARM::FLDD: return ARM::FLDMD;
+  case ARM::FSTS: return ARM::FSTMS;
+  case ARM::FSTD: return ARM::FSTMD;
+  default: abort();
+  }
+  return 0;
+}
+
+/// mergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
+/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
+static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     const TargetInstrInfo *TII) {
+  MachineInstr *MI = MBBI;
+  unsigned Base = MI->getOperand(1).getReg();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
+  int Opcode = MI->getOpcode();
+  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  if ((isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) ||
+      (!isAM2 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0))
+    return false;
+
+  bool isLd = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+  // Can't do the merge if the destination register is the same as the would-be
+  // writeback register.
+  if (isLd && MI->getOperand(0).getReg() == Base)
+    return false;
+
+  bool DoMerge = false;
+  ARM_AM::AddrOpc AddSub = ARM_AM::add;
+  unsigned NewOpc = 0;
+  if (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+    if (isMatchingDecrement(PrevMBBI, Base, Bytes)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
+    } else if (isAM2 && isMatchingIncrement(PrevMBBI, Base, Bytes)) {
+      DoMerge = true;
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
+    }
+    if (DoMerge)
+      MBB.erase(PrevMBBI);
+  }
+
+  if (!DoMerge && MBBI != MBB.end()) {
+    MachineBasicBlock::iterator NextMBBI = next(MBBI);
+    if (isAM2 && isMatchingDecrement(NextMBBI, Base, Bytes)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
+    } else if (isMatchingIncrement(NextMBBI, Base, Bytes)) {
+      DoMerge = true;
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
+    }
+    if (DoMerge)
+      MBB.erase(NextMBBI);
+  }
+
+  if (!DoMerge)
+    return false;
+
+  bool isDPR = NewOpc == ARM::FLDMD || NewOpc == ARM::FSTMD;
+  unsigned Offset = isAM2 ? ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift)
+    : ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) ? ARM_AM::db : ARM_AM::ia,
+                        true, isDPR ? 2 : 1);
+  if (isLd) {
+    if (isAM2)
+      BuildMI(MBB, MBBI, TII->get(NewOpc), MI->getOperand(0).getReg())
+        .addReg(Base, true).addReg(Base).addReg(0).addImm(Offset);
+    else
+      BuildMI(MBB, MBBI, TII->get(NewOpc)).addReg(Base)
+        .addImm(Offset).addReg(MI->getOperand(0).getReg(), true);
+  } else {
+    if (isAM2)
+      BuildMI(MBB, MBBI, TII->get(NewOpc), Base).addReg(MI->getOperand(0).getReg())
+        .addReg(Base).addReg(0).addImm(Offset);
+    else
+      BuildMI(MBB, MBBI, TII->get(NewOpc)).addReg(Base)
+        .addImm(Offset).addReg(MI->getOperand(0).getReg(), false);
+  }
+  MBB.erase(MBBI);
+
+  return true;
+}
+
+/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
+/// ops of the same base and incrementing offset into LDM / STM ops.
+bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
+  unsigned NumMerges = 0;
+  unsigned NumMemOps = 0;
+  MemOpQueue MemOps;
+  unsigned CurrBase = 0;
+  int CurrOpc = -1;
+  unsigned CurrSize = 0;
+  unsigned Position = 0;
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    bool Advance  = false;
+    bool TryMerge = false;
+    bool Clobber  = false;
+
+    int Opcode = MBBI->getOpcode();
+    bool isMemOp = false;
+    bool isAM2 = false;
+    unsigned Size = 4;
+    switch (Opcode) {
+    case ARM::LDR:
+    case ARM::STR:
+      isMemOp =
+        (MBBI->getOperand(1).isRegister() && MBBI->getOperand(2).getReg() == 0);
+      isAM2 = true;
+      break;
+    case ARM::FLDS:
+    case ARM::FSTS:
+      isMemOp = MBBI->getOperand(1).isRegister();
+      break;
+    case ARM::FLDD:
+    case ARM::FSTD:
+      isMemOp = MBBI->getOperand(1).isRegister();
+      Size = 8;
+      break;
+    }
+    if (isMemOp) {
+      unsigned Base = MBBI->getOperand(1).getReg();
+      unsigned OffIdx = MBBI->getNumOperands()-1;
+      unsigned OffField = MBBI->getOperand(OffIdx).getImm();
+      int Offset = isAM2
+        ? ARM_AM::getAM2Offset(OffField) : ARM_AM::getAM5Offset(OffField) * 4;
+      if (isAM2) {
+        if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
+          Offset = -Offset;
+      } else {
+        if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
+          Offset = -Offset;
+      }
+      // Watch out for:
+      // r4 := ldr [r5]
+      // r5 := ldr [r5, #4]
+      // r6 := ldr [r5, #8]
+      //
+      // The second ldr has effectively broken the chain even though it
+      // looks like the later ldr(s) use the same base register. Try to
+      // merge the ldr's so far, including this one. But don't try to
+      // combine the following ldr(s).
+      Clobber = (Opcode == ARM::LDR && Base == MBBI->getOperand(0).getReg());
+      if (CurrBase == 0 && !Clobber) {
+        // Start of a new chain.
+        CurrBase = Base;
+        CurrOpc  = Opcode;
+        CurrSize = Size;
+        MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
+        NumMemOps++;
+        Advance = true;
+      } else {
+        if (Clobber) {
+          TryMerge = true;
+          Advance = true;
+        }
+
+        if (CurrOpc == Opcode && CurrBase == Base) {
+          // Continue adding to the queue.
+          if (Offset > MemOps.back().Offset) {
+            MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
+            NumMemOps++;
+            Advance = true;
+          } else {
+            for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
+                 I != E; ++I) {
+              if (Offset < I->Offset) {
+                MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI));
+                NumMemOps++;
+                Advance = true;
+                break;
+              } else if (Offset == I->Offset) {
+                // Collision! This can't be merged!
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if (Advance) {
+      ++Position;
+      ++MBBI;
+    } else
+      TryMerge = true;
+
+    if (TryMerge) {
+      if (NumMemOps > 1) {
+        SmallVector<MachineBasicBlock::iterator,4> MBBII =
+          MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,MemOps);
+        // Try folding preceeding/trailing base inc/dec into the generated
+        // LDM/STM ops.
+        for (unsigned i = 0, e = MBBII.size(); i < e; ++i)
+          if (mergeBaseUpdateLSMultiple(MBB, MBBII[i]))
+            NumMerges++;
+        NumMerges += MBBII.size();
+      }
+
+      // Try folding preceeding/trailing base inc/dec into those load/store
+      // that were not merged to form LDM/STM ops.
+      for (unsigned i = 0; i != NumMemOps; ++i)
+        if (!MemOps[i].Merged)
+          if (mergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII))
+            NumMerges++;
+
+      CurrBase = 0;
+      CurrOpc = -1;
+      if (NumMemOps) {
+        MemOps.clear();
+        NumMemOps = 0;
+      }
+
+      // If iterator hasn't been advanced and this is not a memory op, skip it.
+      // It can't start a new chain anyway.
+      if (!Advance && !isMemOp && MBBI != E) {
+        ++Position;
+        ++MBBI;
+      }
+    }
+  }
+  return NumMerges > 0;
+}
+
+/// MergeReturnIntoLDM - If this is a exit BB, try merging the return op
+/// (bx lr) into the preceeding stack restore so it directly restore the value
+/// of LR into pc.
+///   ldmfd sp!, {r7, lr}
+///   bx lr
+/// =>
+///   ldmfd sp!, {r7, pc}
+bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
+  if (MBB.empty()) return false;
+
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  if (MBBI->getOpcode() == ARM::BX_RET && MBBI != MBB.begin()) {
+    MachineInstr *PrevMI = prior(MBBI);
+    if (PrevMI->getOpcode() == ARM::LDM) {
+      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
+      if (MO.getReg() == ARM::LR) {
+        PrevMI->setInstrDescriptor(TII->get(ARM::LDM_RET));
+        MO.setReg(ARM::PC);
+        MBB.erase(MBBI);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  TII = Fn.getTarget().getInstrInfo();
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= LoadStoreMultipleOpti(MBB);
+    Modified |= MergeReturnIntoLDM(MBB);
+  }
+  return Modified;
+}
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@ -0,0 +1,136 @@
+//====- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file declares ARM-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMACHINEFUNCTIONINFO_H
+#define ARMMACHINEFUNCTIONINFO_H
+
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// ARMFunctionInfo - This class is derived from MachineFunction private
+/// ARM target-specific information for each MachineFunction.
+class ARMFunctionInfo : public MachineFunctionInfo {
+
+  /// isThumb - True if this function is compiled under Thumb mode.
+  ///
+  bool isThumb;
+
+  /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
+  ///
+  unsigned VarArgsRegSaveSize;
+
+  /// FramePtrSpilled - True if FP register is spilled. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool FramePtrSpilled;
+
+  /// FramePtrSpillOffset - If FramePtrSpilled, this records the frame pointer
+  /// spill stack offset.
+  unsigned FramePtrSpillOffset;
+
+  /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved
+  /// register spills areas. For Mac OS X:
+  ///
+  /// GPR callee-saved (1) : r4, r5, r6, r7, lr
+  /// --------------------------------------------
+  /// GPR callee-saved (2) : r8, r10, r11
+  /// --------------------------------------------
+  /// DPR callee-saved : d8 - d15
+  unsigned GPRCS1Offset;
+  unsigned GPRCS2Offset;
+  unsigned DPRCSOffset;
+
+  /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
+  /// areas.
+  unsigned GPRCS1Size;
+  unsigned GPRCS2Size;
+  unsigned DPRCSSize;
+
+  /// GPRCS1Frames, GPRCS2Frames, DPRCSFrames - Keeps track of frame indices
+  /// which belong to these spill areas.
+  std::set<int> GPRCS1Frames;
+  std::set<int> GPRCS2Frames;
+  std::set<int> DPRCSFrames;
+
+  /// JumpTableUId - Unique id for jumptables.
+  ///
+  unsigned JumpTableUId;
+
+public:
+  ARMFunctionInfo() :
+    isThumb(false),
+    VarArgsRegSaveSize(0), FramePtrSpilled(false), FramePtrSpillOffset(0),
+    GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), JumpTableUId(0) {}
+
+  ARMFunctionInfo(MachineFunction &MF) :
+    isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
+    VarArgsRegSaveSize(0), FramePtrSpilled(false), FramePtrSpillOffset(0),
+    GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), JumpTableUId(0) {}
+
+  bool isThumbFunction() const { return isThumb; }
+
+  unsigned getVarArgsRegSaveSize() const { return VarArgsRegSaveSize; }
+  void setVarArgsRegSaveSize(unsigned s) { VarArgsRegSaveSize = s; }
+
+  bool isFramePtrSpilled() const { return FramePtrSpilled; }
+  void setFramePtrSpilled(bool s) { FramePtrSpilled = s; }
+  unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
+  void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
+  
+  unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; }
+  unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; }
+  unsigned getDPRCalleeSavedAreaOffset()  const { return DPRCSOffset; }
+
+  void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; }
+  void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; }
+  void setDPRCalleeSavedAreaOffset(unsigned o)  { DPRCSOffset = o; }
+
+  unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
+  unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
+  unsigned getDPRCalleeSavedAreaSize()  const { return DPRCSSize; }
+
+  void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
+  void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
+  void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
+
+  bool isGPRCalleeSavedArea1Frame(unsigned fi) const {
+    return GPRCS1Frames.count(fi);
+  }
+  bool isGPRCalleeSavedArea2Frame(unsigned fi) const {
+    return GPRCS2Frames.count(fi);
+  }
+  bool isDPRCalleeSavedAreaFrame(unsigned fi) const {
+    return DPRCSFrames.count(fi);
+  }
+
+  void addGPRCalleeSavedArea1Frame(unsigned fi) {
+    GPRCS1Frames.insert(fi);
+  }
+  void addGPRCalleeSavedArea2Frame(unsigned fi) {
+    GPRCS2Frames.insert(fi);
+  }
+  void addDPRCalleeSavedAreaFrame(unsigned fi) {
+    DPRCSFrames.insert(fi);
+  }
+
+  unsigned createJumpTableUId() {
+    return JumpTableUId++;
+  }
+};
+} // End llvm namespace
+
+#endif // ARMMACHINEFUNCTIONINFO_H
--- a/llvm/lib/Target/ARM/ARMMul.cpp
+++ b/llvm/lib/Target/ARM/ARMMul.cpp
@ -1,75 +0,0 @@
-//===-- ARMMul.cpp - Define TargetMachine for A5CRM -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file was developed by the "Instituto Nokia de Tecnologia" and
-// is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Modify the ARM multiplication instructions so that Rd{Hi,Lo} and Rm are distinct
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "ARM.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Support/Compiler.h"
-
-using namespace llvm;
-
-namespace {
-  class VISIBILITY_HIDDEN FixMul : public MachineFunctionPass {
-    virtual bool runOnMachineFunction(MachineFunction &MF);
-  };
-}
-
-FunctionPass *llvm::createARMFixMulPass() { return new FixMul(); }
-
-bool FixMul::runOnMachineFunction(MachineFunction &MF) {
-  bool Changed = false;
-
-  for (MachineFunction::iterator BB = MF.begin(), E = MF.end();
-       BB != E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      MachineInstr *MI = I;
-
-      int Op = MI->getOpcode();
-      if (Op == ARM::MUL ||
-          Op == ARM::SMULL ||
-          Op == ARM::UMULL) {
-        MachineOperand &RdOp = MI->getOperand(0);
-        MachineOperand &RmOp = MI->getOperand(1);
-        MachineOperand &RsOp = MI->getOperand(2);
-
-        unsigned Rd = RdOp.getReg();
-        unsigned Rm = RmOp.getReg();
-        unsigned Rs = RsOp.getReg();
-
-        if (Rd == Rm) {
-          Changed = true;
-          if (Rd != Rs) {
-	    //Rd and Rm must be distinct, but Rd can be equal to Rs.
-	    //Swap Rs and Rm
-            RmOp.setReg(Rs);
-            RsOp.setReg(Rm);
-          } else {
-            unsigned scratch = Op == ARM::MUL ? ARM::R12 : ARM::R0;
-            BuildMI(MBB, I, MF.getTarget().getInstrInfo()->get(ARM::MOV),
-                    scratch).addReg(Rm).addImm(0).addImm(ARMShift::LSL);
-            RmOp.setReg(scratch);
-          }
-        }
-      }
-    }
-  }
-
-  return Changed;
-}
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.h
@ -17,18 +17,36 @@

 #include "llvm/Target/MRegisterInfo.h"
 #include "ARMGenRegisterInfo.h.inc"
+#include <set>

 namespace llvm {
-
-class Type;
-class TargetInstrInfo;
+  class TargetInstrInfo;
+  class ARMSubtarget;
+  class Type;

 struct ARMRegisterInfo : public ARMGenRegisterInfo {
  const TargetInstrInfo &TII;
+  const ARMSubtarget &STI;
+private:
+  /// FramePtr - ARM physical register used as frame ptr.
+  unsigned FramePtr;

-  ARMRegisterInfo(const TargetInstrInfo &tii);
+public:
+  ARMRegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI);
+
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// ARM::LR, return the number that it corresponds to (e.g. 14).
+  static unsigned getRegisterNumbering(unsigned RegEnum);

  /// Code Generation virtual methods...
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
  void storeRegToStackSlot(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MBBI,
                           unsigned SrcReg, int FrameIndex,
@ -43,9 +61,8 @@ struct ARMRegisterInfo : public ARMGenRegisterInfo {
                    unsigned DestReg, unsigned SrcReg,
                    const TargetRegisterClass *RC) const;

-  virtual MachineInstr* foldMemoryOperand(MachineInstr* MI,
-                                          unsigned OpNum,
-                                          int FrameIndex) const;
+  MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum,
+                                  int FrameIndex) const;

  const unsigned *getCalleeSavedRegs() const;

@ -57,7 +74,7 @@ struct ARMRegisterInfo : public ARMGenRegisterInfo {

  void eliminateFrameIndex(MachineBasicBlock::iterator II) const;

-  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF) const;

  void emitPrologue(MachineFunction &MF) const;
  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td
@ -1,4 +1,4 @@
-//===- ARMRegisterInfo.td - ARM Register defs ----------*- tablegen -*-===//
+//===- ARMRegisterInfo.td - ARM Register defs -------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@ -13,129 +13,169 @@
 //===----------------------------------------------------------------------===//

 // Registers are identified with 4-bit ID numbers.
-class ARMReg<string n> : Register<n> {
-  let Namespace = "ARM";
-}
-
-// Ri - 32-bit integer registers
-class Ri<bits<4> num, string n> : ARMReg<n> {
+class ARMReg<bits<4> num, string n, list<Register> aliases = []> : Register<n> {
  field bits<4> Num;
-  let Num = num;
-}
-// Rf - 32-bit floating-point registers
-class Rf<bits<5> num, string n> : ARMReg<n> {
-  field bits<5> Num;
-  let Num = num;
-}
-// Rd - Slots in the FP register file for 64-bit floating-point values.
-class Rd<bits<5> num, string n, list<Register> aliases> : ARMReg<n> {
-  field bits<5> Num;
-  let Num = num;
+  let Namespace = "ARM";
  let Aliases = aliases;
 }

+class ARMFReg<bits<5> num, string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "ARM";
+}
+
 // Integer registers
-def R0  : Ri< 0, "R0">,  DwarfRegNum<0>;
-def R1  : Ri< 1, "R1">,  DwarfRegNum<1>;
-def R2  : Ri< 2, "R2">,  DwarfRegNum<2>;
-def R3  : Ri< 3, "R3">,  DwarfRegNum<3>;
-def R4  : Ri< 4, "R4">,  DwarfRegNum<4>;
-def R5  : Ri< 5, "R5">,  DwarfRegNum<5>;
-def R6  : Ri< 6, "R6">,  DwarfRegNum<6>;
-def R7  : Ri< 7, "R7">,  DwarfRegNum<7>;
-def R8  : Ri< 8, "R8">,  DwarfRegNum<8>;
-def R9  : Ri< 9, "R9">,  DwarfRegNum<9>;
-def R10 : Ri<10, "R10">, DwarfRegNum<10>;
-def R11 : Ri<11, "R11">, DwarfRegNum<11>;
-def R12 : Ri<12, "R12">, DwarfRegNum<12>;
-def R13 : Ri<13, "R13">, DwarfRegNum<13>;
-def R14 : Ri<14, "R14">, DwarfRegNum<14>;
-def R15 : Ri<15, "R15">, DwarfRegNum<15>;
+def R0  : ARMReg< 0, "r0">,  DwarfRegNum<0>;
+def R1  : ARMReg< 1, "r1">,  DwarfRegNum<1>;
+def R2  : ARMReg< 2, "r2">,  DwarfRegNum<2>;
+def R3  : ARMReg< 3, "r3">,  DwarfRegNum<3>;
+def R4  : ARMReg< 4, "r4">,  DwarfRegNum<4>;
+def R5  : ARMReg< 5, "r5">,  DwarfRegNum<5>;
+def R6  : ARMReg< 6, "r6">,  DwarfRegNum<6>;
+def R7  : ARMReg< 7, "r7">,  DwarfRegNum<7>;
+def R8  : ARMReg< 8, "r8">,  DwarfRegNum<8>;
+def R9  : ARMReg< 9, "r9">,  DwarfRegNum<9>;
+def R10 : ARMReg<10, "r10">, DwarfRegNum<10>;
+def R11 : ARMReg<11, "r11">, DwarfRegNum<11>;
+def R12 : ARMReg<12, "r12">, DwarfRegNum<12>;
+def SP  : ARMReg<13, "sp">,  DwarfRegNum<13>;
+def LR  : ARMReg<14, "lr">,  DwarfRegNum<14>;
+def PC  : ARMReg<15, "pc">,  DwarfRegNum<15>;

-// TODO: update to VFP-v3
-// Floating-point registers
-def S0  : Rf< 0,  "S0">, DwarfRegNum<64>;
-def S1  : Rf< 1,  "S1">, DwarfRegNum<65>;
-def S2  : Rf< 2,  "S2">, DwarfRegNum<66>;
-def S3  : Rf< 3,  "S3">, DwarfRegNum<67>;
-def S4  : Rf< 4,  "S4">, DwarfRegNum<68>;
-def S5  : Rf< 5,  "S5">, DwarfRegNum<69>;
-def S6  : Rf< 6,  "S6">, DwarfRegNum<70>;
-def S7  : Rf< 7,  "S7">, DwarfRegNum<71>;
-def S8  : Rf< 8,  "S8">, DwarfRegNum<72>;
-def S9  : Rf< 9,  "S9">, DwarfRegNum<73>;
-def S10 : Rf<10, "S10">, DwarfRegNum<74>;
-def S11 : Rf<11, "S11">, DwarfRegNum<75>;
-def S12 : Rf<12, "S12">, DwarfRegNum<76>;
-def S13 : Rf<13, "S13">, DwarfRegNum<77>;
-def S14 : Rf<14, "S14">, DwarfRegNum<78>;
-def S15 : Rf<15, "S15">, DwarfRegNum<79>;
-def S16 : Rf<16, "S16">, DwarfRegNum<80>;
-def S17 : Rf<17, "S17">, DwarfRegNum<81>;
-def S18 : Rf<18, "S18">, DwarfRegNum<82>;
-def S19 : Rf<19, "S19">, DwarfRegNum<83>;
-def S20 : Rf<20, "S20">, DwarfRegNum<84>;
-def S21 : Rf<21, "S21">, DwarfRegNum<85>;
-def S22 : Rf<22, "S22">, DwarfRegNum<86>;
-def S23 : Rf<23, "S23">, DwarfRegNum<87>;
-def S24 : Rf<24, "S24">, DwarfRegNum<88>;
-def S25 : Rf<25, "S25">, DwarfRegNum<89>;
-def S26 : Rf<26, "S26">, DwarfRegNum<90>;
-def S27 : Rf<27, "S27">, DwarfRegNum<91>;
-def S28 : Rf<28, "S28">, DwarfRegNum<92>;
-def S29 : Rf<29, "S29">, DwarfRegNum<93>;
-def S30 : Rf<30, "S30">, DwarfRegNum<94>;
-def S31 : Rf<31, "S31">, DwarfRegNum<95>;
+// Float registers
+def S0  : ARMFReg< 0, "s0">;  def S1  : ARMFReg< 1, "s1">;
+def S2  : ARMFReg< 2, "s2">;  def S3  : ARMFReg< 3, "s3">;
+def S4  : ARMFReg< 4, "s4">;  def S5  : ARMFReg< 5, "s5">;
+def S6  : ARMFReg< 6, "s6">;  def S7  : ARMFReg< 7, "s7">;
+def S8  : ARMFReg< 8, "s8">;  def S9  : ARMFReg< 9, "s9">;
+def S10 : ARMFReg<10, "s10">; def S11 : ARMFReg<11, "s11">;
+def S12 : ARMFReg<12, "s12">; def S13 : ARMFReg<13, "s13">;
+def S14 : ARMFReg<14, "s14">; def S15 : ARMFReg<15, "s15">;
+def S16 : ARMFReg<16, "s16">; def S17 : ARMFReg<17, "s17">;
+def S18 : ARMFReg<18, "s18">; def S19 : ARMFReg<19, "s19">;
+def S20 : ARMFReg<20, "s20">; def S21 : ARMFReg<21, "s21">;
+def S22 : ARMFReg<22, "s22">; def S23 : ARMFReg<23, "s23">;
+def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">;
+def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">;
+def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">;
+def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">;

-// Aliases of the S* registers used to hold 64-bit fp values (doubles)
-def D0  : Rd< 0,  "D0", [S0,   S1]>, DwarfRegNum<64>;
-def D1  : Rd< 2,  "D1", [S2,   S3]>, DwarfRegNum<66>;
-def D2  : Rd< 4,  "D2", [S4,   S5]>, DwarfRegNum<68>;
-def D3  : Rd< 6,  "D3", [S6,   S7]>, DwarfRegNum<70>;
-def D4  : Rd< 8,  "D4", [S8,   S9]>, DwarfRegNum<72>;
-def D5  : Rd<10,  "D5", [S10, S11]>, DwarfRegNum<74>;
-def D6  : Rd<12,  "D6", [S12, S13]>, DwarfRegNum<76>;
-def D7  : Rd<14,  "D7", [S14, S15]>, DwarfRegNum<78>;
-def D8  : Rd<16,  "D8", [S16, S17]>, DwarfRegNum<80>;
-def D9  : Rd<18,  "D9", [S18, S19]>, DwarfRegNum<82>;
-def D10 : Rd<20, "D10", [S20, S21]>, DwarfRegNum<84>;
-def D11 : Rd<22, "D11", [S22, S23]>, DwarfRegNum<86>;
-def D12 : Rd<24, "D12", [S24, S25]>, DwarfRegNum<88>;
-def D13 : Rd<26, "D13", [S26, S27]>, DwarfRegNum<90>;
-def D14 : Rd<28, "D14", [S28, S29]>, DwarfRegNum<92>;
-def D15 : Rd<30, "D15", [S30, S31]>, DwarfRegNum<94>;
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+def D0  : ARMReg< 0,  "d0", [S0,   S1]>;
+def D1  : ARMReg< 1,  "d1", [S2,   S3]>; 
+def D2  : ARMReg< 2,  "d2", [S4,   S5]>;
+def D3  : ARMReg< 3,  "d3", [S6,   S7]>;
+def D4  : ARMReg< 4,  "d4", [S8,   S9]>;
+def D5  : ARMReg< 5,  "d5", [S10, S11]>;
+def D6  : ARMReg< 6,  "d6", [S12, S13]>;
+def D7  : ARMReg< 7,  "d7", [S14, S15]>;
+def D8  : ARMReg< 8,  "d8", [S16, S17]>;
+def D9  : ARMReg< 9,  "d9", [S18, S19]>;
+def D10 : ARMReg<10, "d10", [S20, S21]>;
+def D11 : ARMReg<11, "d11", [S22, S23]>;
+def D12 : ARMReg<12, "d12", [S24, S25]>;
+def D13 : ARMReg<13, "d13", [S26, S27]>;
+def D14 : ARMReg<14, "d14", [S28, S29]>;
+def D15 : ARMReg<15, "d15", [S30, S31]>;

 // Register classes.
 //
-// FIXME: the register order should be defined in terms of the preferred
-// allocation order...
+// pc  == Program Counter
+// lr  == Link Register
+// sp  == Stack Pointer
+// r12 == ip (scratch)
+// r7  == Frame Pointer (thumb-style backtraces)
+// r11 == Frame Pointer (arm-style backtraces)
+// r10 == Stack Limit
 //
-def IntRegs : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
-	                                       R7, R8, R9, R10, R11, R12,
-                                               R13, R14, R15]> {
+def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
+                                           R7, R8, R9, R10, R12, R11,
+                                           LR, SP, PC]> {
  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
    iterator allocation_order_end(const MachineFunction &MF) const;
  }];
+  // FIXME: We are reserving r12 in case the PEI needs to use it to
+  // generate large stack offset. Make it available once we have register
+  // scavenging.
  let MethodBodies = [{
-    IntRegsClass::iterator
-    IntRegsClass::allocation_order_end(const MachineFunction &MF) const {
-      // r15 == Program Counter
-      // r14 == Link Register
-      // r13 == Stack Pointer
-      // r12 == ip (scratch)
-      // r11 == Frame Pointer
-      // r10 == Stack Limit
-      if (hasFP(MF))
-        return end() - 5;
-      else
-        return end() - 4;
+    // FP is R11, R9 is available.
+    static const unsigned ARM_GPR_AO_1[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+      ARM::R8, ARM::R9, ARM::R10,
+      ARM::LR, ARM::R11 };
+    // FP is R11, R9 is not available.
+    static const unsigned ARM_GPR_AO_2[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+      ARM::R8, ARM::R10,
+      ARM::LR, ARM::R11 };
+    // FP is R7, R9 is available.
+    static const unsigned ARM_GPR_AO_3[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R8,
+      ARM::R9, ARM::R10,ARM::R11,
+      ARM::LR, ARM::R7 };
+    // FP is R7, R9 is not available.
+    static const unsigned ARM_GPR_AO_4[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R8,
+      ARM::R10,ARM::R11,
+      ARM::LR, ARM::R7 };
+    // FP is R7, only low registers available.
+    static const unsigned THUMB_GPR_AO[] = {
+      ARM::R0, ARM::R1, ARM::R2,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7 };
+
+    GPRClass::iterator
+    GPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.isThumb())
+        return THUMB_GPR_AO;
+      if (Subtarget.useThumbBacktraces()) {
+        if (Subtarget.isR9Reserved())
+          return ARM_GPR_AO_4;
+        else
+          return ARM_GPR_AO_3;
+      } else {
+        if (Subtarget.isR9Reserved())
+          return ARM_GPR_AO_2;
+        else
+          return ARM_GPR_AO_1;
+      }
+    }
+
+    GPRClass::iterator
+    GPRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      GPRClass::iterator I;
+      if (Subtarget.isThumb())
+        I = THUMB_GPR_AO + (sizeof(THUMB_GPR_AO)/sizeof(unsigned));
+      else if (Subtarget.useThumbBacktraces()) {
+        if (Subtarget.isR9Reserved())
+          I = ARM_GPR_AO_4 + (sizeof(ARM_GPR_AO_4)/sizeof(unsigned));
+        else
+          I = ARM_GPR_AO_3 + (sizeof(ARM_GPR_AO_3)/sizeof(unsigned));
+      } else {
+        if (Subtarget.isR9Reserved())
+          I = ARM_GPR_AO_2 + (sizeof(ARM_GPR_AO_2)/sizeof(unsigned));
+        else
+          I = ARM_GPR_AO_1 + (sizeof(ARM_GPR_AO_1)/sizeof(unsigned));
+      }
+
+      return hasFP(MF) ? I-1 : I;
    }
  }];
 }

-def FPRegs : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
+def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
  S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
  S23, S24, S25, S26, S27, S28, S29, S30, S31]>;

-def DFPRegs : RegisterClass<"ARM", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7,
-  D8, D9, D10, D11, D12, D13, D14, D15]>;
+// ARM requires only word alignment for double. It's more performant if it
+// is double-word alignment though.
+def DPR : RegisterClass<"ARM", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7, D8,
+  D9, D10, D11, D12, D13, D14, D15]>;
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@ -0,0 +1,52 @@
+//===-- ARMSubtarget.cpp - ARM Subtarget Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMSubtarget.h"
+#include "ARMGenSubtarget.inc"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+// FIXME: this is temporary.
+static cl::opt<bool> Thumb("enable-thumb",
+                           cl::desc("Switch to thumb mode in ARM backend"));
+
+ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS)
+  : ARMArchVersion(V4T), HasVFP2(false), IsDarwin(false),
+    UseThumbBacktraces(false), IsR9Reserved(false), stackAlignment(8) {
+
+  // Determine default and user specified characteristics
+  std::string CPU = "generic";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+
+  IsThumb = Thumb;
+  
+  // Set the boolean corresponding to the current target triple, or the default
+  // if one cannot be determined, to true.
+  const std::string& TT = M.getTargetTriple();
+  if (TT.length() > 5) {
+    IsDarwin = TT.find("-darwin") != std::string::npos;
+  } else if (TT.empty()) {
+#if defined(__APPLE__)
+    IsDarwin = true;
+#endif
+  }
+
+  if (IsDarwin) {
+    UseThumbBacktraces = true;
+    IsR9Reserved = true;
+    stackAlignment = 4;
+  } 
+}
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@ -0,0 +1,82 @@
+//=====---- ARMSubtarget.h - Define Subtarget for the ARM -----*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMSUBTARGET_H
+#define ARMSUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include <string>
+
+namespace llvm {
+class Module;
+
+class ARMSubtarget : public TargetSubtarget {
+protected:
+  enum ARMArchEnum {
+    V4T, V5T, V5TE, V6
+  };
+
+  /// ARMArchVersion - ARM architecture vecrsion: V4T (base), V5T, V5TE,
+  /// and V6.
+  ARMArchEnum ARMArchVersion;
+
+  /// HasVFP2 - True if the processor supports Vector Floating Point (VFP) V2
+  /// instructions.
+  bool HasVFP2;
+
+  /// IsThumb - True if we are in thumb mode, false if in ARM mode.
+  bool IsThumb;
+
+  bool IsDarwin;
+  
+  /// UseThumbBacktraces - True if we use thumb style backtraces.
+  bool UseThumbBacktraces;
+
+  /// IsR9Reserved - True if R9 is a not available as general purpose register.
+  bool IsR9Reserved;
+  
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+ public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  ARMSubtarget(const Module &M, const std::string &FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU);
+
+  bool hasV4TOps()  const { return ARMArchVersion >= V4T; }
+  bool hasV5TOps()  const { return ARMArchVersion >= V5T; }
+  bool hasV5TEOps() const { return ARMArchVersion >= V5TE; }
+  bool hasV6Ops()   const { return ARMArchVersion >= V6; }
+
+  bool hasVFP2() const { return HasVFP2; }
+  
+  bool isDarwin() const { return IsDarwin; }
+  bool isThumb() const { return IsThumb; }
+
+  bool useThumbBacktraces() const { return UseThumbBacktraces; }
+  bool isR9Reserved() const { return IsR9Reserved; }
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+};
+} // End llvm namespace
+
+#endif  // ARMSUBTARGET_H
--- a/llvm/lib/Target/ARM/ARMTargetAsmInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetAsmInfo.cpp
@ -12,18 +12,50 @@
 //===----------------------------------------------------------------------===//

 #include "ARMTargetAsmInfo.h"
-
+#include "ARMTargetMachine.h"
 using namespace llvm;

 ARMTargetAsmInfo::ARMTargetAsmInfo(const ARMTargetMachine &TM) {
-  Data16bitsDirective = "\t.half\t";
-  Data32bitsDirective = "\t.word\t";
+  const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>();
+  if (Subtarget->isDarwin()) {
+    HasDotTypeDotSizeDirective = false;
+    PrivateGlobalPrefix = "L";
+    GlobalPrefix = "_";
+    ZeroDirective = "\t.space\t";
+    SetDirective = "\t.set";
+    WeakRefDirective = "\t.weak_reference\t";
+    JumpTableDataSection = ".const";
+    CStringSection = "\t.cstring";
+    StaticCtorsSection = ".mod_init_func";
+    StaticDtorsSection = ".mod_term_func";
+    InlineAsmStart = "@ InlineAsm Start";
+    InlineAsmEnd = "@ InlineAsm End";
+    LCOMMDirective = "\t.lcomm\t";
+    COMMDirectiveTakesAlignment = false;
+    
+    NeedsSet = true;
+    DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug";
+    DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug";
+    DwarfLineSection = ".section __DWARF,__debug_line,regular,debug";
+    DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug";
+    DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug";
+    DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug";
+    DwarfStrSection = ".section __DWARF,__debug_str,regular,debug";
+    DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug";
+    DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug";
+    DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug";
+    DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug";
+  } else {
+    Data16bitsDirective = "\t.half\t";
+    Data32bitsDirective = "\t.word\t";
+    ZeroDirective = "\t.skip\t";
+    WeakRefDirective = "\t.weak\t";
+    StaticCtorsSection = "\t.section .ctors,\"aw\",%progbits";
+    StaticDtorsSection = "\t.section .dtors,\"aw\",%progbits";
+  }
+  AlignmentIsInBytes = false; 
  Data64bitsDirective = 0;
-  ZeroDirective = "\t.skip\t";
  CommentString = "@";
+  DataSection = "\t.data";
  ConstantPoolSection = "\t.text\n";
-  AlignmentIsInBytes = false;
-  WeakRefDirective = "\t.weak\t";
-  StaticCtorsSection = "\t.section .ctors,\"aw\",%progbits";
-  StaticDtorsSection = "\t.section .dtors,\"aw\",%progbits";
 }
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@ -11,30 +11,32 @@
 //
 //===----------------------------------------------------------------------===//

-#include "ARMTargetAsmInfo.h"
 #include "ARMTargetMachine.h"
+#include "ARMTargetAsmInfo.h"
 #include "ARMFrameInfo.h"
 #include "ARM.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Target/TargetOptions.h"
 using namespace llvm;

+static cl::opt<bool> DisableLdStOpti("disable-arm-loadstore-opti", cl::Hidden,
+                              cl::desc("Disable load store optimization pass"));
+
 namespace {
  // Register the target.
  RegisterTarget<ARMTargetMachine> X("arm", "  ARM");
 }

-
-const TargetAsmInfo *ARMTargetMachine::createTargetAsmInfo() const {
-  return new ARMTargetAsmInfo(*this);
-}
-
-
 /// TargetMachine ctor - Create an ILP32 architecture model
 ///
 ARMTargetMachine::ARMTargetMachine(const Module &M, const std::string &FS)
-  : DataLayout("e-p:32:32") {
+  : Subtarget(M, FS), DataLayout("e-p:32:32-d:32"), InstrInfo(Subtarget),
+    FrameInfo(Subtarget) {
+  if (Subtarget.isDarwin())
+    NoFramePointerElim = true;
 }

 unsigned ARMTargetMachine::getModuleMatchQuality(const Module &M) {
@ -49,14 +51,23 @@ unsigned ARMTargetMachine::getModuleMatchQuality(const Module &M) {
 }


+const TargetAsmInfo *ARMTargetMachine::createTargetAsmInfo() const {
+  return new ARMTargetAsmInfo(*this);
+}
+
+
 // Pass Pipeline Configuration
 bool ARMTargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) {
  PM.add(createARMISelDag(*this));
  return false;
 }

-bool ARMTargetMachine::addPostRegAlloc(FunctionPassManager &PM, bool Fast) {
-  PM.add(createARMFixMulPass());
+bool ARMTargetMachine::addPreEmitPass(FunctionPassManager &PM, bool Fast) {
+  // FIXME: temporarily disabling load / store optimization pass for Thumb mode.
+  if (!Fast && !DisableLdStOpti && !Subtarget.isThumb())
+    PM.add(createARMLoadStoreOptimizationPass());
+  
+  PM.add(createARMConstantIslandPass());
  return true;
 }

--- a/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.h
@ -20,19 +20,17 @@
 #include "llvm/Target/TargetFrameInfo.h"
 #include "ARMInstrInfo.h"
 #include "ARMFrameInfo.h"
+#include "ARMSubtarget.h"

 namespace llvm {

 class Module;

 class ARMTargetMachine : public LLVMTargetMachine {
-  const TargetData DataLayout;       // Calculates type size & alignment
-  ARMInstrInfo InstrInfo;
-  ARMFrameInfo FrameInfo;
-  
-protected:
-  virtual const TargetAsmInfo *createTargetAsmInfo() const;
-
+  ARMSubtarget      Subtarget;
+  const TargetData  DataLayout;       // Calculates type size & alignment
+  ARMInstrInfo      InstrInfo;
+  ARMFrameInfo      FrameInfo;
 public:
  ARMTargetMachine(const Module &M, const std::string &FS);

@ -42,11 +40,14 @@ public:
    return &InstrInfo.getRegisterInfo();
  }
  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const ARMSubtarget  *getSubtargetImpl() const { return &Subtarget; }
  static unsigned getModuleMatchQuality(const Module &M);

+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+  
  // Pass Pipeline Configuration
  virtual bool addInstSelector(FunctionPassManager &PM, bool Fast);
-  virtual bool addPostRegAlloc(FunctionPassManager &PM, bool Fast);
+  virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast);
  virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
                                  std::ostream &Out);
 };
--- a/llvm/lib/Target/ARM/Makefile
+++ b/llvm/lib/Target/ARM/Makefile
@ -7,6 +7,7 @@
 # License. See LICENSE.TXT for details.
 #
 ##===----------------------------------------------------------------------===##
+
 LEVEL = ../../..
 LIBRARYNAME = LLVMARM
 TARGET = ARM
@ -15,7 +16,6 @@ TARGET = ARM
 BUILT_SOURCES = ARMGenRegisterInfo.h.inc ARMGenRegisterNames.inc \
                ARMGenRegisterInfo.inc ARMGenInstrNames.inc \
                ARMGenInstrInfo.inc ARMGenAsmWriter.inc \
-                ARMGenDAGISel.inc
+                ARMGenDAGISel.inc ARMGenSubtarget.inc

 include $(LEVEL)/Makefile.common
-
--- a/llvm/lib/Target/ARM/README-Thumb.txt
+++ b/llvm/lib/Target/ARM/README-Thumb.txt
@ -0,0 +1,17 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the ARM backend (Thumb specific).
+//===---------------------------------------------------------------------===//
+
+* Add support for compiling functions in both ARM and Thumb mode, then taking
+  the smallest.
+* Add support for compiling individual basic blocks in thumb mode, when in a 
+  larger ARM function.  This can be used for presumed cold code, like paths
+  to abort (failure path of asserts), EH handling code, etc.
+
+* Thumb doesn't have normal pre/post increment addressing modes, but you can
+  load/store 32-bit integers with pre/postinc by using load/store multiple
+  instrs with a single register.
+
+* Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add
+  and cmp instructions can use high registers. Also, we can use them as
+  temporaries to spill values into.
--- a/llvm/lib/Target/ARM/README.txt
+++ b/llvm/lib/Target/ARM/README.txt
@ -2,69 +2,438 @@
 // Random ideas for the ARM backend.
 //===---------------------------------------------------------------------===//

-Consider implementing a select with two conditional moves:
+Reimplement 'select' in terms of 'SEL'.

-cmp x, y
-moveq dst, a
-movne dst, b
+* We would really like to support UXTAB16, but we need to prove that the
+  add doesn't need to overflow between the two 16-bit chunks.

----------------------------------------------------------
+* implement predication support
+* Implement pre/post increment support.  (e.g. PR935)
+* Coalesce stack slots!
+* Implement smarter constant generation for binops with large immediates.

+* Consider materializing FP constants like 0.0f and 1.0f using integer 
+  immediate instructions then copy to FPU.  Slower than load into FPU?

-%tmp1 = shl int %b, ubyte %c
-%tmp4 = add int %a, %tmp1
+//===---------------------------------------------------------------------===//

-compiles to
+The constant island pass is extremely naive.  If a constant pool entry is
+out of range, it *always* splits a block and inserts a copy of the cp 
+entry inline.  It should:

-add r0, r0, r1, lsl r2
+1. Check to see if there is already a copy of this constant nearby.  If so, 
+   reuse it.
+2. Instead of always splitting blocks to insert the constant, insert it in 
+   nearby 'water'.
+3. Constant island references should be ref counted.  If a constant reference
+   is out-of-range, and the last reference to a constant is relocated, the
+   dead constant should be removed.

-but
+This pass has all the framework needed to implement this, but it hasn't 
+been done.

-%tmp1 = shl int %b, ubyte %c
-%tmp4 = add int %tmp1, %a
+//===---------------------------------------------------------------------===//

-compiles to
-mov r1, r1, lsl r2
-add r0, r1, r0
+We need to start generating predicated instructions.  The .td files have a way
+to express this now (see the PPC conditional return instruction), but the 
+branch folding pass (or a new if-cvt pass) should start producing these, at
+least in the trivial case.

---------------------------------------------------------
-%tmp1 = shl int %b, ubyte 4
-%tmp2 = add int %a, %tmp1
+Among the obvious wins, doing so can eliminate the need to custom expand 
+copysign (i.e. we won't need to custom expand it to get the conditional
+negate).

-compiles to
+//===---------------------------------------------------------------------===//

-mov r2, #4
-add r0, r0, r1, lsl r2
+Implement long long "X-3" with instructions that fold the immediate in.  These
+were disabled due to badness with the ARM carry flag on subtracts.

-should be
+//===---------------------------------------------------------------------===//

-add r0, r0, r1, lsl #4
+We currently compile abs:
+int foo(int p) { return p < 0 ? -p : p; }

----------------------------------------------------------
+into:

-add an offset to FLDS/FLDD/FSTD/FSTS addressing mode
+_foo:
+        rsb r1, r0, #0
+        cmn r0, #1
+        movgt r1, r0
+        mov r0, r1
+        bx lr

----------------------------------------------------------
+This is very, uh, literal.  This could be a 3 operation sequence:
+  t = (p sra 31); 
+  res = (p xor t)-t

-the function
+Which would be better.  This occurs in png decode.

-void %f() {
+//===---------------------------------------------------------------------===//
+
+More load / store optimizations:
+1) Look past instructions without side-effects (not load, store, branch, etc.)
+   when forming the list of loads / stores to optimize.
+
+2) Smarter register allocation?
+We are probably missing some opportunities to use ldm / stm. Consider:
+
+ldr r5, [r0]
+ldr r4, [r0, #4]
+
+This cannot be merged into a ldm. Perhaps we will need to do the transformation
+before register allocation. Then teach the register allocator to allocate a
+chunk of consecutive registers.
+
+3) Better representation for block transfer? This is from Olden/power:
+
+	fldd d0, [r4]
+	fstd d0, [r4, #+32]
+	fldd d0, [r4, #+8]
+	fstd d0, [r4, #+40]
+	fldd d0, [r4, #+16]
+	fstd d0, [r4, #+48]
+	fldd d0, [r4, #+24]
+	fstd d0, [r4, #+56]
+
+If we can spare the registers, it would be better to use fldm and fstm here.
+Need major register allocator enhancement though.
+
+4) Can we recognize the relative position of constantpool entries? i.e. Treat
+
+	ldr r0, LCPI17_3
+	ldr r1, LCPI17_4
+	ldr r2, LCPI17_5
+
+   as
+	ldr r0, LCPI17
+	ldr r1, LCPI17+4
+	ldr r2, LCPI17+8
+
+   Then the ldr's can be combined into a single ldm. See Olden/power.
+
+Note for ARM v4 gcc uses ldmia to load a pair of 32-bit values to represent a
+double 64-bit FP constant:
+
+	adr	r0, L6
+	ldmia	r0, {r0-r1}
+
+	.align 2
+L6:
+	.long	-858993459
+	.long	1074318540
+
+5) Can we make use of ldrd and strd? Instead of generating ldm / stm, use
+ldrd/strd instead if there are only two destination registers that form an
+odd/even pair. However, we probably would pay a penalty if the address is not
+aligned on 8-byte boundary. This requires more information on load / store
+nodes (and MI's?) then we currently carry.
+
+//===---------------------------------------------------------------------===//
+
+* Consider this silly example:
+
+double bar(double x) {  
+  double r = foo(3.1);
+  return x+r;
+}
+
+_bar:
+	sub sp, sp, #16
+	str r4, [sp, #+12]
+	str r5, [sp, #+8]
+	str lr, [sp, #+4]
+	mov r4, r0
+	mov r5, r1
+	ldr r0, LCPI2_0
+	bl _foo
+	fmsr f0, r0
+	fcvtsd d0, f0
+	fmdrr d1, r4, r5
+	faddd d0, d0, d1
+	fmrrd r0, r1, d0
+	ldr lr, [sp, #+4]
+	ldr r5, [sp, #+8]
+	ldr r4, [sp, #+12]
+	add sp, sp, #16
+	bx lr
+
+Ignore the prologue and epilogue stuff for a second. Note 
+	mov r4, r0
+	mov r5, r1
+the copys to callee-save registers and the fact they are only being used by the
+fmdrr instruction. It would have been better had the fmdrr been scheduled
+before the call and place the result in a callee-save DPR register. The two
+mov ops would not have been necessary.
+
+//===---------------------------------------------------------------------===//
+
+Calling convention related stuff:
+
+* gcc's parameter passing implementation is terrible and we suffer as a result:
+
+e.g.
+struct s {
+  double d1;
+  int s1;
+};
+
+void foo(struct s S) {
+  printf("%g, %d\n", S.d1, S.s1);
+}
+
+'S' is passed via registers r0, r1, r2. But gcc stores them to the stack, and
+then reload them to r1, r2, and r3 before issuing the call (r0 contains the
+address of the format string):
+
+	stmfd	sp!, {r7, lr}
+	add	r7, sp, #0
+	sub	sp, sp, #12
+	stmia	sp, {r0, r1, r2}
+	ldmia	sp, {r1-r2}
+	ldr	r0, L5
+	ldr	r3, [sp, #8]
+L2:
+	add	r0, pc, r0
+	bl	L_printf$stub
+
+Instead of a stmia, ldmia, and a ldr, wouldn't it be better to do three moves?
+
+* Return an aggregate type is even worse:
+
+e.g.
+struct s foo(void) {
+  struct s S = {1.1, 2};
+  return S;
+}
+
+	mov	ip, r0
+	ldr	r0, L5
+	sub	sp, sp, #12
+L2:
+	add	r0, pc, r0
+	@ lr needed for prologue
+	ldmia	r0, {r0, r1, r2}
+	stmia	sp, {r0, r1, r2}
+	stmia	ip, {r0, r1, r2}
+	mov	r0, ip
+	add	sp, sp, #12
+	bx	lr
+
+r0 (and later ip) is the hidden parameter from caller to store the value in. The
+first ldmia loads the constants into r0, r1, r2. The last stmia stores r0, r1,
+r2 into the address passed in. However, there is one additional stmia that
+stores r0, r1, and r2 to some stack location. The store is dead.
+
+The llvm-gcc generated code looks like this:
+
+csretcc void %foo(%struct.s* %agg.result) {
 entry:
-	call void %g( int 1, int 2, int 3, int 4, int 5 )
+	%S = alloca %struct.s, align 4		; <%struct.s*> [#uses=1]
+	%memtmp = alloca %struct.s		; <%struct.s*> [#uses=1]
+	cast %struct.s* %S to sbyte*		; <sbyte*>:0 [#uses=2]
+	call void %llvm.memcpy.i32( sbyte* %0, sbyte* cast ({ double, int }* %C.0.904 to sbyte*), uint 12, uint 4 )
+	cast %struct.s* %agg.result to sbyte*		; <sbyte*>:1 [#uses=2]
+	call void %llvm.memcpy.i32( sbyte* %1, sbyte* %0, uint 12, uint 0 )
+	cast %struct.s* %memtmp to sbyte*		; <sbyte*>:2 [#uses=1]
+	call void %llvm.memcpy.i32( sbyte* %2, sbyte* %1, uint 12, uint 0 )
 	ret void
 }

-declare void %g(int, int, int, int, int)
+llc ends up issuing two memcpy's (the first memcpy becomes 3 loads from
+constantpool). Perhaps we should 1) fix llvm-gcc so the memcpy is translated
+into a number of load and stores, or 2) custom lower memcpy (of small size) to
+be ldmia / stmia. I think option 2 is better but the current register
+allocator cannot allocate a chunk of registers at a time.

-Only needs 8 bytes of stack space. We currently allocate 16.
+A feasible temporary solution is to use specific physical registers at the
+lowering time for small (<= 4 words?) transfer size.

----------------------------------------------------------
+* ARM CSRet calling convention requires the hidden argument to be returned by
+the callee.

-32 x 32 -> 64 multiplications currently uses two instructions. We
-should try to declare smull and umull as returning two values.
+//===---------------------------------------------------------------------===//

----------------------------------------------------------
+We can definitely do a better job on BB placements to eliminate some branches.
+It's very common to see llvm generated assembly code that looks like this:

-Implement addressing modes 2 (ldrb) and 3 (ldrsb)
+LBB3:
+ ...
+LBB4:
+...
+  beq LBB3
+  b LBB2

----------------------------------------------------------
+If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can
+then eliminate beq and and turn the unconditional branch to LBB2 to a bne.
+
+See McCat/18-imp/ComputeBoundingBoxes for an example.
+
+//===---------------------------------------------------------------------===//
+
+We need register scavenging.  Currently, the 'ip' register is reserved in case
+frame indexes are too big.  This means that we generate extra code for stuff 
+like this:
+
+void foo(unsigned x, unsigned y, unsigned z, unsigned *a, unsigned *b, unsigned *c) { 
+   short Rconst = (short) (16384.0f * 1.40200 + 0.5 );
+   *a = x * Rconst;
+   *b = y * Rconst;
+   *c = z * Rconst;
+}
+
+we compile it to:
+
+_foo:
+***     stmfd sp!, {r4, r7}
+***     add r7, sp, #4
+        mov r4, #186
+        orr r4, r4, #89, 24 @ 22784
+        mul r0, r0, r4
+        str r0, [r3]
+        mul r0, r1, r4
+        ldr r1, [sp, #+8]
+        str r0, [r1]
+        mul r0, r2, r4
+        ldr r1, [sp, #+12]
+        str r0, [r1]
+***     sub sp, r7, #4
+***     ldmfd sp!, {r4, r7}
+        bx lr
+
+GCC produces:
+
+_foo:
+        ldr     ip, L4
+        mul     r0, ip, r0
+        mul     r1, ip, r1
+        str     r0, [r3, #0]
+        ldr     r3, [sp, #0]
+        mul     r2, ip, r2
+        str     r1, [r3, #0]
+        ldr     r3, [sp, #4]
+        str     r2, [r3, #0]
+        bx      lr
+L4:
+        .long   22970
+
+This is apparently all because we couldn't use ip here.
+
+//===---------------------------------------------------------------------===//
+
+Pre-/post- indexed load / stores:
+
+1) We should not make the pre/post- indexed load/store transform if the base ptr
+is guaranteed to be live beyond the load/store. This can happen if the base
+ptr is live out of the block we are performing the optimization. e.g.
+
+mov r1, r2
+ldr r3, [r1], #4
+...
+
+vs.
+
+ldr r3, [r2]
+add r1, r2, #4
+...
+
+In most cases, this is just a wasted optimization. However, sometimes it can
+negatively impact the performance because two-address code is more restrictive
+when it comes to scheduling.
+
+Unfortunately, liveout information is currently unavailable during DAG combine
+time.
+
+2) Consider spliting a indexed load / store into a pair of add/sub + load/store
+   to solve #1 (in TwoAddressInstructionPass.cpp).
+
+3) Enhance LSR to generate more opportunities for indexed ops.
+
+4) Once we added support for multiple result patterns, write indexed loads
+   patterns instead of C++ instruction selection code.
+
+5) Use FLDM / FSTM to emulate indexed FP load / store.
+
+//===---------------------------------------------------------------------===//
+
+We should add i64 support to take advantage of the 64-bit load / stores.
+We can add a pseudo i64 register class containing pseudo registers that are
+register pairs. All other ops (e.g. add, sub) would be expanded as usual.
+
+We need to add pseudo instructions (i.e. gethi / getlo) to extract i32 registers
+from the i64 register. These are single moves which can be eliminated if the
+destination register is a sub-register of the source. We should implement proper
+subreg support in the register allocator to coalesce these away.
+
+There are other minor issues such as multiple instructions for a spill / restore
+/ move.
+
+//===---------------------------------------------------------------------===//
+
+Implement support for some more tricky ways to materialize immediates.  For
+example, to get 0xffff8000, we can use:
+
+mov r9, #&3f8000
+sub r9, r9, #&400000
+
+//===---------------------------------------------------------------------===//
+
+We sometimes generate multiple add / sub instructions to update sp in prologue
+and epilogue if the inc / dec value is too large to fit in a single immediate
+operand. In some cases, perhaps it might be better to load the value from a
+constantpool instead.
+
+//===---------------------------------------------------------------------===//
+
+GCC generates significantly better code for this function.
+
+int foo(int StackPtr, unsigned char *Line, unsigned char *Stack, int LineLen) {
+    int i = 0;
+
+    if (StackPtr != 0) {
+       while (StackPtr != 0 && i < (((LineLen) < (32768))? (LineLen) : (32768)))
+          Line[i++] = Stack[--StackPtr];
+        if (LineLen > 32768)
+        {
+            while (StackPtr != 0 && i < LineLen)
+            {
+                i++;
+                --StackPtr;
+            }
+        }
+    }
+    return StackPtr;
+}
+
+//===---------------------------------------------------------------------===//
+
+This should compile to the mlas instruction:
+int mlas(int x, int y, int z) { return ((x * y + z) < 0) ? 7 : 13; }
+
+//===---------------------------------------------------------------------===//
+
+At some point, we should triage these to see if they still apply to us:
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19598
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18560
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27016
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11831
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11826
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11825
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11824
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11823
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11820
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10982
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10242
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9831
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9760
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9759
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9703
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9702
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9663
+
+http://www.inf.u-szeged.hu/gcc-arm/
+http://citeseer.ist.psu.edu/debus04linktime.html
+
+//===---------------------------------------------------------------------===//