[Sparc] Implement i64 load/store support for 32-bit sparc.

The LDD/STD instructions can load/store a 64bit quantity from/to memory to/from a consecutive even/odd pair of (32-bit) registers. They are part of SparcV8, and also present in SparcV9. (Although deprecated there, as you can store 64bits in one register). As recommended on llvmdev in the thread "How to enable use of 64bit load/store for 32bit architecture" from Apr 2015, I've modeled the 64-bit load/store operations as working on a v2i32 type, rather than making i64 a legal type, but with few legal operations. The latter does not (currently) work, as there is much code in llvm which assumes that if i64 is legal, operations like "add" will actually work on it. The same assumption does not hold for v2i32 -- for vector types, it is workable to support only load/store, and expand everything else. This patch: - Adds a new register class, IntPair, for even/odd pairs of registers. - Modifies the list of reserved registers, the stack spilling code, and register copying code to support the IntPair register class. - Adds support in AsmParser. (note that in asm text, you write the name of the first register of the pair only. So the parser has to morph the single register into the equivalent paired register). - Adds the new instructions themselves (LDD/STD/LDDA/STDA). - Hooks up the instructions and registers as a vector type v2i32. Adds custom legalizer to transform i64 load/stores into v2i32 load/stores and bitcasts, so that the new instructions can actually be generated, and marks all operations other than load/store on v2i32 as needing to be expanded. - Copies the unfortunate SelectInlineAsm hack from ARMISelDAGToDAG. This hack undoes the transformation of i64 operands into two arbitrarily-allocated separate i32 registers in SelectionDAGBuilder. and instead passes them in a single IntPair. (Arbitrarily allocated registers are not useful, asm code expects to be receiving a pair, which can be passed to ldd/std.) Also adds a bunch of test cases covering all the bugs I've added along the way. Differential Revision: http://reviews.llvm.org/D8713 llvm-svn: 244484
2015-08-10 19:11:39 +00:00 · 2015-08-10 19:11:39 +00:00 · 3994be87de
parent fe0e4e4c87
commit 3994be87de
18 changed files with 841 additions and 56 deletions
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@ -140,6 +140,12 @@ public:
    SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27,
    SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31};

+  static unsigned IntPairRegs[] = {
+    Sparc::G0_G1, Sparc::G2_G3, Sparc::G4_G5, Sparc::G6_G7,
+    Sparc::O0_O1, Sparc::O2_O3, Sparc::O4_O5, Sparc::O6_O7,
+    Sparc::L0_L1, Sparc::L2_L3, Sparc::L4_L5, Sparc::L6_L7,
+    Sparc::I0_I1, Sparc::I2_I3, Sparc::I4_I5, Sparc::I6_I7};
+
 /// SparcOperand - Instances of this class represent a parsed Sparc machine
 /// instruction.
 class SparcOperand : public MCParsedAsmOperand {
@ -147,6 +153,7 @@ public:
  enum RegisterKind {
    rk_None,
    rk_IntReg,
+    rk_IntPairReg,
    rk_FloatReg,
    rk_DoubleReg,
    rk_QuadReg,
@ -200,6 +207,10 @@ public:
  bool isMEMrr() const { return Kind == k_MemoryReg; }
  bool isMEMri() const { return Kind == k_MemoryImm; }

+  bool isIntReg() const {
+    return (Kind == k_Register && Reg.Kind == rk_IntReg);
+  }
+
  bool isFloatReg() const {
    return (Kind == k_Register && Reg.Kind == rk_FloatReg);
  }
@ -330,6 +341,25 @@ public:
    return Op;
  }

+  static bool MorphToIntPairReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
+    assert(Op.Reg.Kind == rk_IntReg);
+    unsigned regIdx = 32;
+    if (Reg >= Sparc::G0 && Reg <= Sparc::G7)
+      regIdx = Reg - Sparc::G0;
+    else if (Reg >= Sparc::O0 && Reg <= Sparc::O7)
+      regIdx = Reg - Sparc::O0 + 8;
+    else if (Reg >= Sparc::L0 && Reg <= Sparc::L7)
+      regIdx = Reg - Sparc::L0 + 16;
+    else if (Reg >= Sparc::I0 && Reg <= Sparc::I7)
+      regIdx = Reg - Sparc::I0 + 24;
+    if (regIdx % 2 || regIdx > 31)
+      return false;
+    Op.Reg.RegNum = IntPairRegs[regIdx / 2];
+    Op.Reg.Kind = rk_IntPairReg;
+    return true;
+  }
+
  static bool MorphToDoubleReg(SparcOperand &Op) {
    unsigned Reg = Op.getReg();
    assert(Op.Reg.Kind == rk_FloatReg);
@ -1051,5 +1081,9 @@ unsigned SparcAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp,
      break;
    }
  }
+  if (Op.isIntReg() && Kind == MCK_IntPair) {
+    if (SparcOperand::MorphToIntPairReg(Op))
+      return MCTargetAsmParser::Match_Success;
+  }
  return Match_InvalidOperand;
 }
--- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@ -117,6 +117,13 @@ static const unsigned ASRRegDecoderTable[] = {
  SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27,
  SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31};

+static const uint16_t IntPairDecoderTable[] = {
+  SP::G0_G1, SP::G2_G3, SP::G4_G5, SP::G6_G7,
+  SP::O0_O1, SP::O2_O3, SP::O4_O5, SP::O6_O7,
+  SP::L0_L1, SP::L2_L3, SP::L4_L5, SP::L6_L7,
+  SP::I0_I1, SP::I2_I3, SP::I4_I5, SP::I6_I7,
+};
+
 static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst,
                                               unsigned RegNo,
                                               uint64_t Address,
@ -196,9 +203,25 @@ static DecodeStatus DecodeASRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
  return MCDisassembler::Success;
 }

+static DecodeStatus DecodeIntPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  if ((RegNo & 1))
+    S = MCDisassembler::SoftFail;
+
+  unsigned RegisterPair = IntPairDecoderTable[RegNo/2];
+  Inst.addOperand(MCOperand::createReg(RegisterPair));
+  return S;
+}

 static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
                                  const void *Decoder);
+static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
 static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
                                 const void *Decoder);
 static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
@ -207,6 +230,8 @@ static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
                                  const void *Decoder);
 static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn,
                                  uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
@ -326,6 +351,12 @@ static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
                   DecodeIntRegsRegisterClass);
 }

+static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeIntPairRegisterClass);
+}
+
 static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
                                 const void *Decoder) {
  return DecodeMem(Inst, insn, Address, Decoder, true,
@ -350,6 +381,12 @@ static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
                   DecodeIntRegsRegisterClass);
 }

+static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeIntPairRegisterClass);
+}
+
 static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address,
                                  const void *Decoder) {
  return DecodeMem(Inst, insn, Address, Decoder, false,
--- a/llvm/lib/Target/Sparc/SparcCallingConv.td
+++ b/llvm/lib/Target/Sparc/SparcCallingConv.td
@ -21,7 +21,11 @@ def CC_Sparc32 : CallingConv<[
  // i32 f32 arguments get passed in integer registers if there is space.
  CCIfType<[i32, f32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
  // f64 arguments are split and passed through registers or through stack.
-  CCIfType<[f64], CCCustom<"CC_Sparc_Assign_f64">>,
+  CCIfType<[f64], CCCustom<"CC_Sparc_Assign_Split_64">>,
+  // As are v2i32 arguments (this would be the default behavior for
+  // v2i32 if it wasn't allocated to the IntPair register-class)
+  CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Split_64">>,
+

  // Alternatively, they are assigned to the stack in 4-byte aligned units.
  CCAssignToStack<4, 4>
@ -30,7 +34,8 @@ def CC_Sparc32 : CallingConv<[
 def RetCC_Sparc32 : CallingConv<[
  CCIfType<[i32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
  CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3]>>,
-  CCIfType<[f64], CCAssignToReg<[D0, D1]>>
+  CCIfType<[f64], CCAssignToReg<[D0, D1]>>,
+  CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Ret_Split_64">>
 ]>;


--- a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@ -213,23 +213,35 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
 }

 void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
-
  MachineRegisterInfo &MRI = MF.getRegInfo();
-
  // Remap %i[0-7] to %o[0-7].
  for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
    if (MRI.reg_nodbg_empty(reg))
      continue;
-    unsigned mapped_reg = (reg - SP::I0 + SP::O0);
+
+    unsigned mapped_reg = reg - SP::I0 + SP::O0;
    assert(MRI.reg_nodbg_empty(mapped_reg));

    // Replace I register with O register.
    MRI.replaceRegWith(reg, mapped_reg);
+
+    // Also replace register pair super-registers.
+    if ((reg - SP::I0) % 2 == 0) {
+      unsigned preg = (reg - SP::I0) / 2 + SP::I0_I1;
+      unsigned mapped_preg = preg - SP::I0_I1 + SP::O0_O1;
+      MRI.replaceRegWith(preg, mapped_preg);
+    }
  }

  // Rewrite MBB's Live-ins.
  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
       MBB != E; ++MBB) {
+    for (unsigned reg = SP::I0_I1; reg <= SP::I6_I7; ++reg) {
+      if (!MBB->isLiveIn(reg))
+        continue;
+      MBB->removeLiveIn(reg);
+      MBB->addLiveIn(reg - SP::I0_I1 + SP::O0_O1);
+    }
    for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
      if (!MBB->isLiveIn(reg))
        continue;
--- a/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//

 #include "SparcTargetMachine.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Compiler.h"
@ -62,6 +63,7 @@ public:

 private:
  SDNode* getGlobalBaseReg();
+  SDNode *SelectInlineAsm(SDNode *N);
 };
 }  // end anonymous namespace

@ -141,6 +143,181 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
  return true;
 }

+
+// Re-assemble i64 arguments split up in SelectionDAGBuilder's
+// visitInlineAsm / GetRegistersForValue functions.
+//
+// Note: This function was copied from, and is essentially identical
+// to ARMISelDAGToDAG::SelectInlineAsm. It is very unfortunate that
+// such hacking-up is necessary; a rethink of how inline asm operands
+// are handled may be in order to make doing this more sane.
+//
+// TODO: fix inline asm support so I can simply tell it that 'i64'
+// inputs to asm need to be allocated to the IntPair register type,
+// and have that work. Then, delete this function.
+SDNode *SparcDAGToDAGISel::SelectInlineAsm(SDNode *N){
+  std::vector<SDValue> AsmNodeOperands;
+  unsigned Flag, Kind;
+  bool Changed = false;
+  unsigned NumOps = N->getNumOperands();
+
+  // Normally, i64 data is bounded to two arbitrary GPRs for "%r"
+  // constraint.  However, some instructions (e.g. ldd/std) require
+  // (even/even+1) GPRs.
+
+  // So, here, we check for this case, and mutate the inlineasm to use
+  // a single IntPair register instead, which guarantees such even/odd
+  // placement.
+
+  SDLoc dl(N);
+  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1)
+                                   : SDValue(nullptr,0);
+
+  SmallVector<bool, 8> OpChanged;
+  // Glue node will be appended late.
+  for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) {
+    SDValue op = N->getOperand(i);
+    AsmNodeOperands.push_back(op);
+
+    if (i < InlineAsm::Op_FirstOperand)
+      continue;
+
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) {
+      Flag = C->getZExtValue();
+      Kind = InlineAsm::getKind(Flag);
+    }
+    else
+      continue;
+
+    // Immediate operands to inline asm in the SelectionDAG are modeled with
+    // two operands. The first is a constant of value InlineAsm::Kind_Imm, and
+    // the second is a constant with the value of the immediate. If we get here
+    // and we have a Kind_Imm, skip the next operand, and continue.
+    if (Kind == InlineAsm::Kind_Imm) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
+    unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag);
+    if (NumRegs)
+      OpChanged.push_back(false);
+
+    unsigned DefIdx = 0;
+    bool IsTiedToChangedOp = false;
+    // If it's a use that is tied with a previous def, it has no
+    // reg class constraint.
+    if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
+      IsTiedToChangedOp = OpChanged[DefIdx];
+
+    if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
+        && Kind != InlineAsm::Kind_RegDefEarlyClobber)
+      continue;
+
+    unsigned RC;
+    bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
+    if ((!IsTiedToChangedOp && (!HasRC || RC != SP::IntRegsRegClassID))
+        || NumRegs != 2)
+      continue;
+
+    assert((i+2 < NumOps) && "Invalid number of operands in inline asm");
+    SDValue V0 = N->getOperand(i+1);
+    SDValue V1 = N->getOperand(i+2);
+    unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
+    unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+    SDValue PairedReg;
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    if (Kind == InlineAsm::Kind_RegDef ||
+        Kind == InlineAsm::Kind_RegDefEarlyClobber) {
+      // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
+      // the original GPRs.
+
+      unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
+      SDValue Chain = SDValue(N,0);
+
+      SDNode *GU = N->getGluedUser();
+      SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::v2i32,
+                                               Chain.getValue(1));
+
+      // Extract values from a GPRPair reg and copy to the original GPR reg.
+      SDValue Sub0 = CurDAG->getTargetExtractSubreg(SP::sub_even, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue Sub1 = CurDAG->getTargetExtractSubreg(SP::sub_odd, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0,
+                                        RegCopy.getValue(1));
+      SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1));
+
+      // Update the original glue user.
+      std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
+      Ops.push_back(T1.getValue(1));
+      CurDAG->UpdateNodeOperands(GU, Ops);
+    }
+    else {
+      // For Kind  == InlineAsm::Kind_RegUse, we first copy two GPRs into a
+      // GPRPair and then pass the GPRPair to the inline asm.
+      SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain];
+
+      // As REG_SEQ doesn't take RegisterSDNode, we copy them first.
+      SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32,
+                                          Chain.getValue(1));
+      SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32,
+                                          T0.getValue(1));
+      SDValue Pair = SDValue(
+          CurDAG->getMachineNode(
+              TargetOpcode::REG_SEQUENCE, dl, MVT::v2i32,
+              {
+                  CurDAG->getTargetConstant(SP::IntPairRegClassID, dl,
+                                            MVT::i32),
+                  T0,
+                  CurDAG->getTargetConstant(SP::sub_even, dl, MVT::i32),
+                  T1,
+                  CurDAG->getTargetConstant(SP::sub_odd, dl, MVT::i32),
+              }),
+          0);
+
+      // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
+      // i32 VRs of inline asm with it.
+      unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
+      Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
+
+      AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+      Glue = Chain.getValue(1);
+    }
+
+    Changed = true;
+
+    if(PairedReg.getNode()) {
+      OpChanged[OpChanged.size() -1 ] = true;
+      Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
+      if (IsTiedToChangedOp)
+        Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx);
+      else
+        Flag = InlineAsm::getFlagWordForRegClass(Flag, SP::IntPairRegClassID);
+      // Replace the current flag.
+      AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
+          Flag, dl, MVT::i32);
+      // Add the new register node and skip the original two GPRs.
+      AsmNodeOperands.push_back(PairedReg);
+      // Skip the next two GPRs.
+      i += 2;
+    }
+  }
+
+  if (Glue.getNode())
+    AsmNodeOperands.push_back(Glue);
+  if (!Changed)
+    return nullptr;
+
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
+      CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
+  New->setNodeId(-1);
+  return New.getNode();
+}
+
 SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
  SDLoc dl(N);
  if (N->isMachineOpcode()) {
@ -150,6 +327,12 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) {

  switch (N->getOpcode()) {
  default: break;
+    case ISD::INLINEASM: {
+    SDNode *ResNode = SelectInlineAsm(N);
+    if (ResNode)
+      return ResNode;
+    break;
+  }
  case SPISD::GLOBAL_BASE_REG:
    return getGlobalBaseReg();

--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@ -49,9 +49,9 @@ static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT,
  return true;
 }

-static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
-                                MVT &LocVT, CCValAssign::LocInfo &LocInfo,
-                                ISD::ArgFlagsTy &ArgFlags, CCState &State)
+static bool CC_Sparc_Assign_Split_64(unsigned &ValNo, MVT &ValVT,
+                                     MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                                     ISD::ArgFlagsTy &ArgFlags, CCState &State)
 {
  static const MCPhysReg RegList[] = {
    SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
@ -77,6 +77,29 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
  return true;
 }

+static bool CC_Sparc_Assign_Ret_Split_64(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+  static const MCPhysReg RegList[] = {
+    SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+  };
+
+  // Try to get first reg.
+  if (unsigned Reg = State.AllocateReg(RegList))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    return false;
+
+  // Try to get second reg.
+  if (unsigned Reg = State.AllocateReg(RegList))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    return false;
+
+  return true;
+}
+
 // Allocate a full-sized argument for the 64-bit ABI.
 static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT,
                            MVT &LocVT, CCValAssign::LocInfo &LocInfo,
@ -202,12 +225,34 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain,
  RetOps.push_back(SDValue());

  // Copy the result values into the output registers.
-  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+  for (unsigned i = 0, realRVLocIdx = 0;
+       i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
    CCValAssign &VA = RVLocs[i];
    assert(VA.isRegLoc() && "Can only return in registers!");

-    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(),
-                             OutVals[i], Flag);
+    SDValue Arg = OutVals[realRVLocIdx];
+
+    if (VA.needsCustom()) {
+      assert(VA.getLocVT() == MVT::v2i32);
+      // Legalize ret v2i32 -> ret 2 x i32 (Basically: do what would
+      // happen by default if this wasn't a legal type)
+
+      SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                                  Arg,
+                                  DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                                  Arg,
+                                  DAG.getConstant(1, DL, getVectorIdxTy(DAG.getDataLayout())));
+
+      Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part0, Flag);
+      Flag = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+      VA = RVLocs[++i]; // skip ahead to next loc
+      Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part1,
+                               Flag);
+    } else
+      Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);

    // Guarantee that all emitted copies are stuck together with flags.
    Flag = Chain.getValue(1);
@ -375,7 +420,8 @@ LowerFormalArguments_32(SDValue Chain,

    if (VA.isRegLoc()) {
      if (VA.needsCustom()) {
-        assert(VA.getLocVT() == MVT::f64);
+        assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
+
        unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
        MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi);
        SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
@ -398,7 +444,7 @@ LowerFormalArguments_32(SDValue Chain,
        }
        SDValue WholeValue =
          DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
-        WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue);
+        WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), WholeValue);
        InVals.push_back(WholeValue);
        continue;
      }
@ -422,7 +468,7 @@ LowerFormalArguments_32(SDValue Chain,
    auto PtrVT = getPointerTy(DAG.getDataLayout());

    if (VA.needsCustom()) {
-      assert(VA.getValVT() == MVT::f64);
+      assert(VA.getValVT() == MVT::f64 || MVT::v2i32);
      // If it is double-word aligned, just load.
      if (Offset % 8 == 0) {
        int FI = MF.getFrameInfo()->CreateFixedObject(8,
@ -454,7 +500,7 @@ LowerFormalArguments_32(SDValue Chain,

      SDValue WholeValue =
        DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
-      WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue);
+      WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), WholeValue);
      InVals.push_back(WholeValue);
      continue;
    }
@ -788,7 +834,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
    }

    if (VA.needsCustom()) {
-      assert(VA.getLocVT() == MVT::f64);
+      assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);

      if (VA.isMemLoc()) {
        unsigned Offset = VA.getLocMemOffset() + StackOffset;
@ -804,49 +850,54 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
        }
      }

-      SDValue StackPtr = DAG.CreateStackTemporary(MVT::f64, MVT::i32);
-      SDValue Store = DAG.getStore(DAG.getEntryNode(), dl,
-                                   Arg, StackPtr, MachinePointerInfo(),
-                                   false, false, 0);
-      // Sparc is big-endian, so the high part comes first.
-      SDValue Hi = DAG.getLoad(MVT::i32, dl, Store, StackPtr,
-                               MachinePointerInfo(), false, false, false, 0);
-      // Increment the pointer to the other half.
-      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                             DAG.getIntPtrConstant(4, dl));
-      // Load the low part.
-      SDValue Lo = DAG.getLoad(MVT::i32, dl, Store, StackPtr,
-                               MachinePointerInfo(), false, false, false, 0);
+      if (VA.getLocVT() == MVT::f64) {
+        // Move from the float value from float registers into the
+        // integer registers.
+
+        // TODO: this conversion is done in two steps, because
+        // f64->i64 conversion is done efficiently, and i64->v2i32 is
+        // basically a no-op. But f64->v2i32 is NOT done efficiently
+        // for some reason.
+        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
+        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, Arg);
+      }
+
+      SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  Arg,
+                                  DAG.getConstant(0, dl, getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  Arg,
+                                  DAG.getConstant(1, dl, getVectorIdxTy(DAG.getDataLayout())));

      if (VA.isRegLoc()) {
-        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Hi));
+        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Part0));
        assert(i+1 != e);
        CCValAssign &NextVA = ArgLocs[++i];
        if (NextVA.isRegLoc()) {
-          RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Lo));
+          RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Part1));
        } else {
-          // Store the low part in stack.
+          // Store the second part in stack.
          unsigned Offset = NextVA.getLocMemOffset() + StackOffset;
          SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
          SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
          PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
-          MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff,
+          MemOpChains.push_back(DAG.getStore(Chain, dl, Part1, PtrOff,
                                             MachinePointerInfo(),
                                             false, false, 0));
        }
      } else {
        unsigned Offset = VA.getLocMemOffset() + StackOffset;
-        // Store the high part.
+        // Store the first part.
        SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
        SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
        PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Hi, PtrOff,
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Part0, PtrOff,
                                           MachinePointerInfo(),
                                           false, false, 0));
-        // Store the low part.
+        // Store the second part.
        PtrOff = DAG.getIntPtrConstant(Offset + 4, dl);
        PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff,
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Part1, PtrOff,
                                           MachinePointerInfo(),
                                           false, false, 0));
      }
@ -1377,8 +1428,45 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM,
  addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
  addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
  addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
-  if (Subtarget->is64Bit())
+  if (Subtarget->is64Bit()) {
    addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
+  } else {
+    // On 32bit sparc, we define a double-register 32bit register
+    // class, as well. This is modeled in LLVM as a 2-vector of i32.
+    addRegisterClass(MVT::v2i32, &SP::IntPairRegClass);
+
+    // ...but almost all operations must be expanded, so set that as
+    // the default.
+    for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+      setOperationAction(Op, MVT::v2i32, Expand);
+    }
+    // Truncating/extending stores/loads are also not supported.
+    for (MVT VT : MVT::integer_vector_valuetypes()) {
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i32, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Expand);
+
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, VT, Expand);
+
+      setTruncStoreAction(VT, MVT::v2i32, Expand);
+      setTruncStoreAction(MVT::v2i32, VT, Expand);
+    }
+    // However, load and store *are* legal.
+    setOperationAction(ISD::LOAD, MVT::v2i32, Legal);
+    setOperationAction(ISD::STORE, MVT::v2i32, Legal);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Legal);
+
+    // And we need to promote i64 loads/stores into vector load/store
+    setOperationAction(ISD::LOAD, MVT::i64, Custom);
+    setOperationAction(ISD::STORE, MVT::i64, Custom);
+
+    // Sadly, this doesn't work:
+    //    AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
+    //    AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+  }

  // Turn FP extload into load/fextend
  for (MVT VT : MVT::fp_valuetypes()) {
@ -2604,6 +2692,17 @@ static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
  return DAG.getMergeValues(Ops, dl);
 }

+static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG)
+{
+  LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
+
+  EVT MemVT = LdNode->getMemoryVT();
+  if (MemVT == MVT::f128)
+    return LowerF128Load(Op, DAG);
+
+  return Op;
+}
+
 // Lower a f128 store into two f64 stores.
 static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
  SDLoc dl(Op);
@ -2648,6 +2747,29 @@ static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }

+static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG)
+{
+  SDLoc dl(Op);
+  StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
+
+  EVT MemVT = St->getMemoryVT();
+  if (MemVT == MVT::f128)
+    return LowerF128Store(Op, DAG);
+
+  if (MemVT == MVT::i64) {
+    // Custom handling for i64 stores: turn it into a bitcast and a
+    // v2i32 store.
+    SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue());
+    SDValue Chain = DAG.getStore(
+        St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(),
+        St->isVolatile(), St->isNonTemporal(), St->getAlignment(),
+        St->getAAInfo());
+    return Chain;
+  }
+
+  return SDValue();
+}
+
 static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
  assert((Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS)
         && "invalid opcode");
@ -2786,7 +2908,6 @@ static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
  return SDValue();
 }

-
 SDValue SparcTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {

@ -2821,8 +2942,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG,
                                                               Subtarget);

-  case ISD::LOAD:               return LowerF128Load(Op, DAG);
-  case ISD::STORE:              return LowerF128Store(Op, DAG);
+  case ISD::LOAD:               return LowerLOAD(Op, DAG);
+  case ISD::STORE:              return LowerSTORE(Op, DAG);
  case ISD::FADD:               return LowerF128Op(Op, DAG,
                                       getLibcallName(RTLIB::ADD_F128), 2);
  case ISD::FSUB:               return LowerF128Op(Op, DAG,
@ -3152,9 +3273,12 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
  if (Constraint.size() == 1) {
    switch (Constraint[0]) {
    case 'r':
-      return std::make_pair(0U, &SP::IntRegsRegClass);
+      if (VT == MVT::v2i32)
+        return std::make_pair(0U, &SP::IntPairRegClass);
+      else
+        return std::make_pair(0U, &SP::IntRegsRegClass);
    }
-  } else  if (!Constraint.empty() && Constraint.size() <= 5
+  } else if (!Constraint.empty() && Constraint.size() <= 5
              && Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
    // constraint = '{r<d>}'
    // Remove the braces from around the name.
@ -3230,5 +3354,24 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
                                  getLibcallName(libCall),
                                  1));
    return;
+  case ISD::LOAD: {
+    LoadSDNode *Ld = cast<LoadSDNode>(N);
+    // Custom handling only for i64: turn i64 load into a v2i32 load,
+    // and a bitcast.
+    if (Ld->getValueType(0) != MVT::i64 || Ld->getMemoryVT() != MVT::i64)
+      return;
+
+    SDLoc dl(N);
+    SDValue LoadRes = DAG.getExtLoad(
+        Ld->getExtensionType(), dl, MVT::v2i32,
+        Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
+        MVT::v2i32, Ld->isVolatile(), Ld->isNonTemporal(),
+        Ld->isInvariant(), Ld->getAlignment(), Ld->getAAInfo());
+
+    SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes);
+    Results.push_back(Res);
+    Results.push_back(LoadRes.getValue(1));
+    return;
+  }
  }
 }
--- a/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.h
@ -167,8 +167,8 @@ namespace llvm {
    }

    void ReplaceNodeResults(SDNode *N,
-                                    SmallVectorImpl<SDValue>& Results,
-                                    SelectionDAG &DAG) const override;
+                            SmallVectorImpl<SDValue>& Results,
+                            SelectionDAG &DAG) const override;

    MachineBasicBlock *expandSelectCC(MachineInstr *MI, MachineBasicBlock *BB,
                                      unsigned BROpcode) const;
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@ -284,7 +284,9 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
  unsigned numSubRegs = 0;
  unsigned movOpc     = 0;
  const unsigned *subRegIdx = nullptr;
+  bool ExtraG0 = false;

+  const unsigned DW_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd };
  const unsigned DFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd };
  const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 };
  const unsigned QFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd,
@ -294,7 +296,12 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
  if (SP::IntRegsRegClass.contains(DestReg, SrcReg))
    BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0)
      .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
+  else if (SP::IntPairRegClass.contains(DestReg, SrcReg)) {
+    subRegIdx  = DW_SubRegsIdx;
+    numSubRegs = 2;
+    movOpc     = SP::ORrr;
+    ExtraG0 = true;
+  } else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
    BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg)
      .addReg(SrcReg, getKillRegState(KillSrc));
  else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg)) {
@ -347,7 +354,11 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
    unsigned Src = TRI->getSubReg(SrcReg,  subRegIdx[i]);
    assert(Dst && Src && "Bad sub-register");

-    MovMI = BuildMI(MBB, I, DL, get(movOpc), Dst).addReg(Src);
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(movOpc), Dst);
+    if (ExtraG0)
+      MIB.addReg(SP::G0);
+    MIB.addReg(Src);
+    MovMI = MIB.getInstr();
  }
  // Add implicit super-register defs and kills to the last MovMI.
  MovMI->addRegisterDefined(DestReg, TRI);
@ -372,12 +383,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                             MFI.getObjectAlignment(FI));

  // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
- if (RC == &SP::I64RegsRegClass)
+  if (RC == &SP::I64RegsRegClass)
    BuildMI(MBB, I, DL, get(SP::STXri)).addFrameIndex(FI).addImm(0)
      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
  else if (RC == &SP::IntRegsRegClass)
    BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else if (RC == &SP::IntPairRegClass)
+    BuildMI(MBB, I, DL, get(SP::STDri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
  else if (RC == &SP::FPRegsRegClass)
    BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
      .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
@ -415,6 +429,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
  else if (RC == &SP::IntRegsRegClass)
    BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0)
      .addMemOperand(MMO);
+  else if (RC == &SP::IntPairRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDDri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
  else if (RC == &SP::FPRegsRegClass)
    BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0)
      .addMemOperand(MMO);
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@ -408,6 +408,9 @@ let DecoderMethod = "DecodeLoadInt" in {
  defm LD   : LoadA<"ld",   0b000000, 0b010000, load,        IntRegs, i32>;
 }

+let DecoderMethod = "DecodeLoadIntPair" in
+  defm LDD : LoadA<"ldd", 0b000011, 0b010011, load, IntPair, v2i32>;
+
 // Section B.2 - Load Floating-point Instructions, p. 92
 let DecoderMethod = "DecodeLoadFP" in
  defm LDF   : Load<"ld",  0b100000, load, FPRegs,  f32>;
@ -424,6 +427,9 @@ let DecoderMethod = "DecodeStoreInt" in {
  defm ST    : StoreA<"st",  0b000100, 0b010100, store,         IntRegs, i32>;
 }

+let DecoderMethod = "DecodeStoreIntPair" in
+  defm STD   : StoreA<"std", 0b000111, 0b010111, store, IntPair, v2i32>;
+
 // Section B.5 - Store Floating-point Instructions, p. 97
 let DecoderMethod = "DecodeStoreFP" in
  defm STF   : Store<"st",  0b100100, store,         FPRegs,  f32>;
@ -1327,6 +1333,18 @@ def : Pat<(i32 (atomic_load ADDRri:$src)), (LDri ADDRri:$src)>;
 def : Pat<(atomic_store ADDRrr:$dst, i32:$val), (STrr ADDRrr:$dst, $val)>;
 def : Pat<(atomic_store ADDRri:$dst, i32:$val), (STri ADDRri:$dst, $val)>;

+// extract_vector
+def : Pat<(vector_extract (v2i32 IntPair:$Rn), 0),
+          (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_even))>;
+def : Pat<(vector_extract (v2i32 IntPair:$Rn), 1),
+          (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_odd))>;
+
+// build_vector
+def : Pat<(build_vector (i32 IntRegs:$a1), (i32 IntRegs:$a2)),
+          (INSERT_SUBREG
+	    (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 IntRegs:$a1), sub_even),
+            (i32 IntRegs:$a2), sub_odd)>;
+

 include "SparcInstr64Bit.td"
 include "SparcInstrVIS.td"
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@ -75,6 +75,18 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
  Reserved.set(SP::G6);
  Reserved.set(SP::G7);

+  // Also reserve the register pair aliases covering the above
+  // registers, with the same conditions.
+  Reserved.set(SP::G0_G1);
+  if (ReserveAppRegisters)
+    Reserved.set(SP::G2_G3);
+  if (ReserveAppRegisters || !Subtarget.is64Bit())
+    Reserved.set(SP::G4_G5);
+
+  Reserved.set(SP::O6_O7);
+  Reserved.set(SP::I6_I7);
+  Reserved.set(SP::G6_G7);
+
  // Unaliased double registers are not available in non-V9 targets.
  if (!Subtarget.isV9()) {
    for (unsigned n = 0; n != 16; ++n) {
@ -210,4 +222,3 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
  return SP::I6;
 }
-
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.td
@ -32,6 +32,12 @@ def sub_odd64  : SubRegIndex<64, 64>;
 // Ri - 32-bit integer registers
 class Ri<bits<16> Enc, string n> : SparcReg<Enc, n>;

+// Rdi - pairs of 32-bit integer registers
+class Rdi<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+  let SubRegs = subregs;
+  let SubRegIndices = [sub_even, sub_odd];
+  let CoveredBySubRegs = 1;
+}
 // Rf - 32-bit floating-point registers
 class Rf<bits<16> Enc, string n> : SparcReg<Enc, n>;

@ -217,6 +223,24 @@ def Q13 : Rq<21, "F52", [D26, D27]>;
 def Q14 : Rq<25, "F56", [D28, D29]>;
 def Q15 : Rq<29, "F60", [D30, D31]>;

+// Aliases of the integer registers used for LDD/STD double-word operations
+def G0_G1 : Rdi<0, "G0", [G0, G1]>;
+def G2_G3 : Rdi<2, "G2", [G2, G3]>;
+def G4_G5 : Rdi<4, "G4", [G4, G5]>;
+def G6_G7 : Rdi<6, "G6", [G6, G7]>;
+def O0_O1 : Rdi<8, "O0", [O0, O1]>;
+def O2_O3 : Rdi<10, "O2", [O2, O3]>;
+def O4_O5 : Rdi<12, "O4", [O4, O5]>;
+def O6_O7 : Rdi<14, "O6", [O6, O7]>;
+def L0_L1 : Rdi<16, "L0", [L0, L1]>;
+def L2_L3 : Rdi<18, "L2", [L2, L3]>;
+def L4_L5 : Rdi<20, "L4", [L4, L5]>;
+def L6_L7 : Rdi<22, "L6", [L6, L7]>;
+def I0_I1 : Rdi<24, "I0", [I0, I1]>;
+def I2_I3 : Rdi<26, "I2", [I2, I3]>;
+def I4_I5 : Rdi<28, "I4", [I4, I5]>;
+def I6_I7 : Rdi<30, "I6", [I6, I7]>;
+
 // Register classes.
 //
 // FIXME: the register order should be defined in terms of the preferred
@ -231,6 +255,13 @@ def IntRegs : RegisterClass<"SP", [i32, i64], 32,
                                 (sequence "L%u", 0, 7),
                                 (sequence "O%u", 0, 7))>;

+// Should be in the same order as IntRegs.
+def IntPair : RegisterClass<"SP", [v2i32], 64,
+    (add I0_I1, I2_I3, I4_I5, I6_I7,
+         G0_G1, G2_G3, G4_G5, G6_G7,
+         L0_L1, L2_L3, L4_L5, L6_L7,
+         O0_O1, O2_O3, O4_O5, O6_O7)>;
+
 // Register class for 64-bit mode, with a 64-bit spill slot size.
 // These are the same as the 32-bit registers, so TableGen will consider this
 // to be a sub-class of IntRegs. That works out because requiring a 64-bit
--- a/llvm/llvm.spec.in
+++ b/llvm/llvm.spec.in
@ -65,3 +65,4 @@ rm -rf %{buildroot}
 * Mon Feb 09 2003 Brian R. Gaeke
 - Initial working version of RPM spec file.

+
--- a/llvm/test/CodeGen/SPARC/basictest.ll
+++ b/llvm/test/CodeGen/SPARC/basictest.ll
@ -84,3 +84,16 @@ define i64 @unsigned_multiply_32x32_64(i32 %a, i32 %b) {
  ret i64 %r
 }

+; CHECK-LABEL: load_store_64bit:
+; CHECK: ldd [%o0], %o2
+; CHECK: addcc %o3, 3, %o5
+; CHECK: addxcc %o2, 0, %o4
+; CHECK: retl
+; CHECK: std %o4, [%o1]
+define void @load_store_64bit(i64* %x, i64* %y) {
+entry:
+  %0 = load i64, i64* %x
+  %add = add nsw i64 %0, 3
+  store i64 %add, i64* %y
+  ret void
+}
--- a/llvm/test/CodeGen/SPARC/inlineasm.ll
+++ b/llvm/test/CodeGen/SPARC/inlineasm.ll
@ -1,4 +1,4 @@
-; RUN: llc -march=sparc -no-integrated-as <%s | FileCheck %s
+; RUN: llc -march=sparc <%s | FileCheck %s

 ; CHECK-LABEL: test_constraint_r
 ; CHECK:       add %o1, %o0, %o0
@ -8,7 +8,7 @@ entry:
  ret i32 %0
 }

-; CHECK-LABEL: test_constraint_I
+; CHECK-LABEL: test_constraint_I:
 ; CHECK:       add %o0, 1023, %o0
 define i32 @test_constraint_I(i32 %a) {
 entry:
@ -16,7 +16,7 @@ entry:
  ret i32 %0
 }

-; CHECK-LABEL: test_constraint_I_neg
+; CHECK-LABEL: test_constraint_I_neg:
 ; CHECK:       add %o0, -4096, %o0
 define i32 @test_constraint_I_neg(i32 %a) {
 entry:
@ -24,7 +24,7 @@ entry:
  ret i32 %0
 }

-; CHECK-LABEL: test_constraint_I_largeimm
+; CHECK-LABEL: test_constraint_I_largeimm:
 ; CHECK:       sethi 9, [[R0:%[gilo][0-7]]]
 ; CHECK:       or [[R0]], 784, [[R1:%[gilo][0-7]]]
 ; CHECK:       add %o0, [[R1]], %o0
@ -34,12 +34,51 @@ entry:
  ret i32 %0
 }

-; CHECK-LABEL: test_constraint_reg
+; CHECK-LABEL: test_constraint_reg:
 ; CHECK:       ldda [%o1] 43, %g2
-; CHECK:       ldda [%o1] 43, %g3
+; CHECK:       ldda [%o1] 43, %g4
 define void @test_constraint_reg(i32 %s, i32* %ptr) {
 entry:
  %0 = tail call i64 asm sideeffect "ldda [$1] $2, $0", "={r2},r,n"(i32* %ptr, i32 43)
-  %1 = tail call i64 asm sideeffect "ldda [$1] $2, $0", "={g3},r,n"(i32* %ptr, i32 43)
+  %1 = tail call i64 asm sideeffect "ldda [$1] $2, $0", "={g4},r,n"(i32* %ptr, i32 43)
  ret void
 }
+
+;; Ensure that i64 args to asm are allocated to the IntPair register class.
+;; Also checks that register renaming for leaf proc works.
+; CHECK-LABEL: test_constraint_r_i64:
+; CHECK: mov %o0, %o5
+; CHECK: sra %o5, 31, %o4
+; CHECK: std %o4, [%o1]
+define i32 @test_constraint_r_i64(i32 %foo, i64* %out, i32 %o) {
+entry:
+  %conv = sext i32 %foo to i64
+  tail call void asm sideeffect "std $0, [$1]", "r,r,~{memory}"(i64 %conv, i64* %out)
+  ret i32 %o
+}
+
+;; Same test without leaf-proc opt
+; CHECK-LABEL: test_constraint_r_i64_noleaf:
+; CHECK: mov %i0, %i5
+; CHECK: sra %i5, 31, %i4
+; CHECK: std %i4, [%i1]
+define i32 @test_constraint_r_i64_noleaf(i32 %foo, i64* %out, i32 %o) #0 {
+entry:
+  %conv = sext i32 %foo to i64
+  tail call void asm sideeffect "std $0, [$1]", "r,r,~{memory}"(i64 %conv, i64* %out)
+  ret i32 %o
+}
+attributes #0 = { "no-frame-pointer-elim"="true" }
+
+;; Ensures that tied in and out gets allocated properly.
+; CHECK-LABEL: test_i64_inout:
+; CHECK: sethi 0, %o2
+; CHECK: mov 5, %o3
+; CHECK: xor %o2, %g0, %o2
+; CHECK: mov %o2, %o0
+; CHECK: ret
+define i64 @test_i64_inout() {
+entry:
+  %0 = call i64 asm sideeffect "xor $1, %g0, $0", "=r,0,~{i1}"(i64 5);
+  ret i64 %0
+}
--- a/llvm/test/CodeGen/SPARC/reserved-regs.ll
+++ b/llvm/test/CodeGen/SPARC/reserved-regs.ll
@ -0,0 +1,135 @@
+; RUN: llc -march=sparc  < %s | FileCheck %s
+
+@g = common global [32 x i32] zeroinitializer, align 16
+@h = common global [16 x i64] zeroinitializer, align 16
+
+;; Ensures that we don't use registers which are supposed to be reserved.
+
+; CHECK-LABEL: use_all_i32_regs:
+; CHECK-NOT: %g0
+; CHECK-NOT: %g1
+; CHECK-NOT: %g5
+; CHECK-NOT: %g6
+; CHECK-NOT: %g7
+; CHECK-NOT: %o6
+; CHECK-NOT: %i6
+; CHECK-NOT: %i7
+; CHECK: ret
+define void @use_all_i32_regs() {
+entry:
+  %0 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 0), align 16
+  %1 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 1), align 4
+  %2 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 2), align 8
+  %3 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 3), align 4
+  %4 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 4), align 16
+  %5 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 5), align 4
+  %6 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 6), align 8
+  %7 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 7), align 4
+  %8 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 8), align 16
+  %9 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 9), align 4
+  %10 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 10), align 8
+  %11 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 11), align 4
+  %12 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 12), align 16
+  %13 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 13), align 4
+  %14 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 14), align 8
+  %15 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 15), align 4
+  %16 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 16), align 16
+  %17 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 17), align 4
+  %18 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 18), align 8
+  %19 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 19), align 4
+  %20 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 20), align 16
+  %21 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 21), align 4
+  %22 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 22), align 8
+  %23 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 23), align 4
+  %24 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 24), align 16
+  %25 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 25), align 4
+  %26 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 26), align 8
+  %27 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 27), align 4
+  %28 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 28), align 16
+  %29 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 29), align 4
+  %30 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 30), align 8
+  %31 = load volatile i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 31), align 4
+  store volatile i32 %1, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 0), align 16
+  store volatile i32 %2, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 1), align 4
+  store volatile i32 %3, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 2), align 8
+  store volatile i32 %4, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 3), align 4
+  store volatile i32 %5, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 4), align 16
+  store volatile i32 %6, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 5), align 4
+  store volatile i32 %7, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 6), align 8
+  store volatile i32 %8, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 7), align 4
+  store volatile i32 %9, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 8), align 16
+  store volatile i32 %10, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 9), align 4
+  store volatile i32 %11, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 10), align 8
+  store volatile i32 %12, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 11), align 4
+  store volatile i32 %13, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 12), align 16
+  store volatile i32 %14, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 13), align 4
+  store volatile i32 %15, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 14), align 8
+  store volatile i32 %16, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 15), align 4
+  store volatile i32 %17, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 16), align 16
+  store volatile i32 %18, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 17), align 4
+  store volatile i32 %19, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 18), align 8
+  store volatile i32 %20, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 19), align 4
+  store volatile i32 %21, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 20), align 16
+  store volatile i32 %22, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 21), align 4
+  store volatile i32 %23, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 22), align 8
+  store volatile i32 %24, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 23), align 4
+  store volatile i32 %25, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 24), align 16
+  store volatile i32 %26, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 25), align 4
+  store volatile i32 %27, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 26), align 8
+  store volatile i32 %28, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 27), align 4
+  store volatile i32 %29, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 28), align 16
+  store volatile i32 %30, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 29), align 4
+  store volatile i32 %31, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 30), align 8
+  store volatile i32 %0, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @g, i64 0, i64 31), align 4
+  ret void
+}
+
+
+; CHECK-LABEL: use_all_i64_regs:
+; CHECK-NOT: %g0
+; CHECK-NOT: %g1
+; CHECK-NOT: %g4
+; CHECK-NOT: %g5
+; CHECK-NOT: %g6
+; CHECK-NOT: %g7
+; CHECK-NOT: %o6
+; CHECK-NOT: %o7
+; CHECK-NOT: %i6
+; CHECK-NOT: %i7
+; CHECK: ret
+define void @use_all_i64_regs() {
+entry:
+  %0 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 0), align 16
+  %1 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 1), align 4
+  %2 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 2), align 8
+  %3 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 3), align 4
+  %4 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 4), align 16
+  %5 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 5), align 4
+  %6 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 6), align 8
+  %7 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 7), align 4
+  %8 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 8), align 16
+  %9 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 9), align 4
+  %10 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 10), align 8
+  %11 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 11), align 4
+  %12 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 12), align 16
+  %13 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 13), align 4
+  %14 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 14), align 8
+  %15 = load volatile i64, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 15), align 4
+  store volatile i64 %1, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 0), align 16
+  store volatile i64 %2, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 1), align 4
+  store volatile i64 %3, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 2), align 8
+  store volatile i64 %4, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 3), align 4
+  store volatile i64 %5, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 4), align 16
+  store volatile i64 %6, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 5), align 4
+  store volatile i64 %7, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 6), align 8
+  store volatile i64 %8, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 7), align 4
+  store volatile i64 %9, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 8), align 16
+  store volatile i64 %10, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 9), align 4
+  store volatile i64 %11, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 10), align 8
+  store volatile i64 %12, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 11), align 4
+  store volatile i64 %13, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 12), align 16
+  store volatile i64 %14, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 13), align 4
+  store volatile i64 %15, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 14), align 8
+  store volatile i64 %0, i64* getelementptr inbounds ([16 x i64], [16 x i64]* @h, i64 0, i64 15), align 4
+  ret void
+}
--- a/llvm/test/CodeGen/SPARC/spill.ll
+++ b/llvm/test/CodeGen/SPARC/spill.ll
@ -0,0 +1,64 @@
+; RUN: llc -march=sparc  < %s | FileCheck %s
+
+;; Ensure that spills and reloads work for various types on
+;; sparcv8.
+
+;; For i32/i64 tests, use an asm statement which clobbers most
+;; registers to ensure the spill will happen.
+
+; CHECK-LABEL: test_i32_spill:
+; CHECK:       and %i0, %i1, %o0
+; CHECK:       st %o0, [%fp+{{.+}}]
+; CHECK:       add %o0, %o0, %g0
+; CHECK:       ld [%fp+{{.+}}, %i0
+define i32 @test_i32_spill(i32 %a, i32 %b) {
+entry:
+  %r0 = and i32 %a, %b
+  ; The clobber list has all registers except g0/o0. (Only o0 is usable.)
+  %0 = call i32 asm sideeffect "add $0,$1,%g0", "=r,0,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g1},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6},~{o7}"(i32 %r0)
+  ret i32 %r0
+}
+
+; CHECK-LABEL: test_i64_spill:
+; CHECK:       and %i0, %i2, %o0
+; CHECK:       and %i1, %i3, %o1
+; CHECK:       std %o0, [%fp+{{.+}}]
+; CHECK:       add %o0, %o0, %g0
+; CHECK:       ldd [%fp+{{.+}}, %i0
+define i64 @test_i64_spill(i64 %a, i64 %b) {
+entry:
+  %r0 = and i64 %a, %b
+  ; The clobber list has all registers except g0,g1,o0,o1. (Only o0/o1 are a usable pair)
+  ; So, o0/o1 must be used.
+  %0 = call i64 asm sideeffect "add $0,$1,%g0", "=r,0,~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{g2},~{g3},~{g4},~{g5},~{g6},~{g7},~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{o2},~{o3},~{o4},~{o5},~{o7}"(i64 %r0)
+  ret i64 %r0
+}
+
+;; For float/double tests, a call is a suitable clobber as *all* FPU
+;; registers are caller-save on sparcv8.
+
+; CHECK-LABEL: test_float_spill:
+; CHECK:       fadds %f1, %f0, [[R:%[f][0-31]]]
+; CHECK:       st [[R]], [%fp+{{.+}}]
+; CHECK:       call
+; CHECK:       ld [%fp+{{.+}}, %f0
+declare float @foo_float(float)
+define float @test_float_spill(float %a, float %b) {
+entry:
+  %r0 = fadd float %a, %b
+  %0 = call float @foo_float(float %r0)
+  ret float %r0
+}
+
+; CHECK-LABEL: test_double_spill:
+; CHECK:       faddd %f2, %f0, [[R:%[f][0-31]]]
+; CHECK:       std [[R]], [%fp+{{.+}}]
+; CHECK:       call
+; CHECK:       ldd [%fp+{{.+}}, %f0
+declare double @foo_double(double)
+define double @test_double_spill(double %a, double %b) {
+entry:
+  %r0 = fadd double %a, %b
+  %0 = call double @foo_double(double %r0)
+  ret double %r0
+}
--- a/llvm/test/MC/Disassembler/Sparc/sparc-mem.txt
+++ b/llvm/test/MC/Disassembler/Sparc/sparc-mem.txt
@ -221,3 +221,27 @@

 # CHECK:     swapa [%g1] 131, %o2
 0xd4 0xf8 0x50 0x60
+
+# CHECK:     ldd [%i0+%l6], %o2
+0xd4 0x1e 0x00 0x16
+
+# CHECK:     ldd [%i0+32], %o2
+0xd4 0x1e 0x20 0x20
+
+# CHECK:     ldd [%g1], %o2
+0xd4 0x18 0x60 0x00
+
+# CHECK:     ldd [%g1], %o2
+0xd4 0x18 0x40 0x00
+
+# CHECK:     std %o2, [%i0+%l6]
+0xd4 0x3e 0x00 0x16
+
+# CHECK:     std %o2, [%i0+32]
+0xd4 0x3e 0x20 0x20
+
+# CHECK:     std %o2, [%g1]
+0xd4 0x38 0x60 0x00
+
+# CHECK:     std %o2, [%g1]
+0xd4 0x38 0x40 0x00
--- a/llvm/test/MC/Sparc/sparc-mem-instructions.s
+++ b/llvm/test/MC/Sparc/sparc-mem-instructions.s
@ -46,6 +46,15 @@
        ! CHECK: lda [%i0+%l6] 131, %o2 ! encoding: [0xd4,0x86,0x10,0x76]
        lda [%i0 + %l6] 131, %o2

+        ! CHECK: ldd [%i0+%l6], %o2    ! encoding: [0xd4,0x1e,0x00,0x16]
+        ldd [%i0 + %l6], %o2
+        ! CHECK: ldd [%i0+32], %o2     ! encoding: [0xd4,0x1e,0x20,0x20]
+        ldd [%i0 + 32], %o2
+        ! CHECK: ldd [%g1], %o2        ! encoding: [0xd4,0x18,0x40,0x00]
+        ldd [%g1], %o2
+        ! CHECK: ldda [%i0+%l6] 131, %o2 ! encoding: [0xd4,0x9e,0x10,0x76]
+        ldda [%i0 + %l6] 131, %o2
+
        ! CHECK: stb %o2, [%i0+%l6]   ! encoding: [0xd4,0x2e,0x00,0x16]
        stb %o2, [%i0 + %l6]
        ! CHECK: stb %o2, [%i0+32]    ! encoding: [0xd4,0x2e,0x20,0x20]
@ -73,6 +82,15 @@
        ! CHECK: sta %o2, [%i0+%l6] 131 ! encoding: [0xd4,0xa6,0x10,0x76]
        sta %o2, [%i0 + %l6] 131

+        ! CHECK: std %o2, [%i0+%l6]    ! encoding: [0xd4,0x3e,0x00,0x16]
+        std %o2, [%i0 + %l6]
+        ! CHECK: std %o2, [%i0+32]     ! encoding: [0xd4,0x3e,0x20,0x20]
+        std %o2, [%i0 + 32]
+        ! CHECK: std %o2, [%g1]        ! encoding: [0xd4,0x38,0x40,0x00]
+        std %o2, [%g1]
+        ! CHECK: stda %o2, [%i0+%l6] 131 ! encoding: [0xd4,0xbe,0x10,0x76]
+        stda %o2, [%i0 + %l6] 131
+
        ! CHECK:  flush %g1+%g2         ! encoding: [0x81,0xd8,0x40,0x02]
        flush %g1 + %g2
        ! CHECK:  flush %g1+8           ! encoding: [0x81,0xd8,0x60,0x08]