[AVR] Rewrite the function calling convention.

Summary: The previous version relied on the standard calling convention using std::reverse() to try to force the AVR ABI. But this only works for simple cases, it fails for example with aggregate types. This patch rewrites the calling convention with custom C++ code, that implements the ABI defined in https://gcc.gnu.org/wiki/avr-gcc. To do that it adds a few 16-bit pseudo registers for unaligned argument passing, such as R24R23. For example this function: define void @fun({ i8, i16 } %a) will pass %a.0 in R22 and %a.1 in R24R23. There are no instructions that can use these pseudo registers, so a new register class, DREGSMOVW, is defined to make them apart. Also the ArgCC_AVR_BUILTIN_DIV is no longer necessary, as it is identical to the C++ behavior (actually the clobber list is more strict for __div* functions, but that is currently unimplemented). Reviewers: dylanmckay Subscribers: Gaelan, Sh4rK, indirect, jwagen, efriedma, dsprenkels, hiraditya, Jim, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68524 Patch by Rodrigo Rivas Costa.
2020-06-19 23:26:00 +12:00 · 2020-06-19 23:26:00 +12:00 · b9c26a9cfe
parent 82a882db08
commit b9c26a9cfe
9 changed files with 433 additions and 208 deletions
--- a/llvm/lib/Target/AVR/AVRCallingConv.td
+++ b/llvm/lib/Target/AVR/AVRCallingConv.td
@ -6,21 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 // This describes the calling conventions for AVR architecture.
+// Normal functions use a special calling convention, solved in code.
 //===----------------------------------------------------------------------===//

 //===----------------------------------------------------------------------===//
 // AVR Return Value Calling Convention
 //===----------------------------------------------------------------------===//

-def RetCC_AVR : CallingConv
-<[
-  // i8 is returned in R24.
-  CCIfType<[i8], CCAssignToReg<[R24]>>,
-
-  // i16 are returned in R25:R24, R23:R22, R21:R20 and R19:R18.
-  CCIfType<[i16], CCAssignToReg<[R25R24, R23R22, R21R20, R19R18]>>
-]>;
-
 // Special return value calling convention for runtime functions.
 def RetCC_AVR_BUILTIN : CallingConv
 <[
@ -41,14 +33,6 @@ def ArgCC_AVR_Vararg : CallingConv
  CCAssignToStack<2, 1>
 ]>;

-// Special argument calling convention for
-// division runtime functions.
-def ArgCC_AVR_BUILTIN_DIV : CallingConv
-<[
-  CCIfType<[i8], CCAssignToReg<[R24,R22]>>,
-  CCIfType<[i16], CCAssignToReg<[R25R24, R23R22]>>
-]>;
-
 //===----------------------------------------------------------------------===//
 // Callee-saved register lists.
 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@ -14,6 +14,7 @@
 #include "AVRISelLowering.h"

 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@ -881,172 +882,145 @@ bool AVRTargetLowering::isOffsetFoldingLegal(

 #include "AVRGenCallingConv.inc"

-/// For each argument in a function store the number of pieces it is composed
-/// of.
-static void parseFunctionArgs(const SmallVectorImpl<ISD::InputArg> &Ins,
-                              SmallVectorImpl<unsigned> &Out) {
-  for (const ISD::InputArg &Arg : Ins) {
-    if(Arg.PartOffset > 0) continue;
-    unsigned Bytes = ((Arg.ArgVT.getSizeInBits()) + 7) / 8;
+/// Registers for calling conventions, ordered in reverse as required by ABI.
+/// Both arrays must be of the same length.
+static const MCPhysReg RegList8[] = {
+    AVR::R25, AVR::R24, AVR::R23, AVR::R22, AVR::R21, AVR::R20,
+    AVR::R19, AVR::R18, AVR::R17, AVR::R16, AVR::R15, AVR::R14,
+    AVR::R13, AVR::R12, AVR::R11, AVR::R10, AVR::R9,  AVR::R8};
+static const MCPhysReg RegList16[] = {
+    AVR::R26R25, AVR::R25R24, AVR::R24R23, AVR::R23R22,
+    AVR::R22R21, AVR::R21R20, AVR::R20R19, AVR::R19R18,
+    AVR::R18R17, AVR::R17R16, AVR::R16R15, AVR::R15R14,
+    AVR::R14R13, AVR::R13R12, AVR::R12R11, AVR::R11R10,
+    AVR::R10R9,  AVR::R9R8};

-    Out.push_back((Bytes + 1) / 2);
-  }
-}
-
-/// For external symbols there is no function prototype information so we
-/// have to rely directly on argument sizes.
-static void parseExternFuncCallArgs(const SmallVectorImpl<ISD::OutputArg> &In,
-                                    SmallVectorImpl<unsigned> &Out) {
-  for (unsigned i = 0, e = In.size(); i != e;) {
-    unsigned Size = 0;
-    unsigned Offset = 0;
-    while ((i != e) && (In[i].PartOffset == Offset)) {
-      Offset += In[i].VT.getStoreSize();
-      ++i;
-      ++Size;
-    }
-    Out.push_back(Size);
-  }
-}
-
-static StringRef getFunctionName(TargetLowering::CallLoweringInfo &CLI) {
-  SDValue Callee = CLI.Callee;
-
-  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    return G->getSymbol();
-  }
-
-  if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    return G->getGlobal()->getName();
-  }
-
-  llvm_unreachable("don't know how to get the name for this callee");
-}
+static_assert(array_lengthof(RegList8) == array_lengthof(RegList16),
+        "8-bit and 16-bit register arrays must be of equal length");

 /// Analyze incoming and outgoing function arguments. We need custom C++ code
-/// to handle special constraints in the ABI like reversing the order of the
-/// pieces of splitted arguments. In addition, all pieces of a certain argument
-/// have to be passed either using registers or the stack but never mixing both.
-static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
-                                     const Function *F, const DataLayout *TD,
-                                     const SmallVectorImpl<ISD::OutputArg> *Outs,
-                                     const SmallVectorImpl<ISD::InputArg> *Ins,
-                                     CallingConv::ID CallConv,
-                                     SmallVectorImpl<CCValAssign> &ArgLocs,
-                                     CCState &CCInfo, bool IsCall, bool IsVarArg) {
-  static const MCPhysReg RegList8[] = {AVR::R24, AVR::R22, AVR::R20,
-                                       AVR::R18, AVR::R16, AVR::R14,
-                                       AVR::R12, AVR::R10, AVR::R8};
-  static const MCPhysReg RegList16[] = {AVR::R25R24, AVR::R23R22, AVR::R21R20,
-                                        AVR::R19R18, AVR::R17R16, AVR::R15R14,
-                                        AVR::R13R12, AVR::R11R10, AVR::R9R8};
-  if (IsVarArg) {
-    // Variadic functions do not need all the analysis below.
-    if (IsCall) {
-      CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_Vararg);
-    } else {
-      CCInfo.AnalyzeFormalArguments(*Ins, ArgCC_AVR_Vararg);
+/// to handle special constraints in the ABI.
+/// In addition, all pieces of a certain argument have to be passed either
+/// using registers or the stack but never mixing both.
+template <typename ArgT>
+static void
+analyzeArguments(TargetLowering::CallLoweringInfo *CLI, const Function *F,
+                 const DataLayout *TD, const SmallVectorImpl<ArgT> &Args,
+                 SmallVectorImpl<CCValAssign> &ArgLocs, CCState &CCInfo) {
+  unsigned NumArgs = Args.size();
+  // This is the index of the last used register, in RegList*.
+  // -1 means R26 (R26 is never actually used in CC).
+  int RegLastIdx = -1;
+  // Once a value is passed to the stack it will always be used
+  bool UseStack = false;
+  for (unsigned i = 0; i != NumArgs;) {
+    MVT VT = Args[i].VT;
+    // We have to count the number of bytes for each function argument, that is
+    // those Args with the same OrigArgIndex. This is important in case the
+    // function takes an aggregate type.
+    // Current argument will be between [i..j).
+    unsigned ArgIndex = Args[i].OrigArgIndex;
+    unsigned TotalBytes = VT.getStoreSize();
+    unsigned j = i + 1;
+    for (; j != NumArgs; ++j) {
+      if (Args[j].OrigArgIndex != ArgIndex)
+        break;
+      TotalBytes += Args[j].VT.getStoreSize();
    }
-    return;
-  }
+    // Round up to even number of bytes.
+    TotalBytes = alignTo(TotalBytes, 2);
+    // Skip zero sized arguments
+    if (TotalBytes == 0)
+      continue;
+    // The index of the first register to be used
+    unsigned RegIdx = RegLastIdx + TotalBytes;
+    RegLastIdx = RegIdx;
+    // If there are not enough registers, use the stack
+    if (RegIdx >= array_lengthof(RegList8)) {
+      UseStack = true;
+    }
+    for (; i != j; ++i) {
+      MVT VT = Args[i].VT;

-  // Fill in the Args array which will contain original argument sizes.
-  SmallVector<unsigned, 8> Args;
-  if (IsCall) {
-    parseExternFuncCallArgs(*Outs, Args);
-  } else {
-    assert(F != nullptr && "function should not be null");
-    parseFunctionArgs(*Ins, Args);
-  }
-
-  unsigned RegsLeft = array_lengthof(RegList8), ValNo = 0;
-  // Variadic functions always use the stack.
-  bool UsesStack = false;
-  for (unsigned i = 0, pos = 0, e = Args.size(); i != e; ++i) {
-    unsigned Size = Args[i];
-
-    // If we have a zero-sized argument, don't attempt to lower it.
-    // AVR-GCC does not support zero-sized arguments and so we need not
-    // worry about ABI compatibility.
-    if (Size == 0) continue;
-
-    MVT LocVT = (IsCall) ? (*Outs)[pos].VT : (*Ins)[pos].VT;
-
-    // If we have plenty of regs to pass the whole argument do it.
-    if (!UsesStack && (Size <= RegsLeft)) {
-      const MCPhysReg *RegList = (LocVT == MVT::i16) ? RegList16 : RegList8;
-
-      for (unsigned j = 0; j != Size; ++j) {
-        unsigned Reg = CCInfo.AllocateReg(
-            ArrayRef<MCPhysReg>(RegList, array_lengthof(RegList8)));
+      if (UseStack) {
+        auto evt = EVT(VT).getTypeForEVT(CCInfo.getContext());
+        unsigned Offset = CCInfo.AllocateStack(TD->getTypeAllocSize(evt),
+                                               TD->getABITypeAlign(evt));
        CCInfo.addLoc(
-            CCValAssign::getReg(ValNo++, LocVT, Reg, LocVT, CCValAssign::Full));
-        --RegsLeft;
-      }
-
-      // Reverse the order of the pieces to agree with the "big endian" format
-      // required in the calling convention ABI.
-      std::reverse(ArgLocs.begin() + pos, ArgLocs.begin() + pos + Size);
-    } else {
-      // Pass the rest of arguments using the stack.
-      UsesStack = true;
-      for (unsigned j = 0; j != Size; ++j) {
-        unsigned Offset = CCInfo.AllocateStack(
-            TD->getTypeAllocSize(EVT(LocVT).getTypeForEVT(CCInfo.getContext())),
-            TD->getABITypeAlign(EVT(LocVT).getTypeForEVT(CCInfo.getContext())));
-        CCInfo.addLoc(CCValAssign::getMem(ValNo++, LocVT, Offset, LocVT,
-                                          CCValAssign::Full));
+            CCValAssign::getMem(i, VT, Offset, VT, CCValAssign::Full));
+      } else {
+        unsigned Reg;
+        if (VT == MVT::i8) {
+          Reg = CCInfo.AllocateReg(RegList8[RegIdx]);
+        } else if (VT == MVT::i16) {
+          Reg = CCInfo.AllocateReg(RegList16[RegIdx]);
+        } else {
+          llvm_unreachable(
+              "calling convention can only manage i8 and i16 types");
+        }
+        assert(Reg && "register not available in calling convention");
+        CCInfo.addLoc(CCValAssign::getReg(i, VT, Reg, VT, CCValAssign::Full));
+        // Registers inside a particular argument are sorted in increasing order
+        // (remember the array is reversed).
+        RegIdx -= VT.getStoreSize();
      }
    }
-    pos += Size;
  }
 }

-static void analyzeBuiltinArguments(TargetLowering::CallLoweringInfo &CLI,
-                                    const Function *F, const DataLayout *TD,
-                                    const SmallVectorImpl<ISD::OutputArg> *Outs,
-                                    const SmallVectorImpl<ISD::InputArg> *Ins,
-                                    CallingConv::ID CallConv,
-                                    SmallVectorImpl<CCValAssign> &ArgLocs,
-                                    CCState &CCInfo, bool IsCall, bool IsVarArg) {
-  StringRef FuncName = getFunctionName(CLI);
+/// Count the total number of bytes needed to pass or return these arguments.
+template <typename ArgT>
+static unsigned getTotalArgumentsSizeInBytes(const SmallVectorImpl<ArgT> &Args) {
+  unsigned TotalBytes = 0;

-  if (FuncName.startswith("__udivmod") || FuncName.startswith("__divmod")) {
-    CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_BUILTIN_DIV);
+  for (const ArgT& Arg : Args) {
+    TotalBytes += Arg.VT.getStoreSize();
+  }
+  return TotalBytes;
+}
+
+/// Analyze incoming and outgoing value of returning from a function.
+/// The algorithm is similar to analyzeArguments, but there can only be
+/// one value, possibly an aggregate, and it is limited to 8 bytes.
+template <typename ArgT>
+static void analyzeReturnValues(const SmallVectorImpl<ArgT> &Args,
+                                CCState &CCInfo) {
+  unsigned NumArgs = Args.size();
+  unsigned TotalBytes = getTotalArgumentsSizeInBytes(Args);
+  // CanLowerReturn() guarantees this assertion.
+  assert(TotalBytes <= 8 && "return values greater than 8 bytes cannot be lowered");
+
+  // GCC-ABI says that the size is rounded up to the next even number,
+  // but actually once it is more than 4 it will always round up to 8.
+  if (TotalBytes > 4) {
+    TotalBytes = 8;
  } else {
-    analyzeStandardArguments(&CLI, F, TD, Outs, Ins,
-                             CallConv, ArgLocs, CCInfo,
-                             IsCall, IsVarArg);
+    TotalBytes = alignTo(TotalBytes, 2);
  }
-}

-static void analyzeArguments(TargetLowering::CallLoweringInfo *CLI,
-                             const Function *F, const DataLayout *TD,
-                             const SmallVectorImpl<ISD::OutputArg> *Outs,
-                             const SmallVectorImpl<ISD::InputArg> *Ins,
-                             CallingConv::ID CallConv,
-                             SmallVectorImpl<CCValAssign> &ArgLocs,
-                             CCState &CCInfo, bool IsCall, bool IsVarArg) {
-  switch (CallConv) {
-    case CallingConv::AVR_BUILTIN: {
-      analyzeBuiltinArguments(*CLI, F, TD, Outs, Ins,
-                              CallConv, ArgLocs, CCInfo,
-                              IsCall, IsVarArg);
-      return;
-    }
-    default: {
-      analyzeStandardArguments(CLI, F, TD, Outs, Ins,
-                               CallConv, ArgLocs, CCInfo,
-                               IsCall, IsVarArg);
-      return;
+  // The index of the first register to use.
+  int RegIdx = TotalBytes - 1;
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT VT = Args[i].VT;
+    unsigned Reg;
+    if (VT == MVT::i8) {
+      Reg = CCInfo.AllocateReg(RegList8[RegIdx]);
+    } else if (VT == MVT::i16) {
+      Reg = CCInfo.AllocateReg(RegList16[RegIdx]);
+    } else {
+      llvm_unreachable("calling convention can only manage i8 and i16 types");
    }
+    assert(Reg && "register not available in calling convention");
+    CCInfo.addLoc(CCValAssign::getReg(i, VT, Reg, VT, CCValAssign::Full));
+    // Registers sort in increasing order
+    RegIdx -= VT.getStoreSize();
  }
 }

 SDValue AVRTargetLowering::LowerFormalArguments(
    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
  MachineFunction &MF = DAG.getMachineFunction();
  MachineFrameInfo &MFI = MF.getFrameInfo();
  auto DL = DAG.getDataLayout();
@ -1056,8 +1030,12 @@ SDValue AVRTargetLowering::LowerFormalArguments(
  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                 *DAG.getContext());

-  analyzeArguments(nullptr, &MF.getFunction(), &DL, 0, &Ins, CallConv, ArgLocs, CCInfo,
-                   false, isVarArg);
+  // Variadic functions do not need all the analysis below.
+  if (isVarArg) {
+    CCInfo.AnalyzeFormalArguments(Ins, ArgCC_AVR_Vararg);
+  } else {
+    analyzeArguments(nullptr, &MF.getFunction(), &DL, Ins, ArgLocs, CCInfo);
+  }

  SDValue ArgValue;
  for (CCValAssign &VA : ArgLocs) {
@ -1178,8 +1156,12 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                         getPointerTy(DAG.getDataLayout()));
  }

-  analyzeArguments(&CLI, F, &DAG.getDataLayout(), &Outs, 0, CallConv, ArgLocs, CCInfo,
-                   true, isVarArg);
+  // Variadic functions do not need all the analysis below.
+  if (isVarArg) {
+    CCInfo.AnalyzeCallOperands(Outs, ArgCC_AVR_Vararg);
+  } else {
+    analyzeArguments(&CLI, F, &DAG.getDataLayout(), Outs, ArgLocs, CCInfo);
+  }

  // Get a count of how many bytes are to be pushed on the stack.
  unsigned NumBytes = CCInfo.getNextStackOffset();
@ -1316,13 +1298,10 @@ SDValue AVRTargetLowering::LowerCallResult(
                 *DAG.getContext());

  // Handle runtime calling convs.
-  auto CCFunction = CCAssignFnForReturn(CallConv);
-  CCInfo.AnalyzeCallResult(Ins, CCFunction);
-
-  if (CallConv != CallingConv::AVR_BUILTIN && RVLocs.size() > 1) {
-    // Reverse splitted return values to get the "big endian" format required
-    // to agree with the calling convention ABI.
-    std::reverse(RVLocs.begin(), RVLocs.end());
+  if (CallConv == CallingConv::AVR_BUILTIN) {
+    CCInfo.AnalyzeCallResult(Ins, RetCC_AVR_BUILTIN);
+  } else {
+    analyzeReturnValues(Ins, CCInfo);
  }

  // Copy all of the result registers out of their specified physreg.
@ -1341,26 +1320,17 @@ SDValue AVRTargetLowering::LowerCallResult(
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//

-CCAssignFn *AVRTargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
-  switch (CC) {
-  case CallingConv::AVR_BUILTIN:
-    return RetCC_AVR_BUILTIN;
-  default:
-    return RetCC_AVR;
+bool AVRTargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  if (CallConv == CallingConv::AVR_BUILTIN) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+    return CCInfo.CheckReturn(Outs, RetCC_AVR_BUILTIN);
  }
-}

-bool
-AVRTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
-                                  MachineFunction &MF, bool isVarArg,
-                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  LLVMContext &Context) const
-{
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
-
-  auto CCFunction = CCAssignFnForReturn(CallConv);
-  return CCInfo.CheckReturn(Outs, CCFunction);
+  unsigned TotalBytes = getTotalArgumentsSizeInBytes(Outs);
+  return TotalBytes <= 8;
 }

 SDValue
@ -1376,25 +1346,19 @@ AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                 *DAG.getContext());

-  // Analyze return values.
-  auto CCFunction = CCAssignFnForReturn(CallConv);
-  CCInfo.AnalyzeReturn(Outs, CCFunction);
-
-  // If this is the first return lowered for this function, add the regs to
-  // the liveout set for the function.
  MachineFunction &MF = DAG.getMachineFunction();
-  unsigned e = RVLocs.size();

-  // Reverse splitted return values to get the "big endian" format required
-  // to agree with the calling convention ABI.
-  if (e > 1) {
-    std::reverse(RVLocs.begin(), RVLocs.end());
+  // Analyze return values.
+  if (CallConv == CallingConv::AVR_BUILTIN) {
+    CCInfo.AnalyzeReturn(Outs, RetCC_AVR_BUILTIN);
+  } else {
+    analyzeReturnValues(Outs, CCInfo);
  }

  SDValue Flag;
  SmallVector<SDValue, 4> RetOps(1, Chain);
  // Copy the result values into the output registers.
-  for (unsigned i = 0; i != e; ++i) {
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
    CCValAssign &VA = RVLocs[i];
    assert(VA.isRegLoc() && "Can only return in registers!");

--- a/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/llvm/lib/Target/AVR/AVRISelLowering.h
@ -146,10 +146,8 @@ private:
  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;

-  CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC) const;
-
-  bool CanLowerReturn(CallingConv::ID CallConv,
-                      MachineFunction &MF, bool isVarArg,
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
                      const SmallVectorImpl<ISD::OutputArg> &Outs,
                      LLVMContext &Context) const override;

--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@ -48,7 +48,7 @@ void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,

  // Not all AVR devices support the 16-bit `MOVW` instruction.
  if (AVR::DREGSRegClass.contains(DestReg, SrcReg)) {
-    if (STI.hasMOVW()) {
+    if (STI.hasMOVW() && AVR::DREGSMOVWRegClass.contains(DestReg, SrcReg)) {
      BuildMI(MBB, MI, DL, get(AVR::MOVWRdRr), DestReg)
          .addReg(SrcReg, getKillRegState(KillSrc));
    } else {
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.td
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td
@ -103,6 +103,17 @@ CoveredBySubRegs = 1 in
  def R5R4   : AVRReg<4,  "r5:r4",   [R4, R5]>,   DwarfRegNum<[4]>;
  def R3R2   : AVRReg<2,  "r3:r2",   [R2, R3]>,   DwarfRegNum<[2]>;
  def R1R0   : AVRReg<0,  "r1:r0",   [R0, R1]>,   DwarfRegNum<[0]>;
+
+  // Pseudo registers for unaligned i16
+  def R26R25 : AVRReg<25, "r26:r25", [R25, R26]>, DwarfRegNum<[25]>;
+  def R24R23 : AVRReg<23, "r24:r23", [R23, R24]>, DwarfRegNum<[23]>;
+  def R22R21 : AVRReg<21, "r22:r21", [R21, R22]>, DwarfRegNum<[21]>;
+  def R20R19 : AVRReg<19, "r20:r19", [R19, R20]>, DwarfRegNum<[19]>;
+  def R18R17 : AVRReg<17, "r18:r17", [R17, R18]>, DwarfRegNum<[17]>;
+  def R16R15 : AVRReg<15, "r16:r15", [R15, R16]>, DwarfRegNum<[15]>;
+  def R14R13 : AVRReg<13, "r14:r13", [R13, R14]>, DwarfRegNum<[13]>;
+  def R12R11 : AVRReg<11, "r12:r11", [R11, R12]>, DwarfRegNum<[11]>;
+  def R10R9  : AVRReg<9,  "r10:r9",  [R9,  R10]>, DwarfRegNum<[9]>;
 }

 //===----------------------------------------------------------------------===//
@ -146,6 +157,22 @@ def LD8lo : RegisterClass<"AVR", [i8], 8,

 // Main 16-bit pair register class.
 def DREGS : RegisterClass<"AVR", [i16], 8,
+  (
+    // Return value and arguments.
+    add R25R24, R19R18, R21R20, R23R22,
+    // Scratch registers.
+    R31R30, R27R26,
+    // Callee saved registers.
+    R29R28, R17R16, R15R14, R13R12, R11R10,
+    R9R8, R7R6, R5R4, R3R2, R1R0,
+    // Pseudo regs for unaligned 16-bits
+    R26R25, R24R23, R22R21,
+    R20R19, R18R17, R16R15,
+    R14R13, R12R11, R10R9
+  )>;
+
+// 16-bit pair register class for movw
+def DREGSMOVW : RegisterClass<"AVR", [i16], 8,
  (
    // Return value and arguments.
    add R25R24, R19R18, R21R20, R23R22,
--- a/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll
@ -0,0 +1,84 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+; CHECK-LABEL: ret_void_args_struct_i8_i32
+define void @ret_void_args_struct_i8_i32({ i8, i32 } %a) {
+start:
+  ; CHECK:      sts     4, r20
+  %0 = extractvalue { i8, i32 } %a, 0
+  store volatile i8 %0, i8* inttoptr (i64 4 to i8*)
+
+  ; CHECK-NEXT: sts     8, r24
+  ; CHECK-NEXT: sts     7, r23
+  ; CHECK-NEXT: sts     6, r22
+  ; CHECK-NEXT: sts     5, r21
+  %1 = extractvalue { i8, i32 } %a, 1
+  store volatile i32 %1, i32* inttoptr (i64 5 to i32*)
+  ret void
+}
+
+; CHECK-LABEL: ret_void_args_struct_i8_i8_i8_i8
+define void @ret_void_args_struct_i8_i8_i8_i8({ i8, i8, i8, i8 } %a) {
+start:
+  ; CHECK:      sts     4, r22
+  %0 = extractvalue { i8, i8, i8, i8 } %a, 0
+  store volatile i8 %0, i8* inttoptr (i64 4 to i8*)
+  ; CHECK-NEXT: sts     5, r23
+  %1 = extractvalue { i8, i8, i8, i8 } %a, 1
+  store volatile i8 %1, i8* inttoptr (i64 5 to i8*)
+  ; CHECK-NEXT: sts     6, r24
+  %2 = extractvalue { i8, i8, i8, i8 } %a, 2
+  store volatile i8 %2, i8* inttoptr (i64 6 to i8*)
+  ; CHECK-NEXT: sts     7, r25
+  %3 = extractvalue { i8, i8, i8, i8 } %a, 3
+  store volatile i8 %3, i8* inttoptr (i64 7 to i8*)
+  ret void
+}
+
+; CHECK-LABEL: ret_void_args_struct_i32_16_i8
+define void @ret_void_args_struct_i32_16_i8({ i32, i16, i8} %a) {
+start:
+  ; CHECK:      sts     7, r21
+  ; CHECK-NEXT: sts     6, r20
+  ; CHECK-NEXT: sts     5, r19
+  ; CHECK-NEXT: sts     4, r18
+  %0 = extractvalue { i32, i16, i8 } %a, 0
+  store volatile i32 %0, i32* inttoptr (i64 4 to i32*)
+
+  ; CHECK-NEXT: sts     5, r23
+  ; CHECK-NEXT: sts     4, r22
+  %1 = extractvalue { i32, i16, i8 } %a, 1
+  store volatile i16 %1, i16* inttoptr (i64 4 to i16*)
+
+  ; CHECK-NEXT: sts     4, r24
+  %2 = extractvalue { i32, i16, i8 } %a, 2
+  store volatile i8 %2, i8* inttoptr (i64 4 to i8*)
+  ret void
+}
+
+; CHECK-LABEL: ret_void_args_struct_i8_i32_struct_i32_i8
+define void @ret_void_args_struct_i8_i32_struct_i32_i8({ i8, i32 } %a, { i32, i8 } %b) {
+start:
+  ; CHECK:      sts     4, r20
+  %0 = extractvalue { i8, i32 } %a, 0
+  store volatile i8 %0, i8* inttoptr (i64 4 to i8*)
+
+  ; CHECK-NEXT: sts     8, r24
+  ; CHECK-NEXT: sts     7, r23
+  ; CHECK-NEXT: sts     6, r22
+  ; CHECK-NEXT: sts     5, r21
+  %1 = extractvalue { i8, i32 } %a, 1
+  store volatile i32 %1, i32* inttoptr (i64 5 to i32*)
+
+  ; CHECK-NEXT:      sts     9, r17
+  ; CHECK-NEXT:      sts     8, r16
+  ; CHECK-NEXT:      sts     7, r15
+  ; CHECK-NEXT:      sts     6, r14
+  %2 = extractvalue { i32, i8 } %b, 0
+  store volatile i32 %2, i32* inttoptr (i64 6 to i32*)
+
+  ; CHECK-NEXT: sts     7, r18
+  %3 = extractvalue { i32, i8 } %b, 1
+  store volatile i8 %3, i8* inttoptr (i64 7 to i8*)
+  ret void
+}
+
--- a/llvm/test/CodeGen/AVR/calling-conv/c/call.ll
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/call.ll
@ -0,0 +1,89 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+declare void @ret_void_args_i8(i8 %a)
+declare void @ret_void_args_i8_i32(i8 %a, i32 %b)
+declare void @ret_void_args_i8_i8_i8_i8(i8 %a, i8 %b, i8 %c, i8 %d)
+declare void @ret_void_args_i32_i16_i8(i32 %a, i16 %b, i8 %c)
+declare void @ret_void_args_i64(i64 %a)
+declare void @ret_void_args_i64_i64(i64 %a, i64 %b)
+declare void @ret_void_args_i64_i64_i16(i64 %a, i64 %b, i16 %c)
+
+; CHECK-LABEL: call_void_args_i8
+define void @call_void_args_i8() {
+    ; CHECK: ldi r24, 64
+    call void @ret_void_args_i8 (i8 64)
+    ret void
+}
+
+; CHECK-LABEL: call_void_args_i8_i32
+define void @call_void_args_i8_i32() {
+    ; CHECK: ldi r20, 4
+    ; CHECK-NEXT: ldi r21, 3
+    ; CHECK-NEXT: ldi r22, 2
+    ; CHECK-NEXT: ldi r23, 1
+    ; CHECK-NEXT: ldi r24, 64
+    call void @ret_void_args_i8_i32 (i8 64, i32 16909060)
+    ret void
+}
+
+; CHECK-LABEL: call_void_args_i8_i8_i8_i8
+define void @call_void_args_i8_i8_i8_i8() {
+    ; CHECK: ldi r24, 1
+    ; CHECK-NEXT: ldi r22, 2
+    ; CHECK-NEXT: ldi r20, 3
+    ; CHECK-NEXT: ldi r18, 4
+    call void @ret_void_args_i8_i8_i8_i8(i8 1, i8 2, i8 3, i8 4)
+    ret void
+}
+
+; CHECK-LABEL: call_void_args_i32_i16_i8
+define void @call_void_args_i32_i16_i8() {
+    ; CHECK: ldi r22, 4
+    ; CHECK-NEXT: ldi r23, 3
+    ; CHECK-NEXT: ldi r24, 2
+    ; CHECK-NEXT: ldi r25, 1
+    ; CHECK-NEXT: ldi r20, 1
+    ; CHECK-NEXT: ldi r21, 4
+    ; CHECK-NEXT: ldi r18, 64
+    call void @ret_void_args_i32_i16_i8(i32 16909060, i16 1025, i8 64)
+    ret void
+}
+
+; CHECK-LABEL: call_void_args_i64
+define void @call_void_args_i64() {
+    ; CHECK: ldi r18, 8
+    ; CHECK-NEXT: ldi r19, 7
+    ; CHECK-NEXT: ldi r20, 6
+    ; CHECK-NEXT: ldi r21, 5
+    ; CHECK-NEXT: ldi r22, 4
+    ; CHECK-NEXT: ldi r23, 3
+    ; CHECK-NEXT: ldi r24, 2
+    ; CHECK-NEXT: ldi r25, 1
+    call void @ret_void_args_i64(i64 72623859790382856)
+    ret void
+}
+
+; CHECK-LABEL: call_void_args_i64_i64
+define void @call_void_args_i64_i64() {
+    ; CHECK: ldi r18, 8
+    ; CHECK-NEXT: ldi r19, 7
+    ; CHECK-NEXT: ldi r20, 6
+    ; CHECK-NEXT: ldi r21, 5
+    ; CHECK-NEXT: ldi r22, 4
+    ; CHECK-NEXT: ldi r23, 3
+    ; CHECK-NEXT: ldi r24, 2
+    ; CHECK-NEXT: ldi r25, 1
+    ; the second arg is in r10:r17, but unordered
+    ; CHECK: r17,
+    ; CHECK: r10,
+    call void @ret_void_args_i64_i64(i64 72623859790382856, i64 651345242494996224)
+    ret void
+}
+
+; CHECK-LABEL: call_void_args_i64_i64_i16
+define void @call_void_args_i64_i64_i16() {
+    ; CHECK: r8,
+    ; CHECK: r9,
+    call void @ret_void_args_i64_i64_i16(i64 72623859790382856, i64 651345242494996224, i16 5655)
+    ret void
+}
--- a/llvm/test/CodeGen/AVR/calling-conv/c/call_aggr.ll
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/call_aggr.ll
@ -0,0 +1,48 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+declare void @ret_void_args_struct_i8_i32({ i8, i32 } %a)
+declare void @ret_void_args_struct_i8_i8_i8_i8({ i8, i8, i8, i8 } %a)
+declare void @ret_void_args_struct_i32_i16_i8({ i32, i16, i8} %a)
+declare void @ret_void_args_struct_i8_i32_struct_i32_i8({ i8, i32 } %a, { i32, i8 } %b)
+
+; CHECK-LABEL: call_void_args_struct_i8_i32
+define void @call_void_args_struct_i8_i32() {
+    ; CHECK: ldi r20, 64
+    ; CHECK-NEXT: r21,
+    ; CHECK-NEXT: r22,
+    ; CHECK-NEXT: r23,
+    ; CHECK-NEXT: r24,
+    call void @ret_void_args_struct_i8_i32({ i8, i32 } { i8 64, i32 16909060 })
+    ret void
+}
+
+; CHECK-LABEL: @call_void_args_struct_i8_i8_i8_i8
+define void @call_void_args_struct_i8_i8_i8_i8() {
+    ; CHECK: ldi r22, 1
+    ; CHECK-NEXT: ldi r23, 2
+    ; CHECK-NEXT: ldi r24, 3
+    ; CHECK-NEXT: ldi r25, 4
+    call void @ret_void_args_struct_i8_i8_i8_i8({ i8, i8, i8, i8 } { i8 1, i8 2, i8 3, i8 4 })
+    ret void
+}
+
+; CHECK-LABEL: @call_void_args_struct_i32_i16_i8
+define void @call_void_args_struct_i32_i16_i8() {
+    ; CHECK: ldi r18, 4
+    ; CHECK-NEXT: ldi r19, 3
+    ; CHECK-NEXT: ldi r20, 2
+    ; CHECK-NEXT: ldi r21, 1
+    ; CHECK-NEXT: ldi r22, 23
+    ; CHECK-NEXT: ldi r23, 22
+    ; CHECK-NEXT: ldi r24, 64
+    call void @ret_void_args_struct_i32_i16_i8({ i32, i16, i8 } { i32 16909060, i16 5655, i8 64 })
+    ret void
+}
+
+; CHECK-LABEL: @call_void_args_struct_i8_i32_struct_i32_i8
+define void @call_void_args_struct_i8_i32_struct_i32_i8() {
+    ; CHECK: ldi r20, 64
+    ; CHECK: ldi r18, 65
+    call void @ret_void_args_struct_i8_i32_struct_i32_i8({ i8, i32 } { i8 64, i32 16909060 }, { i32, i8 } { i32 287454020, i8 65 })
+    ret void
+}
--- a/llvm/test/CodeGen/AVR/calling-conv/c/return_aggr.ll
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/return_aggr.ll
@ -0,0 +1,31 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+; CHECK-LABEL: ret_struct_i8_i16_i8
+define { i8, i16, i8 } @ret_struct_i8_i16_i8() {
+start:
+  ; for some reason the i16 is loaded to r24:r25
+  ; and then moved to r23:r24
+  ; CHECK: ldi r22, 64
+  ; CHECK-NEXT: r23,
+  ; CHECK-NEXT: r24,
+  ; CHECK-NEXT: r25, 11
+  %0 = insertvalue {i8, i16, i8} undef, i8 64, 0
+  %1 = insertvalue {i8, i16, i8} %0, i16 1024, 1
+  %2 = insertvalue {i8, i16, i8} %1, i8 11, 2
+  ret {i8, i16, i8} %2
+}
+
+; CHECK-LABEL: ret_struct_i32_i16
+define { i32, i16 } @ret_struct_i32_i16() {
+start:
+  ; CHECK: ldi r18, 4
+  ; CHECK-NEXT: ldi r19, 3
+  ; CHECK-NEXT: ldi r20, 2
+  ; CHECK-NEXT: ldi r21, 1
+  ; CHECK-NEXT: ldi r22, 0
+  ; CHECK-NEXT: ldi r23, 8
+  %0 = insertvalue { i32, i16 } undef, i32 16909060, 0
+  %1 = insertvalue { i32, i16 } %0, i16 2048, 1
+  ret { i32, i16} %1
+}
+