From aa3519f178fc6ea563f950a4077b34d8dc6c4470 Mon Sep 17 00:00:00 2001
From: Anirudh Prasad <anirudh_prasad@hotmail.com>
Date: Thu, 21 Oct 2021 09:48:21 -0400
Subject: [PATCH] [SystemZ][z/OS] Initial implementation for lowerCall on z/OS

- This patch provides the initial implementation for lowering a call on z/OS according to the XPLINK64 calling convention
- A series of changes have been made to SystemZCallingConv.td to account for these additional XPLINK64 changes including adding a new helper function to shadow the stack along with allocation of a register wherever appropriate
- For the cases of copying a f64 to a gr64 and a f128 / 128-bit vector type to a gr64, a `CCBitConvertToType` has been added and has been bitcasted appropriately in the lowering phase
- Support for the ADA register (R5) will be provided in a later patch.

Reviewed By: uweigand

Differential Revision: https://reviews.llvm.org/D111662
---
 .../lib/Target/SystemZ/SystemZCallingConv.cpp |   4 +
 llvm/lib/Target/SystemZ/SystemZCallingConv.h  |  71 +++++--
 llvm/lib/Target/SystemZ/SystemZCallingConv.td |  33 +--
 .../Target/SystemZ/SystemZISelLowering.cpp    |  69 +++++--
 llvm/lib/Target/SystemZ/SystemZRegisterInfo.h |  14 ++
 llvm/lib/Target/TargetMachine.cpp             |   3 +
 llvm/test/CodeGen/SystemZ/call-zos-01.ll      | 191 +++++++++++++++++
 llvm/test/CodeGen/SystemZ/call-zos-vararg.ll  | 195 ++++++++++++++++++
 llvm/test/CodeGen/SystemZ/call-zos-vec.ll     |  66 ++++++
 9 files changed, 601 insertions(+), 45 deletions(-)
 create mode 100644 llvm/test/CodeGen/SystemZ/call-zos-01.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/call-zos-vararg.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/call-zos-vec.ll
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp b/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
index 86eb8365d527..9c73757d7f5c 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
@@ -28,3 +28,7 @@ const MCPhysReg SystemZ::XPLINK64ArgGPRs[SystemZ::XPLINK64NumArgGPRs] = {
 const MCPhysReg SystemZ::XPLINK64ArgFPRs[SystemZ::XPLINK64NumArgFPRs] = {
     SystemZ::F0D, SystemZ::F2D, SystemZ::F4D, SystemZ::F6D
 };
+
+const MCPhysReg SystemZ::XPLINK64ArgVRs[SystemZ::XPLINK64NumArgVRs] = {
+    SystemZ::V24, SystemZ::V25, SystemZ::V26, SystemZ::V27,
+    SystemZ::V28, SystemZ::V29, SystemZ::V30, SystemZ::V31};
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
index 96c1080d5237..f82c61c0f344 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -27,6 +27,9 @@ namespace SystemZ {
 
   const unsigned XPLINK64NumArgFPRs = 4;
   extern const MCPhysReg XPLINK64ArgFPRs[XPLINK64NumArgFPRs];
+
+  const unsigned XPLINK64NumArgVRs = 8;
+  extern const MCPhysReg XPLINK64ArgVRs[XPLINK64NumArgVRs];
 } // end namespace SystemZ
 
 class SystemZCCState : public CCState {
@@ -124,7 +127,9 @@ inline bool CC_SystemZ_I128Indirect(unsigned &ValNo, MVT &ValVT,
   else
     llvm_unreachable("Unknown Calling Convention!");
 
-  unsigned Offset = Reg ? 0 : State.AllocateStack(8, Align(8));
+  unsigned Offset = Reg && !Subtarget.isTargetXPLINK64()
+                        ? 0
+                        : State.AllocateStack(8, Align(8));
 
   // Use that same location for all the pending parts.
   for (auto &It : PendingMembers) {
@@ -167,12 +172,6 @@ inline bool CC_XPLINK64_Allocate128BitVararg(unsigned &ValNo, MVT &ValVT,
                                              CCValAssign::LocInfo &LocInfo,
                                              ISD::ArgFlagsTy &ArgFlags,
                                              CCState &State) {
-  if (LocVT.getSizeInBits() < 128)
-    return false;
-
-  if (static_cast<SystemZCCState *>(&State)->IsFixed(ValNo))
-    return false;
-
   // For any C or C++ program, this should always be
   // false, since it is illegal to have a function
   // where the first argument is variadic. Therefore
@@ -185,21 +184,59 @@ inline bool CC_XPLINK64_Allocate128BitVararg(unsigned &ValNo, MVT &ValVT,
   bool AllocGPR3 = State.AllocateReg(SystemZ::R3D);
 
   // If GPR2 and GPR3 are available, then we may pass vararg in R2Q.
-  if (AllocGPR2 && AllocGPR3) {
-    State.addLoc(
-        CCValAssign::getReg(ValNo, ValVT, SystemZ::R2Q, LocVT, LocInfo));
+  // If only GPR3 is available, we need to set custom handling to copy
+  // hi bits into GPR3.
+  // Either way, we allocate on the stack.
+  if (AllocGPR3) {
+    // For f128 and vector var arg case, set the bitcast flag to bitcast to
+    // i128.
+    LocVT = MVT::i128;
+    LocInfo = CCValAssign::BCvt;
+    auto Offset = State.AllocateStack(16, Align(8));
+    if (AllocGPR2)
+      State.addLoc(
+          CCValAssign::getReg(ValNo, ValVT, SystemZ::R2Q, LocVT, LocInfo));
+    else
+      State.addLoc(
+          CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
     return true;
   }
 
-  // If only GPR3 is available, we allocate on stack but need to
-  // set custom handling to copy hi bits into GPR3.
-  if (!AllocGPR2 && AllocGPR3) {
-    auto Offset = State.AllocateStack(16, Align(8));
-    State.addLoc(
-        CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-    return true;
+  return false;
+}
+
+inline bool CC_XPLINK64_Shadow_Stack(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                     CCValAssign::LocInfo &LocInfo,
+                                     ISD::ArgFlagsTy &ArgFlags,
+                                     CCState &State) {
+  ArrayRef<MCPhysReg> RegList;
+
+  switch (LocVT.SimpleTy) {
+  case MVT::i64:
+    RegList = SystemZ::XPLINK64ArgGPRs;
+    break;
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v4f32:
+  case MVT::v2f64:
+    RegList = SystemZ::XPLINK64ArgVRs;
+    break;
+  case MVT::f32:
+  case MVT::f64:
+  case MVT::f128:
+    RegList = SystemZ::XPLINK64ArgFPRs;
+    break;
+  default:
+    return false;
   }
 
+  unsigned UnallocatedRegisterIndex = State.getFirstUnallocated(RegList);
+  // Every time we can allocate a register, allocate on the stack.
+  if (UnallocatedRegisterIndex < RegList.size())
+    State.AllocateStack(LocVT.getSizeInBits() / 8, Align(8));
+
   return false;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index c606e78b69b6..373023effb4a 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -224,6 +224,17 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
   // XPLINK64 ABI compliant code widens integral types smaller than i64
   // to i64 before placing the parameters either on the stack or in registers.
   CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
+  // Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRS.
+  CCIfType<[f32], CCIfNotFixed<CCPromoteToType<f64>>>,
+  CCIfType<[f64], CCIfNotFixed<CCBitConvertToType<i64>>>,
+  // long double, can only be passed in GPR2 and GPR3, if available,
+  // hence R2Q
+  CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
+  // Non fixed vector arguments are treated in the same way as long
+  // doubles.
+  CCIfSubtarget<"hasVector()",
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+      CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>,
 
   // A SwiftSelf is passed in callee-saved R10.
   CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R10D]>>>,
@@ -238,7 +249,7 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
   // The first 3 integer arguments are passed in registers R1D-R3D.
   // The rest will be passed in the user area. The address offset of the user
   // area can be found in register R4D.
-  CCIfType<[i32], CCAssignToReg<[R1L, R2L, R3L]>>,
+  CCIfType<[i64], CCCustom<"CC_XPLINK64_Shadow_Stack">>,
   CCIfType<[i64], CCAssignToReg<[R1D, R2D, R3D]>>,
 
   // The first 8 named vector arguments are passed in V24-V31.  Sub-128 vectors
@@ -247,6 +258,9 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
              CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
+  CCIfSubtarget<"hasVector()",
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+             CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>>,
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
              CCIfFixed<CCAssignToReg<[V24, V25, V26, V27,
@@ -255,28 +269,15 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
   // The first 4 named  float and double arguments are passed in registers FPR0-FPR6.
   // The rest will be passed in the user area.
   CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
+  CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>,
   CCIfType<[f32], CCIfFixed<CCAssignToReg<[F0S, F2S, F4S, F6S]>>>,
   CCIfType<[f64], CCIfFixed<CCAssignToReg<[F0D, F2D, F4D, F6D]>>>,
   // The first 2 long double arguments are passed in register FPR0/FPR2
   // and FPR4/FPR6. The rest will be passed in the user area.
   CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
+  CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>,
   CCIfType<[f128], CCIfFixed<CCAssignToReg<[F0Q, F4Q]>>>,
 
-  // Non fixed floats are passed in GPRs
-  // Promote f32 to f64, if it needs to be passed in GPRs.
-  CCIfType<[f32], CCIfNotFixed<CCPromoteToType<f64>>>,
-  // Assign f64 varargs to their proper GPRs.
-  CCIfType<[f64], CCIfNotFixed<CCAssignToReg<[R1D, R2D, R3D]>>>,
-  // long double, can only be passed in GPR2 and GPR3, if available,
-  // hence R2Q
-  CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
-
-  // Non fixed vector arguments are treated in the same way as long
-  // doubles.
-  CCIfSubtarget<"hasVector()",
-    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-      CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>>,
-
   // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
   // Other f128 arguments are passed in 8-byte-aligned 16-byte stack slots.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index e3a79e6d1a99..a9bc4f30fff6 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1358,14 +1358,21 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL,
     return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
   case CCValAssign::AExt:
     return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
-  case CCValAssign::BCvt:
-    // If this is a short vector argument to be stored to the stack,
+  case CCValAssign::BCvt: {
+    assert(VA.getLocVT() == MVT::i64 || VA.getLocVT() == MVT::i128);
+    assert(VA.getValVT().isVector() || VA.getValVT() == MVT::f64 ||
+           VA.getValVT() == MVT::f128);
+    MVT BitCastToType = VA.getValVT().isVector() && VA.getLocVT() == MVT::i64
+                            ? MVT::v2i64
+                            : VA.getLocVT();
+    Value = DAG.getNode(ISD::BITCAST, DL, BitCastToType, Value);
+    // For ELF, this is a short vector argument to be stored to the stack,
     // bitcast to v2i64 and then extract first element.
-    assert(VA.getLocVT() == MVT::i64);
-    assert(VA.getValVT().isVector());
-    Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
-                       DAG.getConstant(0, DL, MVT::i32));
+    if (BitCastToType == MVT::v2i64)
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
+                         DAG.getConstant(0, DL, MVT::i32));
+    return Value;
+  }
   case CCValAssign::Full:
     return Value;
   default:
@@ -1472,6 +1479,10 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
         NumFixedFPRs += 1;
         RC = &SystemZ::FP64BitRegClass;
         break;
+      case MVT::f128:
+        NumFixedFPRs += 2;
+        RC = &SystemZ::FP128BitRegClass;
+        break;
       case MVT::v16i8:
       case MVT::v8i16:
       case MVT::v4i32:
@@ -1525,7 +1536,8 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
       InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
   }
 
-  if (IsVarArg) {
+  // FIXME: Add support for lowering varargs for XPLINK64 in a later patch.
+  if (IsVarArg && Subtarget.isTargetELF()) {
     // Save the number of non-varargs registers for later use by va_start, etc.
     FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
     FuncInfo->setVarArgsFirstFPR(NumFixedFPRs);
@@ -1564,6 +1576,8 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
     }
   }
 
+  // FIXME: For XPLINK64, Add in support for handling incoming "ADA" special
+  // register (R5)
   return Chain;
 }
 
@@ -1604,6 +1618,11 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   MachineFunction &MF = DAG.getMachineFunction();
   EVT PtrVT = getPointerTy(MF.getDataLayout());
   LLVMContext &Ctx = *DAG.getContext();
+  SystemZCallingConventionRegisters *Regs = Subtarget.getSpecialRegisters();
+
+  // FIXME: z/OS support to be added in later.
+  if (Subtarget.isTargetXPLINK64())
+    IsTailCall = false;
 
   // Detect unsupported vector argument and return types.
   if (Subtarget.hasVector()) {
@@ -1624,6 +1643,13 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = ArgCCInfo.getNextStackOffset();
 
+  if (Subtarget.isTargetXPLINK64())
+    // Although the XPLINK specifications for AMODE64 state that minimum size
+    // of the param area is minimum 32 bytes and no rounding is otherwise
+    // specified, we round this area in 64 bytes increments to be compatible
+    // with existing compilers.
+    NumBytes = std::max(64U, (unsigned)alignTo(NumBytes, 64));
+
   // Mark the start of the call.
   if (!IsTailCall)
     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
@@ -1674,17 +1700,24 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
     } else
       ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
 
-    if (VA.isRegLoc())
+    if (VA.isRegLoc()) {
+      // In XPLINK64, for the 128-bit vararg case, ArgValue is bitcasted to a
+      // MVT::i128 type. We decompose the 128-bit type to a pair of its high
+      // and low values.
+      if (VA.getLocVT() == MVT::i128)
+        ArgValue = lowerI128ToGR128(DAG, ArgValue);
       // Queue up the argument copies and emit them at the end.
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
-    else {
+    } else {
       assert(VA.isMemLoc() && "Argument not register or memory");
 
       // Work out the address of the stack slot.  Unpromoted ints and
       // floats are passed as right-justified 8-byte values.
       if (!StackPtr.getNode())
-        StackPtr = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, PtrVT);
-      unsigned Offset = SystemZMC::ELFCallFrameSize + VA.getLocMemOffset();
+        StackPtr = DAG.getCopyFromReg(Chain, DL,
+                                      Regs->getStackPointerRegister(), PtrVT);
+      unsigned Offset = Regs->getStackPointerBias() + Regs->getCallFrameSize() +
+                        VA.getLocMemOffset();
       if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
         Offset += 4;
       SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
@@ -1693,6 +1726,17 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
       // Emit the store.
       MemOpChains.push_back(
           DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
+
+      // Although long doubles or vectors are passed through the stack when
+      // they are vararg (non-fixed arguments), if a long double or vector
+      // occupies the third and fourth slot of the argument list GPR3 should
+      // still shadow the third slot of the argument list.
+      if (Subtarget.isTargetXPLINK64() && VA.needsCustom()) {
+        SDValue ShadowArgValue =
+            DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, ArgValue,
+                        DAG.getIntPtrConstant(1, DL));
+        RegsToPass.push_back(std::make_pair(SystemZ::R3D, ShadowArgValue));
+      }
     }
   }
 
@@ -1704,6 +1748,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // associated Target* opcodes.  Force %r1 to be used for indirect
   // tail calls.
   SDValue Glue;
+  // FIXME: Add support for XPLINK using the ADA register.
   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
     Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
index 2a4253e2deaf..8ce01074873a 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -66,6 +66,12 @@ public:
   virtual const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                                CallingConv::ID CC) const = 0;
 
+  /// \returns the offset to the locals area.
+  virtual int getCallFrameSize() = 0;
+
+  /// \returns the stack pointer bias.
+  virtual int getStackPointerBias() = 0;
+
   /// Destroys the object. Bogus destructor allowing derived classes
   /// to override it.
   virtual ~SystemZCallingConventionRegisters(){};
@@ -91,6 +97,10 @@ public:
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID CC) const override final;
 
+  int getCallFrameSize() override final { return 128; }
+
+  int getStackPointerBias() override final { return 2048; }
+
   /// Destroys the object. Bogus destructor overriding base class destructor
   ~SystemZXPLINK64Registers(){};
 };
@@ -113,6 +123,10 @@ public:
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID CC) const override final;
 
+  int getCallFrameSize() override final { return SystemZMC::ELFCallFrameSize; }
+
+  int getStackPointerBias() override final { return 0; }
+
   /// Destroys the object. Bogus destructor overriding base class destructor
   ~SystemZELFRegisters(){};
 };
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 08295df376e1..390457dbb2bc 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -135,6 +135,9 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
     return true;
   }
 
+  if (TT.isOSBinFormatGOFF())
+    return true;
+
   if (TT.isOSBinFormatMachO()) {
     if (RM == Reloc::Static)
       return true;
diff --git a/llvm/test/CodeGen/SystemZ/call-zos-01.ll b/llvm/test/CodeGen/SystemZ/call-zos-01.ll
new file mode 100644
index 000000000000..7194d09cba16
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/call-zos-01.ll
@@ -0,0 +1,191 @@
+; Test the passing of scalar values in GPRs, FPRs in 64-bit calls on z/OS.
+;
+; RUN: llc < %s -mtriple=s390x-ibm-zos -mcpu=z10 | FileCheck %s
+
+; CHECK-LABEL: call_char:
+; CHECK: lghi  1, 8
+define i8 @call_char(){
+  %retval = call i8 (i8) @pass_char(i8 8)
+  ret i8 %retval
+}
+
+; CHECK-LABEL: call_short:
+; CHECK: lghi  1, 16
+define i16 @call_short() {
+entry:
+  %retval = call i16 (i16) @pass_short(i16 16)
+  ret i16 %retval
+}
+
+; CHECK-LABEL: call_int:
+; CHECK: lghi  1, 32
+; CHECK: lghi  2, 33
+define i32 @call_int() {
+entry:
+  %retval = call i32 (i32, i32) @pass_int(i32 32, i32 33)
+  ret i32 %retval
+}
+
+; CHECK-LABEL: call_long:
+; CHECK: lghi  1, 64
+; CHECK: lghi  2, 65
+; CHECK: lghi  3, 66
+define i64 @call_long() {
+entry:
+  %retval = call i64 (i64, i64, i64) @pass_long(i64 64, i64 65, i64 66)
+  ret i64 %retval
+}
+
+; CHECK-LABEL: call_ptr:
+; CHECK: lgr 1, 2
+define i32 @call_ptr(i32* %p1, i32* %p2) {
+entry:
+  %retval = call i32 (i32*) @pass_ptr(i32* %p2)
+  ret i32 %retval
+}
+
+; CHECK-LABEL: call_integrals:
+; CHECK: lghi  1, 64
+; CHECK: lghi  2, 32
+; CHECK: lghi  3, 16
+define i64 @call_integrals() {
+entry:
+  %retval = call i64 (i64, i32, i16, i64) @pass_integrals0(i64 64, i32 32, i16 16, i64 128)
+  ret i64 %retval
+}
+
+; CHECK-LABEL: pass_char:
+; CHECK: lgr 3, 1
+define signext i8 @pass_char(i8 signext %arg) {
+entry:
+  ret i8 %arg
+}
+
+; CHECK-LABEL: pass_short:
+; CHECK: lgr 3, 1
+define signext i16 @pass_short(i16 signext %arg) {
+entry:
+  ret i16 %arg
+}
+
+; CHECK-LABEL: pass_int:
+; CHECK: lgr 3, 2
+define signext i32 @pass_int(i32 signext %arg0, i32 signext %arg1) {
+entry:
+  ret i32 %arg1
+}
+
+; CHECK-LABEL: pass_long:
+; CHECK: agr 1, 2
+; CHECK: agr 3, 1
+define signext i64 @pass_long(i64 signext %arg0, i64 signext %arg1, i64 signext %arg2) {
+entry:
+  %N = add i64 %arg0, %arg1
+  %M = add i64 %N, %arg2
+  ret i64 %M
+}
+
+; CHECK-LABEL: pass_integrals0:
+; CHECK: ag  2, -{{[0-9]+}}(4)
+; CHECK-NEXT: lgr 3, 2
+define signext i64 @pass_integrals0(i64 signext %arg0, i32 signext %arg1, i16 signext %arg2, i64 signext %arg3) {
+entry:
+  %N = sext i32 %arg1 to i64
+  %M = add i64 %arg3, %N
+  ret i64 %M
+}
+
+; CHECK-LABEL: call_float:
+; CHECK: le 0, 0({{[0-9]}})
+define float @call_float() {
+entry:
+  %ret = call float (float) @pass_float(float 0x400921FB60000000)
+  ret float %ret
+}
+
+; CHECK-LABEL: call_double:
+; CHECK: larl  [[GENREG:[0-9]+]], @{{CPI[0-9]+_[0-9]+}}
+; CHECK-NEXT: ld  0, 0([[GENREG]])
+define double @call_double() {
+entry:
+  %ret = call double (double) @pass_double(double 3.141000e+00)
+  ret double %ret
+}
+
+; CHECK-LABEL: call_longdouble:
+; CHECK: larl  [[GENREG:[0-9]+]], @{{CPI[0-9]+_[0-9]+}}
+; CHECK-NEXT: ld  0, 0([[GENREG]])
+; CHECK-NEXT: ld  2, 8([[GENREG]])
+define fp128 @call_longdouble() {
+entry:
+  %ret = call fp128 (fp128) @pass_longdouble(fp128 0xLE0FC1518450562CD4000921FB5444261)
+  ret fp128 %ret
+}
+
+; CHECK-LABEL: call_floats0
+; CHECK: larl  [[GENREG:[0-9]+]], @{{CPI[0-9]+_[0-9]+}}
+; CHECK-NEXT: ld  1, 0([[GENREG]])
+; CHECK-NEXT: ld  3, 8([[GENREG]])
+; CHECK: lxr 5, 0
+; CHECK: lxr 0, 1
+; CHECK: lxr 4, 5
+define i64 @call_floats0(fp128 %arg0, double %arg1) {
+entry:
+  %ret = call i64 (fp128, fp128, double) @pass_floats0(fp128 0xLE0FC1518450562CD4000921FB5444261, fp128 %arg0, double %arg1)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: call_floats1
+; CHECK: lxr 1, 0
+; CHECK: ldr 0, 4
+; CHECK: lxr 4, 1
+define i64 @call_floats1(fp128 %arg0, double %arg1) {
+entry:
+  %ret = call i64 (double, fp128) @pass_floats1(double %arg1, fp128 %arg0)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: pass_float:
+; CHECK: larl  1, @{{CPI[0-9]+_[0-9]+}}
+; CHECK: aeb 0, 0(1)
+define float @pass_float(float %arg) {
+entry:
+  %X = fadd float %arg, 0x400821FB60000000
+  ret float %X
+}
+
+; CHECK-LABEL: pass_double:
+; CHECK: larl  1, @{{CPI[0-9]+_[0-9]+}}
+; CHECK: adb 0, 0(1)
+define double @pass_double(double %arg) {
+entry:
+  %X = fadd double %arg, 1.414213e+00
+  ret double %X
+}
+
+; CHECK-LABEL: pass_longdouble
+; CHECK: larl  1, @{{CPI[0-9]+_[0-9]+}}
+; CHECK: lxdb  1, 0(1)
+; CHECK: axbr  0, 1
+define fp128 @pass_longdouble(fp128 %arg) {
+entry:
+  %X = fadd fp128 %arg, 0xL10000000000000004000921FB53C8D4F
+  ret fp128 %X
+}
+
+; CHECK-LABEL: pass_floats0
+; CHECK: larl  1, @{{CPI[0-9]+_[0-9]+}}
+; CHECK: axbr  0, 4
+; CHECK: axbr  1, 0
+; CHECK: cxbr  1, 5
+define i64 @pass_floats0(fp128 %arg0, fp128 %arg1, double %arg2) {
+  %X = fadd fp128 %arg0, %arg1
+  %arg2_ext = fpext double %arg2 to fp128
+  %Y = fadd fp128 %X, %arg2_ext
+  %ret_bool = fcmp ueq fp128 %Y, 0xLE0FC1518450562CD4000921FB5444261
+  %ret = sext i1 %ret_bool to i64
+  ret i64 %ret
+}
+
+declare i64 @pass_floats1(double %arg0, fp128 %arg1)
+declare i32 @pass_ptr(i32* %arg)
diff --git a/llvm/test/CodeGen/SystemZ/call-zos-vararg.ll b/llvm/test/CodeGen/SystemZ/call-zos-vararg.ll
new file mode 100644
index 000000000000..2efe27172efc
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/call-zos-vararg.ll
@@ -0,0 +1,195 @@
+; Test passing variable argument lists in 64-bit calls on z/OS.
+; RUN: llc < %s -mtriple=s390x-ibm-zos -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-ibm-zos -mcpu=z14 | FileCheck %s -check-prefix=ARCH12
+; CHECK-LABEL: call_vararg_double0
+; CHECK:       llihf 3, 1074118262
+; CHECK-NEXT:  oilf  3, 3367254360
+; CHECK:       lghi  1, 1
+; CHECK:       lghi  2, 2
+define i64 @call_vararg_double0() {
+entry:
+  %retval = call i64 (i64, i64, ...) @pass_vararg0(i64 1, i64 2, double 2.718000e+00)
+  ret i64 %retval
+}
+
+; CHECK-LABEL:  call_vararg_double1
+; CHECK:        llihf 0, 1074118262
+; CHECK-NEXT:   oilf  0, 3367254360
+; CHECK:        llihf 3, 1074340036
+; CHECK-NEXT:   oilf  3, 2611340116
+; CHECK:        lghi  1, 1
+; CHECK:        lghi  2, 2
+; CHECK:        stg 0, 2200(4)
+define i64 @call_vararg_double1() {
+entry:
+  %retval = call i64 (i64, i64, ...) @pass_vararg0(i64 1, i64 2, double 3.141000e+00, double 2.718000e+00)
+  ret i64 %retval
+}
+
+; CHECK-LABEL: call_vararg_double2
+; CHECK-NOT:   llihf 0
+; CHECK-NOT:   oilf 0
+; CHECK:       llihf 2, 1074118262
+; CHECK-NEXT:  oilf  2, 3367254360
+; CHECK:       lghi  1, 8200
+define i64 @call_vararg_double2() {
+entry:
+  %retval = call i64 (i64, ...) @pass_vararg2(i64 8200, double 2.718000e+00)
+  ret i64 %retval
+}
+
+; CHECK-LABEL: call_vararg_double3
+; CHECK:       llihf   0, 1072703839
+; CHECK-NEXT:  oilf    0, 2861204133
+; CHECK:       llihf   1, 1074118262
+; CHECK-NEXT:  oilf    1, 3367254360
+; CHECK:       llihf   2, 1074340036
+; CHECK-NEXT:  oilf    2, 2611340116
+; CHECK:       llihf   3, 1073127358
+; CHECK-NEXT:  oilf    3, 1992864825
+; CHECK:       stg     0, 2200(4)
+define i64 @call_vararg_double3() {
+entry:
+  %retval = call i64 (...) @pass_vararg3(double 2.718000e+00, double 3.141000e+00, double 1.414000e+00, double 1.010101e+00)
+  ret i64 %retval
+}
+
+; CHECK-LABEL: call_vararg_both0
+; CHECK:       lgr   2, 1
+; CHECK:       lgdr  1, 0
+define i64 @call_vararg_both0(i64 %arg0, double %arg1) {
+  %retval  = call i64(...) @pass_vararg3(double %arg1, i64 %arg0)
+  ret i64 %retval
+}
+
+; CHECK-LABEL: call_vararg_long_double0
+; CHECK:       larl  1, @CPI5_0
+; CHECK-NEXT:  ld    0, 0(1)
+; CHECK-NEXT:  ld    2, 8(1)
+; CHECK-NEXT:  lgdr  3, 0
+; CHECK:       lghi  1, 1
+; CHECK:       lghi  2, 2
+; CHECK:       std   0, 2192(4)
+; CHECK-NEXT:  std   2, 2200(4)
+define i64 @call_vararg_long_double0() {
+entry:
+  %retval = call i64 (i64, i64, ...) @pass_vararg0(i64 1, i64 2, fp128 0xLE0FC1518450562CD4000921FB5444261)
+  ret i64 %retval
+}
+
+; CHECK-LABEL: call_vararg_long_double1
+; CHECK:       lgdr  3, 0
+; CHECK:       lghi  1, 1
+; CHECK:       lghi  2, 2
+; CHECK:       std   0, 2192(4)
+; CHECK-NEXT:  std   2, 2200(4)
+define i64 @call_vararg_long_double1(fp128 %arg0) {
+entry:
+  %retval = call i64 (i64, i64, ...) @pass_vararg0(i64 1, i64 2, fp128 %arg0)
+  ret i64 %retval
+}
+
+; CHECK-LABEL: call_vararg_long_double2
+; CHECK:      std   4, 2208(4)
+; CHECK-NEXT: std   6, 2216(4)
+; CHECK:      lgdr  3, 0
+; CHECK:      lghi  1, 1
+; CHECK:      lghi  2, 2
+; CHECK:      std   0, 2192(4)
+; CHECK-NEXT: std   2, 2200(4)
+define i64 @call_vararg_long_double2(fp128 %arg0, fp128 %arg1) {
+entry:
+  %retval = call i64 (i64, i64, ...) @pass_vararg0(i64 1, i64 2, fp128 %arg0, fp128 %arg1)
+  ret i64 %retval
+}
+
+; CHECK-LABEL: call_vararg_long_double3
+; CHECK:       lgdr 3, 2
+; CHECK-NEXT:  lgdr 2, 0
+define i64 @call_vararg_long_double3(fp128 %arg0) {
+entry:
+  %retval = call i64 (...) @pass_vararg3(fp128 %arg0)
+  ret i64 %retval
+}
+
+; ARCH12-LABEL: call_vec_vararg_test0
+; ARCH12: vlgvg 3, 24, 1
+; ARCH12: vlgvg 2, 24, 0
+; ARCH12: lghi  1, 1
+define void @call_vec_vararg_test0(<2 x double> %v) {
+  %retval = call i64(i64, ...) @pass_vararg2(i64 1, <2 x double> %v)
+  ret void
+}
+
+; ARCH12-LABEL: call_vec_vararg_test1
+; ARCH12: larl  1, @CPI10_0
+; ARCH12: vl    0, 0(1), 3
+; ARCH12: vlgvg 3, 24, 0
+; ARCH12: vrepg 2, 0, 1
+; ARCH12: vst   25, 2208(4), 3
+; ARCH12: vst   24, 2192(4), 3
+define void @call_vec_vararg_test1(<4 x i32> %v, <2 x i64> %w) {
+  %retval = call i64(fp128, ...) @pass_vararg1(fp128 0xLE0FC1518450562CD4000921FB5444261, <4 x i32> %v, <2 x i64> %w)
+  ret void
+}
+
+; ARCH12-LABEL: call_vec_char_vararg_straddle
+; ARCH12: vlgvg 3, 24, 0
+; ARCH12: lghi  1, 1
+; ARCH12: lghi  2, 2
+; ARCH12: vst   24, 2192(4), 3
+define void @call_vec_char_vararg_straddle(<16 x i8> %v) {
+  %retval = call i64(i64, i64, ...) @pass_vararg0(i64 1, i64 2, <16 x i8> %v)
+  ret void
+}
+
+; ARCH12-LABEL: call_vec_short_vararg_straddle
+; ARCH12: vlgvg 3, 24, 0
+; ARCH12: lghi  1, 1
+; ARCH12: lghi  2, 2
+; ARCH12: vst   24, 2192(4), 3
+define void @call_vec_short_vararg_straddle(<8 x i16> %v) {
+  %retval = call i64(i64, i64, ...) @pass_vararg0(i64 1, i64 2, <8 x i16> %v)
+  ret void
+}
+
+; ARCH12-LABEL: call_vec_int_vararg_straddle
+; ARCH12: vlgvg 3, 24, 0
+; ARCH12: lghi  1, 1
+; ARCH12: lghi  2, 2
+; ARCH12: vst 24, 2192(4), 3
+define void @call_vec_int_vararg_straddle(<4 x i32> %v) {
+  %retval = call i64(i64, i64, ...) @pass_vararg0(i64 1, i64 2, <4 x i32> %v)
+  ret void
+}
+
+; ARCH12-LABEL: call_vec_double_vararg_straddle
+; ARCH12: vlgvg 3, 24, 0
+; ARCH12: lghi  1, 1
+; ARCH12: lghi  2, 2
+; ARCH12: vst 24, 2192(4), 3
+define void @call_vec_double_vararg_straddle(<2 x double> %v) {
+  %retval = call i64(i64, i64, ...) @pass_vararg0(i64 1, i64 2, <2 x double> %v)
+  ret void
+}
+
+; CHECK-LABEL: call_vararg_integral0
+; Since arguments 0, 1, and 2 are already in the correct
+; registers, we should have no loads of any sort into
+; GPRs 1, 2, and 3.
+; CHECK-NOT: lg  1
+; CHECK-NOT: lgr  1
+; CHECK-NOT: lg  2
+; CHECK-NOT: lgr  2
+; CHECK-NOT: lg  3
+; CHECK-NOT: lgr  3
+define i64 @call_vararg_integral0(i32 signext %arg0, i16 signext %arg1, i64 signext %arg2, i8 signext %arg3) {
+entry:
+  %retval = call i64(...) @pass_vararg3(i32 signext %arg0, i16 signext %arg1, i64 signext %arg2, i8 signext %arg3)
+  ret i64 %retval
+}
+
+declare i64 @pass_vararg0(i64 %arg0, i64 %arg1, ...)
+declare i64 @pass_vararg1(fp128 %arg0, ...)
+declare i64 @pass_vararg2(i64 %arg0, ...)
+declare i64 @pass_vararg3(...)
diff --git a/llvm/test/CodeGen/SystemZ/call-zos-vec.ll b/llvm/test/CodeGen/SystemZ/call-zos-vec.ll
new file mode 100644
index 000000000000..8d6b93387330
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/call-zos-vec.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -mtriple=s390x-ibm-zos -mcpu=z13 | FileCheck %s
+
+; CHECK-LABEL: sum_vecs0
+; CHECK: vag 24, 24, 25
+define <2 x i64> @sum_vecs0(<2 x i64> %v1, <2 x i64> %v2) {
+entry:
+  %add0 = add <2 x i64> %v1, %v2
+  ret <2 x i64> %add0
+}
+
+; CHECK-LABEL: sum_vecs1
+; CHECK: vaf 1, 24, 25
+; CHECK: vaf 1, 1, 26
+; CHECK: vaf 1, 1, 27
+; CHECK: vaf 1, 1, 28
+; CHECK: vaf 1, 1, 29
+; CHECK: vl  0, 32(4), 4
+; CHECK: vaf 1, 1, 30
+; CHECK: vaf 1, 1, 31
+; CHECK: vaf 24, 1, 0
+define <4 x i32> @sum_vecs1(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4, <4 x i32> %v5, <4 x i32> %v6, <4 x i32> %v7, <4 x i32> %v8, <4 x i32> %v9) {
+entry:
+  %add0 = add <4 x i32> %v1, %v2
+  %add1 = add <4 x i32> %add0, %v3
+  %add2 = add <4 x i32> %add1, %v4
+  %add3 = add <4 x i32> %add2, %v5
+  %add4 = add <4 x i32> %add3, %v6
+  %add5 = add <4 x i32> %add4, %v7
+  %add6 = add <4 x i32> %add5, %v8
+  %add7 = add <4 x i32> %add6, %v9
+  ret <4 x i32> %add7
+}
+
+; Verify that 3 is used for passing integral types if
+; only 24 is used.
+; CHECK-LABEL: call_vecs0
+; CHECK: lgr 3, 1
+define i64 @call_vecs0(i64 %n, <2 x i64> %v1) {
+entry:
+  %ret = call i64 (<2 x i64>, i64) @pass_vecs0(<2 x i64> %v1, i64 %n)
+  ret i64 %ret
+}
+
+; Verify that 3 is not allocated for passing integral types
+; if 24 and %f0 are used.
+; CHECK-LABEL: call_vecs1
+; CHECK: vlr 24, 25
+; CHECK: stg 1, 2200(4)
+define i64 @call_vecs1(i64 %n, <2 x i64> %v1, double %x, <2 x i64> %v2) {
+entry:
+  %ret = call i64 (<2 x i64>, double, i64) @pass_vecs1(<2 x i64> %v2, double %x, i64 %n)
+  ret i64 %ret
+}
+
+; Verify that 3 is not allocated for passing integral types
+; if 24 and 25 are used.
+; CHECK-LABEL: call_vecs2
+; CHECK: mvghi 2208(4), 55
+define i64 @call_vecs2(<2 x i64> %v1, <2 x i64> %v2) {
+  %ret = call i64 (<2 x i64>, <2 x i64>, i64) @pass_vecs2(<2 x i64> %v1, <2 x i64> %v2, i64 55)
+  ret i64 %ret
+}
+
+declare i64 @pass_vecs0(<2 x i64> %v1, i64 %n)
+declare i64 @pass_vecs1(<2 x i64> %v1, double %x, i64 %n)
+declare i64 @pass_vecs2(<2 x i64> %v1, <2 x i64> %v2, i64 %n)