From 3b95157090110e8e9f41903ec2d04db372470b03 Mon Sep 17 00:00:00 2001 From: Oren Ben Simhon Date: Wed, 21 Dec 2016 08:31:45 +0000 Subject: [PATCH] [X86] Vectorcall Calling Convention - Adding CodeGen Complete Support The vectorcall calling convention specifies that arguments to functions are to be passed in registers, when possible. vectorcall uses more registers for arguments than fastcall or the default x64 calling convention use. The vectorcall calling convention is only supported in native code on x86 and x64 processors that include Streaming SIMD Extensions 2 (SSE2) and above. The current implementation does not handle Homogeneous Vector Aggregates (HVAs) correctly and this review attempts to fix it. This aubmit also includes additional lit tests to cover better HVAs corner cases. Differential Revision: https://reviews.llvm.org/D27392 llvm-svn: 290240 --- llvm/include/llvm/CodeGen/CallingConvLower.h | 48 ++++++ llvm/include/llvm/Target/TargetCallingConv.h | 18 ++ llvm/lib/CodeGen/CallingConvLower.cpp | 18 ++ .../SelectionDAG/SelectionDAGBuilder.cpp | 26 ++- llvm/lib/Target/X86/X86CallingConv.cpp | 154 +++++++++++++++++- llvm/lib/Target/X86/X86CallingConv.h | 31 ++-- llvm/lib/Target/X86/X86CallingConv.td | 64 +++----- llvm/lib/Target/X86/X86ISelLowering.cpp | 42 ++++- llvm/test/CodeGen/X86/vectorcall.ll | 142 +++++++++++++++- 9 files changed, 471 insertions(+), 72 deletions(-) diff --git a/llvm/include/llvm/CodeGen/CallingConvLower.h b/llvm/include/llvm/CodeGen/CallingConvLower.h index f7983c574793..bfbd22823eb8 100644 --- a/llvm/include/llvm/CodeGen/CallingConvLower.h +++ b/llvm/include/llvm/CodeGen/CallingConvLower.h @@ -296,6 +296,12 @@ public: void AnalyzeFormalArguments(const SmallVectorImpl &Ins, CCAssignFn Fn); + /// The function will invoke AnalyzeFormalArguments. + void AnalyzeArguments(const SmallVectorImpl &Ins, + CCAssignFn Fn) { + AnalyzeFormalArguments(Ins, Fn); + } + /// AnalyzeReturn - Analyze the returned values of a return, /// incorporating info about the result values into this state. void AnalyzeReturn(const SmallVectorImpl &Outs, @@ -318,11 +324,22 @@ public: SmallVectorImpl &Flags, CCAssignFn Fn); + /// The function will invoke AnalyzeCallOperands. + void AnalyzeArguments(const SmallVectorImpl &Outs, + CCAssignFn Fn) { + AnalyzeCallOperands(Outs, Fn); + } + /// AnalyzeCallResult - Analyze the return values of a call, /// incorporating info about the passed values into this state. void AnalyzeCallResult(const SmallVectorImpl &Ins, CCAssignFn Fn); + /// A shadow allocated register is a register that was allocated + /// but wasn't added to the location list (Locs). + /// \returns true if the register was allocated as shadow or false otherwise. + bool IsShadowAllocatedReg(unsigned Reg) const; + /// AnalyzeCallResult - Same as above except it's specialized for calls which /// produce a single value. void AnalyzeCallResult(MVT VT, CCAssignFn Fn); @@ -521,6 +538,37 @@ public: const SmallVectorImpl &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn); + /// The function runs an additional analysis pass over function arguments. + /// It will mark each argument with the attribute flag SecArgPass. + /// After running, it will sort the locs list. + template + void AnalyzeArgumentsSecondPass(const SmallVectorImpl &Args, + CCAssignFn Fn) { + unsigned NumFirstPassLocs = Locs.size(); + + /// Creates similar argument list to \p Args in which each argument is + /// marked using SecArgPass flag. + SmallVector SecPassArg; + // SmallVector SecPassArg; + for (auto Arg : Args) { + Arg.Flags.setSecArgPass(); + SecPassArg.push_back(Arg); + } + + // Run the second argument pass + AnalyzeArguments(SecPassArg, Fn); + + // Sort the locations of the arguments according to their original position. + SmallVector TmpArgLocs; + std::swap(TmpArgLocs, Locs); + auto B = TmpArgLocs.begin(), E = TmpArgLocs.end(); + std::merge(B, B + NumFirstPassLocs, B + NumFirstPassLocs, E, + std::back_inserter(Locs), + [](const CCValAssign &A, const CCValAssign &B) -> bool { + return A.getValNo() < B.getValNo(); + }); + } + private: /// MarkAllocated - Mark a register and all of its aliases as allocated. void MarkAllocated(unsigned Reg); diff --git a/llvm/include/llvm/Target/TargetCallingConv.h b/llvm/include/llvm/Target/TargetCallingConv.h index 19d8917f17d3..be09236cdab0 100644 --- a/llvm/include/llvm/Target/TargetCallingConv.h +++ b/llvm/include/llvm/Target/TargetCallingConv.h @@ -51,6 +51,15 @@ namespace ISD { static const uint64_t SwiftSelfOffs = 14; static const uint64_t SwiftError = 1ULL<<15; ///< Swift error parameter static const uint64_t SwiftErrorOffs = 15; + static const uint64_t Hva = 1ULL << 16; ///< HVA field for + ///< vectorcall + static const uint64_t HvaOffs = 16; + static const uint64_t HvaStart = 1ULL << 17; ///< HVA structure start + ///< for vectorcall + static const uint64_t HvaStartOffs = 17; + static const uint64_t SecArgPass = 1ULL << 18; ///< Second argument + ///< pass for vectorcall + static const uint64_t SecArgPassOffs = 18; static const uint64_t OrigAlign = 0x1FULL<<27; static const uint64_t OrigAlignOffs = 27; static const uint64_t ByValSize = 0x3fffffffULL<<32; ///< Struct size @@ -91,6 +100,15 @@ namespace ISD { bool isSwiftError() const { return Flags & SwiftError; } void setSwiftError() { Flags |= One << SwiftErrorOffs; } + bool isHva() const { return Flags & Hva; } + void setHva() { Flags |= One << HvaOffs; } + + bool isHvaStart() const { return Flags & HvaStart; } + void setHvaStart() { Flags |= One << HvaStartOffs; } + + bool isSecArgPass() const { return Flags & SecArgPass; } + void setSecArgPass() { Flags |= One << SecArgPassOffs; } + bool isNest() const { return Flags & Nest; } void setNest() { Flags |= One << NestOffs; } diff --git a/llvm/lib/CodeGen/CallingConvLower.cpp b/llvm/lib/CodeGen/CallingConvLower.cpp index 7d67bcfe5469..2e33f14c7ee3 100644 --- a/llvm/lib/CodeGen/CallingConvLower.cpp +++ b/llvm/lib/CodeGen/CallingConvLower.cpp @@ -23,6 +23,8 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include + using namespace llvm; CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf, @@ -64,6 +66,22 @@ void CCState::MarkAllocated(unsigned Reg) { UsedRegs[*AI/32] |= 1 << (*AI&31); } +bool CCState::IsShadowAllocatedReg(unsigned Reg) const { + if (!isAllocated(Reg)) + return false; + + for (auto const &ValAssign : Locs) { + if (ValAssign.isRegLoc()) { + for (MCRegAliasIterator AI(ValAssign.getLocReg(), &TRI, true); + AI.isValid(); ++AI) { + if (*AI == Reg) + return false; + } + } + } + return true; +} + /// Analyze an array of argument values, /// incorporating info about the formals into this state. void diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 50ddc4bfd463..da68fe87a321 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7732,8 +7732,19 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { Flags.setZExt(); if (Args[i].isSExt) Flags.setSExt(); - if (Args[i].isInReg) + if (Args[i].isInReg) { + // If we are using vectorcall calling convention, a structure that is + // passed InReg - is surely an HVA + if (CLI.CallConv == CallingConv::X86_VectorCall && + isa(FinalType)) { + // The first value of a structure is marked + if (0 == Value) + Flags.setHvaStart(); + Flags.setHva(); + } + // Set InReg Flag Flags.setInReg(); + } if (Args[i].isSRet) Flags.setSRet(); if (Args[i].isSwiftSelf) @@ -8019,8 +8030,19 @@ void SelectionDAGISel::LowerArguments(const Function &F) { Flags.setZExt(); if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) Flags.setSExt(); - if (F.getAttributes().hasAttribute(Idx, Attribute::InReg)) + if (F.getAttributes().hasAttribute(Idx, Attribute::InReg)) { + // If we are using vectorcall calling convention, a structure that is + // passed InReg - is surely an HVA + if (F.getCallingConv() == CallingConv::X86_VectorCall && + isa(I->getType())) { + // The first value of a structure is marked + if (0 == Value) + Flags.setHvaStart(); + Flags.setHva(); + } + // Set InReg Flag Flags.setInReg(); + } if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet)) Flags.setSRet(); if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftSelf)) diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index 1bfe225a2ff9..ae3f5b58b5f7 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/X86MCTargetDesc.h" +#include "X86Subtarget.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/IR/CallingConv.h" @@ -39,14 +40,14 @@ bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, if (AvailableRegs.size() < RequiredGprsUponSplit) return false; // Not enough free registers - continue the search. - // Allocating the available registers + // Allocating the available registers. for (unsigned I = 0; I < RequiredGprsUponSplit; I++) { - // Marking the register as located + // Marking the register as located. unsigned Reg = State.AllocateReg(AvailableRegs[I]); // Since we previously made sure that 2 registers are available - // we expect that a real register number will be returned + // we expect that a real register number will be returned. assert(Reg && "Expecting a register will be available"); // Assign the value to the allocated register @@ -57,4 +58,151 @@ bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } +static ArrayRef CC_X86_VectorCallGetSSEs(const MVT &ValVT) { + if (ValVT.is512BitVector()) { + static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2, + X86::ZMM3, X86::ZMM4, X86::ZMM5}; + return RegListZMM; + } + + if (ValVT.is256BitVector()) { + static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2, + X86::YMM3, X86::YMM4, X86::YMM5}; + return RegListYMM; + } + + static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2, + X86::XMM3, X86::XMM4, X86::XMM5}; + return RegListXMM; +} + +static ArrayRef CC_X86_64_VectorCallGetGPRs() { + static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9}; + return RegListGPR; +} + +static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + + ArrayRef RegList = CC_X86_VectorCallGetSSEs(ValVT); + bool Is64bit = static_cast( + State.getMachineFunction().getSubtarget()) + .is64Bit(); + + for (auto Reg : RegList) { + // If the register is not marked as allocated - assign to it. + if (!State.isAllocated(Reg)) { + unsigned AssigedReg = State.AllocateReg(Reg); + assert(AssigedReg == Reg && "Expecting a valid register allocation"); + State.addLoc( + CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo)); + return true; + } + // If the register is marked as shadow allocated - assign to it. + if (Is64bit && State.IsShadowAllocatedReg(Reg)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; + } + } + + llvm_unreachable("Clang should ensure that hva marked vectors will have " + "an available register."); + return false; +} + +bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + // On the second pass, go through the HVAs only. + if (ArgFlags.isSecArgPass()) { + if (ArgFlags.isHva()) + return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo, + ArgFlags, State); + return true; + } + + // Process only vector types as defined by vectorcall spec: + // "A vector type is either a floating-point type, for example, + // a float or double, or an SIMD vector type, for example, __m128 or __m256". + if (!(ValVT.isFloatingPoint() || + (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) { + // If R9 was already assigned it means that we are after the fourth element + // and because this is not an HVA / Vector type, we need to allocate + // shadow XMM register. + if (State.isAllocated(X86::R9)) { + // Assign shadow XMM register. + (void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT)); + } + + return false; + } + + if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) { + // Assign shadow GPR register. + (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs()); + + // Assign XMM register - (shadow for HVA and non-shadow for non HVA). + if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) { + // In Vectorcall Calling convention, additional shadow stack can be + // created on top of the basic 32 bytes of win64. + // It can happen if the fifth or sixth argument is vector type or HVA. + // At that case for each argument a shadow stack of 8 bytes is allocated. + if (Reg == X86::XMM4 || Reg == X86::XMM5) + State.AllocateStack(8, 8); + + if (!ArgFlags.isHva()) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; // Allocated a register - Stop the search. + } + } + } + + // If this is an HVA - Stop the search, + // otherwise continue the search. + return ArgFlags.isHva(); +} + +bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + // On the second pass, go through the HVAs only. + if (ArgFlags.isSecArgPass()) { + if (ArgFlags.isHva()) + return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo, + ArgFlags, State); + return true; + } + + // Process only vector types as defined by vectorcall spec: + // "A vector type is either a floating point type, for example, + // a float or double, or an SIMD vector type, for example, __m128 or __m256". + if (!(ValVT.isFloatingPoint() || + (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) { + return false; + } + + if (ArgFlags.isHva()) + return true; // If this is an HVA - Stop the search. + + // Assign XMM register. + if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; + } + + // In case we did not find an available XMM register for a vector - + // pass it indirectly. + // It is similar to CCPassIndirect, with the addition of inreg. + if (!ValVT.isFloatingPoint()) { + LocVT = MVT::i32; + LocInfo = CCValAssign::Indirect; + ArgFlags.setInReg(); + } + + return false; // No register was assigned - Continue the search. +} + } // End llvm namespace diff --git a/llvm/lib/Target/X86/X86CallingConv.h b/llvm/lib/Target/X86/X86CallingConv.h index 2e93ec9c78ca..c49a6838fa44 100644 --- a/llvm/lib/Target/X86/X86CallingConv.h +++ b/llvm/lib/Target/X86/X86CallingConv.h @@ -24,22 +24,29 @@ namespace llvm { /// When regcall calling convention compiled to 32 bit arch, special treatment /// is required for 64 bit masks. /// The value should be assigned to two GPRs. -/// @return true if registers were allocated and false otherwise +/// \return true if registers were allocated and false otherwise. bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State); -inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - // Similar to CCPassIndirect, with the addition of inreg. - LocVT = MVT::i32; - LocInfo = CCValAssign::Indirect; - ArgFlags.setInReg(); - return false; // Continue the search, but now for i32. -} +/// Vectorcall calling convention has special handling for vector types or +/// HVA for 64 bit arch. +/// For HVAs shadow registers might be allocated on the first pass +/// and actual XMM registers are allocated on the second pass. +/// For vector types, actual XMM registers are allocated on the first pass. +/// \return true if registers were allocated and false otherwise. +bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State); + +/// Vectorcall calling convention has special handling for vector types or +/// HVA for 32 bit arch. +/// For HVAs actual XMM registers are allocated on the second pass. +/// For vector types, actual XMM registers are allocated on the first pass. +/// \return true if registers were allocated and false otherwise. +bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State); inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, CCValAssign::LocInfo &, ISD::ArgFlagsTy &, diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index a0c822ff0ab4..cf7bc981b8a5 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -308,20 +308,12 @@ def RetCC_X86_32_HiPE : CallingConv<[ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>> ]>; -// X86-32 HiPE return-value convention. +// X86-32 Vectorcall return-value convention. def RetCC_X86_32_VectorCall : CallingConv<[ - // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + // Floating Point types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, f128], CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, - // 256-bit FP vectors - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, - - // 512-bit FP vectors - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, - // Return integers in the standard way. CCDelegateTo ]>; @@ -350,6 +342,16 @@ def RetCC_X86_Win64_C : CallingConv<[ CCDelegateTo ]>; +// X86-64 vectorcall return-value convention. +def RetCC_X86_64_Vectorcall : CallingConv<[ + // Vectorcall calling convention always returns FP values in XMMs. + CCIfType<[f32, f64, f128], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + + // Otherwise, everything is the same as Windows X86-64 C CC. + CCDelegateTo +]>; + // X86-64 HiPE return-value convention. def RetCC_X86_64_HiPE : CallingConv<[ // Promote all types to i64 @@ -447,6 +449,9 @@ def RetCC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo>, + // Handle Vectorcall CC + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, + // Handle HHVM calls. CCIfCC<"CallingConv::HHVM", CCDelegateTo>, @@ -626,18 +631,7 @@ def CC_X86_Win64_C : CallingConv<[ ]>; def CC_X86_Win64_VectorCall : CallingConv<[ - // The first 6 floating point and vector types of 128 bits or less use - // XMM0-XMM5. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, - - // 256-bit vectors use YMM registers. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, - - // 512-bit vectors use ZMM registers. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, + CCCustom<"CC_X86_64_VectorCall">, // Delegate to fastcall to handle integer types. CCDelegateTo @@ -847,25 +841,9 @@ def CC_X86_32_FastCall : CallingConv<[ CCDelegateTo ]>; -def CC_X86_32_VectorCall : CallingConv<[ - // The first 6 floating point and vector types of 128 bits or less use - // XMM0-XMM5. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, - - // 256-bit vectors use YMM registers. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, - - // 512-bit vectors use ZMM registers. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, - - // Otherwise, pass it indirectly. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, - v32i8, v16i16, v8i32, v4i64, v8f32, v4f64, - v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCCustom<"CC_X86_32_VectorCallIndirect">>, +def CC_X86_Win32_VectorCall : CallingConv<[ + // Pass floating point in XMMs + CCCustom<"CC_X86_32_VectorCall">, // Delegate to fastcall to handle integer types. CCDelegateTo @@ -999,7 +977,7 @@ def CC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::X86_INTR", CCDelegateTo>, CCIfSubtarget<"isTargetMCU()", CCDelegateTo>, CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo>, - CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo>, CCIfCC<"CallingConv::Fast", CCDelegateTo>, CCIfCC<"CallingConv::GHC", CCDelegateTo>, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a2eb654bcd25..a816c062acc2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17,6 +17,7 @@ #include "X86CallingConv.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" +#include "X86IntrinsicsInfo.h" #include "X86MachineFunctionInfo.h" #include "X86ShuffleDecodeConstantPool.h" #include "X86TargetMachine.h" @@ -53,10 +54,10 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" -#include "X86IntrinsicsInfo.h" +#include #include -#include #include +#include using namespace llvm; #define DEBUG_TYPE "x86-isel" @@ -2781,6 +2782,13 @@ static ArrayRef get64BitArgumentXMMs(MachineFunction &MF, return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } +static bool isSortedByValueNo(const SmallVectorImpl &ArgLocs) { + return std::is_sorted(ArgLocs.begin(), ArgLocs.end(), + [](const CCValAssign &A, const CCValAssign &B) -> bool { + return A.getValNo() < B.getValNo(); + }); +} + SDValue X86TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, @@ -2815,11 +2823,22 @@ SDValue X86TargetLowering::LowerFormalArguments( SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); - // Allocate shadow area for Win64 + // Allocate shadow area for Win64. if (IsWin64) CCInfo.AllocateStack(32, 8); - CCInfo.AnalyzeFormalArguments(Ins, CC_X86); + CCInfo.AnalyzeArguments(Ins, CC_X86); + + // In vectorcall calling convention a second pass is required for the HVA + // types. + if (CallingConv::X86_VectorCall == CallConv) { + CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); + } + + // The next loop assumes that the locations are in the same order of the + // input arguments. + assert(isSortedByValueNo(ArgLocs) && + "Argument Location list must be sorted before lowering"); SDValue ArgValue; for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; @@ -3263,11 +3282,17 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); - // Allocate shadow area for Win64 + // Allocate shadow area for Win64. if (IsWin64) CCInfo.AllocateStack(32, 8); - CCInfo.AnalyzeCallOperands(Outs, CC_X86); + CCInfo.AnalyzeArguments(Outs, CC_X86); + + // In vectorcall calling convention a second pass is required for the HVA + // types. + if (CallingConv::X86_VectorCall == CallConv) { + CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); + } // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); @@ -3322,6 +3347,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector MemOpChains; SDValue StackPtr; + // The next loop assumes that the locations are in the same order of the + // input arguments. + assert(isSortedByValueNo(ArgLocs) && + "Argument Location list must be sorted before lowering"); + // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); diff --git a/llvm/test/CodeGen/X86/vectorcall.ll b/llvm/test/CodeGen/X86/vectorcall.ll index 1e52654e99fe..376e2c0c9df6 100644 --- a/llvm/test/CodeGen/X86/vectorcall.ll +++ b/llvm/test/CodeGen/X86/vectorcall.ll @@ -6,14 +6,12 @@ define x86_vectorcallcc i32 @test_int_1() { ret i32 0 } - ; CHECK-LABEL: {{^}}test_int_1@@0: ; CHECK: xorl %eax, %eax define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) { ret i32 %a } - ; X86-LABEL: {{^}}test_int_2@@4: ; X64-LABEL: {{^}}test_int_2@@8: ; CHECK: movl %ecx, %eax @@ -22,7 +20,6 @@ define x86_vectorcallcc i32 @test_int_3(i64 inreg %a) { %at = trunc i64 %a to i32 ret i32 %at } - ; X86-LABEL: {{^}}test_int_3@@8: ; X64-LABEL: {{^}}test_int_3@@8: ; CHECK: movl %ecx, %eax @@ -31,10 +28,8 @@ define x86_vectorcallcc i32 @test_int_4(i32 inreg %a, i32 inreg %b) { %s = add i32 %a, %b ret i32 %s } - ; X86-LABEL: {{^}}test_int_4@@8: ; X86: leal (%ecx,%edx), %eax - ; X64-LABEL: {{^}}test_int_4@@16: ; X64: leal (%rcx,%rdx), %eax @@ -90,4 +85,139 @@ define x86_vectorcallcc <16 x i8> @test_vec_2( ret <16 x i8> %r } ; CHECK-LABEL: {{^}}test_vec_2@@104: -; CHECK: movaps (%{{[re]}}cx), %xmm0 +; x64: movq {{[0-9]*}}(%rsp), %rax +; CHECK: movaps (%{{rax|ecx}}), %xmm0 + +%struct.HVA5 = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float> } +%struct.HVA4 = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> } +%struct.HVA3 = type { <4 x float>, <4 x float>, <4 x float> } +%struct.HVA2 = type { <4 x float>, <4 x float> } + +define x86_vectorcallcc <4 x float> @test_mixed_1(i32 %a, %struct.HVA4 inreg %bb, i32 %c) { +entry: + %b = alloca %struct.HVA4, align 16 + store %struct.HVA4 %bb, %struct.HVA4* %b, align 16 + %w1 = getelementptr inbounds %struct.HVA4, %struct.HVA4* %b, i32 0, i32 1 + %0 = load <4 x float>, <4 x float>* %w1, align 16 + ret <4 x float> %0 +} +; CHECK-LABEL: test_mixed_1 +; CHECK: movaps %xmm1, 16(%{{(e|r)}}sp) +; CHECK: movaps 16(%{{(e|r)}}sp), %xmm0 +; CHECK: ret{{q|l}} + +define x86_vectorcallcc <4 x float> @test_mixed_2(%struct.HVA4 inreg %a, %struct.HVA4* %b, <4 x float> %c) { +entry: + %c.addr = alloca <4 x float>, align 16 + store <4 x float> %c, <4 x float>* %c.addr, align 16 + %0 = load <4 x float>, <4 x float>* %c.addr, align 16 + ret <4 x float> %0 +} +; CHECK-LABEL: test_mixed_2 +; X86: movaps %xmm0, (%esp) +; X64: movaps %xmm2, %xmm0 +; CHECK: ret{{[ql]}} + +define x86_vectorcallcc <4 x float> @test_mixed_3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* %f) { +entry: + %x = getelementptr inbounds %struct.HVA2, %struct.HVA2* %f, i32 0, i32 0 + %0 = load <4 x float>, <4 x float>* %x, align 16 + ret <4 x float> %0 +} +; CHECK-LABEL: test_mixed_3 +; CHECK: movaps (%{{[re][ac]}}x), %xmm0 +; CHECK: ret{{[ql]}} + +define x86_vectorcallcc <4 x float> @test_mixed_4(%struct.HVA4 inreg %a, %struct.HVA2* %bb, <4 x float> %c) { +entry: + %y4 = getelementptr inbounds %struct.HVA2, %struct.HVA2* %bb, i32 0, i32 1 + %0 = load <4 x float>, <4 x float>* %y4, align 16 + ret <4 x float> %0 +} +; CHECK-LABEL: test_mixed_4 +; X86: movaps 16(%eax), %xmm0 +; X64: movaps 16(%rdx), %xmm0 +; CHECK: ret{{[ql]}} + +define x86_vectorcallcc <4 x float> @test_mixed_5(%struct.HVA3 inreg %a, %struct.HVA3* %b, <4 x float> %c, %struct.HVA2 inreg %dd) { +entry: + %d = alloca %struct.HVA2, align 16 + store %struct.HVA2 %dd, %struct.HVA2* %d, align 16 + %y5 = getelementptr inbounds %struct.HVA2, %struct.HVA2* %d, i32 0, i32 1 + %0 = load <4 x float>, <4 x float>* %y5, align 16 + ret <4 x float> %0 +} +; CHECK-LABEL: test_mixed_5 +; CHECK: movaps %xmm5, 16(%{{(e|r)}}sp) +; CHECK: movaps 16(%{{(e|r)}}sp), %xmm0 +; CHECK: ret{{[ql]}} + +define x86_vectorcallcc %struct.HVA4 @test_mixed_6(%struct.HVA4 inreg %a, %struct.HVA4* %b) { +entry: + %retval = alloca %struct.HVA4, align 16 + %0 = bitcast %struct.HVA4* %retval to i8* + %1 = bitcast %struct.HVA4* %b to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 64, i32 16, i1 false) + %2 = load %struct.HVA4, %struct.HVA4* %retval, align 16 + ret %struct.HVA4 %2 +} +; CHECK-LABEL: test_mixed_6 +; CHECK: movaps (%{{[re]}}sp), %xmm0 +; CHECK: movaps 16(%{{[re]}}sp), %xmm1 +; CHECK: movaps 32(%{{[re]}}sp), %xmm2 +; CHECK: movaps 48(%{{[re]}}sp), %xmm3 +; CHECK: ret{{[ql]}} + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1) + +define x86_vectorcallcc void @test_mixed_7(%struct.HVA5* noalias sret %agg.result) { +entry: + %a = alloca %struct.HVA5, align 16 + %0 = bitcast %struct.HVA5* %a to i8* + call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 80, i32 16, i1 false) + %1 = bitcast %struct.HVA5* %agg.result to i8* + %2 = bitcast %struct.HVA5* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 80, i32 16, i1 false) + ret void +} +; CHECK-LABEL: test_mixed_7 +; CHECK: movaps %xmm{{[0-9]}}, 64(%{{rcx|eax}}) +; CHECK: movaps %xmm{{[0-9]}}, 48(%{{rcx|eax}}) +; CHECK: movaps %xmm{{[0-9]}}, 32(%{{rcx|eax}}) +; CHECK: movaps %xmm{{[0-9]}}, 16(%{{rcx|eax}}) +; CHECK: movaps %xmm{{[0-9]}}, (%{{rcx|eax}}) +; X64: mov{{[ql]}} %rcx, %rax +; CHECK: ret{{[ql]}} + +define x86_vectorcallcc <4 x float> @test_mixed_8(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, i32 %e, <4 x float> %f) { +entry: + %f.addr = alloca <4 x float>, align 16 + store <4 x float> %f, <4 x float>* %f.addr, align 16 + %0 = load <4 x float>, <4 x float>* %f.addr, align 16 + ret <4 x float> %0 +} +; CHECK-LABEL: test_mixed_8 +; X86: movaps %xmm4, %xmm0 +; X64: movaps %xmm5, %xmm0 +; CHECK: ret{{[ql]}} + +%struct.HFA4 = type { double, double, double, double } +declare x86_vectorcallcc double @test_mixed_9_callee(%struct.HFA4 %x, double %y) + +define x86_vectorcallcc double @test_mixed_9_caller(%struct.HFA4 inreg %b) { +entry: + %call = call x86_vectorcallcc double @test_mixed_9_callee(%struct.HFA4 inreg %b, double 3.000000e+00) + %add = fadd double 1.000000e+00, %call + ret double %add +} +; CHECK-LABEL: test_mixed_9_caller +; CHECK: movaps %xmm3, %xmm4 +; CHECK: movaps %xmm2, %xmm3 +; CHECK: movaps %xmm1, %xmm2 +; X32: movasd %xmm0, %xmm1 +; X64: movapd %xmm5, %xmm1 +; CHECK: call{{l|q}} test_mixed_9_callee@@40 +; CHECK: addsd {{.*}}, %xmm0 +; CHECK: ret{{l|q}}