From 30eeb742f1d11d7a7036e3b8a3bffc1dfd252082 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 5 May 2020 20:24:53 -0400 Subject: [PATCH] clang: Use byref for aggregate kernel arguments Add address space to indirect abi info and use it for kernels. Previously, indirect arguments assumed assumed a stack passed object in the alloca address space using byval. A stack pointer is unsuitable for kernel arguments, which are passed in a separate, constant buffer with a different address space. Start using the new byref for aggregate kernel arguments. Previously these were emitted as raw struct arguments, and turned into loads in the backend. These will lower identically, although with byref you now have the option of applying an explicit alignment. In the future, a reasonable implementation would use byref for all kernel arguments (this would be a practical problem at the moment due to losing things like noalias on pointer arguments). This is mostly to avoid fighting the optimizer's treatment of aggregate load/store. SROA and instcombine both turn aggregate loads and stores into a long sequence of element loads and stores, rather than the optimizable memcpy I would expect in this situation. Now an explicit memcpy will be introduced up-front which is better understood and helps eliminate the alloca in more situations. This skips using byref in the case where HIP kernel pointer arguments in structs are promoted to global pointers. At minimum an additional patch is needed to allow coercion with indirect arguments. This also skips using it for OpenCL due to the current workaround used to support kernels calling kernels. Distinct function bodies would need to be generated up front instead of emitting an illegal call. --- clang/include/clang/CodeGen/CGFunctionInfo.h | 60 +++++++++++++++---- clang/lib/CodeGen/CGCall.cpp | 40 ++++++++++--- clang/lib/CodeGen/TargetInfo.cpp | 31 ++++++++-- clang/test/CodeGenCUDA/kernel-args.cu | 8 +-- .../CodeGenOpenCL/amdgpu-abi-struct-coerce.cl | 3 +- 5 files changed, 113 insertions(+), 29 deletions(-) diff --git a/clang/include/clang/CodeGen/CGFunctionInfo.h b/clang/include/clang/CodeGen/CGFunctionInfo.h index eaf5a3d5aad7..253ef946ce15 100644 --- a/clang/include/clang/CodeGen/CGFunctionInfo.h +++ b/clang/include/clang/CodeGen/CGFunctionInfo.h @@ -44,10 +44,23 @@ public: /// but also emit a zero/sign extension attribute. Extend, - /// Indirect - Pass the argument indirectly via a hidden pointer - /// with the specified alignment (0 indicates default alignment). + /// Indirect - Pass the argument indirectly via a hidden pointer with the + /// specified alignment (0 indicates default alignment) and address space. Indirect, + /// IndirectAliased - Similar to Indirect, but the pointer may be to an + /// object that is otherwise referenced. The object is known to not be + /// modified through any other references for the duration of the call, and + /// the callee must not itself modify the object. Because C allows + /// parameter variables to be modified and guarantees that they have unique + /// addresses, the callee must defensively copy the object into a local + /// variable if it might be modified or its address might be compared. + /// Since those are uncommon, in principle this convention allows programs + /// to avoid copies in more situations. However, it may introduce *extra* + /// copies if the callee fails to prove that a copy is unnecessary and the + /// caller naturally produces an unaliased object for the argument. + IndirectAliased, + /// Ignore - Ignore the argument (treat as void). Useful for void and /// empty structs. Ignore, @@ -86,6 +99,7 @@ private: unsigned AllocaFieldIndex; // isInAlloca() }; Kind TheKind; + unsigned IndirectAddrSpace : 24; // isIndirect() bool PaddingInReg : 1; bool InAllocaSRet : 1; // isInAlloca() bool InAllocaIndirect : 1;// isInAlloca() @@ -97,7 +111,8 @@ private: bool SignExt : 1; // isExtend() bool canHavePaddingType() const { - return isDirect() || isExtend() || isIndirect() || isExpand(); + return isDirect() || isExtend() || isIndirect() || isIndirectAliased() || + isExpand(); } void setPaddingType(llvm::Type *T) { assert(canHavePaddingType()); @@ -112,9 +127,10 @@ private: public: ABIArgInfo(Kind K = Direct) : TypeData(nullptr), PaddingType(nullptr), DirectOffset(0), TheKind(K), - PaddingInReg(false), InAllocaSRet(false), InAllocaIndirect(false), - IndirectByVal(false), IndirectRealign(false), SRetAfterThis(false), - InReg(false), CanBeFlattened(false), SignExt(false) {} + IndirectAddrSpace(0), PaddingInReg(false), InAllocaSRet(false), + InAllocaIndirect(false), IndirectByVal(false), IndirectRealign(false), + SRetAfterThis(false), InReg(false), CanBeFlattened(false), + SignExt(false) {} static ABIArgInfo getDirect(llvm::Type *T = nullptr, unsigned Offset = 0, llvm::Type *Padding = nullptr, @@ -180,6 +196,19 @@ public: AI.setPaddingType(Padding); return AI; } + + /// Pass this in memory using the IR byref attribute. + static ABIArgInfo getIndirectAliased(CharUnits Alignment, unsigned AddrSpace, + bool Realign = false, + llvm::Type *Padding = nullptr) { + auto AI = ABIArgInfo(IndirectAliased); + AI.setIndirectAlign(Alignment); + AI.setIndirectRealign(Realign); + AI.setPaddingType(Padding); + AI.setIndirectAddrSpace(AddrSpace); + return AI; + } + static ABIArgInfo getIndirectInReg(CharUnits Alignment, bool ByVal = true, bool Realign = false) { auto AI = getIndirect(Alignment, ByVal, Realign); @@ -259,6 +288,7 @@ public: bool isExtend() const { return TheKind == Extend; } bool isIgnore() const { return TheKind == Ignore; } bool isIndirect() const { return TheKind == Indirect; } + bool isIndirectAliased() const { return TheKind == IndirectAliased; } bool isExpand() const { return TheKind == Expand; } bool isCoerceAndExpand() const { return TheKind == CoerceAndExpand; } @@ -338,11 +368,11 @@ public: // Indirect accessors CharUnits getIndirectAlign() const { - assert(isIndirect() && "Invalid kind!"); + assert((isIndirect() || isIndirectAliased()) && "Invalid kind!"); return CharUnits::fromQuantity(IndirectAlign); } void setIndirectAlign(CharUnits IA) { - assert(isIndirect() && "Invalid kind!"); + assert((isIndirect() || isIndirectAliased()) && "Invalid kind!"); IndirectAlign = IA.getQuantity(); } @@ -355,12 +385,22 @@ public: IndirectByVal = IBV; } + unsigned getIndirectAddrSpace() const { + assert(isIndirectAliased() && "Invalid kind!"); + return IndirectAddrSpace; + } + + void setIndirectAddrSpace(unsigned AddrSpace) { + assert(isIndirectAliased() && "Invalid kind!"); + IndirectAddrSpace = AddrSpace; + } + bool getIndirectRealign() const { - assert(isIndirect() && "Invalid kind!"); + assert((isIndirect() || isIndirectAliased()) && "Invalid kind!"); return IndirectRealign; } void setIndirectRealign(bool IR) { - assert(isIndirect() && "Invalid kind!"); + assert((isIndirect() || isIndirectAliased()) && "Invalid kind!"); IndirectRealign = IR; } diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 3f27e1bb8955..9d225b23e3c3 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1470,6 +1470,7 @@ void ClangToLLVMArgMapping::construct(const ASTContext &Context, break; } case ABIArgInfo::Indirect: + case ABIArgInfo::IndirectAliased: IRArgs.NumberOfArgs = 1; break; case ABIArgInfo::Ignore: @@ -1560,6 +1561,7 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI) { const ABIArgInfo &retAI = FI.getReturnInfo(); switch (retAI.getKind()) { case ABIArgInfo::Expand: + case ABIArgInfo::IndirectAliased: llvm_unreachable("Invalid ABI kind for return argument"); case ABIArgInfo::Extend: @@ -1637,7 +1639,12 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI) { CGM.getDataLayout().getAllocaAddrSpace()); break; } - + case ABIArgInfo::IndirectAliased: { + assert(NumIRArgs == 1); + llvm::Type *LTy = ConvertTypeForMem(it->type); + ArgTypes[FirstIRArg] = LTy->getPointerTo(ArgInfo.getIndirectAddrSpace()); + break; + } case ABIArgInfo::Extend: case ABIArgInfo::Direct: { // Fast-isel and the optimizer generally like scalar values better than @@ -2101,6 +2108,7 @@ void CodeGenModule::ConstructAttributeList( break; case ABIArgInfo::Expand: + case ABIArgInfo::IndirectAliased: llvm_unreachable("Invalid ABI kind for return argument"); } @@ -2184,6 +2192,9 @@ void CodeGenModule::ConstructAttributeList( if (AI.getIndirectByVal()) Attrs.addByValAttr(getTypes().ConvertTypeForMem(ParamType)); + // TODO: We could add the byref attribute if not byval, but it would + // require updating many testcases. + CharUnits Align = AI.getIndirectAlign(); // In a byval argument, it is important that the required @@ -2206,6 +2217,13 @@ void CodeGenModule::ConstructAttributeList( // byval disables readnone and readonly. FuncAttrs.removeAttribute(llvm::Attribute::ReadOnly) .removeAttribute(llvm::Attribute::ReadNone); + + break; + } + case ABIArgInfo::IndirectAliased: { + CharUnits Align = AI.getIndirectAlign(); + Attrs.addByRefAttr(getTypes().ConvertTypeForMem(ParamType)); + Attrs.addAlignmentAttr(Align.getQuantity()); break; } case ABIArgInfo::Ignore: @@ -2434,16 +2452,19 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, break; } - case ABIArgInfo::Indirect: { + case ABIArgInfo::Indirect: + case ABIArgInfo::IndirectAliased: { assert(NumIRArgs == 1); Address ParamAddr = Address(Fn->getArg(FirstIRArg), ArgI.getIndirectAlign()); if (!hasScalarEvaluationKind(Ty)) { - // Aggregates and complex variables are accessed by reference. All we - // need to do is realign the value, if requested. + // Aggregates and complex variables are accessed by reference. All we + // need to do is realign the value, if requested. Also, if the address + // may be aliased, copy it to ensure that the parameter variable is + // mutable and has a unique adress, as C requires. Address V = ParamAddr; - if (ArgI.getIndirectRealign()) { + if (ArgI.getIndirectRealign() || ArgI.isIndirectAliased()) { Address AlignedTemp = CreateMemTemp(Ty, "coerce"); // Copy from the incoming argument pointer to the temporary with the @@ -3285,8 +3306,8 @@ void CodeGenFunction::EmitFunctionEpilog(const CGFunctionInfo &FI, } break; } - case ABIArgInfo::Expand: + case ABIArgInfo::IndirectAliased: llvm_unreachable("Invalid ABI kind for return argument"); } @@ -4413,7 +4434,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, break; } - case ABIArgInfo::Indirect: { + case ABIArgInfo::Indirect: + case ABIArgInfo::IndirectAliased: { assert(NumIRArgs == 1); if (!I->isAggregate()) { // Make a temporary alloca to pass the argument. @@ -4668,12 +4690,13 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, break; } - case ABIArgInfo::Expand: + case ABIArgInfo::Expand: { unsigned IRArgPos = FirstIRArg; ExpandTypeToArgs(I->Ty, *I, IRFuncTy, IRCallArgs, IRArgPos); assert(IRArgPos == FirstIRArg + NumIRArgs); break; } + } } const CGCallee &ConcreteCallee = Callee.prepareConcreteCallee(*this); @@ -5084,6 +5107,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, } case ABIArgInfo::Expand: + case ABIArgInfo::IndirectAliased: llvm_unreachable("Invalid ABI kind for return argument"); } diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp index 8c89e6bc2a64..e94945592123 100644 --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -257,6 +257,11 @@ LLVM_DUMP_METHOD void ABIArgInfo::dump() const { << " ByVal=" << getIndirectByVal() << " Realign=" << getIndirectRealign(); break; + case IndirectAliased: + OS << "Indirect Align=" << getIndirectAlign().getQuantity() + << " AadrSpace=" << getIndirectAddrSpace() + << " Realign=" << getIndirectRealign(); + break; case Expand: OS << "Expand"; break; @@ -1989,6 +1994,7 @@ static bool isArgInAlloca(const ABIArgInfo &Info) { case ABIArgInfo::InAlloca: return true; case ABIArgInfo::Ignore: + case ABIArgInfo::IndirectAliased: return false; case ABIArgInfo::Indirect: case ABIArgInfo::Direct: @@ -8790,18 +8796,31 @@ ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const { // TODO: Can we omit empty structs? - llvm::Type *LTy = nullptr; if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) - LTy = CGT.ConvertType(QualType(SeltTy, 0)); + Ty = QualType(SeltTy, 0); + llvm::Type *OrigLTy = CGT.ConvertType(Ty); + llvm::Type *LTy = OrigLTy; if (getContext().getLangOpts().HIP) { - if (!LTy) - LTy = CGT.ConvertType(Ty); LTy = coerceKernelArgumentType( - LTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default), + OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default), /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); } + // FIXME: Should also use this for OpenCL, but it requires addressing the + // problem of kernels being called. + // + // FIXME: This doesn't apply the optimization of coercing pointers in structs + // to global address space when using byref. This would require implementing a + // new kind of coercion of the in-memory type when for indirect arguments. + if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy && + isAggregateTypeForABI(Ty)) { + return ABIArgInfo::getIndirectAliased( + getContext().getTypeAlignInChars(Ty), + getContext().getTargetAddressSpace(LangAS::opencl_constant), + false /*Realign*/, nullptr /*Padding*/); + } + // If we set CanBeFlattened to true, CodeGen will expand the struct to its // individual elements, which confuses the Clover OpenCL backend; therefore we // have to set it to false here. Other args of getDirect() are just defaults. @@ -9377,6 +9396,7 @@ Address SparcV9ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, } case ABIArgInfo::Indirect: + case ABIArgInfo::IndirectAliased: Stride = SlotSize; ArgAddr = Builder.CreateElementBitCast(Addr, ArgPtrTy, "indirect"); ArgAddr = Address(Builder.CreateLoad(ArgAddr, "indirect.arg"), @@ -9742,6 +9762,7 @@ Address XCoreABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, ArgSize = ArgSize.alignTo(SlotSize); break; case ABIArgInfo::Indirect: + case ABIArgInfo::IndirectAliased: Val = Builder.CreateElementBitCast(AP, ArgPtrTy); Val = Address(Builder.CreateLoad(Val), TypeAlign); ArgSize = SlotSize; diff --git a/clang/test/CodeGenCUDA/kernel-args.cu b/clang/test/CodeGenCUDA/kernel-args.cu index 74d91b4d57f3..1bc613c699f2 100644 --- a/clang/test/CodeGenCUDA/kernel-args.cu +++ b/clang/test/CodeGenCUDA/kernel-args.cu @@ -8,14 +8,14 @@ struct A { int a[32]; }; -// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A %x.coerce) +// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}}) // NVPTX: define void @_Z6kernel1A(%struct.A* byval(%struct.A) align 4 %x) __global__ void kernel(A x) { } class Kernel { public: - // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A %x.coerce) + // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}}) // NVPTX: define void @_ZN6Kernel12memberKernelE1A(%struct.A* byval(%struct.A) align 4 %x) static __global__ void memberKernel(A x){} template static __global__ void templateMemberKernel(T x) {} @@ -29,11 +29,11 @@ void launch(void*); void test() { Kernel K; - // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A %x.coerce) + // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}} // NVPTX: define void @_Z14templateKernelI1AEvT_(%struct.A* byval(%struct.A) align 4 %x) launch((void*)templateKernel); - // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A %x.coerce) + // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}} // NVPTX: define void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* byval(%struct.A) align 4 %x) launch((void*)Kernel::templateMemberKernel); } diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl index fd46d3cce22e..8cae24db4540 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl @@ -67,7 +67,6 @@ typedef struct struct_of_structs_arg int i2; } struct_of_structs_arg_t; -// CHECK: %union.transparent_u = type { i32 } typedef union { int b1; @@ -237,7 +236,7 @@ __kernel void kernel_test_struct_of_arrays_arg(struct_of_arrays_arg_t arg1) { } // CHECK: void @kernel_struct_of_structs_arg(%struct.struct_of_structs_arg %arg1.coerce) __kernel void kernel_struct_of_structs_arg(struct_of_structs_arg_t arg1) { } -// CHECK: void @test_kernel_transparent_union_arg(%union.transparent_u %u.coerce) +// CHECK: void @test_kernel_transparent_union_arg(i32 %u.coerce) __kernel void test_kernel_transparent_union_arg(transparent_u u) { } // CHECK: void @kernel_single_array_element_struct_arg(%struct.single_array_element_struct_arg %arg1.coerce)