forked from OSchip/llvm-project
AMDGPU/GlobalISel: Handle most function return types
handleAssignments gives up pretty easily on structs, and i8 values for some reason. The other case that doesn't work is when an implicit sret needs to be inserted if the return size exceeds the number of return registers. llvm-svn: 367082
This commit is contained in:
parent
51d795d941
commit
a9ea8a9aae
|
@ -30,9 +30,9 @@ using namespace llvm;
|
|||
|
||||
namespace {
|
||||
|
||||
struct OutgoingArgHandler : public CallLowering::ValueHandler {
|
||||
OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
|
||||
MachineInstrBuilder MIB, CCAssignFn *AssignFn)
|
||||
struct OutgoingValueHandler : public CallLowering::ValueHandler {
|
||||
OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
|
||||
MachineInstrBuilder MIB, CCAssignFn *AssignFn)
|
||||
: ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
|
||||
|
||||
MachineInstrBuilder MIB;
|
||||
|
@ -49,8 +49,16 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
|
|||
|
||||
void assignValueToReg(Register ValVReg, Register PhysReg,
|
||||
CCValAssign &VA) override {
|
||||
MIB.addUse(PhysReg);
|
||||
MIRBuilder.buildCopy(PhysReg, ValVReg);
|
||||
Register ExtReg;
|
||||
if (VA.getLocVT().getSizeInBits() < 32) {
|
||||
// 16-bit types are reported as legal for 32-bit registers. We need to
|
||||
// extend and do a 32-bit copy to avoid the verifier complaining about it.
|
||||
ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
|
||||
} else
|
||||
ExtReg = extendRegister(ValVReg, VA);
|
||||
|
||||
MIRBuilder.buildCopy(PhysReg, ExtReg);
|
||||
MIB.addUse(PhysReg, RegState::Implicit);
|
||||
}
|
||||
|
||||
bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
|
||||
|
@ -193,6 +201,90 @@ void AMDGPUCallLowering::splitToValueTypes(
|
|||
}
|
||||
}
|
||||
|
||||
// Get the appropriate type to make \p OrigTy \p Factor times bigger.
|
||||
static LLT getMultipleType(LLT OrigTy, int Factor) {
|
||||
if (OrigTy.isVector()) {
|
||||
return LLT::vector(OrigTy.getNumElements() * Factor,
|
||||
OrigTy.getElementType());
|
||||
}
|
||||
|
||||
return LLT::scalar(OrigTy.getSizeInBits() * Factor);
|
||||
}
|
||||
|
||||
// TODO: Move to generic code
|
||||
static void unpackRegsToOrigType(MachineIRBuilder &MIRBuilder,
|
||||
ArrayRef<Register> DstRegs,
|
||||
Register SrcReg,
|
||||
LLT SrcTy,
|
||||
LLT PartTy) {
|
||||
assert(DstRegs.size() > 1 && "Nothing to unpack");
|
||||
|
||||
MachineFunction &MF = MIRBuilder.getMF();
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
|
||||
const unsigned SrcSize = SrcTy.getSizeInBits();
|
||||
const unsigned PartSize = PartTy.getSizeInBits();
|
||||
|
||||
if (SrcTy.isVector() && !PartTy.isVector() &&
|
||||
PartSize > SrcTy.getElementType().getSizeInBits()) {
|
||||
// Vector was scalarized, and the elements extended.
|
||||
auto UnmergeToEltTy = MIRBuilder.buildUnmerge(SrcTy.getElementType(),
|
||||
SrcReg);
|
||||
for (int i = 0, e = DstRegs.size(); i != e; ++i)
|
||||
MIRBuilder.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
|
||||
return;
|
||||
}
|
||||
|
||||
if (SrcSize % PartSize == 0) {
|
||||
MIRBuilder.buildUnmerge(DstRegs, SrcReg);
|
||||
return;
|
||||
}
|
||||
|
||||
const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize;
|
||||
|
||||
LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
|
||||
auto ImpDef = MIRBuilder.buildUndef(BigTy);
|
||||
|
||||
Register BigReg = MRI.createGenericVirtualRegister(BigTy);
|
||||
MIRBuilder.buildInsert(BigReg, ImpDef.getReg(0), SrcReg, 0).getReg(0);
|
||||
|
||||
int64_t Offset = 0;
|
||||
for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
|
||||
MIRBuilder.buildExtract(DstRegs[i], BigReg, Offset);
|
||||
}
|
||||
|
||||
/// Lower the return value for the already existing \p Ret. This assumes that
|
||||
/// \p MIRBuilder's insertion point is correct.
|
||||
bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
|
||||
const Value *Val, ArrayRef<Register> VRegs,
|
||||
MachineInstrBuilder &Ret) const {
|
||||
if (!Val)
|
||||
return true;
|
||||
|
||||
auto &MF = MIRBuilder.getMF();
|
||||
const auto &F = MF.getFunction();
|
||||
const DataLayout &DL = MF.getDataLayout();
|
||||
|
||||
CallingConv::ID CC = F.getCallingConv();
|
||||
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
|
||||
ArgInfo OrigRetInfo(VRegs, Val->getType());
|
||||
setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
|
||||
SmallVector<ArgInfo, 4> SplitRetInfos;
|
||||
|
||||
splitToValueTypes(
|
||||
OrigRetInfo, SplitRetInfos, DL, MRI, CC,
|
||||
[&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
|
||||
unpackRegsToOrigType(MIRBuilder, Regs, VRegs[VTSplitIdx], LLTy, PartLLT);
|
||||
});
|
||||
|
||||
CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
|
||||
|
||||
OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret, AssignFn);
|
||||
return handleAssignments(MIRBuilder, SplitRetInfos, RetHandler);
|
||||
}
|
||||
|
||||
bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
|
||||
const Value *Val,
|
||||
ArrayRef<Register> VRegs) const {
|
||||
|
@ -202,38 +294,43 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
|
|||
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
MFI->setIfReturnsVoid(!Val);
|
||||
|
||||
if (!Val) {
|
||||
MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
|
||||
assert(!Val == VRegs.empty() && "Return value without a vreg");
|
||||
|
||||
CallingConv::ID CC = MIRBuilder.getMF().getFunction().getCallingConv();
|
||||
const bool IsShader = AMDGPU::isShader(CC);
|
||||
const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
|
||||
AMDGPU::isKernel(CC);
|
||||
if (IsWaveEnd) {
|
||||
MIRBuilder.buildInstr(AMDGPU::S_ENDPGM)
|
||||
.addImm(0);
|
||||
return true;
|
||||
}
|
||||
|
||||
Register VReg = VRegs[0];
|
||||
auto const &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
|
||||
|
||||
const Function &F = MF.getFunction();
|
||||
auto &DL = F.getParent()->getDataLayout();
|
||||
if (!AMDGPU::isShader(F.getCallingConv()))
|
||||
return false;
|
||||
unsigned ReturnOpc = ReturnOpc = IsShader ?
|
||||
AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
|
||||
|
||||
|
||||
const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
|
||||
SmallVector<EVT, 4> SplitVTs;
|
||||
SmallVector<uint64_t, 4> Offsets;
|
||||
ArgInfo OrigArg{VReg, Val->getType()};
|
||||
setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
|
||||
ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
|
||||
|
||||
SmallVector<ArgInfo, 8> SplitArgs;
|
||||
CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false);
|
||||
for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
|
||||
Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext());
|
||||
SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed});
|
||||
auto Ret = MIRBuilder.buildInstrNoInsert(ReturnOpc);
|
||||
Register ReturnAddrVReg;
|
||||
if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
|
||||
ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
|
||||
Ret.addUse(ReturnAddrVReg);
|
||||
}
|
||||
auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG);
|
||||
OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn);
|
||||
if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
|
||||
return false;
|
||||
MIRBuilder.insertInstr(RetInstr);
|
||||
|
||||
if (!lowerReturnVal(MIRBuilder, Val, VRegs, Ret))
|
||||
return false;
|
||||
|
||||
if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
|
||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
|
||||
&AMDGPU::SGPR_64RegClass);
|
||||
MIRBuilder.buildCopy(ReturnAddrVReg, LiveInReturn);
|
||||
}
|
||||
|
||||
// TODO: Handle CalleeSavedRegsViaCopy.
|
||||
|
||||
MIRBuilder.insertInstr(Ret);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -386,6 +483,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
|
|||
return true;
|
||||
}
|
||||
|
||||
// TODO: Move this to generic code
|
||||
static void packSplitRegsToOrigType(MachineIRBuilder &MIRBuilder,
|
||||
ArrayRef<Register> OrigRegs,
|
||||
ArrayRef<Register> Regs,
|
||||
|
@ -476,6 +574,14 @@ bool AMDGPUCallLowering::lowerFormalArguments(
|
|||
SmallVector<CCValAssign, 16> ArgLocs;
|
||||
CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
|
||||
|
||||
if (!IsEntryFunc) {
|
||||
Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
|
||||
Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
|
||||
&AMDGPU::SGPR_64RegClass);
|
||||
MBB.addLiveIn(ReturnAddrReg);
|
||||
MIRBuilder.buildCopy(LiveInReturn, ReturnAddrReg);
|
||||
}
|
||||
|
||||
if (Info->hasImplicitBufferPtr()) {
|
||||
Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
|
||||
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
|
||||
|
@ -497,9 +603,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
|
|||
if (!IsShader && InReg)
|
||||
return false;
|
||||
|
||||
// TODO: Handle sret.
|
||||
if (Arg.hasAttribute(Attribute::StructRet) ||
|
||||
Arg.hasAttribute(Attribute::SwiftSelf) ||
|
||||
if (Arg.hasAttribute(Attribute::SwiftSelf) ||
|
||||
Arg.hasAttribute(Attribute::SwiftError) ||
|
||||
Arg.hasAttribute(Attribute::Nest))
|
||||
return false;
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
namespace llvm {
|
||||
|
||||
class AMDGPUTargetLowering;
|
||||
class MachineInstrBuilder;
|
||||
|
||||
class AMDGPUCallLowering: public CallLowering {
|
||||
Register lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
|
||||
|
@ -38,6 +39,10 @@ class AMDGPUCallLowering: public CallLowering {
|
|||
CallingConv::ID CallConv,
|
||||
SplitArgTy SplitArg) const;
|
||||
|
||||
bool lowerReturnVal(MachineIRBuilder &MIRBuilder,
|
||||
const Value *Val, ArrayRef<Register> VRegs,
|
||||
MachineInstrBuilder &Ret) const;
|
||||
|
||||
public:
|
||||
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -56,7 +56,7 @@ define amdgpu_vs void @test_order(float inreg %arg0, float inreg %arg1, float %a
|
|||
; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; CHECK: $sgpr0 = COPY [[S0]]
|
||||
; CHECK: $sgpr1 = COPY [[S1]]
|
||||
; CHECK: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
|
||||
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
|
||||
define amdgpu_vs <{ i32, i32 }> @ret_struct(i32 inreg %arg0, i32 inreg %arg1) {
|
||||
main_body:
|
||||
%tmp0 = insertvalue <{ i32, i32 }> undef, i32 %arg0, 0
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,3 @@
|
|||
; Runs original SDAG test with -global-isel
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %S/../ret.ll | FileCheck -check-prefix=GCN %S/../ret.ll
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %S/../ret.ll | FileCheck -check-prefix=GCN %S/../ret.ll
|
|
@ -79,7 +79,7 @@ bb:
|
|||
; GCN-LABEL: {{^}}ps_input_ena_pos_w:
|
||||
; GCN-DAG: v_mov_b32_e32 v0, v4
|
||||
; GCN-DAG: v_mov_b32_e32 v1, v2
|
||||
; GCN: v_mov_b32_e32 v2, v3
|
||||
; GCN-DAG: v_mov_b32_e32 v2, v3
|
||||
; GCN-NOT: s_endpgm
|
||||
define amdgpu_ps { float, <2 x float> } @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
|
||||
bb:
|
||||
|
@ -177,8 +177,8 @@ bb:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sgpr:
|
||||
; GCN: s_mov_b32 s2, s3
|
||||
; GCN: s_add_i32 s0, s3, 2
|
||||
; GCN-DAG: s_mov_b32 s2, s3
|
||||
; GCN-DAG: s_add_{{i|u}}32 s0, s3, 2
|
||||
; GCN-NOT: s_endpgm
|
||||
define amdgpu_vs { i32, i32, i32 } @sgpr([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
|
||||
bb:
|
||||
|
@ -206,9 +206,9 @@ bb:
|
|||
; GCN-DAG: exp mrt0 v0, v0, v0, v0 done vm
|
||||
; GCN-DAG: v_mov_b32_e32 v1, v0
|
||||
; GCN-DAG: s_mov_b32 s1, s2
|
||||
; GCN: s_waitcnt expcnt(0)
|
||||
; GCN: v_add_f32_e32 v0, 1.0, v1
|
||||
; GCN-DAG: s_add_i32 s0, s3, 2
|
||||
; GCN-DAG: s_waitcnt expcnt(0)
|
||||
; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
|
||||
; GCN-DAG: s_add_{{i|u}}32 s0, s3, 2
|
||||
; GCN-DAG: s_mov_b32 s2, s3
|
||||
; GCN-NOT: s_endpgm
|
||||
define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(4)* inreg %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
|
||||
|
|
Loading…
Reference in New Issue