forked from OSchip/llvm-project
AMDGPU: Partially fix handling of packed amdgpu_ps arguments
Fixes annoying limitations when writing tests. Also remove more leftover code for manually scalarizing arguments and return values. llvm-svn: 338618
This commit is contained in:
parent
b3724b7169
commit
55ab9213d3
|
@ -19,7 +19,7 @@ class CCIfExtend<CCAction A>
|
|||
// Calling convention for SI
|
||||
def CC_SI : CallingConv<[
|
||||
|
||||
CCIfInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
|
||||
CCIfInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
|
||||
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
|
||||
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
|
||||
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
|
||||
|
@ -33,7 +33,7 @@ def CC_SI : CallingConv<[
|
|||
CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
|
||||
|
||||
// 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
|
||||
CCIfNotInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
|
||||
CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
|
||||
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
|
||||
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
|
||||
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
|
||||
|
@ -64,7 +64,7 @@ def RetCC_SI_Shader : CallingConv<[
|
|||
]>>,
|
||||
|
||||
// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
|
||||
CCIfType<[f32, f16] , CCAssignToReg<[
|
||||
CCIfType<[f32, f16, v2f16] , CCAssignToReg<[
|
||||
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
|
||||
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
|
||||
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
|
||||
|
|
|
@ -1349,7 +1349,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
|
|||
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
|
||||
const ISD::InputArg *Arg = &Ins[I];
|
||||
|
||||
assert(!Arg->VT.isVector() && "vector type argument should have been split");
|
||||
assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
|
||||
"vector type argument should have been split");
|
||||
|
||||
// First check if it's a PS input addr.
|
||||
if (CallConv == CallingConv::AMDGPU_PS &&
|
||||
|
@ -1951,29 +1952,6 @@ SDValue SITargetLowering::LowerFormalArguments(
|
|||
llvm_unreachable("Unknown loc info!");
|
||||
}
|
||||
|
||||
if (IsShader && Arg.VT.isVector()) {
|
||||
// Build a vector from the registers
|
||||
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
|
||||
unsigned NumElements = ParamType->getVectorNumElements();
|
||||
|
||||
SmallVector<SDValue, 4> Regs;
|
||||
Regs.push_back(Val);
|
||||
for (unsigned j = 1; j != NumElements; ++j) {
|
||||
Reg = ArgLocs[ArgIdx++].getLocReg();
|
||||
Reg = MF.addLiveIn(Reg, RC);
|
||||
|
||||
SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
|
||||
Regs.push_back(Copy);
|
||||
}
|
||||
|
||||
// Fill up the missing vector elements
|
||||
NumElements = Arg.VT.getVectorNumElements() - NumElements;
|
||||
Regs.append(NumElements, DAG.getUNDEF(VT));
|
||||
|
||||
InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
|
||||
continue;
|
||||
}
|
||||
|
||||
InVals.push_back(Val);
|
||||
}
|
||||
|
||||
|
@ -2037,48 +2015,19 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
|||
|
||||
bool IsShader = AMDGPU::isShader(CallConv);
|
||||
|
||||
Info->setIfReturnsVoid(Outs.size() == 0);
|
||||
Info->setIfReturnsVoid(Outs.empty());
|
||||
bool IsWaveEnd = Info->returnsVoid() && IsShader;
|
||||
|
||||
SmallVector<ISD::OutputArg, 48> Splits;
|
||||
SmallVector<SDValue, 48> SplitVals;
|
||||
|
||||
// Split vectors into their elements.
|
||||
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
|
||||
const ISD::OutputArg &Out = Outs[i];
|
||||
|
||||
if (IsShader && Out.VT.isVector()) {
|
||||
MVT VT = Out.VT.getVectorElementType();
|
||||
ISD::OutputArg NewOut = Out;
|
||||
NewOut.Flags.setSplit();
|
||||
NewOut.VT = VT;
|
||||
|
||||
// We want the original number of vector elements here, e.g.
|
||||
// three or five, not four or eight.
|
||||
unsigned NumElements = Out.ArgVT.getVectorNumElements();
|
||||
|
||||
for (unsigned j = 0; j != NumElements; ++j) {
|
||||
SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
|
||||
DAG.getConstant(j, DL, MVT::i32));
|
||||
SplitVals.push_back(Elem);
|
||||
Splits.push_back(NewOut);
|
||||
NewOut.PartOffset += NewOut.VT.getStoreSize();
|
||||
}
|
||||
} else {
|
||||
SplitVals.push_back(OutVals[i]);
|
||||
Splits.push_back(Out);
|
||||
}
|
||||
}
|
||||
|
||||
// CCValAssign - represent the assignment of the return value to a location.
|
||||
SmallVector<CCValAssign, 48> RVLocs;
|
||||
SmallVector<ISD::OutputArg, 48> Splits;
|
||||
|
||||
// CCState - Info about the registers and stack slots.
|
||||
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
|
||||
*DAG.getContext());
|
||||
|
||||
// Analyze outgoing return values.
|
||||
CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
|
||||
CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
|
||||
|
||||
SDValue Flag;
|
||||
SmallVector<SDValue, 48> RetOps;
|
||||
|
@ -2103,14 +2052,12 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
|||
}
|
||||
|
||||
// Copy the result values into the output registers.
|
||||
for (unsigned i = 0, realRVLocIdx = 0;
|
||||
i != RVLocs.size();
|
||||
++i, ++realRVLocIdx) {
|
||||
CCValAssign &VA = RVLocs[i];
|
||||
for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
|
||||
++I, ++RealRVLocIdx) {
|
||||
CCValAssign &VA = RVLocs[I];
|
||||
assert(VA.isRegLoc() && "Can only return in registers!");
|
||||
// TODO: Partially return in registers if return values don't fit.
|
||||
|
||||
SDValue Arg = SplitVals[realRVLocIdx];
|
||||
SDValue Arg = OutVals[RealRVLocIdx];
|
||||
|
||||
// Copied from other backends.
|
||||
switch (VA.getLocInfo()) {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
|
||||
; Make sure we don't crash or assert on spir_kernel calling convention.
|
||||
|
||||
|
@ -88,8 +88,8 @@ define amdgpu_cs half @cs_mesa(half %arg0) {
|
|||
; Mesa pixel shader: check for 45096 (SPI_SHADER_PGM_RSRC1_PS) in .AMDGPU.config
|
||||
; GCN-LABEL: .AMDGPU.config
|
||||
; GCN: .long 45096
|
||||
; GCN-LABEL: {{^}}ps_mesa:
|
||||
define amdgpu_ps half @ps_mesa(half %arg0) {
|
||||
; GCN-LABEL: {{^}}ps_mesa_f16:
|
||||
define amdgpu_ps half @ps_mesa_f16(half %arg0) {
|
||||
%add = fadd half %arg0, 1.0
|
||||
ret half %add
|
||||
}
|
||||
|
@ -121,4 +121,83 @@ define amdgpu_hs half @hs_mesa(half %arg0) {
|
|||
ret half %add
|
||||
}
|
||||
|
||||
; FIXME: Inconsistent ABI between targets
|
||||
; GCN-LABEL: {{^}}ps_mesa_v2f16:
|
||||
; VI: v_mov_b32_e32 v1, 0x3c00
|
||||
; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; VI-NEXT: ; return
|
||||
|
||||
; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], v0
|
||||
; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], v1
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]]
|
||||
; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]]
|
||||
; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]]
|
||||
; SI: ; return to shader part epilog
|
||||
define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) {
|
||||
%add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
|
||||
ret <2 x half> %add
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ps_mesa_inreg_v2f16:
|
||||
; VI: s_lshr_b32 s1, s0, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
|
||||
; VI-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: v_add_f16_e64 v1, s0, 1.0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: ; return to shader part epilog
|
||||
|
||||
; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], s0
|
||||
; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], s1
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]]
|
||||
; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]]
|
||||
; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]]
|
||||
; SI: ; return to shader part epilog
|
||||
define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) {
|
||||
%add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
|
||||
ret <2 x half> %add
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ps_mesa_v2i16:
|
||||
; VI: v_mov_b32_e32 v2, 1
|
||||
; VI: v_add_u16_e32 v1, 1, v0
|
||||
; VI: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI: v_or_b32_e32 v0, v1, v0
|
||||
|
||||
|
||||
; SI: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; SI: v_add_i32_e32 v0, vcc, 1, v0
|
||||
; SI: v_add_i32_e32 v1, vcc, 0x10000, v1
|
||||
; SI: v_and_b32
|
||||
; SI: v_or_b32
|
||||
define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) {
|
||||
%add = add <2 x i16> %arg0, <i16 1, i16 1>
|
||||
store <2 x i16> %add, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ps_mesa_inreg_v2i16:
|
||||
; VI: s_lshr_b32 s1, s0, 16
|
||||
; VI: s_add_i32 s1, s1, 1
|
||||
; VI: s_add_i32 s0, s0, 1
|
||||
; VI: s_and_b32 s0, s0, 0xffff
|
||||
; VI: s_lshl_b32 s1, s1, 16
|
||||
; VI: s_or_b32 s0, s0, s1
|
||||
; VI: v_mov_b32_e32 v0, s0
|
||||
|
||||
; SI: s_lshl_b32 s1, s1, 16
|
||||
; SI: s_add_i32 s0, s0, 1
|
||||
; SI: s_add_i32 s1, s1, 0x10000
|
||||
; SI: s_and_b32 s0, s0, 0xffff
|
||||
; SI: s_or_b32 s0, s0, s1
|
||||
define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
|
||||
%add = add <2 x i16> %arg0, <i16 1, i16 1>
|
||||
store <2 x i16> %add, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind noinline }
|
||||
|
|
Loading…
Reference in New Issue