forked from OSchip/llvm-project
AMDGPU: Use better alignment for kernarg lowering
This was just emitting loads with the ABI alignment for the raw type. The true alignment is often better, especially when an illegal vector type was scalarized. The better alignment allows using a scalar load more often. llvm-svn: 333558
This commit is contained in:
parent
ebaaa2ddae
commit
7b4826e6ce
|
@ -1068,15 +1068,12 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
|
|||
SDValue SITargetLowering::lowerKernargMemParameter(
|
||||
SelectionDAG &DAG, EVT VT, EVT MemVT,
|
||||
const SDLoc &SL, SDValue Chain,
|
||||
uint64_t Offset, bool Signed,
|
||||
uint64_t Offset, unsigned Align, bool Signed,
|
||||
const ISD::InputArg *Arg) const {
|
||||
const DataLayout &DL = DAG.getDataLayout();
|
||||
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
|
||||
PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
|
||||
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
|
||||
|
||||
unsigned Align = DL.getABITypeAlignment(Ty);
|
||||
|
||||
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
|
||||
SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
|
||||
MachineMemOperand::MODereferenceable |
|
||||
|
@ -1663,7 +1660,15 @@ SDValue SITargetLowering::LowerFormalArguments(
|
|||
|
||||
SmallVector<SDValue, 16> Chains;
|
||||
|
||||
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
|
||||
// FIXME: This is the minimum kernel argument alignment. We should improve
|
||||
// this to the maximum alignment of the arguments.
|
||||
//
|
||||
// FIXME: Alignment of explicit arguments totally broken with non-0 explicit
|
||||
// kern arg offset.
|
||||
const unsigned KernelArgBaseAlign = 16;
|
||||
const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset(Fn);
|
||||
|
||||
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
|
||||
const ISD::InputArg &Arg = Ins[i];
|
||||
if (Skipped[i]) {
|
||||
InVals.push_back(DAG.getUNDEF(Arg.VT));
|
||||
|
@ -1677,14 +1682,14 @@ SDValue SITargetLowering::LowerFormalArguments(
|
|||
VT = Ins[i].VT;
|
||||
EVT MemVT = VA.getLocVT();
|
||||
|
||||
const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(Fn) +
|
||||
VA.getLocMemOffset();
|
||||
const uint64_t Offset = ExplicitOffset + VA.getLocMemOffset();
|
||||
Info->setABIArgOffset(Offset + MemVT.getStoreSize());
|
||||
unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
|
||||
|
||||
// The first 36 bytes of the input buffer contains information about
|
||||
// thread group and global sizes.
|
||||
// thread group and global sizes for clover.
|
||||
SDValue Arg = lowerKernargMemParameter(
|
||||
DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
|
||||
DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
|
||||
Chains.push_back(Arg.getValue(1));
|
||||
|
||||
auto *ParamTy =
|
||||
|
@ -4303,7 +4308,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
|
|||
unsigned Offset) const {
|
||||
SDLoc SL(Op);
|
||||
SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
|
||||
DAG.getEntryNode(), Offset, false);
|
||||
DAG.getEntryNode(), Offset, 4, false);
|
||||
// The local size values will have the hi 16-bits as zero.
|
||||
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
|
||||
DAG.getValueType(VT));
|
||||
|
@ -4404,37 +4409,37 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|||
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
||||
|
||||
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
||||
SI::KernelInputOffsets::NGROUPS_X, false);
|
||||
SI::KernelInputOffsets::NGROUPS_X, 4, false);
|
||||
case Intrinsic::r600_read_ngroups_y:
|
||||
if (Subtarget->isAmdHsaOS())
|
||||
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
||||
|
||||
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
||||
SI::KernelInputOffsets::NGROUPS_Y, false);
|
||||
SI::KernelInputOffsets::NGROUPS_Y, 4, false);
|
||||
case Intrinsic::r600_read_ngroups_z:
|
||||
if (Subtarget->isAmdHsaOS())
|
||||
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
||||
|
||||
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
||||
SI::KernelInputOffsets::NGROUPS_Z, false);
|
||||
SI::KernelInputOffsets::NGROUPS_Z, 4, false);
|
||||
case Intrinsic::r600_read_global_size_x:
|
||||
if (Subtarget->isAmdHsaOS())
|
||||
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
||||
|
||||
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
||||
SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
|
||||
SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
|
||||
case Intrinsic::r600_read_global_size_y:
|
||||
if (Subtarget->isAmdHsaOS())
|
||||
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
||||
|
||||
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
||||
SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
|
||||
SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
|
||||
case Intrinsic::r600_read_global_size_z:
|
||||
if (Subtarget->isAmdHsaOS())
|
||||
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
||||
|
||||
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
||||
SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
|
||||
SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
|
||||
case Intrinsic::r600_read_local_size_x:
|
||||
if (Subtarget->isAmdHsaOS())
|
||||
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
||||
|
|
|
@ -27,7 +27,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
|
|||
SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
|
||||
SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
|
||||
const SDLoc &SL, SDValue Chain,
|
||||
uint64_t Offset, bool Signed,
|
||||
uint64_t Offset, unsigned Align, bool Signed,
|
||||
const ISD::InputArg *Arg = nullptr) const;
|
||||
|
||||
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
|
||||
|
|
|
@ -88,11 +88,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(<2 x half> addrspace(1)*
|
|||
; Combine turns this into integer op when bitcast source (from load)
|
||||
|
||||
; GCN-LABEL: {{^}}s_fneg_fabs_v2f16_bc_src:
|
||||
; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}}
|
||||
; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]]
|
||||
; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]]
|
||||
|
||||
; FIXME: Random commute
|
||||
; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
|
||||
; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
|
||||
; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
|
||||
define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %out, <2 x half> %in) {
|
||||
|
@ -103,16 +101,12 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %ou
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fneg_fabs_v4f16:
|
||||
; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
|
||||
; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, v{{[0-9]+}}
|
||||
; CI: v_or_b32_e32 [[OR0:v[0-9]+]], v{{[0-9]+}}, [[SHL0]]
|
||||
; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, v{{[0-9]+}}
|
||||
; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
|
||||
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]]
|
||||
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]]
|
||||
|
||||
; FIXME: Random commute
|
||||
; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
|
||||
; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
|
||||
|
||||
; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
|
||||
; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
|
||||
|
||||
; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
|
||||
; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
|
||||
|
|
|
@ -13,17 +13,10 @@ define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
|
|||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should always be the same
|
||||
; GCN-LABEL: {{^}}load_v2f16_arg:
|
||||
; SI-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
|
||||
; SI-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
|
||||
; SI: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
|
||||
; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
|
||||
; SI: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
|
||||
; VI: s_load_dword [[ARG:s[0-9]+]]
|
||||
; VI: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
|
||||
; VI: buffer_store_dword [[V_ARG]]
|
||||
; GCN: s_load_dword [[ARG:s[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
|
||||
; GCN: buffer_store_dword [[V_ARG]]
|
||||
define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
|
||||
store <2 x half> %arg, <2 x half> addrspace(1)* %out
|
||||
ret void
|
||||
|
@ -31,8 +24,8 @@ define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x ha
|
|||
|
||||
; GCN-LABEL: {{^}}load_v3f16_arg:
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: s_load_dword s
|
||||
|
||||
; GCN-NOT: buffer_load
|
||||
; GCN-DAG: buffer_store_dword
|
||||
; GCN-DAG: buffer_store_short
|
||||
|
@ -43,19 +36,14 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_v4f16_arg:
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_store_dwordx2
|
||||
|
||||
; FIXME: Why not one load?
|
||||
; VI-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; VI-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
|
||||
; VI-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
|
||||
; VI-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
|
||||
; VI: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
|
||||
; GCN-LABEL: {{^}}load_v4f16_arg:
|
||||
; GCN-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
||||
; GCN-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
|
||||
define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
|
||||
store <4 x half> %arg, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
|
|
|
@ -162,10 +162,11 @@ entry:
|
|||
|
||||
; EG: VTX_READ_8
|
||||
; EG: VTX_READ_8
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
|
||||
; HSA: flat_load_ushort
|
||||
define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
|
||||
entry:
|
||||
store <2 x i8> %in, <2 x i8> addrspace(1)* %out
|
||||
|
@ -179,10 +180,9 @@ entry:
|
|||
; EG: VTX_READ_16
|
||||
; EG: VTX_READ_16
|
||||
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
|
||||
; VI: s_load_dword s
|
||||
; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
|
||||
; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
|
||||
define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
|
||||
entry:
|
||||
store <2 x i16> %in, <2 x i16> addrspace(1)* %out
|
||||
|
@ -226,11 +226,14 @@ entry:
|
|||
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
|
||||
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
|
||||
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
|
||||
; MESA-VI: buffer_load_ushort
|
||||
; MESA-VI: buffer_load_ubyte
|
||||
|
||||
; HSA-VI: flat_load_ushort
|
||||
; HSA-VI: flat_load_ubyte
|
||||
define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
|
||||
entry:
|
||||
|
@ -245,12 +248,9 @@ entry:
|
|||
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
|
||||
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
|
||||
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
|
||||
; MESA-GCN: buffer_load_ushort
|
||||
; MESA-GCN: buffer_load_ushort
|
||||
; MESA-GCN: buffer_load_ushort
|
||||
; HSA-VI: flat_load_ushort
|
||||
; HSA-VI: flat_load_ushort
|
||||
; HSA-VI: flat_load_ushort
|
||||
|
||||
; GCN-DAG: s_load_dword s
|
||||
; GCN-DAG: {{buffer|flat}}_load_ushort
|
||||
define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
|
||||
entry:
|
||||
store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
|
||||
|
@ -293,14 +293,13 @@ entry:
|
|||
; EG: VTX_READ_8
|
||||
; EG: VTX_READ_8
|
||||
; EG: VTX_READ_8
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
|
||||
; VI: s_load_dword s
|
||||
define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
|
||||
entry:
|
||||
store <4 x i8> %in, <4 x i8> addrspace(1)* %out
|
||||
|
@ -315,13 +314,14 @@ entry:
|
|||
; EG: VTX_READ_16
|
||||
; EG: VTX_READ_16
|
||||
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
|
||||
; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
|
||||
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x30
|
||||
|
||||
; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x8
|
||||
; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
|
||||
define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
|
||||
entry:
|
||||
store <4 x i16> %in, <4 x i16> addrspace(1)* %out
|
||||
|
@ -372,21 +372,17 @@ entry:
|
|||
; EG: VTX_READ_8
|
||||
; EG: VTX_READ_8
|
||||
; EG: VTX_READ_8
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; HSA-GCN: float_load_ubyte
|
||||
; HSA-GCN: float_load_ubyte
|
||||
; HSA-GCN: float_load_ubyte
|
||||
; HSA-GCN: float_load_ubyte
|
||||
; HSA-GCN: float_load_ubyte
|
||||
; HSA-GCN: float_load_ubyte
|
||||
; HSA-GCN: float_load_ubyte
|
||||
; HSA-GCN: float_load_ubyte
|
||||
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
|
||||
; VI: s_load_dwordx2
|
||||
; VI: s_load_dwordx2
|
||||
define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
|
||||
entry:
|
||||
store <8 x i8> %in, <8 x i8> addrspace(1)* %out
|
||||
|
@ -405,15 +401,11 @@ entry:
|
|||
; EG: VTX_READ_16
|
||||
; EG: VTX_READ_16
|
||||
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: s_load_dwordx2
|
||||
; SI: s_load_dwordx2
|
||||
; SI: s_load_dwordx2
|
||||
|
||||
; VI: s_load_dwordx2
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
|
@ -481,38 +473,27 @@ entry:
|
|||
; EG: VTX_READ_8
|
||||
; EG: VTX_READ_8
|
||||
; EG: VTX_READ_8
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; MESA-GCN: buffer_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
; HSA-VI: flat_load_ubyte
|
||||
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
; SI: buffer_load_ubyte
|
||||
|
||||
; VI: s_load_dwordx2
|
||||
; VI: s_load_dwordx2
|
||||
; VI: s_load_dwordx2
|
||||
define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
|
||||
entry:
|
||||
store <16 x i8> %in, <16 x i8> addrspace(1)* %out
|
||||
|
@ -539,22 +520,13 @@ entry:
|
|||
; EG: VTX_READ_16
|
||||
; EG: VTX_READ_16
|
||||
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: buffer_load_ushort
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dwordx2
|
||||
; SI: s_load_dwordx2
|
||||
; SI: s_load_dwordx2
|
||||
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
|
|
|
@ -39,10 +39,8 @@ define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dwordx2 s
|
||||
; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
|
||||
%x.bc = bitcast <4 x i16> %x to <2 x i32>
|
||||
|
|
Loading…
Reference in New Issue