AMDGPU: Use better alignment for kernarg lowering

This was just emitting loads with the ABI alignment
for the raw type. The true alignment is often better,
especially when an illegal vector type was scalarized.
The better alignment allows using a scalar load
more often.

llvm-svn: 333558
This commit is contained in:
Matt Arsenault 2018-05-30 16:17:51 +00:00
parent ebaaa2ddae
commit 7b4826e6ce
6 changed files with 116 additions and 159 deletions

View File

@ -1068,15 +1068,12 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
SDValue SITargetLowering::lowerKernargMemParameter(
SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
uint64_t Offset, bool Signed,
uint64_t Offset, unsigned Align, bool Signed,
const ISD::InputArg *Arg) const {
const DataLayout &DL = DAG.getDataLayout();
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
unsigned Align = DL.getABITypeAlignment(Ty);
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
MachineMemOperand::MODereferenceable |
@ -1663,7 +1660,15 @@ SDValue SITargetLowering::LowerFormalArguments(
SmallVector<SDValue, 16> Chains;
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
// FIXME: This is the minimum kernel argument alignment. We should improve
// this to the maximum alignment of the arguments.
//
// FIXME: Alignment of explicit arguments totally broken with non-0 explicit
// kern arg offset.
const unsigned KernelArgBaseAlign = 16;
const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset(Fn);
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
const ISD::InputArg &Arg = Ins[i];
if (Skipped[i]) {
InVals.push_back(DAG.getUNDEF(Arg.VT));
@ -1677,14 +1682,14 @@ SDValue SITargetLowering::LowerFormalArguments(
VT = Ins[i].VT;
EVT MemVT = VA.getLocVT();
const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(Fn) +
VA.getLocMemOffset();
const uint64_t Offset = ExplicitOffset + VA.getLocMemOffset();
Info->setABIArgOffset(Offset + MemVT.getStoreSize());
unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
// thread group and global sizes for clover.
SDValue Arg = lowerKernargMemParameter(
DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
Chains.push_back(Arg.getValue(1));
auto *ParamTy =
@ -4303,7 +4308,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
unsigned Offset) const {
SDLoc SL(Op);
SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
DAG.getEntryNode(), Offset, false);
DAG.getEntryNode(), Offset, 4, false);
// The local size values will have the hi 16-bits as zero.
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
DAG.getValueType(VT));
@ -4404,37 +4409,37 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::NGROUPS_X, false);
SI::KernelInputOffsets::NGROUPS_X, 4, false);
case Intrinsic::r600_read_ngroups_y:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::NGROUPS_Y, false);
SI::KernelInputOffsets::NGROUPS_Y, 4, false);
case Intrinsic::r600_read_ngroups_z:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::NGROUPS_Z, false);
SI::KernelInputOffsets::NGROUPS_Z, 4, false);
case Intrinsic::r600_read_global_size_x:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
case Intrinsic::r600_read_global_size_y:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
case Intrinsic::r600_read_global_size_z:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
case Intrinsic::r600_read_local_size_x:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);

View File

@ -27,7 +27,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
uint64_t Offset, bool Signed,
uint64_t Offset, unsigned Align, bool Signed,
const ISD::InputArg *Arg = nullptr) const;
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,

View File

@ -88,11 +88,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(<2 x half> addrspace(1)*
; Combine turns this into integer op when bitcast source (from load)
; GCN-LABEL: {{^}}s_fneg_fabs_v2f16_bc_src:
; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}}
; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]]
; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]]
; FIXME: Random commute
; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %out, <2 x half> %in) {
@ -103,16 +101,12 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %ou
}
; GCN-LABEL: {{^}}fneg_fabs_v4f16:
; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, v{{[0-9]+}}
; CI: v_or_b32_e32 [[OR0:v[0-9]+]], v{{[0-9]+}}, [[SHL0]]
; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, v{{[0-9]+}}
; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]]
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]]
; FIXME: Random commute
; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]

View File

@ -13,17 +13,10 @@ define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
ret void
}
; FIXME: Should always be the same
; GCN-LABEL: {{^}}load_v2f16_arg:
; SI-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
; SI-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
; SI: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
; SI: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; VI: s_load_dword [[ARG:s[0-9]+]]
; VI: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
; VI: buffer_store_dword [[V_ARG]]
; GCN: s_load_dword [[ARG:s[0-9]+]]
; GCN: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
; GCN: buffer_store_dword [[V_ARG]]
define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
store <2 x half> %arg, <2 x half> addrspace(1)* %out
ret void
@ -31,8 +24,8 @@ define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x ha
; GCN-LABEL: {{^}}load_v3f16_arg:
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: s_load_dword s
; GCN-NOT: buffer_load
; GCN-DAG: buffer_store_dword
; GCN-DAG: buffer_store_short
@ -43,19 +36,14 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha
ret void
}
; GCN-LABEL: {{^}}load_v4f16_arg:
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_store_dwordx2
; FIXME: Why not one load?
; VI-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; VI-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
; VI-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
; VI: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
; GCN-LABEL: {{^}}load_v4f16_arg:
; GCN-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
; GCN: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
store <4 x half> %arg, <4 x half> addrspace(1)* %out
ret void

View File

@ -162,10 +162,11 @@ entry:
; EG: VTX_READ_8
; EG: VTX_READ_8
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; HSA: flat_load_ushort
define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
entry:
store <2 x i8> %in, <2 x i8> addrspace(1)* %out
@ -179,10 +180,9 @@ entry:
; EG: VTX_READ_16
; EG: VTX_READ_16
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; VI: s_load_dword s
; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
entry:
store <2 x i16> %in, <2 x i16> addrspace(1)* %out
@ -226,11 +226,14 @@ entry:
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; MESA-VI: buffer_load_ushort
; MESA-VI: buffer_load_ubyte
; HSA-VI: flat_load_ushort
; HSA-VI: flat_load_ubyte
define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
entry:
@ -245,12 +248,9 @@ entry:
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
; MESA-GCN: buffer_load_ushort
; MESA-GCN: buffer_load_ushort
; MESA-GCN: buffer_load_ushort
; HSA-VI: flat_load_ushort
; HSA-VI: flat_load_ushort
; HSA-VI: flat_load_ushort
; GCN-DAG: s_load_dword s
; GCN-DAG: {{buffer|flat}}_load_ushort
define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
entry:
store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
@ -293,14 +293,13 @@ entry:
; EG: VTX_READ_8
; EG: VTX_READ_8
; EG: VTX_READ_8
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; VI: s_load_dword s
define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
entry:
store <4 x i8> %in, <4 x i8> addrspace(1)* %out
@ -315,13 +314,14 @@ entry:
; EG: VTX_READ_16
; EG: VTX_READ_16
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
; VI: s_load_dword s
; VI: s_load_dword s
; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x30
; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x8
; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
entry:
store <4 x i16> %in, <4 x i16> addrspace(1)* %out
@ -372,21 +372,17 @@ entry:
; EG: VTX_READ_8
; EG: VTX_READ_8
; EG: VTX_READ_8
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; HSA-GCN: float_load_ubyte
; HSA-GCN: float_load_ubyte
; HSA-GCN: float_load_ubyte
; HSA-GCN: float_load_ubyte
; HSA-GCN: float_load_ubyte
; HSA-GCN: float_load_ubyte
; HSA-GCN: float_load_ubyte
; HSA-GCN: float_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; VI: s_load_dwordx2
; VI: s_load_dwordx2
define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
entry:
store <8 x i8> %in, <8 x i8> addrspace(1)* %out
@ -405,15 +401,11 @@ entry:
; EG: VTX_READ_16
; EG: VTX_READ_16
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: s_load_dwordx2
; SI: s_load_dwordx2
; SI: s_load_dwordx2
; VI: s_load_dwordx2
; VI: s_load_dword s
; VI: s_load_dword s
; VI: s_load_dword s
@ -481,38 +473,27 @@ entry:
; EG: VTX_READ_8
; EG: VTX_READ_8
; EG: VTX_READ_8
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; MESA-GCN: buffer_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; HSA-VI: flat_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; VI: s_load_dwordx2
; VI: s_load_dwordx2
; VI: s_load_dwordx2
define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
entry:
store <16 x i8> %in, <16 x i8> addrspace(1)* %out
@ -539,22 +520,13 @@ entry:
; EG: VTX_READ_16
; EG: VTX_READ_16
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: buffer_load_ushort
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dwordx2
; SI: s_load_dwordx2
; SI: s_load_dwordx2
; VI: s_load_dword s
; VI: s_load_dword s

View File

@ -39,10 +39,8 @@ define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)*
}
; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: s_load_dword s
; GCN: s_load_dwordx2 s
; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
%x.bc = bitcast <4 x i16> %x to <2 x i32>