[AMDGPU] Fix v3f16 interaction with image store workaround

In some cases, the wrong amount of registers was reserved.

Also enable more v3f16 tests.

Differential Revision: https://reviews.llvm.org/D90847
This commit is contained in:
Sebastian Neubauer 2020-11-05 15:09:56 +01:00
parent 3abaf6cde7
commit 72ccec1bbc
6 changed files with 418 additions and 76 deletions

View File

@ -3560,9 +3560,9 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
auto Unmerge = B.buildUnmerge(S16, Reg);
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
PackedRegs.push_back(Unmerge.getReg(I));
PackedRegs.resize(8, B.buildUndef(S16).getReg(0));
Reg = B.buildBuildVector(LLT::vector(8, S16), PackedRegs).getReg(0);
return B.buildBitcast(LLT::vector(4, S32), Reg).getReg(0);
PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
}
if (StoreVT.getNumElements() == 4) {

View File

@ -7455,17 +7455,6 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
return DAG.UnrollVectorOp(ZExt.getNode());
} else if (NumElements == 3) {
EVT IntStoreVT =
EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
EVT WidenedStoreVT = EVT::getVectorVT(
*DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
WidenedStoreVT.getStoreSizeInBits());
SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
}
// The sq block of gfx8.1 does not estimate register use correctly for d16
@ -7488,9 +7477,17 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
PackedElts.push_back(IntPair);
}
if ((NumElements % 2) == 1) {
// Handle v3i16
unsigned I = Elts.size() / 2;
SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
{Elts[I * 2], DAG.getUNDEF(MVT::i16)});
SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
PackedElts.push_back(IntPair);
}
// Pad using UNDEF
PackedElts.resize(PackedElts.size() * 2, DAG.getUNDEF(MVT::i32));
PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
// Build final vector
EVT VecVT =
@ -7498,6 +7495,19 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
return DAG.getBuildVector(VecVT, DL, PackedElts);
}
if (NumElements == 3) {
EVT IntStoreVT =
EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
EVT WidenedStoreVT = EVT::getVectorVT(
*DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
WidenedStoreVT.getStoreSizeInBits());
SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
}
assert(isTypeLegal(StoreVT));
return VData;
}

View File

@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=UNPACKED %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX81 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {
; PACKED-LABEL: name: image_store_f16
@ -60,6 +62,44 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
; GFX81: S_ENDPGM 0
; GFX9-LABEL: name: image_store_f16
; GFX9: bb.1 (%ir-block.0):
; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
; GFX9: S_ENDPGM 0
; GFX10-LABEL: name: image_store_f16
; GFX10: bb.1 (%ir-block.0):
; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
; GFX10: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -128,6 +168,42 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX81: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[DEF]](s32)
; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<2 x s32>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
; GFX81: S_ENDPGM 0
; GFX9-LABEL: name: image_store_v2f16
; GFX9: bb.1 (%ir-block.0):
; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
; GFX9: S_ENDPGM 0
; GFX10-LABEL: name: image_store_v2f16
; GFX10: bb.1 (%ir-block.0):
; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
; GFX10: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -245,12 +321,78 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX81: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; GFX81: [[OR2:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]]
; GFX81: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
; GFX81: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
; GFX81: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
; GFX81: [[CONCAT_VECTORS1:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
; GFX81: [[BITCAST5:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<8 x s16>)
; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST5]](<4 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
; GFX81: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
; GFX81: [[BITCAST4:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<6 x s16>)
; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST4]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
; GFX81: S_ENDPGM 0
; GFX9-LABEL: name: image_store_v3f16
; GFX9: bb.1 (%ir-block.0):
; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; GFX9: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX9: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
; GFX9: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF1]](s32)
; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF]](<2 x s16>)
; GFX9: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
; GFX9: S_ENDPGM 0
; GFX10-LABEL: name: image_store_v3f16
; GFX10: bb.1 (%ir-block.0):
; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; GFX10: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX10: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; GFX10: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
; GFX10: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
; GFX10: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF1]](s32)
; GFX10: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF]](<2 x s16>)
; GFX10: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
; GFX10: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
@ -329,6 +471,46 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; GFX81: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[DEF]](s32), [[DEF]](s32)
; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<4 x s32>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
; GFX81: S_ENDPGM 0
; GFX9-LABEL: name: image_store_v4f16
; GFX9: bb.1 (%ir-block.0):
; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; GFX9: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
; GFX9: S_ENDPGM 0
; GFX10-LABEL: name: image_store_v4f16
; GFX10: bb.1 (%ir-block.0):
; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
; GFX10: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
; GFX10: S_ENDPGM 0
call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}

View File

@ -504,11 +504,95 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_yz(<8 x i32> inreg %rsrc, i32 %s) {
ret <2 x half> %v
}
; FIXME:
; define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
; %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
; ret <3 x half> %v
; }
define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-LABEL: load_1d_v3f16_xyz:
; GFX8-UNPACKED: ; %bb.0:
; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2
; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3
; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4
; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9
; GFX8-UNPACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm d16
; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0xffff
; GFX8-UNPACKED-NEXT: s_and_b32 s1, s0, s0
; GFX8-UNPACKED-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, s1
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: v_and_b32_e32 v4, s0, v1
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v2, 16, v4
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-UNPACKED-NEXT: ; return to shader part epilog
;
; GFX8-PACKED-LABEL: load_1d_v3f16_xyz:
; GFX8-PACKED: ; %bb.0:
; GFX8-PACKED-NEXT: s_mov_b32 s0, s2
; GFX8-PACKED-NEXT: s_mov_b32 s1, s3
; GFX8-PACKED-NEXT: s_mov_b32 s2, s4
; GFX8-PACKED-NEXT: s_mov_b32 s3, s5
; GFX8-PACKED-NEXT: s_mov_b32 s4, s6
; GFX8-PACKED-NEXT: s_mov_b32 s5, s7
; GFX8-PACKED-NEXT: s_mov_b32 s6, s8
; GFX8-PACKED-NEXT: s_mov_b32 s7, s9
; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16
; GFX8-PACKED-NEXT: s_mov_b32 s0, 0xffff
; GFX8-PACKED-NEXT: s_and_b32 s0, s0, s0
; GFX8-PACKED-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-PACKED-NEXT: v_mov_b32_e32 v2, s0
; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-PACKED-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX8-PACKED-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-PACKED-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX8-PACKED-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-PACKED-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: load_1d_v3f16_xyz:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s0, s2
; GFX9-NEXT: s_mov_b32 s1, s3
; GFX9-NEXT: s_mov_b32 s2, s4
; GFX9-NEXT: s_mov_b32 s3, s5
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s8
; GFX9-NEXT: s_mov_b32 s7, s9
; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: s_lshl_b32 s0, s0, 16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s0
; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_v3f16_xyz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_lshl_b32 s0, s0, 16
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v2
; GFX10-NEXT: ; return to shader part epilog
%v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
ret <3 x half> %v
}
define amdgpu_ps <4 x half> @load_1d_v4f16_xyzw(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-LABEL: load_1d_v4f16_xyzw:
@ -712,13 +796,72 @@ define amdgpu_ps float @load_1d_v2f16_tfe_dmask_xy(<8 x i32> inreg %rsrc, i32 %s
ret float %vv
}
; FIXME:
; define amdgpu_ps float @load_1d_v3f16_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 %s) {
; %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
; %v.err = extractvalue { <3 x half>, i32 } %v, 1
; %vv = bitcast i32 %v.err to float
; ret float %vv
; }
define amdgpu_ps float @load_1d_v3f16_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask_xyz:
; GFX8-UNPACKED: ; %bb.0:
; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2
; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3
; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4
; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9
; GFX8-UNPACKED-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe d16
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v3
; GFX8-UNPACKED-NEXT: ; return to shader part epilog
;
; GFX8-PACKED-LABEL: load_1d_v3f16_tfe_dmask_xyz:
; GFX8-PACKED: ; %bb.0:
; GFX8-PACKED-NEXT: s_mov_b32 s0, s2
; GFX8-PACKED-NEXT: s_mov_b32 s1, s3
; GFX8-PACKED-NEXT: s_mov_b32 s2, s4
; GFX8-PACKED-NEXT: s_mov_b32 s3, s5
; GFX8-PACKED-NEXT: s_mov_b32 s4, s6
; GFX8-PACKED-NEXT: s_mov_b32 s5, s7
; GFX8-PACKED-NEXT: s_mov_b32 s6, s8
; GFX8-PACKED-NEXT: s_mov_b32 s7, s9
; GFX8-PACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm tfe d16
; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v2
; GFX8-PACKED-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: load_1d_v3f16_tfe_dmask_xyz:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s0, s2
; GFX9-NEXT: s_mov_b32 s1, s3
; GFX9-NEXT: s_mov_b32 s2, s4
; GFX9-NEXT: s_mov_b32 s3, s5
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s8
; GFX9-NEXT: s_mov_b32 s7, s9
; GFX9-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_v3f16_tfe_dmask_xyz:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: ; return to shader part epilog
%v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.err = extractvalue { <3 x half>, i32 } %v, 1
%vv = bitcast i32 %v.err to float
ret float %vv
}
define amdgpu_ps float @load_1d_v4f16_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask_xyzw:

View File

@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -o - %s | FileCheck -check-prefix=UNPACKED %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -o - %s | FileCheck -check-prefix=GFX81 %s
; FIXME: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
; FIXME: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {
; UNPACKED-LABEL: image_store_f16:
@ -16,19 +18,6 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
; UNPACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16
; UNPACKED-NEXT: s_endpgm
;
; PACKED-LABEL: image_store_f16:
; PACKED: ; %bb.0:
; PACKED-NEXT: s_mov_b32 s0, s2
; PACKED-NEXT: s_mov_b32 s1, s3
; PACKED-NEXT: s_mov_b32 s2, s4
; PACKED-NEXT: s_mov_b32 s3, s5
; PACKED-NEXT: s_mov_b32 s4, s6
; PACKED-NEXT: s_mov_b32 s5, s7
; PACKED-NEXT: s_mov_b32 s6, s8
; PACKED-NEXT: s_mov_b32 s7, s9
; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16
; PACKED-NEXT: s_endpgm
;
; GFX81-LABEL: image_store_f16:
; GFX81: ; %bb.0:
; GFX81-NEXT: s_mov_b32 s0, s2
@ -60,19 +49,6 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; UNPACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16
; UNPACKED-NEXT: s_endpgm
;
; PACKED-LABEL: image_store_v2f16:
; PACKED: ; %bb.0:
; PACKED-NEXT: s_mov_b32 s0, s2
; PACKED-NEXT: s_mov_b32 s1, s3
; PACKED-NEXT: s_mov_b32 s2, s4
; PACKED-NEXT: s_mov_b32 s3, s5
; PACKED-NEXT: s_mov_b32 s4, s6
; PACKED-NEXT: s_mov_b32 s5, s7
; PACKED-NEXT: s_mov_b32 s6, s8
; PACKED-NEXT: s_mov_b32 s7, s9
; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16
; PACKED-NEXT: s_endpgm
;
; GFX81-LABEL: image_store_v2f16:
; GFX81: ; %bb.0:
; GFX81-NEXT: s_mov_b32 s0, s2
@ -89,11 +65,44 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
ret void
}
; FIXME: Broken
; define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) {
; call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
; ret void
; }
define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) {
; UNPACKED-LABEL: image_store_v3f16:
; UNPACKED: ; %bb.0:
; UNPACKED-NEXT: v_mov_b32_e32 v5, v1
; UNPACKED-NEXT: v_mov_b32_e32 v1, v2
; UNPACKED-NEXT: s_mov_b32 s0, s2
; UNPACKED-NEXT: s_mov_b32 s1, s3
; UNPACKED-NEXT: s_mov_b32 s2, s4
; UNPACKED-NEXT: s_mov_b32 s3, s5
; UNPACKED-NEXT: s_mov_b32 s4, s6
; UNPACKED-NEXT: s_mov_b32 s5, s7
; UNPACKED-NEXT: s_mov_b32 s6, s8
; UNPACKED-NEXT: s_mov_b32 s7, s9
; UNPACKED-NEXT: v_mov_b32_e32 v4, v0
; UNPACKED-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; UNPACKED-NEXT: image_store v[1:3], v[4:5], s[0:7] dmask:0x7 unorm d16
; UNPACKED-NEXT: s_endpgm
;
; GFX81-LABEL: image_store_v3f16:
; GFX81: ; %bb.0:
; GFX81-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX81-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX81-NEXT: s_mov_b32 s0, s2
; GFX81-NEXT: s_mov_b32 s1, s3
; GFX81-NEXT: s_mov_b32 s2, s4
; GFX81-NEXT: s_mov_b32 s3, s5
; GFX81-NEXT: s_mov_b32 s4, s6
; GFX81-NEXT: s_mov_b32 s5, s7
; GFX81-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX81-NEXT: s_mov_b32 s6, s8
; GFX81-NEXT: s_mov_b32 s7, s9
; GFX81-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX81-NEXT: v_mov_b32_e32 v4, 0
; GFX81-NEXT: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm d16
; GFX81-NEXT: s_endpgm
call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void
}
define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %in) {
; UNPACKED-LABEL: image_store_v4f16:
@ -114,19 +123,6 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
; UNPACKED-NEXT: image_store v[1:4], v[5:6], s[0:7] dmask:0xf unorm d16
; UNPACKED-NEXT: s_endpgm
;
; PACKED-LABEL: image_store_v4f16:
; PACKED: ; %bb.0:
; PACKED-NEXT: s_mov_b32 s0, s2
; PACKED-NEXT: s_mov_b32 s1, s3
; PACKED-NEXT: s_mov_b32 s2, s4
; PACKED-NEXT: s_mov_b32 s3, s5
; PACKED-NEXT: s_mov_b32 s4, s6
; PACKED-NEXT: s_mov_b32 s5, s7
; PACKED-NEXT: s_mov_b32 s6, s8
; PACKED-NEXT: s_mov_b32 s7, s9
; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16
; PACKED-NEXT: s_endpgm
;
; GFX81-LABEL: image_store_v4f16:
; GFX81: ; %bb.0:
; GFX81-NEXT: s_mov_b32 s0, s2

View File

@ -72,6 +72,12 @@ main_body:
ret float %x
}
; GCN-LABEL: {{^}}image_load_3d_v3f16:
; UNPACKED: image_load v[0:2], v[0:2], s[0:7] dmask:0x7 unorm d16
; PACKED: image_load v[0:1], v[0:2], s[0:7] dmask:0x7 unorm d16
; GFX81: image_load v[0:1], v[0:2], s[0:7] dmask:0x7 unorm d16
; GFX10: image_load v[0:1], v[0:2], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm d16{{$}}
define amdgpu_ps <2 x float> @image_load_3d_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
main_body:
%tex = call <3 x half> @llvm.amdgcn.image.load.3d.v3f16.i32(i32 7, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
@ -103,6 +109,11 @@ main_body:
ret void
}
; GCN-LABEL: {{^}}image_store_v3f16:
; UNPACKED: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm d16
; PACKED: image_store v[2:3], v[0:1], s[0:7] dmask:0x7 unorm d16
; GFX81: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm d16
; GFX10: image_store v[2:3], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) {
main_body:
%r = bitcast <2 x float> %in to <4 x half>