forked from OSchip/llvm-project
[AMDGPU] Implement hardware bug workaround for image instructions
Summary: This implements a workaround for a hardware bug in gfx8 and gfx9, where register usage is not estimated correctly for image_store and image_gather4 instructions when D16 is used. Change-Id: I4e30744da6796acac53a9b5ad37ac1c2035c8899 Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D81172
This commit is contained in:
parent
dce03e3059
commit
f71f5f39f6
|
@ -234,6 +234,18 @@ def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
|
|||
"Branch offset of 3f hardware bug"
|
||||
>;
|
||||
|
||||
def FeatureImageStoreD16Bug : SubtargetFeature<"image-store-d16-bug",
|
||||
"HasImageStoreD16Bug",
|
||||
"true",
|
||||
"Image Store D16 hardware bug"
|
||||
>;
|
||||
|
||||
def FeatureImageGather4D16Bug : SubtargetFeature<"image-gather4-d16-bug",
|
||||
"HasImageGather4D16Bug",
|
||||
"true",
|
||||
"Image Gather4 D16 hardware bug"
|
||||
>;
|
||||
|
||||
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
|
||||
"ldsbankcount"#Value,
|
||||
"LDSBankCount",
|
||||
|
@ -810,7 +822,9 @@ def FeatureISAVersion8_1_0 : FeatureSet<
|
|||
[FeatureVolcanicIslands,
|
||||
FeatureLDSBankCount16,
|
||||
FeatureXNACK,
|
||||
FeatureCodeObjectV3]>;
|
||||
FeatureCodeObjectV3,
|
||||
FeatureImageStoreD16Bug,
|
||||
FeatureImageGather4D16Bug]>;
|
||||
|
||||
def FeatureISAVersion9_0_0 : FeatureSet<
|
||||
[FeatureGFX9,
|
||||
|
@ -818,7 +832,8 @@ def FeatureISAVersion9_0_0 : FeatureSet<
|
|||
FeatureLDSBankCount32,
|
||||
FeatureCodeObjectV3,
|
||||
FeatureDoesNotSupportXNACK,
|
||||
FeatureDoesNotSupportSRAMECC]>;
|
||||
FeatureDoesNotSupportSRAMECC,
|
||||
FeatureImageGather4D16Bug]>;
|
||||
|
||||
def FeatureISAVersion9_0_2 : FeatureSet<
|
||||
[FeatureGFX9,
|
||||
|
@ -826,7 +841,8 @@ def FeatureISAVersion9_0_2 : FeatureSet<
|
|||
FeatureLDSBankCount32,
|
||||
FeatureXNACK,
|
||||
FeatureDoesNotSupportSRAMECC,
|
||||
FeatureCodeObjectV3]>;
|
||||
FeatureCodeObjectV3,
|
||||
FeatureImageGather4D16Bug]>;
|
||||
|
||||
def FeatureISAVersion9_0_4 : FeatureSet<
|
||||
[FeatureGFX9,
|
||||
|
@ -834,7 +850,8 @@ def FeatureISAVersion9_0_4 : FeatureSet<
|
|||
FeatureFmaMixInsts,
|
||||
FeatureDoesNotSupportXNACK,
|
||||
FeatureDoesNotSupportSRAMECC,
|
||||
FeatureCodeObjectV3]>;
|
||||
FeatureCodeObjectV3,
|
||||
FeatureImageGather4D16Bug]>;
|
||||
|
||||
def FeatureISAVersion9_0_6 : FeatureSet<
|
||||
[FeatureGFX9,
|
||||
|
@ -845,7 +862,8 @@ def FeatureISAVersion9_0_6 : FeatureSet<
|
|||
FeatureDot1Insts,
|
||||
FeatureDot2Insts,
|
||||
FeatureDoesNotSupportXNACK,
|
||||
FeatureCodeObjectV3]>;
|
||||
FeatureCodeObjectV3,
|
||||
FeatureImageGather4D16Bug]>;
|
||||
|
||||
def FeatureISAVersion9_0_8 : FeatureSet<
|
||||
[FeatureGFX9,
|
||||
|
@ -864,14 +882,16 @@ def FeatureISAVersion9_0_8 : FeatureSet<
|
|||
FeatureAtomicFaddInsts,
|
||||
FeatureSRAMECC,
|
||||
FeatureMFMAInlineLiteralBug,
|
||||
FeatureCodeObjectV3]>;
|
||||
FeatureCodeObjectV3,
|
||||
FeatureImageGather4D16Bug]>;
|
||||
|
||||
def FeatureISAVersion9_0_9 : FeatureSet<
|
||||
[FeatureGFX9,
|
||||
FeatureMadMixInsts,
|
||||
FeatureLDSBankCount32,
|
||||
FeatureXNACK,
|
||||
FeatureCodeObjectV3]>;
|
||||
FeatureCodeObjectV3,
|
||||
FeatureImageGather4D16Bug]>;
|
||||
|
||||
// TODO: Organize more features into groups.
|
||||
def FeatureGroup {
|
||||
|
|
|
@ -1539,6 +1539,16 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
|
|||
DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
|
||||
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
|
||||
|
||||
// One memoperand is mandatory, except for getresinfo.
|
||||
// FIXME: Check this in verifier.
|
||||
if (!MI.memoperands_empty()) {
|
||||
const MachineMemOperand *MMO = *MI.memoperands_begin();
|
||||
|
||||
// Infer d16 from the memory size, as the register type will be mangled by
|
||||
// unpacked subtargets, or by TFE.
|
||||
IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
|
||||
}
|
||||
|
||||
if (BaseOpcode->Store) {
|
||||
VDataIn = MI.getOperand(1).getReg();
|
||||
VDataTy = MRI->getType(VDataIn);
|
||||
|
@ -1548,18 +1558,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
|
|||
VDataTy = MRI->getType(VDataOut);
|
||||
NumVDataDwords = DMaskLanes;
|
||||
|
||||
// One memoperand is mandatory, except for getresinfo.
|
||||
// FIXME: Check this in verifier.
|
||||
if (!MI.memoperands_empty()) {
|
||||
const MachineMemOperand *MMO = *MI.memoperands_begin();
|
||||
|
||||
// Infer d16 from the memory size, as the register type will be mangled by
|
||||
// unpacked subtargets, or by TFE.
|
||||
IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
|
||||
|
||||
if (IsD16 && !STI.hasUnpackedD16VMem())
|
||||
NumVDataDwords = (DMaskLanes + 1) / 2;
|
||||
}
|
||||
if (IsD16 && !STI.hasUnpackedD16VMem())
|
||||
NumVDataDwords = (DMaskLanes + 1) / 2;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -3528,24 +3528,58 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
|
|||
/// Handle register layout difference for f16 images for some subtargets.
|
||||
Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
|
||||
MachineRegisterInfo &MRI,
|
||||
Register Reg) const {
|
||||
if (!ST.hasUnpackedD16VMem())
|
||||
return Reg;
|
||||
|
||||
Register Reg,
|
||||
bool ImageStore) const {
|
||||
const LLT S16 = LLT::scalar(16);
|
||||
const LLT S32 = LLT::scalar(32);
|
||||
LLT StoreVT = MRI.getType(Reg);
|
||||
assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
|
||||
|
||||
auto Unmerge = B.buildUnmerge(S16, Reg);
|
||||
if (ST.hasUnpackedD16VMem()) {
|
||||
auto Unmerge = B.buildUnmerge(S16, Reg);
|
||||
|
||||
SmallVector<Register, 4> WideRegs;
|
||||
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
|
||||
WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
|
||||
SmallVector<Register, 4> WideRegs;
|
||||
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
|
||||
WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
|
||||
|
||||
int NumElts = StoreVT.getNumElements();
|
||||
int NumElts = StoreVT.getNumElements();
|
||||
|
||||
return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
|
||||
return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
|
||||
}
|
||||
|
||||
if (ImageStore && ST.hasImageStoreD16Bug()) {
|
||||
if (StoreVT.getNumElements() == 2) {
|
||||
SmallVector<Register, 4> PackedRegs;
|
||||
Reg = B.buildBitcast(S32, Reg).getReg(0);
|
||||
PackedRegs.push_back(Reg);
|
||||
PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
|
||||
return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
|
||||
}
|
||||
|
||||
if (StoreVT.getNumElements() == 3) {
|
||||
SmallVector<Register, 4> PackedRegs;
|
||||
auto Unmerge = B.buildUnmerge(S16, Reg);
|
||||
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
|
||||
PackedRegs.push_back(Unmerge.getReg(I));
|
||||
PackedRegs.resize(8, B.buildUndef(S16).getReg(0));
|
||||
Reg = B.buildBuildVector(LLT::vector(8, S16), PackedRegs).getReg(0);
|
||||
return B.buildBitcast(LLT::vector(4, S32), Reg).getReg(0);
|
||||
}
|
||||
|
||||
if (StoreVT.getNumElements() == 4) {
|
||||
SmallVector<Register, 4> PackedRegs;
|
||||
Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
|
||||
auto Unmerge = B.buildUnmerge(S32, Reg);
|
||||
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
|
||||
PackedRegs.push_back(Unmerge.getReg(I));
|
||||
PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
|
||||
return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
|
||||
}
|
||||
|
||||
llvm_unreachable("invalid data type");
|
||||
}
|
||||
|
||||
return Reg;
|
||||
}
|
||||
|
||||
Register AMDGPULegalizerInfo::fixStoreSourceType(
|
||||
|
@ -4215,7 +4249,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
|
|||
if (!Ty.isVector() || Ty.getElementType() != S16)
|
||||
return true;
|
||||
|
||||
Register RepackedReg = handleD16VData(B, *MRI, VData);
|
||||
Register RepackedReg = handleD16VData(B, *MRI, VData, true);
|
||||
if (RepackedReg != VData) {
|
||||
MI.getOperand(1).setReg(RepackedReg);
|
||||
}
|
||||
|
|
|
@ -146,7 +146,7 @@ public:
|
|||
splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
|
||||
|
||||
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
|
||||
Register Reg) const;
|
||||
Register Reg, bool ImageStore = false) const;
|
||||
bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||
MachineIRBuilder &B, bool IsFormat) const;
|
||||
bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||
|
|
|
@ -271,6 +271,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
|||
HasNSAtoVMEMBug(false),
|
||||
HasOffset3fBug(false),
|
||||
HasFlatSegmentOffsetBug(false),
|
||||
HasImageStoreD16Bug(false),
|
||||
HasImageGather4D16Bug(false),
|
||||
|
||||
FeatureDisable(false),
|
||||
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
|
||||
|
|
|
@ -411,6 +411,8 @@ protected:
|
|||
bool HasNSAtoVMEMBug;
|
||||
bool HasOffset3fBug;
|
||||
bool HasFlatSegmentOffsetBug;
|
||||
bool HasImageStoreD16Bug;
|
||||
bool HasImageGather4D16Bug;
|
||||
|
||||
// Dummy feature to use for assembler in tablegen.
|
||||
bool FeatureDisable;
|
||||
|
@ -1025,9 +1027,11 @@ public:
|
|||
return HasOffset3fBug;
|
||||
}
|
||||
|
||||
bool hasNSAEncoding() const {
|
||||
return HasNSAEncoding;
|
||||
}
|
||||
bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
|
||||
|
||||
bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
|
||||
|
||||
bool hasNSAEncoding() const { return HasNSAEncoding; }
|
||||
|
||||
bool hasGFX10_BEncoding() const {
|
||||
return GFX10_BEncoding;
|
||||
|
|
|
@ -5851,7 +5851,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
|
|||
SDValue Data(Result, 0);
|
||||
SDValue TexFail;
|
||||
|
||||
if (IsTexFail) {
|
||||
if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
|
||||
SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
|
||||
if (MaskPopVT.isVector()) {
|
||||
Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
|
||||
|
@ -5860,10 +5860,6 @@ static SDValue constructRetValue(SelectionDAG &DAG,
|
|||
Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
|
||||
SDValue(Result, 0), ZeroIdx);
|
||||
}
|
||||
|
||||
TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
|
||||
SDValue(Result, 0),
|
||||
DAG.getConstant(MaskPopDwords, DL, MVT::i32));
|
||||
}
|
||||
|
||||
if (DataDwordVT.isVector())
|
||||
|
@ -5887,8 +5883,13 @@ static SDValue constructRetValue(SelectionDAG &DAG,
|
|||
}
|
||||
Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
|
||||
|
||||
if (TexFail)
|
||||
if (IsTexFail) {
|
||||
TexFail =
|
||||
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
|
||||
DAG.getConstant(MaskPopDwords, DL, MVT::i32));
|
||||
|
||||
return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
|
||||
}
|
||||
|
||||
if (Result->getNumValues() == 1)
|
||||
return Data;
|
||||
|
@ -6007,7 +6008,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
|
|||
return Op; // D16 is unsupported for this instruction
|
||||
|
||||
IsD16 = true;
|
||||
VData = handleD16VData(VData, DAG);
|
||||
VData = handleD16VData(VData, DAG, true);
|
||||
}
|
||||
|
||||
NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
|
||||
|
@ -6027,7 +6028,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
|
|||
(!LoadVT.isVector() && DMaskLanes > 1))
|
||||
return Op;
|
||||
|
||||
if (IsD16 && !Subtarget->hasUnpackedD16VMem())
|
||||
// The sq block of gfx8 and gfx9 do not estimate register use correctly
|
||||
// for d16 image_gather4, image_gather4_l, and image_gather4_lz
|
||||
// instructions.
|
||||
if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
|
||||
!(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
|
||||
NumVDataDwords = (DMaskLanes + 1) / 2;
|
||||
else
|
||||
NumVDataDwords = DMaskLanes;
|
||||
|
@ -7401,8 +7406,8 @@ SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
|
|||
return NewOp;
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::handleD16VData(SDValue VData,
|
||||
SelectionDAG &DAG) const {
|
||||
SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
|
||||
bool ImageStore) const {
|
||||
EVT StoreVT = VData.getValueType();
|
||||
|
||||
// No change for f16 and legal vector D16 types.
|
||||
|
@ -7434,6 +7439,36 @@ SDValue SITargetLowering::handleD16VData(SDValue VData,
|
|||
return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
|
||||
}
|
||||
|
||||
// The sq block of gfx8.1 does not estimate register use correctly for d16
|
||||
// image store instructions. The data operand is computed as if it were not a
|
||||
// d16 image instruction.
|
||||
if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
|
||||
// Bitcast to i16
|
||||
EVT IntStoreVT = StoreVT.changeTypeToInteger();
|
||||
SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
|
||||
|
||||
// Decompose into scalars
|
||||
SmallVector<SDValue, 4> Elts;
|
||||
DAG.ExtractVectorElements(IntVData, Elts);
|
||||
|
||||
// Group pairs of i16 into v2i16 and bitcast to i32
|
||||
SmallVector<SDValue, 4> PackedElts;
|
||||
for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
|
||||
SDValue Pair =
|
||||
DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
|
||||
SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
|
||||
PackedElts.push_back(IntPair);
|
||||
}
|
||||
|
||||
// Pad using UNDEF
|
||||
PackedElts.resize(PackedElts.size() * 2, DAG.getUNDEF(MVT::i32));
|
||||
|
||||
// Build final vector
|
||||
EVT VecVT =
|
||||
EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
|
||||
return DAG.getBuildVector(VecVT, DL, PackedElts);
|
||||
}
|
||||
|
||||
assert(isTypeLegal(StoreVT));
|
||||
return VData;
|
||||
}
|
||||
|
|
|
@ -108,7 +108,8 @@ private:
|
|||
ArrayRef<SDValue> Ops, EVT MemVT,
|
||||
MachineMemOperand *MMO, SelectionDAG &DAG) const;
|
||||
|
||||
SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
|
||||
SDValue handleD16VData(SDValue VData, SelectionDAG &DAG,
|
||||
bool ImageStore = false) const;
|
||||
|
||||
/// Converts \p Op, which must be of floating point type, to the
|
||||
/// floating point type \p VT, by either extending or truncating it.
|
||||
|
|
|
@ -1,27 +1,8 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=UNPACKED %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=PACKED %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX81 %s
|
||||
|
||||
define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {
|
||||
; UNPACKED-LABEL: name: image_store_f16
|
||||
; UNPACKED: bb.1 (%ir-block.0):
|
||||
; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
|
||||
; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
|
||||
; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
|
||||
; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
|
||||
; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
|
||||
; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
|
||||
; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
|
||||
; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
|
||||
; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
|
||||
; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
|
||||
; UNPACKED: S_ENDPGM 0
|
||||
; PACKED-LABEL: name: image_store_f16
|
||||
; PACKED: bb.1 (%ir-block.0):
|
||||
; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
|
||||
|
@ -41,11 +22,67 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
|
|||
; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
|
||||
; PACKED: S_ENDPGM 0
|
||||
; UNPACKED-LABEL: name: image_store_f16
|
||||
; UNPACKED: bb.1 (%ir-block.0):
|
||||
; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
|
||||
; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
|
||||
; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
|
||||
; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
|
||||
; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
|
||||
; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
|
||||
; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
|
||||
; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
|
||||
; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; UNPACKED: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
|
||||
; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
|
||||
; UNPACKED: S_ENDPGM 0
|
||||
; GFX81-LABEL: name: image_store_f16
|
||||
; GFX81: bb.1 (%ir-block.0):
|
||||
; GFX81: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX81: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
|
||||
; GFX81: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; GFX81: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
|
||||
; GFX81: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
|
||||
; GFX81: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
|
||||
; GFX81: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
|
||||
; GFX81: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
|
||||
; GFX81: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
|
||||
; GFX81: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GFX81: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
; GFX81: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
|
||||
; GFX81: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8")
|
||||
; GFX81: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x half> %in) {
|
||||
; PACKED-LABEL: name: image_store_v2f16
|
||||
; PACKED: bb.1 (%ir-block.0):
|
||||
; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
|
||||
; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
|
||||
; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
|
||||
; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
|
||||
; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
|
||||
; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
|
||||
; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
|
||||
; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
|
||||
; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
|
||||
; PACKED: S_ENDPGM 0
|
||||
; UNPACKED-LABEL: name: image_store_v2f16
|
||||
; UNPACKED: bb.1 (%ir-block.0):
|
||||
; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
|
||||
|
@ -70,59 +107,32 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
|
|||
; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY11]](s32), [[COPY12]](s32)
|
||||
; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<2 x s32>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
|
||||
; UNPACKED: S_ENDPGM 0
|
||||
; PACKED-LABEL: name: image_store_v2f16
|
||||
; PACKED: bb.1 (%ir-block.0):
|
||||
; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
|
||||
; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
|
||||
; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
|
||||
; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
|
||||
; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
|
||||
; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
|
||||
; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
|
||||
; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
|
||||
; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
|
||||
; PACKED: S_ENDPGM 0
|
||||
; GFX81-LABEL: name: image_store_v2f16
|
||||
; GFX81: bb.1 (%ir-block.0):
|
||||
; GFX81: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX81: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
|
||||
; GFX81: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; GFX81: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
|
||||
; GFX81: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
|
||||
; GFX81: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
|
||||
; GFX81: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
|
||||
; GFX81: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
|
||||
; GFX81: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
|
||||
; GFX81: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GFX81: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; GFX81: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; GFX81: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
|
||||
; GFX81: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
|
||||
; GFX81: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[DEF]](s32)
|
||||
; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<2 x s32>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8")
|
||||
; GFX81: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) {
|
||||
; UNPACKED-LABEL: name: image_store_v3f16
|
||||
; UNPACKED: bb.1 (%ir-block.0):
|
||||
; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
||||
; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
|
||||
; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
|
||||
; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
|
||||
; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
|
||||
; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
|
||||
; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
|
||||
; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
|
||||
; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; UNPACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; UNPACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
|
||||
; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; UNPACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
|
||||
; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
|
||||
; UNPACKED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
|
||||
; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
|
||||
; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
|
||||
; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
|
||||
; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
|
||||
; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
|
||||
; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
|
||||
; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
|
||||
; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
|
||||
; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
|
||||
; UNPACKED: S_ENDPGM 0
|
||||
; PACKED-LABEL: name: image_store_v3f16
|
||||
; PACKED: bb.1 (%ir-block.0):
|
||||
; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
||||
|
@ -165,11 +175,107 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
|
|||
; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
|
||||
; PACKED: S_ENDPGM 0
|
||||
; UNPACKED-LABEL: name: image_store_v3f16
|
||||
; UNPACKED: bb.1 (%ir-block.0):
|
||||
; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
||||
; UNPACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
|
||||
; UNPACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; UNPACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
|
||||
; UNPACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
|
||||
; UNPACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
|
||||
; UNPACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
|
||||
; UNPACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
|
||||
; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
|
||||
; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; UNPACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; UNPACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
|
||||
; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; UNPACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
|
||||
; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
|
||||
; UNPACKED: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
|
||||
; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
|
||||
; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
|
||||
; UNPACKED: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
|
||||
; UNPACKED: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
|
||||
; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; UNPACKED: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
|
||||
; UNPACKED: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
|
||||
; UNPACKED: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
|
||||
; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32)
|
||||
; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
|
||||
; UNPACKED: S_ENDPGM 0
|
||||
; GFX81-LABEL: name: image_store_v3f16
|
||||
; GFX81: bb.1 (%ir-block.0):
|
||||
; GFX81: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
||||
; GFX81: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
|
||||
; GFX81: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; GFX81: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
|
||||
; GFX81: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
|
||||
; GFX81: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
|
||||
; GFX81: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
|
||||
; GFX81: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
|
||||
; GFX81: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
|
||||
; GFX81: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GFX81: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; GFX81: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
|
||||
; GFX81: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; GFX81: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
|
||||
; GFX81: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
|
||||
; GFX81: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
|
||||
; GFX81: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96)
|
||||
; GFX81: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
|
||||
; GFX81: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
|
||||
; GFX81: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32)
|
||||
; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; GFX81: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
|
||||
; GFX81: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
|
||||
; GFX81: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]]
|
||||
; GFX81: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
|
||||
; GFX81: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]]
|
||||
; GFX81: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
|
||||
; GFX81: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
|
||||
; GFX81: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
|
||||
; GFX81: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
|
||||
; GFX81: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C1]]
|
||||
; GFX81: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
|
||||
; GFX81: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
|
||||
; GFX81: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
|
||||
; GFX81: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
|
||||
; GFX81: [[OR2:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]]
|
||||
; GFX81: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
|
||||
; GFX81: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
|
||||
; GFX81: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32)
|
||||
; GFX81: [[CONCAT_VECTORS1:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
|
||||
; GFX81: [[BITCAST5:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<8 x s16>)
|
||||
; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST5]](<4 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8)
|
||||
; GFX81: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %in) {
|
||||
; PACKED-LABEL: name: image_store_v4f16
|
||||
; PACKED: bb.1 (%ir-block.0):
|
||||
; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
||||
; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
|
||||
; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
|
||||
; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
|
||||
; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
|
||||
; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
|
||||
; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
|
||||
; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
|
||||
; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; PACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
|
||||
; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
|
||||
; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
|
||||
; PACKED: S_ENDPGM 0
|
||||
; UNPACKED-LABEL: name: image_store_v4f16
|
||||
; UNPACKED: bb.1 (%ir-block.0):
|
||||
; UNPACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
||||
|
@ -199,26 +305,30 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
|
|||
; UNPACKED: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32)
|
||||
; UNPACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<4 x s32>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
|
||||
; UNPACKED: S_ENDPGM 0
|
||||
; PACKED-LABEL: name: image_store_v4f16
|
||||
; PACKED: bb.1 (%ir-block.0):
|
||||
; PACKED: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
||||
; PACKED: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
|
||||
; PACKED: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; PACKED: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
|
||||
; PACKED: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
|
||||
; PACKED: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
|
||||
; PACKED: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
|
||||
; PACKED: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
|
||||
; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
|
||||
; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; PACKED: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; PACKED: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
|
||||
; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
|
||||
; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; PACKED: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
|
||||
; PACKED: S_ENDPGM 0
|
||||
; GFX81-LABEL: name: image_store_v4f16
|
||||
; GFX81: bb.1 (%ir-block.0):
|
||||
; GFX81: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
||||
; GFX81: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
|
||||
; GFX81: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
|
||||
; GFX81: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
|
||||
; GFX81: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
|
||||
; GFX81: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
|
||||
; GFX81: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
|
||||
; GFX81: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
|
||||
; GFX81: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
|
||||
; GFX81: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GFX81: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GFX81: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
|
||||
; GFX81: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
|
||||
; GFX81: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
|
||||
; GFX81: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
|
||||
; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
|
||||
; GFX81: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
|
||||
; GFX81: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
|
||||
; GFX81: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
|
||||
; GFX81: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[DEF]](s32), [[DEF]](s32)
|
||||
; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<4 x s32>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8")
|
||||
; GFX81: S_ENDPGM 0
|
||||
call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -o - %s | FileCheck -check-prefix=UNPACKED %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -o - %s | FileCheck -check-prefix=PACKED %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -o - %s | FileCheck -check-prefix=GFX81 %s
|
||||
|
||||
define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) {
|
||||
; UNPACKED-LABEL: image_store_f16:
|
||||
|
@ -13,7 +13,7 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
|
|||
; UNPACKED-NEXT: s_mov_b32 s5, s7
|
||||
; UNPACKED-NEXT: s_mov_b32 s6, s8
|
||||
; UNPACKED-NEXT: s_mov_b32 s7, s9
|
||||
; UNPACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm
|
||||
; UNPACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16
|
||||
; UNPACKED-NEXT: s_endpgm
|
||||
;
|
||||
; PACKED-LABEL: image_store_f16:
|
||||
|
@ -26,8 +26,21 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
|
|||
; PACKED-NEXT: s_mov_b32 s5, s7
|
||||
; PACKED-NEXT: s_mov_b32 s6, s8
|
||||
; PACKED-NEXT: s_mov_b32 s7, s9
|
||||
; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm
|
||||
; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16
|
||||
; PACKED-NEXT: s_endpgm
|
||||
;
|
||||
; GFX81-LABEL: image_store_f16:
|
||||
; GFX81: ; %bb.0:
|
||||
; GFX81-NEXT: s_mov_b32 s0, s2
|
||||
; GFX81-NEXT: s_mov_b32 s1, s3
|
||||
; GFX81-NEXT: s_mov_b32 s2, s4
|
||||
; GFX81-NEXT: s_mov_b32 s3, s5
|
||||
; GFX81-NEXT: s_mov_b32 s4, s6
|
||||
; GFX81-NEXT: s_mov_b32 s5, s7
|
||||
; GFX81-NEXT: s_mov_b32 s6, s8
|
||||
; GFX81-NEXT: s_mov_b32 s7, s9
|
||||
; GFX81-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16
|
||||
; GFX81-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
@ -44,7 +57,7 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
|
|||
; UNPACKED-NEXT: s_mov_b32 s6, s8
|
||||
; UNPACKED-NEXT: s_mov_b32 s7, s9
|
||||
; UNPACKED-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
||||
; UNPACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm
|
||||
; UNPACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16
|
||||
; UNPACKED-NEXT: s_endpgm
|
||||
;
|
||||
; PACKED-LABEL: image_store_v2f16:
|
||||
|
@ -57,8 +70,21 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
|
|||
; PACKED-NEXT: s_mov_b32 s5, s7
|
||||
; PACKED-NEXT: s_mov_b32 s6, s8
|
||||
; PACKED-NEXT: s_mov_b32 s7, s9
|
||||
; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm
|
||||
; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16
|
||||
; PACKED-NEXT: s_endpgm
|
||||
;
|
||||
; GFX81-LABEL: image_store_v2f16:
|
||||
; GFX81: ; %bb.0:
|
||||
; GFX81-NEXT: s_mov_b32 s0, s2
|
||||
; GFX81-NEXT: s_mov_b32 s1, s3
|
||||
; GFX81-NEXT: s_mov_b32 s2, s4
|
||||
; GFX81-NEXT: s_mov_b32 s3, s5
|
||||
; GFX81-NEXT: s_mov_b32 s4, s6
|
||||
; GFX81-NEXT: s_mov_b32 s5, s7
|
||||
; GFX81-NEXT: s_mov_b32 s6, s8
|
||||
; GFX81-NEXT: s_mov_b32 s7, s9
|
||||
; GFX81-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16
|
||||
; GFX81-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
@ -85,7 +111,7 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
|
|||
; UNPACKED-NEXT: v_mov_b32_e32 v5, v0
|
||||
; UNPACKED-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
||||
; UNPACKED-NEXT: v_lshrrev_b32_e32 v4, 16, v3
|
||||
; UNPACKED-NEXT: image_store v[1:4], v[5:6], s[0:7] dmask:0xf unorm
|
||||
; UNPACKED-NEXT: image_store v[1:4], v[5:6], s[0:7] dmask:0xf unorm d16
|
||||
; UNPACKED-NEXT: s_endpgm
|
||||
;
|
||||
; PACKED-LABEL: image_store_v4f16:
|
||||
|
@ -98,8 +124,21 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
|
|||
; PACKED-NEXT: s_mov_b32 s5, s7
|
||||
; PACKED-NEXT: s_mov_b32 s6, s8
|
||||
; PACKED-NEXT: s_mov_b32 s7, s9
|
||||
; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm
|
||||
; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16
|
||||
; PACKED-NEXT: s_endpgm
|
||||
;
|
||||
; GFX81-LABEL: image_store_v4f16:
|
||||
; GFX81: ; %bb.0:
|
||||
; GFX81-NEXT: s_mov_b32 s0, s2
|
||||
; GFX81-NEXT: s_mov_b32 s1, s3
|
||||
; GFX81-NEXT: s_mov_b32 s2, s4
|
||||
; GFX81-NEXT: s_mov_b32 s3, s5
|
||||
; GFX81-NEXT: s_mov_b32 s4, s6
|
||||
; GFX81-NEXT: s_mov_b32 s5, s7
|
||||
; GFX81-NEXT: s_mov_b32 s6, s8
|
||||
; GFX81-NEXT: s_mov_b32 s7, s9
|
||||
; GFX81-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16
|
||||
; GFX81-NEXT: s_endpgm
|
||||
call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,GFX89 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
|
||||
|
@ -15,6 +15,7 @@ main_body:
|
|||
; GCN-LABEL: {{^}}image_load_v2f16:
|
||||
; UNPACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
|
||||
; PACKED: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
|
||||
; GFX81: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
|
||||
; GFX10: image_load v0, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
|
||||
define amdgpu_ps float @image_load_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
|
||||
main_body:
|
||||
|
@ -38,6 +39,7 @@ main_body:
|
|||
; GCN-LABEL: {{^}}image_load_v4f16:
|
||||
; UNPACKED: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
|
||||
; PACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
|
||||
; GFX81: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
|
||||
; GFX10: image_load v[0:1], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}
|
||||
define amdgpu_ps <2 x float> @image_load_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
|
||||
main_body:
|
||||
|
@ -49,6 +51,7 @@ main_body:
|
|||
; GCN-LABEL: {{^}}image_load_mip_v4f16:
|
||||
; UNPACKED: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm d16{{$}}
|
||||
; PACKED: image_load_mip v[0:1], v[0:2], s[0:7] dmask:0xf unorm d16{{$}}
|
||||
; GFX81: image_load_mip v[0:1], v[0:2], s[0:7] dmask:0xf unorm d16{{$}}
|
||||
; GFX10: image_load_mip v[0:1], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}
|
||||
define amdgpu_ps <2 x float> @image_load_mip_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
|
||||
main_body:
|
||||
|
@ -60,6 +63,7 @@ main_body:
|
|||
; GCN-LABEL: {{^}}image_load_3d_v2f16:
|
||||
; UNPACKED: image_load v[0:1], v[0:2], s[0:7] dmask:0x3 unorm d16{{$}}
|
||||
; PACKED: image_load v0, v[0:2], s[0:7] dmask:0x3 unorm d16{{$}}
|
||||
; GFX81: image_load v0, v[0:2], s[0:7] dmask:0x3 unorm d16{{$}}
|
||||
; GFX10: image_load v0, v[0:2], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm d16{{$}}
|
||||
define amdgpu_ps float @image_load_3d_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
|
||||
main_body:
|
||||
|
@ -90,6 +94,7 @@ main_body:
|
|||
; UNPACKED: v_and_b32_e32
|
||||
; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
|
||||
; PACKED: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
|
||||
; GFX81: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
|
||||
; GFX10: image_store v2, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
|
||||
define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %in) {
|
||||
main_body:
|
||||
|
@ -113,6 +118,7 @@ main_body:
|
|||
; UNPACKED: v_and_b32_e32
|
||||
; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
|
||||
; PACKED: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
|
||||
; GFX81: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
|
||||
; GFX10: image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}
|
||||
define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) {
|
||||
main_body:
|
||||
|
@ -128,6 +134,7 @@ main_body:
|
|||
; UNPACKED: v_and_b32_e32
|
||||
; UNPACKED: image_store_mip v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
|
||||
; PACKED: image_store_mip v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
|
||||
; GFX81: image_store_mip v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
|
||||
; GFX10: image_store_mip v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm d16{{$}}
|
||||
define amdgpu_ps void @image_store_mip_1d_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %mip, <2 x float> %in) {
|
||||
main_body:
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,GFX89 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
|
||||
; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16:
|
||||
; UNPACKED: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
|
||||
; PACKED: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
|
||||
; GFX810: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
|
||||
; GFX9: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
|
||||
; GFX10: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D d16{{$}}
|
||||
define amdgpu_ps <2 x float> @image_gather4_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
|
||||
main_body:
|
||||
|
|
Loading…
Reference in New Issue