[AMDGPU] Add support for TFE/LWE in image intrinsics. 2nd try

TFE and LWE support requires extra result registers that are written in the
event of a failure in order to detect that failure case.
The specific use-case that initiated these changes is sparse texture support.

This means that if image intrinsics are used with either option turned on, the
programmer must ensure that the return type can contain all of the expected
results. This can result in redundant registers since the vector size must be a
power-of-2.

This change takes roughly 6 parts:
1. Modify the instruction defs in tablegen to add new instruction variants that
can accomodate the extra return values.
2. Updates to lowerImage in SIISelLowering.cpp to accomodate setting TFE or LWE
(where the bulk of the work for these instruction types is now done)
3. Extra verification code to catch cases where intrinsics have been used but
insufficient return registers are used.
4. Modification to the adjustWritemask optimisation to account for TFE/LWE being
enabled (requires extra registers to be maintained for error return value).
5. An extra pass to zero initialize the error value return - this is because if
the error does not occur, the register is not written and thus must be zeroed
before use. Also added a new (on by default) option to ensure ALL return values
are zero-initialized that is required for sparse texture support.
6. Disable the inst_combine optimization in the presence of tfe/lwe (later TODO
for this to re-enable and handle correctly).

There's an additional fix now to avoid a dmask=0

For an image intrinsic with tfe where all result channels except tfe
were unused, I was getting an image instruction with dmask=0 and only a
single vgpr result for tfe. That is incorrect because the hardware
assumes there is at least one vgpr result, plus the one for tfe.

Fixed by forcing dmask to 1, which gives the desired two vgpr result
with tfe in the second one.

The TFE or LWE result is returned from the intrinsics using an aggregate
type. Look in the test code provided to see how this works, but in essence IR
code to invoke the intrinsic looks as follows:

%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15,
                                      i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1

This re-submit of the change also includes a slight modification in
SIISelLowering.cpp to work-around a compiler bug for the powerpc_le
platform that caused a buildbot failure on a previous submission.

Differential revision: https://reviews.llvm.org/D48826

Change-Id: If222bc03642e76cf98059a6bef5d5bffeda38dda


Work around for ppcle compiler bug

Change-Id: Ie284cf24b2271215be1b9dc95b485fd15000e32b
llvm-svn: 351054
This commit is contained in:
David Stuttard 2019-01-14 11:55:24 +00:00
parent d1986d1b5a
commit f77079f892
20 changed files with 1275 additions and 77 deletions

View File

@ -590,7 +590,7 @@ class AMDGPUDimSampleProfile<string opmod,
AMDGPUDimProps dim,
AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> {
let IsSample = 1;
let RetTypes = [llvm_anyfloat_ty];
let RetTypes = [llvm_any_ty];
let ExtraAddrArgs = sample.ExtraAddrArgs;
let Gradients = sample.Gradients;
let LodClampMip = sample.LodOrClamp;
@ -683,11 +683,11 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
}
defm int_amdgcn_image_load
: AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_anyfloat_ty], [], [IntrReadMem],
: AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
[SDNPMemOperand]>,
AMDGPUImageDMaskIntrinsic;
defm int_amdgcn_image_load_mip
: AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_anyfloat_ty], [],
: AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
[IntrReadMem], [SDNPMemOperand], 1>,
AMDGPUImageDMaskIntrinsic;

View File

@ -43,6 +43,7 @@ FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesPass();
FunctionPass *createSIFixupVectorISelPass();
FunctionPass *createSIAddIMGInitPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass();
FunctionPass *createSIWholeQuadModePass();
@ -158,6 +159,9 @@ extern char &AMDGPUSimplifyLibCallsID;
void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
extern char &AMDGPUUseNativeCallsID;
void initializeSIAddIMGInitPass(PassRegistry &);
extern char &SIAddIMGInitID;
void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
extern char &AMDGPUPerfHintAnalysisID;

View File

@ -377,6 +377,16 @@ def FeatureEnableDS128 : SubtargetFeature<"enable-ds128",
"Use ds_{read|write}_b128"
>;
// Sparse texture support requires that all result registers are zeroed when
// PRTStrictNull is set to true. This feature is turned on for all architectures
// but is enabled as a feature in case there are situations where PRTStrictNull
// is disabled by the driver.
def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-prt-strict-null",
"EnablePRTStrictNull",
"true",
"Enable zeroing of result registers for sparse texture fetches"
>;
// Unless +-flat-for-global is specified, turn on FlatForGlobal for
// all OS-es on VI and newer hardware to avoid assertion failures due
// to missing ADDR64 variants of MUBUF instructions.

View File

@ -74,6 +74,9 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
// We want to be able to turn these off, but making this a subtarget feature
// for SI has the unhelpful behavior that it unsets everything else if you
// disable it.
//
// Similarly we want enable-prt-strict-null to be on by default and not to
// unset everything else if it is disabled
SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
@ -89,6 +92,8 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
FullFS += "-fp32-denormals,";
}
FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
FullFS += FS;
ParseSubtargetFeatures(GPU, FullFS);
@ -175,6 +180,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
EnableUnsafeDSOffsetFolding(false),
EnableSIScheduler(false),
EnableDS128(false),
EnablePRTStrictNull(false),
DumpCode(false),
FP64(false),

View File

@ -326,6 +326,7 @@ protected:
bool EnableUnsafeDSOffsetFolding;
bool EnableSIScheduler;
bool EnableDS128;
bool EnablePRTStrictNull;
bool DumpCode;
// Subtarget statically properties set by tablegen
@ -577,6 +578,12 @@ public:
return getGeneration() < AMDGPUSubtarget::GFX9;
}
/// \returns If target requires PRT Struct NULL support (zero result registers
/// for sparse texture support).
bool usePRTStrictNull() const {
return EnablePRTStrictNull;
}
bool hasAutoWaitcntBeforeBarrier() const {
return AutoWaitcntBeforeBarrier;
}

View File

@ -825,6 +825,7 @@ bool GCNPassConfig::addInstSelector() {
addPass(&SIFixSGPRCopiesID);
addPass(createSILowerI1CopiesPass());
addPass(createSIFixupVectorISelPass());
addPass(createSIAddIMGInitPass());
return false;
}

View File

@ -93,6 +93,7 @@ add_llvm_target(AMDGPUCodeGen
R600OptimizeVectorRegisters.cpp
R600Packetizer.cpp
R600RegisterInfo.cpp
SIAddIMGInit.cpp
SIAnnotateControlFlow.cpp
SIDebuggerInsertNops.cpp
SIFixSGPRCopies.cpp

View File

@ -29,6 +29,7 @@ class MIMGBaseOpcode {
bit Atomic = 0;
bit AtomicX2 = 0; // (f)cmpswap
bit Sampler = 0;
bit Gather4 = 0;
bits<8> NumExtraArgs = 0;
bit Gradients = 0;
bit Coordinates = 1;
@ -43,7 +44,7 @@ def MIMGBaseOpcode : GenericEnum {
def MIMGBaseOpcodesTable : GenericTable {
let FilterClass = "MIMGBaseOpcode";
let CppTypeName = "MIMGBaseOpcodeInfo";
let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4",
"NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
"HasD16"];
GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
@ -179,6 +180,8 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
let VDataDwords = 4 in
defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
let VDataDwords = 8 in
defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>;
}
}
@ -411,6 +414,8 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
let VDataDwords = 4 in
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
let VDataDwords = 8 in
defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
}
}
@ -421,6 +426,7 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
string asm = "image_gather4"#sample.LowerCaseMod> {
def "" : MIMG_Sampler_BaseOpcode<sample> {
let HasD16 = 1;
let Gather4 = 1;
}
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
@ -429,6 +435,8 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
let VDataDwords = 4 in
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
let VDataDwords = 8 in
defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
}
}

View File

@ -0,0 +1,181 @@
//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Any MIMG instructions that use tfe or lwe require an initialization of the
/// result register that will be written in the case of a memory access failure
/// The required code is also added to tie this init code to the result of the
/// img instruction
///
//===----------------------------------------------------------------------===//
//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "si-img-init"
using namespace llvm;
namespace {
class SIAddIMGInit : public MachineFunctionPass {
public:
static char ID;
public:
SIAddIMGInit() : MachineFunctionPass(ID) {
initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
} // End anonymous namespace.
INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false)
char SIAddIMGInit::ID = 0;
char &llvm::SIAddIMGInitID = SIAddIMGInit::ID;
FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); }
bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *RI = ST.getRegisterInfo();
bool Changed = false;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
++BI) {
MachineBasicBlock &MBB = *BI;
MachineBasicBlock::iterator I, Next;
for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
auto Opcode = MI.getOpcode();
if (TII->isMIMG(Opcode) && !MI.mayStore()) {
MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
// Check for instructions that don't have tfe or lwe fields
// There shouldn't be any at this point.
assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction");
unsigned TFEVal = TFE->getImm();
unsigned LWEVal = LWE->getImm();
unsigned D16Val = D16 ? D16->getImm() : 0;
if (TFEVal || LWEVal) {
// At least one of TFE or LWE are non-zero
// We have to insert a suitable initialization of the result value and
// tie this to the dest of the image instruction.
const DebugLoc &DL = MI.getDebugLoc();
int DstIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
// Calculate which dword we have to initialize to 0.
MachineOperand *MO_Dmask =
TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
// check that dmask operand is found.
assert(MO_Dmask && "Expected dmask operand in instruction");
unsigned dmask = MO_Dmask->getImm();
// Determine the number of active lanes taking into account the
// Gather4 special case
unsigned ActiveLanes =
TII->isGather4(Opcode) ? 4 : countPopulation(dmask);
// Subreg indices are counted from 1
// When D16 then we want next whole VGPR after write data.
static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected");
bool Packed = !ST.hasUnpackedD16VMem();
unsigned InitIdx =
D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
// Abandon attempt if the dst size isn't large enough
// - this is in fact an error but this is picked up elsewhere and
// reported correctly.
uint32_t DstSize =
RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
if (DstSize < InitIdx)
continue;
// Create a register for the intialization value.
unsigned PrevDst =
MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
unsigned NewDst = 0; // Final initialized value will be in here
// If PRTStrictNull feature is enabled (the default) then initialize
// all the result registers to 0, otherwise just the error indication
// register (VGPRn+1)
unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1;
unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx;
if (DstSize == 1) {
// In this case we can just initialize the result directly
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst)
.addImm(0);
NewDst = PrevDst;
} else {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
for (; SizeLeft; SizeLeft--, CurrIdx++) {
NewDst =
MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
// Initialize dword
unsigned SubReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
.addImm(0);
// Insert into the super-reg
BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
.addReg(PrevDst)
.addReg(SubReg)
.addImm(CurrIdx);
PrevDst = NewDst;
}
}
// Add as an implicit operand
MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit);
// Tie the just added implicit operand to the dst
MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
Changed = true;
}
}
}
}
return Changed;
}

View File

@ -216,6 +216,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
@ -813,6 +814,47 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
}
static MVT memVTFromAggregate(Type *Ty) {
// Only limited forms of aggregate type currently expected.
assert(Ty->isStructTy() && "Expected struct type");
Type *ElementType = nullptr;
unsigned NumElts;
if (Ty->getContainedType(0)->isVectorTy()) {
VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
ElementType = VecComponent->getElementType();
NumElts = VecComponent->getNumElements();
} else {
ElementType = Ty->getContainedType(0);
NumElts = 1;
}
assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
// Calculate the size of the memVT type from the aggregate
unsigned Pow2Elts = 0;
unsigned ElementSize;
switch (ElementType->getTypeID()) {
default:
llvm_unreachable("Unknown type!");
case Type::IntegerTyID:
ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
break;
case Type::HalfTyID:
ElementSize = 16;
break;
case Type::FloatTyID:
ElementSize = 32;
break;
}
unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
return MVT::getVectorVT(MVT::getVT(ElementType, false),
Pow2Elts);
}
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@ -840,7 +882,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MODereferenceable;
if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.memVT = MVT::getVT(CI.getType(), true);
if (Info.memVT == MVT::Other) {
// Some intrinsics return an aggregate type - special case to work out
// the correct memVT
Info.memVT = memVTFromAggregate(CI.getType());
}
Info.flags |= MachineMemOperand::MOLoad;
} else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
Info.opc = ISD::INTRINSIC_VOID;
@ -4613,6 +4660,109 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
return Value == 0;
}
// Re-construct the required return value for a image load intrinsic.
// This is more complicated due to the optional use TexFailCtrl which means the required
// return type is an aggregate
static SDValue constructRetValue(SelectionDAG &DAG,
MachineSDNode *Result,
ArrayRef<EVT> ResultTypes,
bool IsTexFail, bool Unpacked, bool IsD16,
int DMaskPop, int NumVDataDwords,
const SDLoc &DL, LLVMContext &Context) {
// Determine the required return type. This is the same regardless of IsTexFail flag
EVT ReqRetVT = ResultTypes[0];
EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
: AdjEltVT
: ReqRetVT;
// Extract data part of the result
// Bitcast the result to the same type as the required return type
int NumElts;
if (IsD16 && !Unpacked)
NumElts = NumVDataDwords << 1;
else
NumElts = NumVDataDwords;
EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
: AdjEltVT;
// Special case for v8f16. Rather than add support for this, use v4i32 to
// extract the data elements
bool V8F16Special = false;
if (CastVT == MVT::v8f16) {
CastVT = MVT::v4i32;
DMaskPop >>= 1;
ReqRetNumElts >>= 1;
V8F16Special = true;
AdjVT = MVT::v2i32;
}
SDValue N = SDValue(Result, 0);
SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
// Iterate over the result
SmallVector<SDValue, 4> BVElts;
if (CastVT.isVector()) {
DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
} else {
BVElts.push_back(CastRes);
}
int ExtraElts = ReqRetNumElts - DMaskPop;
while(ExtraElts--)
BVElts.push_back(DAG.getUNDEF(AdjEltVT));
SDValue PreTFCRes;
if (ReqRetNumElts > 1) {
SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
if (IsD16 && Unpacked)
PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
else
PreTFCRes = NewVec;
} else {
PreTFCRes = BVElts[0];
}
if (V8F16Special)
PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
if (!IsTexFail) {
if (Result->getNumValues() > 1)
return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
else
return PreTFCRes;
}
// Extract the TexFail result and insert into aggregate return
SmallVector<SDValue, 1> TFCElt;
DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
}
static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
SDValue *LWE, bool &IsTexFail) {
auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode());
if (!TexFailCtrlConst)
return false;
uint64_t Value = TexFailCtrlConst->getZExtValue();
if (Value) {
IsTexFail = true;
}
SDLoc DL(TexFailCtrlConst);
*TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
Value &= ~(uint64_t)0x1;
*LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
Value &= ~(uint64_t)0x2;
return Value == 0;
}
SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::ImageDimIntrinsicInfo *Intr,
SelectionDAG &DAG) const {
@ -4626,13 +4776,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
unsigned IntrOpcode = Intr->BaseOpcode;
SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
bool IsD16 = false;
bool IsA16 = false;
SDValue VData;
int NumVDataDwords;
bool AdjustRetType = false;
unsigned AddrIdx; // Index of first address argument
unsigned DMask;
unsigned DMaskLanes = 0;
if (BaseOpcode->Atomic) {
VData = Op.getOperand(2);
@ -4655,7 +4809,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
AddrIdx = 3;
}
} else {
unsigned DMaskIdx;
unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
if (!DMaskConst)
return Op;
DMask = DMaskConst->getZExtValue();
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
if (BaseOpcode->Store) {
VData = Op.getOperand(2);
@ -4671,37 +4830,32 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
DMaskIdx = 3;
} else {
MVT LoadVT = Op.getSimpleValueType();
// Work out the num dwords based on the dmask popcount and underlying type
// and whether packing is supported.
MVT LoadVT = ResultTypes[0].getSimpleVT();
if (LoadVT.getScalarType() == MVT::f16) {
if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
!BaseOpcode->HasD16)
return Op; // D16 is unsupported for this instruction
IsD16 = true;
if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
}
NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
}
// Confirm that the return type is large enough for the dmask specified
if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
(!LoadVT.isVector() && DMaskLanes > 1))
return Op;
auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
if (!DMaskConst)
return Op;
if (IsD16 && !Subtarget->hasUnpackedD16VMem())
NumVDataDwords = (DMaskLanes + 1) / 2;
else
NumVDataDwords = DMaskLanes;
AdjustRetType = true;
}
AddrIdx = DMaskIdx + 1;
DMask = DMaskConst->getZExtValue();
if (!DMask && !BaseOpcode->Store) {
// Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
// store the channels' default values.
SDValue Undef = DAG.getUNDEF(Op.getValueType());
if (isa<MemSDNode>(Op))
return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
return Undef;
}
}
unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
@ -4780,11 +4934,53 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
CtrlIdx = AddrIdx + NumVAddrs + 3;
}
SDValue TFE;
SDValue LWE;
SDValue TexFail = Op.getOperand(CtrlIdx);
auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
if (!TexFailConst || TexFailConst->getZExtValue() != 0)
bool IsTexFail = false;
if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
return Op;
if (IsTexFail) {
if (!DMaskLanes) {
// Expecting to get an error flag since TFC is on - and dmask is 0
// Force dmask to be at least 1 otherwise the instruction will fail
DMask = 0x1;
DMaskLanes = 1;
NumVDataDwords = 1;
}
NumVDataDwords += 1;
AdjustRetType = true;
}
// Has something earlier tagged that the return type needs adjusting
// This happens if the instruction is a load or has set TexFailCtrl flags
if (AdjustRetType) {
// NumVDataDwords reflects the true number of dwords required in the return type
if (DMaskLanes == 0 && !BaseOpcode->Store) {
// This is a no-op load. This can be eliminated
SDValue Undef = DAG.getUNDEF(Op.getValueType());
if (isa<MemSDNode>(Op))
return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
return Undef;
}
// Have to use a power of 2 number of dwords
NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
EVT NewVT = NumVDataDwords > 1 ?
EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
: MVT::f32;
ResultTypes[0] = NewVT;
if (ResultTypes.size() == 3) {
// Original result was aggregate type used for TexFailCtrl results
// The actual instruction returns as a vector type which has now been
// created. Remove the aggregate result.
ResultTypes.erase(&ResultTypes[1]);
}
}
SDValue GLC;
SDValue SLC;
if (BaseOpcode->Atomic) {
@ -4809,8 +5005,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Ops.push_back(SLC);
Ops.push_back(IsA16 && // a16 or r128
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
Ops.push_back(False); // tfe
Ops.push_back(False); // lwe
Ops.push_back(TFE); // tfe
Ops.push_back(LWE); // lwe
Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
Ops.push_back(IsD16 ? True : False);
@ -4838,11 +5034,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SmallVector<SDValue, 1> Elt;
DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
} else if (IsD16 && !BaseOpcode->Store) {
MVT LoadVT = Op.getSimpleValueType();
SDValue Adjusted = adjustLoadValueTypeImpl(
SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
} else if (!BaseOpcode->Store) {
return constructRetValue(DAG, NewNode,
OrigResultTypes, IsTexFail,
Subtarget->hasUnpackedD16VMem(), IsD16,
DMaskLanes, NumVDataDwords, DL,
*DAG.getContext());
}
return SDValue(NewNode, 0);
@ -8753,6 +8950,7 @@ static unsigned SubIdx2Lane(unsigned Idx) {
case AMDGPU::sub1: return 1;
case AMDGPU::sub2: return 2;
case AMDGPU::sub3: return 3;
case AMDGPU::sub4: return 4; // Possible with TFE/LWE
}
}
@ -8766,11 +8964,16 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
return Node; // not implemented for D16
SDNode *Users[4] = { nullptr };
SDNode *Users[5] = { nullptr };
unsigned Lane = 0;
unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
unsigned NewDmask = 0;
unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
unsigned TFCLane = 0;
bool HasChain = Node->getNumValues() > 1;
if (OldDmask == 0) {
@ -8778,6 +8981,12 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
return Node;
}
unsigned OldBitsSet = countPopulation(OldDmask);
// Work out which is the TFE/LWE lane if that is enabled.
if (UsesTFC) {
TFCLane = OldBitsSet;
}
// Try to figure out the used register components
for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
I != E; ++I) {
@ -8797,28 +9006,49 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// set, etc.
Lane = SubIdx2Lane(I->getConstantOperandVal(1));
// Set which texture component corresponds to the lane.
unsigned Comp;
for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
Comp = countTrailingZeros(Dmask);
Dmask &= ~(1 << Comp);
// Check if the use is for the TFE/LWE generated result at VGPRn+1.
if (UsesTFC && Lane == TFCLane) {
Users[Lane] = *I;
} else {
// Set which texture component corresponds to the lane.
unsigned Comp;
for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
Comp = countTrailingZeros(Dmask);
Dmask &= ~(1 << Comp);
}
// Abort if we have more than one user per component.
if (Users[Lane])
return Node;
Users[Lane] = *I;
NewDmask |= 1 << Comp;
}
// Abort if we have more than one user per component
if (Users[Lane])
return Node;
Users[Lane] = *I;
NewDmask |= 1 << Comp;
}
// Don't allow 0 dmask, as hardware assumes one channel enabled.
bool NoChannels = !NewDmask;
if (NoChannels) {
// If the original dmask has one channel - then nothing to do
if (OldBitsSet == 1)
return Node;
// Use an arbitrary dmask - required for the instruction to work
NewDmask = 1;
}
// Abort if there's no change
if (NewDmask == OldDmask)
return Node;
unsigned BitsSet = countPopulation(NewDmask);
int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
// Check for TFE or LWE - increase the number of channels by one to account
// for the extra return value
// This will need adjustment for D16 if this is also included in
// adjustWriteMask (this function) but at present D16 are excluded.
unsigned NewChannels = BitsSet + UsesTFC;
int NewOpcode =
AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
assert(NewOpcode != -1 &&
NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
"failed to find equivalent MIMG op");
@ -8831,8 +9061,9 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
MVT ResultVT = BitsSet == 1 ?
SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
MVT ResultVT = NewChannels == 1 ?
SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
NewChannels == 5 ? 8 : NewChannels);
SDVTList NewVTList = HasChain ?
DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
@ -8846,7 +9077,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
}
if (BitsSet == 1) {
if (NewChannels == 1) {
assert(Node->hasNUsesOfValue(1, 0));
SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
SDLoc(Node), Users[Lane]->getValueType(0),
@ -8856,19 +9087,24 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
}
// Update the users of the node with the new indices
for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
SDNode *User = Users[i];
if (!User)
continue;
SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
if (!User) {
// Handle the special case of NoChannels. We set NewDmask to 1 above, but
// Users[0] is still nullptr because channel 0 doesn't really have a use.
if (i || !NoChannels)
continue;
} else {
SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
}
switch (Idx) {
default: break;
case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
}
}

View File

@ -2972,6 +2972,42 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
// Verify MIMG
if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
// Ensure that the return type used is large enough for all the options
// being used TFE/LWE require an extra result register.
const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
if (DMask) {
uint64_t DMaskImm = DMask->getImm();
uint32_t RegCount =
isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
// Adjust for packed 16 bit values
if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
RegCount >>= 1;
// Adjust if using LWE or TFE
if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
RegCount += 1;
const uint32_t DstIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
const MachineOperand &Dst = MI.getOperand(DstIdx);
if (Dst.isReg()) {
const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
if (RegCount > DstSize) {
ErrInfo = "MIMG instruction returns too many registers for dst "
"register class";
return false;
}
}
}
}
// Verify VOP*. Ignore multiple sgpr operands on writelane.
if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
&& (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {

View File

@ -184,6 +184,7 @@ struct MIMGBaseOpcodeInfo {
bool Atomic;
bool AtomicX2;
bool Sampler;
bool Gather4;
uint8_t NumExtraArgs;
bool Gradients;

View File

@ -802,7 +802,8 @@ private:
Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
APInt DemandedElts,
int DmaskIdx = -1);
int DmaskIdx = -1,
int TFCIdx = -1);
Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
APInt &UndefElts, unsigned Depth = 0);

View File

@ -969,11 +969,24 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
APInt DemandedElts,
int DMaskIdx) {
int DMaskIdx,
int TFCIdx) {
unsigned VWidth = II->getType()->getVectorNumElements();
if (VWidth == 1)
return nullptr;
// Need to change to new instruction format
ConstantInt *TFC = nullptr;
bool TFELWEEnabled = false;
if (TFCIdx > 0) {
TFC = dyn_cast<ConstantInt>(II->getArgOperand(TFCIdx));
TFELWEEnabled = TFC->getZExtValue() & 0x1 // TFE
|| TFC->getZExtValue() & 0x2; // LWE
}
if (TFELWEEnabled)
return nullptr; // TFE not yet supported
ConstantInt *NewDMask = nullptr;
if (DMaskIdx < 0) {
@ -1626,7 +1639,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
default: {
if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
return simplifyAMDGCNMemoryIntrinsicDemanded(
II, DemandedElts, 0, II->getNumArgOperands() - 2);
break;
}

View File

@ -1,6 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SIVI,PRT %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SIVI,PRT %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,PRT %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,NOPRT %s
; GCN-LABEL: {{^}}load_1d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
@ -10,6 +11,52 @@ main_body:
ret <4 x float> %v
}
; GCN-LABEL: {{^}}load_1d_tfe:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v4, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT-NOT: v_mov_b32_e32 v3
; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}}
; SIVI: buffer_store_dword v4, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_1d_lwe:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v4, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT-NOT: v_mov_b32_e32 v3
; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}}
; SIVI: buffer_store_dword v4, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
main_body:
%v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_2d:
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
@ -18,6 +65,29 @@ main_body:
ret <4 x float> %v
}
; GCN-LABEL: {{^}}load_2d_tfe:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v4, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT-NOT: v_mov_b32_e32 v3
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
; SIVI: buffer_store_dword v4, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_3d:
; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
@ -26,6 +96,29 @@ main_body:
ret <4 x float> %v
}
; GCN-LABEL: {{^}}load_3d_tfe_lwe:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v4, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT-NOT: v_mov_b32_e32 v3
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
; SIVI: buffer_store_dword v4, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_cube:
; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
@ -34,6 +127,29 @@ main_body:
ret <4 x float> %v
}
; GCN-LABEL: {{^}}load_cube_lwe:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v4, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT-NOT: v_mov_b32_e32 v3
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
; SIVI: buffer_store_dword v4, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_1darray:
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}}
define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
@ -42,6 +158,29 @@ main_body:
ret <4 x float> %v
}
; GCN-LABEL: {{^}}load_1darray_tfe:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v4, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT-NOT: v_mov_b32_e32 v3
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
; SIVI: buffer_store_dword v4, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_2darray:
; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
@ -50,6 +189,29 @@ main_body:
ret <4 x float> %v
}
; GCN-LABEL: {{^}}load_2darray_lwe:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v4, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT-NOT: v_mov_b32_e32 v3
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
; SIVI: buffer_store_dword v4, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_2dmsaa:
; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
@ -58,6 +220,29 @@ main_body:
ret <4 x float> %v
}
; GCN-LABEL: {{^}}load_2dmsaa_both:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v4, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT-NOT: v_mov_b32_e32 v3
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
; SIVI: buffer_store_dword v4, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_2darraymsaa:
; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
@ -66,6 +251,29 @@ main_body:
ret <4 x float> %v
}
; GCN-LABEL: {{^}}load_2darraymsaa_tfe:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v4, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT-NOT: v_mov_b32_e32 v3
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
; SIVI: buffer_store_dword v4, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_mip_1d:
; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) {
@ -74,6 +282,29 @@ main_body:
ret <4 x float> %v
}
; GCN-LABEL: {{^}}load_mip_1d_lwe:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v4, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT-NOT: v_mov_b32_e32 v3
; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}}
; SIVI: buffer_store_dword v4, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_mip_2d:
; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
@ -82,6 +313,191 @@ main_body:
ret <4 x float> %v
}
; GCN-LABEL: {{^}}load_mip_2d_tfe:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v4, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT-NOT: v_mov_b32_e32 v3
; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
; SIVI: buffer_store_dword v4, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; Make sure that error flag is returned even with dmask 0
; GCN-LABEL: {{^}}load_1d_V2_tfe_dmask0:
; GCN: v_mov_b32_e32 v1, 0
; PRT-DAG: v_mov_b32_e32 v2, v1
; PRT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
define amdgpu_ps float @load_1d_V2_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
main_body:
%v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.err = extractvalue {<2 x float>, i32} %v, 1
%vv = bitcast i32 %v.err to float
ret float %vv
}
; GCN-LABEL: {{^}}load_1d_V1_tfe_dmask0:
; GCN: v_mov_b32_e32 v1, 0
; PRT-DAG: v_mov_b32_e32 v2, v1
; PRT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
define amdgpu_ps float @load_1d_V1_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
main_body:
%v = call {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.err = extractvalue {float, i32} %v, 1
%vv = bitcast i32 %v.err to float
ret float %vv
}
; GCN-LABEL: {{^}}load_mip_2d_tfe_dmask0:
; GCN: v_mov_b32_e32 v3, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
define amdgpu_ps float @load_mip_2d_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 0, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
%v.err = extractvalue {<4 x float>, i32} %v, 1
%vv = bitcast i32 %v.err to float
ret float %vv
}
; Do not make dmask 0 even if no result (other than tfe) is used.
; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse:
; GCN: v_mov_b32_e32 v3, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
define amdgpu_ps float @load_mip_2d_tfe_nouse(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
%v.err = extractvalue {<4 x float>, i32} %v, 1
%vv = bitcast i32 %v.err to float
ret float %vv
}
; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse_V2:
; GCN: v_mov_b32_e32 v3, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
define amdgpu_ps float @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
main_body:
%v = call {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32 6, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
%v.err = extractvalue {<2 x float>, i32} %v, 1
%vv = bitcast i32 %v.err to float
ret float %vv
}
; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse_V1:
; GCN: v_mov_b32_e32 v3, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 unorm tfe{{$}}
; NOPRT-NOT: v_mov_b32_e32 v2
; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 unorm tfe{{$}}
define amdgpu_ps float @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
main_body:
%v = call {float, i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32 2, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
%v.err = extractvalue {float, i32} %v, 1
%vv = bitcast i32 %v.err to float
ret float %vv
}
; Check for dmask being materially smaller than return type
; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask3:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v3, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; NOPRT-NOT: v_mov_b32_e32 v2
; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x7 unorm tfe{{$}}
; SIVI: buffer_store_dword v3, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v3
define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask2:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v2, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; NOPRT-NOT: v_mov_b32_e32 v1
; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}}
; SIVI: buffer_store_dword v2, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v2
define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask1:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v1, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; GCN: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}}
; SIVI: buffer_store_dword v1, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v1
define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}load_1d_tfe_V2_dmask1:
; PRT: v_mov_b32_e32 v0, 0
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
; NOPRT: v_mov_b32_e32 v1, 0
; NOPRT-NOT: v_mov_b32_e32 v0
; GCN: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}}
; SIVI: buffer_store_dword v1, off, s[8:11], 0
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v1
define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
main_body:
%v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue {<2 x float>, i32} %v, 0
%v.err = extractvalue {<2 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <2 x float> %v.vec
}
; GCN-LABEL: {{^}}load_mip_3d:
; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) {
@ -404,23 +820,37 @@ define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)
store float 0.000000e+00, float addrspace(3)* %lds
%c0 = extractelement <2 x i32> %c, i32 0
%c1 = extractelement <2 x i32> %c, i32 1
%tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 15, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
%tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
%tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
store float 0.000000e+00, float addrspace(3)* %tmp2
ret float %tex
}
declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
declare {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
declare {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
declare {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
declare {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
declare {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare {float,i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1

View File

@ -1,7 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
; GCN-LABEL: {{^}}load.f16.1d:
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 d16
define amdgpu_ps <4 x half> @load.f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
@ -10,7 +10,7 @@ main_body:
}
; GCN-LABEL: {{^}}load.v2f16.1d:
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
; GCN: image_load v0, v0, s[0:7] dmask:0x3 unorm a16 d16
define amdgpu_ps <4 x half> @load.v2f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
@ -37,7 +37,7 @@ main_body:
}
; GCN-LABEL: {{^}}load.f16.2d:
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 d16
define amdgpu_ps <4 x half> @load.f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
@ -47,7 +47,7 @@ main_body:
}
; GCN-LABEL: {{^}}load.v2f16.2d:
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
; GCN: image_load v0, v0, s[0:7] dmask:0x3 unorm a16 d16
define amdgpu_ps <4 x half> @load.v2f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
@ -77,7 +77,7 @@ main_body:
}
; GCN-LABEL: {{^}}load.f16.3d:
; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x1 unorm a16 d16
; GCN: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm a16 d16
define amdgpu_ps <4 x half> @load.f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
@ -88,7 +88,7 @@ main_body:
}
; GCN-LABEL: {{^}}load.v2f16.3d:
; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16 d16
; GCN: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm a16 d16
define amdgpu_ps <4 x half> @load.v2f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0

View File

@ -1,7 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
; GCN-LABEL: {{^}}load.f32.1d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
@ -10,7 +10,7 @@ main_body:
}
; GCN-LABEL: {{^}}load.v2f32.1d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16
define amdgpu_ps <4 x float> @load.v2f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
@ -37,7 +37,7 @@ main_body:
}
; GCN-LABEL: {{^}}load.f32.2d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
define amdgpu_ps <4 x float> @load.f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
@ -47,7 +47,7 @@ main_body:
}
; GCN-LABEL: {{^}}load.v2f32.2d:
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16
define amdgpu_ps <4 x float> @load.v2f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
@ -77,7 +77,7 @@ main_body:
}
; GCN-LABEL: {{^}}load.f32.3d:
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x1 unorm a16
; GCN: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm a16
define amdgpu_ps <4 x float> @load.f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0
@ -88,7 +88,7 @@ main_body:
}
; GCN-LABEL: {{^}}load.v2f32.3d:
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x3 unorm a16
; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16
define amdgpu_ps <4 x float> @load.v2f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
main_body:
%x = extractelement <2 x i16> %coords_lo, i32 0

View File

@ -10,6 +10,19 @@ main_body:
ret half %tex
}
; GCN-LABEL: {{^}}image_sample_2d_f16_tfe:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
; PACKED: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}}
; UNPACKED: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}}
define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, i32 addrspace(1)* inreg %out) {
main_body:
%tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
%tex.vec = extractvalue {half, i32} %tex, 0
%tex.err = extractvalue {half, i32} %tex, 1
store i32 %tex.err, i32 addrspace(1)* %out, align 4
ret half %tex.vec
}
; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16:
; UNPACKED: image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}}
; PACKED: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}}
@ -20,6 +33,22 @@ main_body:
ret float %r
}
; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16_tfe:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
; UNPACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}}
; PACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}}
define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
main_body:
%tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
%tex.vec = extractvalue {<2 x half>, i32} %tex, 0
%tex.err = extractvalue {<2 x half>, i32} %tex, 1
%tex.vecf = bitcast <2 x half> %tex.vec to float
%r.0 = insertelement <2 x float> undef, float %tex.vecf, i32 0
%tex.errf = bitcast i32 %tex.err to float
%r = insertelement <2 x float> %r.0, float %tex.errf, i32 1
ret <2 x float> %r
}
; GCN-LABEL: {{^}}image_sample_b_2d_v4f16:
; UNPACKED: image_sample_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}}
; PACKED: image_sample_b v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}}
@ -30,9 +59,33 @@ main_body:
ret <2 x float> %r
}
; GCN-LABEL: {{^}}image_sample_b_2d_v4f16_tfe:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
; UNPACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}}
; PACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}}
define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
main_body:
%tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
%tex.vec = extractvalue {<4 x half>, i32} %tex, 0
%tex.err = extractvalue {<4 x half>, i32} %tex, 1
%tex.vecf = bitcast <4 x half> %tex.vec to <2 x float>
%tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0
%tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1
%r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0
%r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1
%tex.errf = bitcast i32 %tex.err to float
%r = insertelement <4 x float> %r.1, float %tex.errf, i32 2
ret <4 x float> %r
}
declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare {<2 x half>,i32} @llvm.amdgcn.image.sample.2d.v2f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }

View File

@ -9,6 +9,162 @@ main_body:
ret <4 x float> %v
}
; GCN-LABEL: {{^}}sample_1d_tfe:
; GCN: v_mov_b32_e32 v0, 0
; GCN: v_mov_b32_e32 v1, v0
; GCN: v_mov_b32_e32 v2, v0
; GCN: v_mov_b32_e32 v3, v0
; GCN: v_mov_b32_e32 v4, v0
; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}}
define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_1:
; GCN: v_mov_b32_e32 v0, 0
; GCN: v_mov_b32_e32 v1, v0
; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 tfe{{$}}
define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%res.vec = extractvalue {<4 x float>,i32} %v, 0
%res.f = extractelement <4 x float> %res.vec, i32 0
%res.err = extractvalue {<4 x float>,i32} %v, 1
%res.errf = bitcast i32 %res.err to float
%res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
%res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
ret <2 x float> %res
}
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_2:
; GCN: v_mov_b32_e32 v0, 0
; GCN: v_mov_b32_e32 v1, v0
; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 tfe{{$}}
define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%res.vec = extractvalue {<4 x float>,i32} %v, 0
%res.f = extractelement <4 x float> %res.vec, i32 1
%res.err = extractvalue {<4 x float>,i32} %v, 1
%res.errf = bitcast i32 %res.err to float
%res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
%res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
ret <2 x float> %res
}
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_3:
; GCN: v_mov_b32_e32 v0, 0
; GCN: v_mov_b32_e32 v1, v0
; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 tfe{{$}}
define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%res.vec = extractvalue {<4 x float>,i32} %v, 0
%res.f = extractelement <4 x float> %res.vec, i32 2
%res.err = extractvalue {<4 x float>,i32} %v, 1
%res.errf = bitcast i32 %res.err to float
%res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
%res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
ret <2 x float> %res
}
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_4:
; GCN: v_mov_b32_e32 v0, 0
; GCN: v_mov_b32_e32 v1, v0
; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 tfe{{$}}
define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%res.vec = extractvalue {<4 x float>,i32} %v, 0
%res.f = extractelement <4 x float> %res.vec, i32 3
%res.err = extractvalue {<4 x float>,i32} %v, 1
%res.errf = bitcast i32 %res.err to float
%res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
%res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
ret <2 x float> %res
}
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_12:
; GCN: v_mov_b32_e32 v0, 0
; GCN: v_mov_b32_e32 v1, v0
; GCN: v_mov_b32_e32 v2, v0
; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 tfe{{$}}
define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%res.vec = extractvalue {<4 x float>,i32} %v, 0
%res.f1 = extractelement <4 x float> %res.vec, i32 0
%res.f2 = extractelement <4 x float> %res.vec, i32 1
%res.err = extractvalue {<4 x float>,i32} %v, 1
%res.errf = bitcast i32 %res.err to float
%res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
%res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
%res = insertelement <4 x float> %res.tmp2, float %res.errf, i32 2
ret <4 x float> %res
}
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_24:
; GCN: v_mov_b32_e32 v0, 0
; GCN: v_mov_b32_e32 v1, v0
; GCN: v_mov_b32_e32 v2, v0
; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa tfe{{$}}
define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_24(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%res.vec = extractvalue {<4 x float>,i32} %v, 0
%res.f1 = extractelement <4 x float> %res.vec, i32 1
%res.f2 = extractelement <4 x float> %res.vec, i32 3
%res.err = extractvalue {<4 x float>,i32} %v, 1
%res.errf = bitcast i32 %res.err to float
%res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
%res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
%res = insertelement <4 x float> %res.tmp2, float %res.errf, i32 2
ret <4 x float> %res
}
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_134:
; GCN: v_mov_b32_e32 v0, 0
; GCN: v_mov_b32_e32 v1, v0
; GCN: v_mov_b32_e32 v2, v0
; GCN: v_mov_b32_e32 v3, v0
; GCN: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd tfe{{$}}
define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_134(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%res.vec = extractvalue {<4 x float>,i32} %v, 0
%res.f1 = extractelement <4 x float> %res.vec, i32 0
%res.f2 = extractelement <4 x float> %res.vec, i32 2
%res.f3 = extractelement <4 x float> %res.vec, i32 3
%res.err = extractvalue {<4 x float>,i32} %v, 1
%res.errf = bitcast i32 %res.err to float
%res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
%res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
%res.tmp3 = insertelement <4 x float> %res.tmp2, float %res.f3, i32 2
%res = insertelement <4 x float> %res.tmp3, float %res.errf, i32 3
ret <4 x float> %res
}
; GCN-LABEL: {{^}}sample_1d_lwe:
; GCN: v_mov_b32_e32 v0, 0
; GCN: v_mov_b32_e32 v1, v0
; GCN: v_mov_b32_e32 v2, v0
; GCN: v_mov_b32_e32 v3, v0
; GCN: v_mov_b32_e32 v4, v0
; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}}
define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret <4 x float> %v.vec
}
; GCN-LABEL: {{^}}sample_2d:
; GCN: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}}
define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
@ -361,6 +517,17 @@ main_body:
ret float %v
}
; GCN-LABEL: {{^}}sample_c_d_o_2darray_V1_tfe:
; GCN: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da{{$}}
define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, i32 addrspace(1)* inreg %out) {
main_body:
%v = call {float,i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%v.vec = extractvalue {float, i32} %v, 0
%v.err = extractvalue {float, i32} %v, 1
store i32 %v.err, i32 addrspace(1)* %out, align 4
ret float %v.vec
}
; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2:
; GCN: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da{{$}}
define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
@ -369,6 +536,22 @@ main_body:
ret <2 x float> %v
}
; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2_tfe:
; GCN: image_sample_c_d_o v[9:12], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}}
define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
main_body:
%v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
%v.vec = extractvalue {<2 x float>, i32} %v, 0
%v.f1 = extractelement <2 x float> %v.vec, i32 0
%v.f2 = extractelement <2 x float> %v.vec, i32 1
%v.err = extractvalue {<2 x float>, i32} %v, 1
%v.errf = bitcast i32 %v.err to float
%res.0 = insertelement <4 x float> undef, float %v.f1, i32 0
%res.1 = insertelement <4 x float> %res.0, float %v.f2, i32 1
%res.2 = insertelement <4 x float> %res.1, float %v.errf, i32 2
ret <4 x float> %res.2
}
; GCN-LABEL: {{^}}sample_1d_unorm:
; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf unorm{{$}}
define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
@ -491,6 +674,7 @@ main_body:
}
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
@ -542,7 +726,9 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, floa
declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare {float, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }

View File

@ -1288,6 +1288,28 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8
ret float %elt0
}
; Check that the intrinsic remains unchanged in the presence of TFE or LWE
; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe(
; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
; CHECK: ret float %elt0
define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
%data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
%data.vec = extractvalue {<4 x float>,i32} %data, 0
%elt0 = extractelement <4 x float> %data.vec, i32 0
ret float %elt0
}
; Check that the intrinsic remains unchanged in the presence of TFE or LWE
; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe(
; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
; CHECK: ret float %elt0
define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
%data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
%data.vec = extractvalue {<4 x float>,i32} %data, 0
%elt0 = extractelement <4 x float> %data.vec, i32 0
ret float %elt0
}
; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32(
; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
; CHECK-NEXT: ret float %data
@ -1466,6 +1488,7 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_
}
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1