AMDGPU/GlobalISel: Adjust image load register type based on dmask

Trim elements that won't be written. The equivalent still needs to be
done for writes. Also start widening 3 elements to 4
elements. Selection will get the count from the dmask.
This commit is contained in:
Matt Arsenault 2020-01-28 09:05:11 -05:00 committed by Matt Arsenault
parent 83ffbf2618
commit d9a012ed8a
4 changed files with 1983 additions and 146 deletions

View File

@ -3374,69 +3374,6 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
return true;
}
// Produce a vector of s16 elements from s32 pieces.
static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
ArrayRef<Register> UnmergeParts) {
const LLT S16 = LLT::scalar(16);
SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
B.buildBuildVector(DstReg, RemergeParts);
}
/// Convert a set of s32 registers to a result vector with s16 elements.
static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
ArrayRef<Register> UnmergeParts) {
MachineRegisterInfo &MRI = *B.getMRI();
const LLT V2S16 = LLT::vector(2, 16);
LLT TargetTy = MRI.getType(DstReg);
int NumElts = UnmergeParts.size();
if (NumElts == 1) {
assert(TargetTy == V2S16);
B.buildBitcast(DstReg, UnmergeParts[0]);
return;
}
SmallVector<Register, 4> RemergeParts(NumElts);
for (int I = 0; I != NumElts; ++I)
RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
if (TargetTy.getSizeInBits() == 32u * NumElts) {
B.buildConcatVectors(DstReg, RemergeParts);
return;
}
const LLT V3S16 = LLT::vector(3, 16);
const LLT V6S16 = LLT::vector(6, 16);
// Widen to v6s16 and unpack v3 parts.
assert(TargetTy == V3S16);
RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
}
// FIXME: Just vector trunc should be sufficent, but legalization currently
// broken.
static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
Register WideDstReg) {
const LLT S32 = LLT::scalar(32);
const LLT S16 = LLT::scalar(16);
auto Unmerge = B.buildUnmerge(S32, WideDstReg);
int NumOps = Unmerge->getNumOperands() - 1;
SmallVector<Register, 4> RemergeParts(NumOps);
for (int I = 0; I != NumOps; ++I)
RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
B.buildBuildVector(DstReg, RemergeParts);
}
/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
/// vector with s16 typed elements.
static void packImageA16AddressToDwords(MachineIRBuilder &B,
@ -3493,14 +3430,18 @@ static int getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
return BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM;
}
static int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
int NumDefs) {
assert(!BaseOpcode->Atomic);
return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0);
}
/// Return first address operand index in an image intrinsic.
static int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
int NumDefs) {
if (BaseOpcode->Atomic)
return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1);
int DMaskIdx = NumDefs + 1 + (BaseOpcode->Store ? 1 : 0);
return DMaskIdx + 1;
return getDMaskIdx(BaseOpcode, NumDefs) + 1;
}
/// Rewrite image intrinsics to use register layouts expected by the subtarget.
@ -3544,6 +3485,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
MachineRegisterInfo *MRI = B.getMRI();
const LLT S32 = LLT::scalar(32);
const LLT S16 = LLT::scalar(16);
const LLT V2S16 = LLT::vector(2, 16);
// Index of first address argument
const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
@ -3603,10 +3545,15 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
convertImageAddrToPacked(B, MI, DimIdx, NumVAddrs);
}
if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
return true;
int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
const int DMaskIdx = getDMaskIdx(BaseOpcode, NumDefs);
unsigned DMask = MI.getOperand(DMaskIdx).getImm();
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
}
if (BaseOpcode->Store) { // No TFE for stores?
// TODO: Handle dmask trim
Register VData = MI.getOperand(1).getReg();
LLT Ty = MRI->getType(VData);
if (!Ty.isVector() || Ty.getElementType() != S16)
@ -3626,91 +3573,162 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
LLT Ty = MRI->getType(DstReg);
const LLT EltTy = Ty.getScalarType();
const bool IsD16 = Ty.getScalarType() == S16;
const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
// Confirm that the return type is large enough for the dmask specified
if (NumElts < DMaskLanes)
return false;
if (NumElts > 4 || DMaskLanes > 4)
return false;
const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
// The raw dword aligned data component of the load. The only legal cases
// where this matters should be when using the packed D16 format, for
// s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
LLT RoundedTy;
// S32 vector to to cover all data, plus TFE result element.
LLT TFETy;
// Register type to use for each loaded component. Will be S32 or V2S16.
LLT RegTy;
if (IsD16 && ST.hasUnpackedD16VMem()) {
RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
TFETy = LLT::vector(AdjustedNumElts + 1, 32);
RegTy = S32;
} else {
unsigned EltSize = EltTy.getSizeInBits();
unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
unsigned RoundedSize = 32 * RoundedElts;
RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
}
// The return type does not need adjustment.
// TODO: Should we change s16 case to s32 or <2 x s16>?
if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
return true;
Register Dst1Reg;
// Insert after the instruction.
B.setInsertPt(*MI.getParent(), ++MI.getIterator());
// TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
// s16> instead of s32, we would only need 1 bitcast instead of multiple.
const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
MI.getOperand(0).setReg(NewResultReg);
// In the IR, TFE is supposed to be used with a 2 element struct return
// type. The intruction really returns these two values in one contiguous
// register, with one additional dword beyond the loaded data. Rewrite the
// return type to use a single register result.
if (IsTFE) {
// In the IR, TFE is supposed to be used with a 2 element struct return
// type. The intruction really returns these two values in one contiguous
// register, with one additional dword beyond the loaded data. Rewrite the
// return type to use a single register result.
Register Dst1Reg = MI.getOperand(1).getReg();
Dst1Reg = MI.getOperand(1).getReg();
if (MRI->getType(Dst1Reg) != S32)
return false;
// TODO: Make sure the TFE operand bit is set.
// The raw dword aligned data component of the load. The only legal cases
// where this matters should be when using the packed D16 format, for
// s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
LLT RoundedTy;
LLT TFETy;
if (IsD16 && ST.hasUnpackedD16VMem()) {
RoundedTy = LLT::scalarOrVector(NumElts, 32);
TFETy = LLT::vector(NumElts + 1, 32);
} else {
unsigned EltSize = Ty.getScalarSizeInBits();
unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
unsigned RoundedSize = 32 * RoundedElts;
RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
}
Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
MI.getOperand(0).setReg(TFEReg);
MI.RemoveOperand(1);
// Insert after the instruction.
B.setInsertPt(*MI.getParent(), ++MI.getIterator());
// Now figure out how to copy the new result register back into the old
// result.
SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
int NumDataElts = TFETy.getNumElements() - 1;
if (!Ty.isVector()) {
// Simplest case is a trivial unmerge (plus a truncate for d16).
UnmergeResults[0] = Ty == S32 ?
DstReg : MRI->createGenericVirtualRegister(S32);
B.buildUnmerge(UnmergeResults, TFEReg);
if (Ty != S32)
B.buildTrunc(DstReg, UnmergeResults[0]);
// Handle the easy case that requires no repack instructions.
if (Ty == S32) {
B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
return true;
}
}
// Now figure out how to copy the new result register back into the old
// result.
SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
if (ResultNumRegs == 1) {
assert(!IsTFE);
ResultRegs[0] = NewResultReg;
} else {
// We have to repack into a new vector of some kind.
for (int I = 0; I != NumDataElts; ++I)
UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
B.buildUnmerge(UnmergeResults, TFEReg);
for (int I = 0; I != NumDataRegs; ++I)
ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
B.buildUnmerge(ResultRegs, NewResultReg);
// Drop the final TFE element.
ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
if (EltTy == S32)
B.buildBuildVector(DstReg, DataPart);
else if (ST.hasUnpackedD16VMem())
truncToS16Vector(B, DstReg, DataPart);
else
bitcastToS16Vector(B, DstReg, DataPart);
// Drop the final TFE element to get the data part. The TFE result is
// directly written to the right place already.
if (IsTFE)
ResultRegs.resize(NumDataRegs);
}
// For an s16 scalar result, we form an s32 result with a truncate regardless
// of packed vs. unpacked.
if (IsD16 && !Ty.isVector()) {
B.buildTrunc(DstReg, ResultRegs[0]);
return true;
}
// Must be an image load.
if (!ST.hasUnpackedD16VMem() || !Ty.isVector() || Ty.getElementType() != S16)
// Avoid a build/concat_vector of 1 entry.
if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
B.buildBitcast(DstReg, ResultRegs[0]);
return true;
}
B.setInsertPt(*MI.getParent(), ++MI.getIterator());
assert(Ty.isVector());
LLT WidenedTy = Ty.changeElementType(S32);
Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
if (IsD16) {
// For packed D16 results with TFE enabled, all the data components are
// S32. Cast back to the expected type.
//
// TODO: We don't really need to use load s32 elements. We would only need one
// cast for the TFE result if a multiple of v2s16 was used.
if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
for (Register &Reg : ResultRegs)
Reg = B.buildBitcast(V2S16, Reg).getReg(0);
} else if (ST.hasUnpackedD16VMem()) {
for (Register &Reg : ResultRegs)
Reg = B.buildTrunc(S16, Reg).getReg(0);
}
}
MI.getOperand(0).setReg(WideDstReg);
auto padWithUndef = [&](LLT Ty, int NumElts) {
if (NumElts == 0)
return;
Register Undef = B.buildUndef(Ty).getReg(0);
for (int I = 0; I != NumElts; ++I)
ResultRegs.push_back(Undef);
};
repackUnpackedD16Load(B, DstReg, WideDstReg);
// Pad out any elements eliminated due to the dmask.
LLT ResTy = MRI->getType(ResultRegs[0]);
if (!ResTy.isVector()) {
padWithUndef(ResTy, NumElts - ResultRegs.size());
B.buildBuildVector(DstReg, ResultRegs);
return true;
}
assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
// Deal with the one annoying legal case.
const LLT V3S16 = LLT::vector(3, 16);
if (Ty == V3S16) {
padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
return true;
}
padWithUndef(ResTy, RegsToCover - ResultRegs.size());
B.buildConcatVectors(DstReg, ResultRegs);
return true;
}

View File

@ -2906,12 +2906,12 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0
; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX9: $vgpr0 = COPY [[UV]](s32)
; GFX9: $vgpr1 = COPY [[UV1]](s32)
; GFX9: $vgpr2 = COPY [[UV2]](s32)
; GFX9: $vgpr3 = COPY [[UV3]](s32)
; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0
; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX9: $vgpr1 = COPY [[DEF]](s32)
; GFX9: $vgpr2 = COPY [[DEF]](s32)
; GFX9: $vgpr3 = COPY [[DEF]](s32)
; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
; GFX10NSA-LABEL: name: getresinfo_dmask0
; GFX10NSA: bb.1.main_body:
@ -2928,12 +2928,12 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa
; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0
; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GFX10NSA: $vgpr0 = COPY [[UV]](s32)
; GFX10NSA: $vgpr1 = COPY [[UV1]](s32)
; GFX10NSA: $vgpr2 = COPY [[UV2]](s32)
; GFX10NSA: $vgpr3 = COPY [[UV3]](s32)
; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0
; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GFX10NSA: $vgpr1 = COPY [[DEF]](s32)
; GFX10NSA: $vgpr2 = COPY [[DEF]](s32)
; GFX10NSA: $vgpr3 = COPY [[DEF]](s32)
; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
main_body:
%mip = extractelement <2 x i16> %coords, i32 0

View File

@ -224,6 +224,587 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32(<8 x i32> inreg %rsrc, i32 %s
ret <4 x float> %tex
}
define amdgpu_ps float @image_load_f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_f32_dmask_0000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8")
; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
%tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret float %tex
}
define amdgpu_ps <2 x float> @image_load_v2f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_v2f32_dmask_1000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GCN: $vgpr1 = COPY [[DEF]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <2 x float> %tex
}
define amdgpu_ps <2 x float> @image_load_v2f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_v2f32_dmask_0000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GCN: $vgpr1 = COPY [[DEF]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <2 x float> %tex
}
define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_v3f32_dmask_1100
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: $vgpr1 = COPY [[UV1]](s32)
; GCN: $vgpr2 = COPY [[DEF]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <3 x float> %tex
}
define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_v3f32_dmask_1000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GCN: $vgpr1 = COPY [[DEF]](s32)
; GCN: $vgpr2 = COPY [[DEF]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <3 x float> %tex
}
define amdgpu_ps <3 x float> @image_load_v3f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_v3f32_dmask_0000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GCN: $vgpr1 = COPY [[DEF]](s32)
; GCN: $vgpr2 = COPY [[DEF]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <3 x float> %tex
}
define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1110(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_v4f32_dmask_1110
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: $vgpr1 = COPY [[UV1]](s32)
; GCN: $vgpr2 = COPY [[UV2]](s32)
; GCN: $vgpr3 = COPY [[DEF]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %tex
}
define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_v4f32_dmask_1100
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: $vgpr1 = COPY [[UV1]](s32)
; GCN: $vgpr2 = COPY [[DEF]](s32)
; GCN: $vgpr3 = COPY [[DEF]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %tex
}
define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_v4f32_dmask_1000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GCN: $vgpr1 = COPY [[DEF]](s32)
; GCN: $vgpr2 = COPY [[DEF]](s32)
; GCN: $vgpr3 = COPY [[DEF]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %tex
}
define amdgpu_ps <4 x float> @image_load_v4f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_v4f32_dmask_0000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
; GCN: $vgpr1 = COPY [[DEF]](s32)
; GCN: $vgpr2 = COPY [[DEF]](s32)
; GCN: $vgpr3 = COPY [[DEF]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %tex
}
define amdgpu_ps float @image_load_tfe_f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_tfe_f32_dmask_0000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8")
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
%res = call { float, i32 } @llvm.amdgcn.image.load.2d.sl_f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { float, i32 } %res, 0
%tfe = extractvalue { float, i32 } %res, 1
store i32 %tfe, i32 addrspace(1)* undef
ret float %tex
}
define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_tfe_v2f32_dmask_1000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: $vgpr1 = COPY [[DEF1]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <2 x float>, i32 } %res, 0
%tfe = extractvalue { <2 x float>, i32 } %res, 1
store i32 %tfe, i32 addrspace(1)* undef
ret <2 x float> %tex
}
define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_tfe_v2f32_dmask_0000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: $vgpr1 = COPY [[DEF1]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%res = call { <2 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v2f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <2 x float>, i32 } %res, 0
%tfe = extractvalue { <2 x float>, i32 } %res, 1
store i32 %tfe, i32 addrspace(1)* undef
ret <2 x float> %tex
}
define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_tfe_v3f32_dmask_1100
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: $vgpr1 = COPY [[UV1]](s32)
; GCN: $vgpr2 = COPY [[DEF1]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <3 x float>, i32 } %res, 0
%tfe = extractvalue { <3 x float>, i32 } %res, 1
store i32 %tfe, i32 addrspace(1)* undef
ret <3 x float> %tex
}
define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_tfe_v3f32_dmask_1000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: $vgpr1 = COPY [[DEF1]](s32)
; GCN: $vgpr2 = COPY [[DEF1]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <3 x float>, i32 } %res, 0
%tfe = extractvalue { <3 x float>, i32 } %res, 1
store i32 %tfe, i32 addrspace(1)* undef
ret <3 x float> %tex
}
define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_tfe_v3f32_dmask_0000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16)
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: $vgpr1 = COPY [[DEF1]](s32)
; GCN: $vgpr2 = COPY [[DEF1]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%res = call { <3 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v3f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <3 x float>, i32 } %res, 0
%tfe = extractvalue { <3 x float>, i32 } %res, 1
store i32 %tfe, i32 addrspace(1)* undef
ret <3 x float> %tex
}
define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1110(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_tfe_v4f32_dmask_1110
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: G_STORE [[UV3]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: $vgpr1 = COPY [[UV1]](s32)
; GCN: $vgpr2 = COPY [[UV2]](s32)
; GCN: $vgpr3 = COPY [[DEF1]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <4 x float>, i32 } %res, 0
%tfe = extractvalue { <4 x float>, i32 } %res, 1
store i32 %tfe, i32 addrspace(1)* undef
ret <4 x float> %tex
}
define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1100(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_tfe_v4f32_dmask_1100
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: G_STORE [[UV2]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: $vgpr1 = COPY [[UV1]](s32)
; GCN: $vgpr2 = COPY [[DEF1]](s32)
; GCN: $vgpr3 = COPY [[DEF1]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <4 x float>, i32 } %res, 0
%tfe = extractvalue { <4 x float>, i32 } %res, 1
store i32 %tfe, i32 addrspace(1)* undef
ret <4 x float> %tex
}
define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_tfe_v4f32_dmask_1000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: $vgpr1 = COPY [[DEF1]](s32)
; GCN: $vgpr2 = COPY [[DEF1]](s32)
; GCN: $vgpr3 = COPY [[DEF1]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <4 x float>, i32 } %res, 0
%tfe = extractvalue { <4 x float>, i32 } %res, 1
store i32 %tfe, i32 addrspace(1)* undef
ret <4 x float> %tex
}
define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
; GCN-LABEL: name: image_load_tfe_v4f32_dmask_0000
; GCN: bb.1 (%ir-block.0):
; GCN: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; GCN: $vgpr0 = COPY [[UV]](s32)
; GCN: $vgpr1 = COPY [[DEF1]](s32)
; GCN: $vgpr2 = COPY [[DEF1]](s32)
; GCN: $vgpr3 = COPY [[DEF1]](s32)
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%res = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
%tex = extractvalue { <4 x float>, i32 } %res, 0
%tfe = extractvalue { <4 x float>, i32 } %res, 1
store i32 %tfe, i32 addrspace(1)* undef
ret <4 x float> %tex
}
declare float @llvm.amdgcn.image.load.2d.f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
declare <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
declare <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0