AMDGPU: Fix interaction of tfe and d16

This using the wrong result register, and dropping the result entirely
for v2f16. This would fail to select on the scalar case. I believe it
was also mishandling packed/unpacked subtargets.
This commit is contained in:
Matt Arsenault 2020-01-17 15:40:15 -05:00 committed by Matt Arsenault
parent 4481eefbe8
commit 9c928649a0
2 changed files with 468 additions and 60 deletions

View File

@ -5215,6 +5215,24 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
return Value == 0;
}
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
SDValue Src, int ExtraElts) {
EVT SrcVT = Src.getValueType();
SmallVector<SDValue, 8> Elts;
if (SrcVT.isVector())
DAG.ExtractVectorElements(Src, Elts);
else
Elts.push_back(Src);
SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
while (ExtraElts--)
Elts.push_back(Undef);
return DAG.getBuildVector(CastVT, DL, Elts);
}
// Re-construct the required return value for a image load intrinsic.
// This is more complicated due to the optional use TexFailCtrl which means the required
// return type is an aggregate
@ -5226,76 +5244,56 @@ static SDValue constructRetValue(SelectionDAG &DAG,
const SDLoc &DL, LLVMContext &Context) {
// Determine the required return type. This is the same regardless of IsTexFail flag
EVT ReqRetVT = ResultTypes[0];
EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
: AdjEltVT
: ReqRetVT;
int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ?
ReqRetNumElts : (ReqRetNumElts + 1) / 2;
// Extract data part of the result
// Bitcast the result to the same type as the required return type
int NumElts;
if (IsD16 && !Unpacked)
NumElts = NumVDataDwords << 1;
else
NumElts = NumVDataDwords;
int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
DMaskPop : (DMaskPop + 1) / 2;
EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
: AdjEltVT;
MVT DataDwordVT = NumDataDwords == 1 ?
MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
// Special case for v6f16. Rather than add support for this, use v3i32 to
// extract the data elements
bool V6F16Special = false;
if (NumElts == 6) {
CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
DMaskPop >>= 1;
ReqRetNumElts >>= 1;
V6F16Special = true;
AdjVT = MVT::v2i32;
MVT MaskPopVT = MaskPopDwords == 1 ?
MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
SDValue Data(Result, 0);
SDValue TexFail;
if (IsTexFail) {
SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
if (MaskPopVT.isVector()) {
Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
SDValue(Result, 0), ZeroIdx);
} else {
Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
SDValue(Result, 0), ZeroIdx);
}
TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
SDValue(Result, 0),
DAG.getConstant(MaskPopDwords, DL, MVT::i32));
}
SDValue N = SDValue(Result, 0);
SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
if (DataDwordVT.isVector())
Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
NumDataDwords - MaskPopDwords);
// Iterate over the result
SmallVector<SDValue, 4> BVElts;
if (IsD16)
Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
if (CastVT.isVector()) {
DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
} else {
BVElts.push_back(CastRes);
}
int ExtraElts = ReqRetNumElts - DMaskPop;
while(ExtraElts--)
BVElts.push_back(DAG.getUNDEF(AdjEltVT));
if (!ReqRetVT.isVector())
Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
SDValue PreTFCRes;
if (ReqRetNumElts > 1) {
SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
if (IsD16 && Unpacked)
PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
else
PreTFCRes = NewVec;
} else {
PreTFCRes = BVElts[0];
}
Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);
if (V6F16Special)
PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
if (TexFail)
return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
if (!IsTexFail) {
if (Result->getNumValues() > 1)
return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
else
return PreTFCRes;
}
if (Result->getNumValues() == 1)
return Data;
// Extract the TexFail result and insert into aggregate return
SmallVector<SDValue, 1> TFCElt;
DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
}
static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
@ -5545,8 +5543,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
EVT NewVT = NumVDataDwords > 1 ?
EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
: MVT::f32;
EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
: MVT::i32;
ResultTypes[0] = NewVT;
if (ResultTypes.size() == 3) {

View File

@ -0,0 +1,410 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-UNPACKED %s
define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-LABEL: load_1d_f16_tfe_dmask0:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b32 s11, s9
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s7, s5
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v[0:1], v1, off
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_f16_tfe_dmask0:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_mov_b32 s11, s9
; GFX10-NEXT: s_mov_b32 s10, s8
; GFX10-NEXT: s_mov_b32 s9, s7
; GFX10-NEXT: s_mov_b32 s8, s6
; GFX10-NEXT: s_mov_b32 s7, s5
; GFX10-NEXT: s_mov_b32 s6, s4
; GFX10-NEXT: s_mov_b32 s5, s3
; GFX10-NEXT: s_mov_b32 s4, s2
; GFX10-NEXT: v_mov_b32_e32 v2, v1
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v[0:1], v1, off
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0:
; GFX8-UNPACKED: ; %bb.0:
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { half, i32 } %v, 0
%v.err = extractvalue { half, i32 } %v, 1
store volatile half %v.data, half addrspace(1)* undef
store volatile i32 %v.err, i32 addrspace(1)* undef
ret void
}
define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-LABEL: load_1d_f16_tfe_dmask1:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b32 s11, s9
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s7, s5
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v[0:1], v1, off
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_f16_tfe_dmask1:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_mov_b32 s11, s9
; GFX10-NEXT: s_mov_b32 s10, s8
; GFX10-NEXT: s_mov_b32 s9, s7
; GFX10-NEXT: s_mov_b32 s8, s6
; GFX10-NEXT: s_mov_b32 s7, s5
; GFX10-NEXT: s_mov_b32 s6, s4
; GFX10-NEXT: s_mov_b32 s5, s3
; GFX10-NEXT: s_mov_b32 s4, s2
; GFX10-NEXT: v_mov_b32_e32 v2, v1
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v[0:1], v1, off
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1:
; GFX8-UNPACKED: ; %bb.0:
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { half, i32 } %v, 0
%v.err = extractvalue { half, i32 } %v, 1
store volatile half %v.data, half addrspace(1)* undef
store volatile i32 %v.err, i32 addrspace(1)* undef
ret void
}
define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-LABEL: load_1d_v2f16_tfe_dmask0:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b32 s11, s9
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s7, s5
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_v2f16_tfe_dmask0:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_mov_b32 s11, s9
; GFX10-NEXT: s_mov_b32 s10, s8
; GFX10-NEXT: s_mov_b32 s9, s7
; GFX10-NEXT: s_mov_b32 s8, s6
; GFX10-NEXT: s_mov_b32 s7, s5
; GFX10-NEXT: s_mov_b32 s6, s4
; GFX10-NEXT: s_mov_b32 s5, s3
; GFX10-NEXT: s_mov_b32 s4, s2
; GFX10-NEXT: v_mov_b32_e32 v2, v1
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[0:1], v1, off
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0:
; GFX8-UNPACKED: ; %bb.0:
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { <2 x half>, i32 } %v, 0
%v.err = extractvalue { <2 x half>, i32 } %v, 1
store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
store volatile i32 %v.err, i32 addrspace(1)* undef
ret void
}
define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-LABEL: load_1d_v2f16_tfe_dmask1:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b32 s11, s9
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s7, s5
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_v2f16_tfe_dmask1:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_mov_b32 s11, s9
; GFX10-NEXT: s_mov_b32 s10, s8
; GFX10-NEXT: s_mov_b32 s9, s7
; GFX10-NEXT: s_mov_b32 s8, s6
; GFX10-NEXT: s_mov_b32 s7, s5
; GFX10-NEXT: s_mov_b32 s6, s4
; GFX10-NEXT: s_mov_b32 s5, s3
; GFX10-NEXT: s_mov_b32 s4, s2
; GFX10-NEXT: v_mov_b32_e32 v2, v1
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[0:1], v1, off
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1:
; GFX8-UNPACKED: ; %bb.0:
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { <2 x half>, i32 } %v, 0
%v.err = extractvalue { <2 x half>, i32 } %v, 1
store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
store volatile i32 %v.err, i32 addrspace(1)* undef
ret void
}
define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-LABEL: load_1d_v2f16_tfe_dmask3:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b32 s11, s9
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s7, s5
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_v2f16_tfe_dmask3:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_mov_b32 s11, s9
; GFX10-NEXT: s_mov_b32 s10, s8
; GFX10-NEXT: s_mov_b32 s9, s7
; GFX10-NEXT: s_mov_b32 s8, s6
; GFX10-NEXT: s_mov_b32 s7, s5
; GFX10-NEXT: s_mov_b32 s6, s4
; GFX10-NEXT: s_mov_b32 s5, s3
; GFX10-NEXT: s_mov_b32 s4, s2
; GFX10-NEXT: v_mov_b32_e32 v2, v1
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[0:1], v1, off
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3:
; GFX8-UNPACKED: ; %bb.0:
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1
; GFX8-UNPACKED-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x3 unorm tfe d16
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v3
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { <2 x half>, i32 } %v, 0
%v.err = extractvalue { <2 x half>, i32 } %v, 1
store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
store volatile i32 %v.err, i32 addrspace(1)* undef
ret void
}
; define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
; %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
; %v.data = extractvalue { <3 x half>, i32 } %v, 0
; %v.err = extractvalue { <3 x half>, i32 } %v, 1
; store volatile <3 x half> %v.data, <3 x half> addrspace(1)* undef
; store volatile i32 %v.err, i32 addrspace(1)* undef
; ret void
; }
define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-LABEL: load_1d_v4f16_tfe_dmask15:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b32 s11, s9
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s7, s5
; GFX9-NEXT: s_mov_b32 s6, s4
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: v_mov_b32_e32 v3, v1
; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
; GFX9-NEXT: global_store_dword v[0:1], v3, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_v4f16_tfe_dmask15:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_mov_b32 s11, s9
; GFX10-NEXT: s_mov_b32 s10, s8
; GFX10-NEXT: s_mov_b32 s9, s7
; GFX10-NEXT: s_mov_b32 s8, s6
; GFX10-NEXT: s_mov_b32 s7, s5
; GFX10-NEXT: s_mov_b32 s6, s4
; GFX10-NEXT: s_mov_b32 s5, s3
; GFX10-NEXT: s_mov_b32 s4, s2
; GFX10-NEXT: v_mov_b32_e32 v2, v1
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
; GFX10-NEXT: global_store_dword v[0:1], v3, off
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15:
; GFX8-UNPACKED: ; %bb.0:
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v5, v1
; GFX8-UNPACKED-NEXT: image_load v[1:5], v0, s[4:11] dmask:0xf unorm tfe d16
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2]
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { <4 x half>, i32 } %v, 0
%v.err = extractvalue { <4 x half>, i32 } %v, 1
store volatile <4 x half> %v.data, <4 x half> addrspace(1)* undef
store volatile i32 %v.err, i32 addrspace(1)* undef
ret void
}
declare { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
declare { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
declare { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
declare { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
attributes #0 = { nounwind readonly }