forked from OSchip/llvm-project
AMDGPU: Fix interaction of tfe and d16
This using the wrong result register, and dropping the result entirely for v2f16. This would fail to select on the scalar case. I believe it was also mishandling packed/unpacked subtargets.
This commit is contained in:
parent
4481eefbe8
commit
9c928649a0
|
@ -5215,6 +5215,24 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
|
|||
return Value == 0;
|
||||
}
|
||||
|
||||
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
|
||||
SDValue Src, int ExtraElts) {
|
||||
EVT SrcVT = Src.getValueType();
|
||||
|
||||
SmallVector<SDValue, 8> Elts;
|
||||
|
||||
if (SrcVT.isVector())
|
||||
DAG.ExtractVectorElements(Src, Elts);
|
||||
else
|
||||
Elts.push_back(Src);
|
||||
|
||||
SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
|
||||
while (ExtraElts--)
|
||||
Elts.push_back(Undef);
|
||||
|
||||
return DAG.getBuildVector(CastVT, DL, Elts);
|
||||
}
|
||||
|
||||
// Re-construct the required return value for a image load intrinsic.
|
||||
// This is more complicated due to the optional use TexFailCtrl which means the required
|
||||
// return type is an aggregate
|
||||
|
@ -5226,76 +5244,56 @@ static SDValue constructRetValue(SelectionDAG &DAG,
|
|||
const SDLoc &DL, LLVMContext &Context) {
|
||||
// Determine the required return type. This is the same regardless of IsTexFail flag
|
||||
EVT ReqRetVT = ResultTypes[0];
|
||||
EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
|
||||
int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
|
||||
EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
|
||||
EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
|
||||
: AdjEltVT
|
||||
: ReqRetVT;
|
||||
int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ?
|
||||
ReqRetNumElts : (ReqRetNumElts + 1) / 2;
|
||||
|
||||
// Extract data part of the result
|
||||
// Bitcast the result to the same type as the required return type
|
||||
int NumElts;
|
||||
if (IsD16 && !Unpacked)
|
||||
NumElts = NumVDataDwords << 1;
|
||||
else
|
||||
NumElts = NumVDataDwords;
|
||||
int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
|
||||
DMaskPop : (DMaskPop + 1) / 2;
|
||||
|
||||
EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
|
||||
: AdjEltVT;
|
||||
MVT DataDwordVT = NumDataDwords == 1 ?
|
||||
MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
|
||||
|
||||
// Special case for v6f16. Rather than add support for this, use v3i32 to
|
||||
// extract the data elements
|
||||
bool V6F16Special = false;
|
||||
if (NumElts == 6) {
|
||||
CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
|
||||
DMaskPop >>= 1;
|
||||
ReqRetNumElts >>= 1;
|
||||
V6F16Special = true;
|
||||
AdjVT = MVT::v2i32;
|
||||
}
|
||||
MVT MaskPopVT = MaskPopDwords == 1 ?
|
||||
MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
|
||||
|
||||
SDValue N = SDValue(Result, 0);
|
||||
SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
|
||||
SDValue Data(Result, 0);
|
||||
SDValue TexFail;
|
||||
|
||||
// Iterate over the result
|
||||
SmallVector<SDValue, 4> BVElts;
|
||||
|
||||
if (CastVT.isVector()) {
|
||||
DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
|
||||
if (IsTexFail) {
|
||||
SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
|
||||
if (MaskPopVT.isVector()) {
|
||||
Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
|
||||
SDValue(Result, 0), ZeroIdx);
|
||||
} else {
|
||||
BVElts.push_back(CastRes);
|
||||
}
|
||||
int ExtraElts = ReqRetNumElts - DMaskPop;
|
||||
while(ExtraElts--)
|
||||
BVElts.push_back(DAG.getUNDEF(AdjEltVT));
|
||||
|
||||
SDValue PreTFCRes;
|
||||
if (ReqRetNumElts > 1) {
|
||||
SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
|
||||
if (IsD16 && Unpacked)
|
||||
PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
|
||||
else
|
||||
PreTFCRes = NewVec;
|
||||
} else {
|
||||
PreTFCRes = BVElts[0];
|
||||
Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
|
||||
SDValue(Result, 0), ZeroIdx);
|
||||
}
|
||||
|
||||
if (V6F16Special)
|
||||
PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
|
||||
|
||||
if (!IsTexFail) {
|
||||
if (Result->getNumValues() > 1)
|
||||
return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
|
||||
else
|
||||
return PreTFCRes;
|
||||
TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
|
||||
SDValue(Result, 0),
|
||||
DAG.getConstant(MaskPopDwords, DL, MVT::i32));
|
||||
}
|
||||
|
||||
// Extract the TexFail result and insert into aggregate return
|
||||
SmallVector<SDValue, 1> TFCElt;
|
||||
DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
|
||||
SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
|
||||
return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
|
||||
if (DataDwordVT.isVector())
|
||||
Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
|
||||
NumDataDwords - MaskPopDwords);
|
||||
|
||||
if (IsD16)
|
||||
Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
|
||||
|
||||
if (!ReqRetVT.isVector())
|
||||
Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
|
||||
|
||||
Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);
|
||||
|
||||
if (TexFail)
|
||||
return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
|
||||
|
||||
if (Result->getNumValues() == 1)
|
||||
return Data;
|
||||
|
||||
return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
|
||||
}
|
||||
|
||||
static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
|
||||
|
@ -5545,8 +5543,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
|
|||
}
|
||||
|
||||
EVT NewVT = NumVDataDwords > 1 ?
|
||||
EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
|
||||
: MVT::f32;
|
||||
EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
|
||||
: MVT::i32;
|
||||
|
||||
ResultTypes[0] = NewVT;
|
||||
if (ResultTypes.size() == 3) {
|
||||
|
|
|
@ -0,0 +1,410 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-UNPACKED %s
|
||||
|
||||
define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX9-LABEL: load_1d_f16_tfe_dmask0:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b32 s11, s9
|
||||
; GFX9-NEXT: s_mov_b32 s10, s8
|
||||
; GFX9-NEXT: s_mov_b32 s9, s7
|
||||
; GFX9-NEXT: s_mov_b32 s8, s6
|
||||
; GFX9-NEXT: s_mov_b32 s7, s5
|
||||
; GFX9-NEXT: s_mov_b32 s6, s4
|
||||
; GFX9-NEXT: s_mov_b32 s5, s3
|
||||
; GFX9-NEXT: s_mov_b32 s4, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_short v[0:1], v1, off
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_f16_tfe_dmask0:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_mov_b32 s11, s9
|
||||
; GFX10-NEXT: s_mov_b32 s10, s8
|
||||
; GFX10-NEXT: s_mov_b32 s9, s7
|
||||
; GFX10-NEXT: s_mov_b32 s8, s6
|
||||
; GFX10-NEXT: s_mov_b32 s7, s5
|
||||
; GFX10-NEXT: s_mov_b32 s6, s4
|
||||
; GFX10-NEXT: s_mov_b32 s5, s3
|
||||
; GFX10-NEXT: s_mov_b32 s4, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_short v[0:1], v1, off
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0:
|
||||
; GFX8-UNPACKED: ; %bb.0:
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { half, i32 } %v, 0
|
||||
%v.err = extractvalue { half, i32 } %v, 1
|
||||
store volatile half %v.data, half addrspace(1)* undef
|
||||
store volatile i32 %v.err, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX9-LABEL: load_1d_f16_tfe_dmask1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b32 s11, s9
|
||||
; GFX9-NEXT: s_mov_b32 s10, s8
|
||||
; GFX9-NEXT: s_mov_b32 s9, s7
|
||||
; GFX9-NEXT: s_mov_b32 s8, s6
|
||||
; GFX9-NEXT: s_mov_b32 s7, s5
|
||||
; GFX9-NEXT: s_mov_b32 s6, s4
|
||||
; GFX9-NEXT: s_mov_b32 s5, s3
|
||||
; GFX9-NEXT: s_mov_b32 s4, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_short v[0:1], v1, off
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_f16_tfe_dmask1:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_mov_b32 s11, s9
|
||||
; GFX10-NEXT: s_mov_b32 s10, s8
|
||||
; GFX10-NEXT: s_mov_b32 s9, s7
|
||||
; GFX10-NEXT: s_mov_b32 s8, s6
|
||||
; GFX10-NEXT: s_mov_b32 s7, s5
|
||||
; GFX10-NEXT: s_mov_b32 s6, s4
|
||||
; GFX10-NEXT: s_mov_b32 s5, s3
|
||||
; GFX10-NEXT: s_mov_b32 s4, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_short v[0:1], v1, off
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1:
|
||||
; GFX8-UNPACKED: ; %bb.0:
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { half, i32 } %v, 0
|
||||
%v.err = extractvalue { half, i32 } %v, 1
|
||||
store volatile half %v.data, half addrspace(1)* undef
|
||||
store volatile i32 %v.err, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX9-LABEL: load_1d_v2f16_tfe_dmask0:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b32 s11, s9
|
||||
; GFX9-NEXT: s_mov_b32 s10, s8
|
||||
; GFX9-NEXT: s_mov_b32 s9, s7
|
||||
; GFX9-NEXT: s_mov_b32 s8, s6
|
||||
; GFX9-NEXT: s_mov_b32 s7, s5
|
||||
; GFX9-NEXT: s_mov_b32 s6, s4
|
||||
; GFX9-NEXT: s_mov_b32 s5, s3
|
||||
; GFX9-NEXT: s_mov_b32 s4, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_v2f16_tfe_dmask0:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_mov_b32 s11, s9
|
||||
; GFX10-NEXT: s_mov_b32 s10, s8
|
||||
; GFX10-NEXT: s_mov_b32 s9, s7
|
||||
; GFX10-NEXT: s_mov_b32 s8, s6
|
||||
; GFX10-NEXT: s_mov_b32 s7, s5
|
||||
; GFX10-NEXT: s_mov_b32 s6, s4
|
||||
; GFX10-NEXT: s_mov_b32 s5, s3
|
||||
; GFX10-NEXT: s_mov_b32 s4, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0:
|
||||
; GFX8-UNPACKED: ; %bb.0:
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { <2 x half>, i32 } %v, 0
|
||||
%v.err = extractvalue { <2 x half>, i32 } %v, 1
|
||||
store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
|
||||
store volatile i32 %v.err, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX9-LABEL: load_1d_v2f16_tfe_dmask1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b32 s11, s9
|
||||
; GFX9-NEXT: s_mov_b32 s10, s8
|
||||
; GFX9-NEXT: s_mov_b32 s9, s7
|
||||
; GFX9-NEXT: s_mov_b32 s8, s6
|
||||
; GFX9-NEXT: s_mov_b32 s7, s5
|
||||
; GFX9-NEXT: s_mov_b32 s6, s4
|
||||
; GFX9-NEXT: s_mov_b32 s5, s3
|
||||
; GFX9-NEXT: s_mov_b32 s4, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_v2f16_tfe_dmask1:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_mov_b32 s11, s9
|
||||
; GFX10-NEXT: s_mov_b32 s10, s8
|
||||
; GFX10-NEXT: s_mov_b32 s9, s7
|
||||
; GFX10-NEXT: s_mov_b32 s8, s6
|
||||
; GFX10-NEXT: s_mov_b32 s7, s5
|
||||
; GFX10-NEXT: s_mov_b32 s6, s4
|
||||
; GFX10-NEXT: s_mov_b32 s5, s3
|
||||
; GFX10-NEXT: s_mov_b32 s4, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1:
|
||||
; GFX8-UNPACKED: ; %bb.0:
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { <2 x half>, i32 } %v, 0
|
||||
%v.err = extractvalue { <2 x half>, i32 } %v, 1
|
||||
store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
|
||||
store volatile i32 %v.err, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX9-LABEL: load_1d_v2f16_tfe_dmask3:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b32 s11, s9
|
||||
; GFX9-NEXT: s_mov_b32 s10, s8
|
||||
; GFX9-NEXT: s_mov_b32 s9, s7
|
||||
; GFX9-NEXT: s_mov_b32 s8, s6
|
||||
; GFX9-NEXT: s_mov_b32 s7, s5
|
||||
; GFX9-NEXT: s_mov_b32 s6, s4
|
||||
; GFX9-NEXT: s_mov_b32 s5, s3
|
||||
; GFX9-NEXT: s_mov_b32 s4, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_v2f16_tfe_dmask3:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_mov_b32 s11, s9
|
||||
; GFX10-NEXT: s_mov_b32 s10, s8
|
||||
; GFX10-NEXT: s_mov_b32 s9, s7
|
||||
; GFX10-NEXT: s_mov_b32 s8, s6
|
||||
; GFX10-NEXT: s_mov_b32 s7, s5
|
||||
; GFX10-NEXT: s_mov_b32 s6, s4
|
||||
; GFX10-NEXT: s_mov_b32 s5, s3
|
||||
; GFX10-NEXT: s_mov_b32 s4, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3:
|
||||
; GFX8-UNPACKED: ; %bb.0:
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX8-UNPACKED-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x3 unorm tfe d16
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2
|
||||
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v3
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { <2 x half>, i32 } %v, 0
|
||||
%v.err = extractvalue { <2 x half>, i32 } %v, 1
|
||||
store volatile <2 x half> %v.data, <2 x half> addrspace(1)* undef
|
||||
store volatile i32 %v.err, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
; %v.data = extractvalue { <3 x half>, i32 } %v, 0
|
||||
; %v.err = extractvalue { <3 x half>, i32 } %v, 1
|
||||
; store volatile <3 x half> %v.data, <3 x half> addrspace(1)* undef
|
||||
; store volatile i32 %v.err, i32 addrspace(1)* undef
|
||||
; ret void
|
||||
; }
|
||||
|
||||
define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX9-LABEL: load_1d_v4f16_tfe_dmask15:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_mov_b32 s11, s9
|
||||
; GFX9-NEXT: s_mov_b32 s10, s8
|
||||
; GFX9-NEXT: s_mov_b32 s9, s7
|
||||
; GFX9-NEXT: s_mov_b32 s8, s6
|
||||
; GFX9-NEXT: s_mov_b32 s7, s5
|
||||
; GFX9-NEXT: s_mov_b32 s6, s4
|
||||
; GFX9-NEXT: s_mov_b32 s5, s3
|
||||
; GFX9-NEXT: s_mov_b32 s4, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v3, off
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_v4f16_tfe_dmask15:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_mov_b32 s11, s9
|
||||
; GFX10-NEXT: s_mov_b32 s10, s8
|
||||
; GFX10-NEXT: s_mov_b32 s9, s7
|
||||
; GFX10-NEXT: s_mov_b32 s8, s6
|
||||
; GFX10-NEXT: s_mov_b32 s7, s5
|
||||
; GFX10-NEXT: s_mov_b32 s6, s4
|
||||
; GFX10-NEXT: s_mov_b32 s5, s3
|
||||
; GFX10-NEXT: s_mov_b32 s4, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v3, off
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15:
|
||||
; GFX8-UNPACKED: ; %bb.0:
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
|
||||
; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX8-UNPACKED-NEXT: image_load v[1:5], v0, s[4:11] dmask:0xf unorm tfe d16
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v4
|
||||
; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
||||
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2]
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { <4 x half>, i32 } %v, 0
|
||||
%v.err = extractvalue { <4 x half>, i32 } %v, 1
|
||||
store volatile <4 x half> %v.data, <4 x half> addrspace(1)* undef
|
||||
store volatile i32 %v.err, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
declare { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
|
||||
declare { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
|
||||
declare { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
|
||||
declare { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0
|
||||
|
||||
attributes #0 = { nounwind readonly }
|
Loading…
Reference in New Issue