forked from OSchip/llvm-project
AMDGPU: Allow f32 types for llvm.amdgcn.s.buffer.load
llvm-svn: 348625
This commit is contained in:
parent
99c139f4dc
commit
ce2e053134
|
@ -803,7 +803,7 @@ def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
|
|||
def int_amdgcn_buffer_load : AMDGPUBufferLoad;
|
||||
|
||||
def int_amdgcn_s_buffer_load : Intrinsic <
|
||||
[llvm_anyint_ty],
|
||||
[llvm_any_ty],
|
||||
[llvm_v4i32_ty, // rsrc(SGPR)
|
||||
llvm_i32_ty, // byte offset(SGPR/VGPR/imm)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc)
|
||||
|
@ -835,7 +835,7 @@ class AMDGPURawBufferLoad : Intrinsic <
|
|||
[llvm_v4i32_ty, // rsrc(SGPR)
|
||||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
[IntrReadMem], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<0>;
|
||||
def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad;
|
||||
|
@ -847,7 +847,7 @@ class AMDGPUStructBufferLoad : Intrinsic <
|
|||
llvm_i32_ty, // vindex(VGPR)
|
||||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
[IntrReadMem], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<0>;
|
||||
def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
|
||||
|
@ -859,7 +859,7 @@ class AMDGPURawBufferStore : Intrinsic <
|
|||
llvm_v4i32_ty, // rsrc(SGPR)
|
||||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
[IntrWriteMem], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<1>;
|
||||
def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore;
|
||||
|
@ -872,7 +872,7 @@ class AMDGPUStructBufferStore : Intrinsic <
|
|||
llvm_i32_ty, // vindex(VGPR)
|
||||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
[IntrWriteMem], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<1>;
|
||||
def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
|
||||
|
@ -884,7 +884,7 @@ class AMDGPURawBufferAtomic : Intrinsic <
|
|||
llvm_v4i32_ty, // rsrc(SGPR)
|
||||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
|
||||
[], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<1, 0>;
|
||||
def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic;
|
||||
|
@ -904,7 +904,7 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
|
|||
llvm_v4i32_ty, // rsrc(SGPR)
|
||||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
|
||||
[], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<2, 0>;
|
||||
|
||||
|
@ -915,7 +915,7 @@ class AMDGPUStructBufferAtomic : Intrinsic <
|
|||
llvm_i32_ty, // vindex(VGPR)
|
||||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
|
||||
[], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<1, 0>;
|
||||
def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic;
|
||||
|
@ -936,7 +936,7 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
|
|||
llvm_i32_ty, // vindex(VGPR)
|
||||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
|
||||
[], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<2, 0>;
|
||||
|
||||
|
@ -980,7 +980,7 @@ def int_amdgcn_raw_tbuffer_load : Intrinsic <
|
|||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
[IntrReadMem], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<0>;
|
||||
|
||||
|
@ -991,7 +991,7 @@ def int_amdgcn_raw_tbuffer_store : Intrinsic <
|
|||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
[IntrWriteMem], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<1>;
|
||||
|
||||
|
@ -1002,7 +1002,7 @@ def int_amdgcn_struct_tbuffer_load : Intrinsic <
|
|||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
[IntrReadMem], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<0>;
|
||||
|
||||
|
@ -1014,7 +1014,7 @@ def int_amdgcn_struct_tbuffer_store : Intrinsic <
|
|||
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
|
||||
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
|
||||
llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc)
|
||||
[IntrWriteMem], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<1>;
|
||||
|
||||
|
|
|
@ -4873,12 +4873,13 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
|
|||
SmallVector<SDValue, 4> Loads;
|
||||
unsigned NumLoads = 1;
|
||||
MVT LoadVT = VT.getSimpleVT();
|
||||
MVT EltVT = LoadVT.isVector() ? LoadVT.getVectorElementType() : LoadVT;
|
||||
unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
|
||||
assert((EltVT == MVT::i32 || EltVT == MVT::f32) &&
|
||||
isPowerOf2_32(NumElts));
|
||||
|
||||
assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 ||
|
||||
LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32);
|
||||
|
||||
if (VT == MVT::v8i32 || VT == MVT::v16i32) {
|
||||
NumLoads = VT == MVT::v16i32 ? 4 : 2;
|
||||
if (NumElts == 8 || NumElts == 16) {
|
||||
NumLoads = NumElts == 16 ? 4 : 2;
|
||||
LoadVT = MVT::v4i32;
|
||||
}
|
||||
|
||||
|
|
|
@ -751,6 +751,12 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>;
|
|||
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>;
|
||||
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>;
|
||||
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>;
|
||||
|
||||
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", f32>;
|
||||
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2f32>;
|
||||
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4f32>;
|
||||
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>;
|
||||
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>;
|
||||
} // End let AddedComplexity = 100
|
||||
|
||||
let OtherPredicates = [isSICI] in {
|
||||
|
|
|
@ -625,7 +625,6 @@ exit:
|
|||
ret float %sum.next
|
||||
}
|
||||
|
||||
|
||||
; This test checks that the load after some control flow with an offset based
|
||||
; on a divergent shader input is correctly recognized as divergent. This was
|
||||
; reduced from an actual regression. Yes, the %unused argument matters, as
|
||||
|
@ -649,6 +648,45 @@ endif1: ; preds = %if1, %main_body
|
|||
ret float %tmp97
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_buffer_load_f32:
|
||||
; GCN: s_buffer_load_dword s0, s[0:3], s4
|
||||
define amdgpu_ps void @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
|
||||
%sgpr = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
|
||||
call void asm sideeffect "; use $0", "s"(float %sgpr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_buffer_load_v2f32:
|
||||
; GCN: s_buffer_load_dwordx2 s[0:1], s[0:3], s4
|
||||
define amdgpu_ps void @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
|
||||
%sgpr = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %offset, i32 0)
|
||||
call void asm sideeffect "; use $0", "s"(<2 x float> %sgpr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_buffer_load_v4f32:
|
||||
; GCN: s_buffer_load_dwordx4 s[0:3], s[0:3], s4
|
||||
define amdgpu_ps void @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
|
||||
%sgpr = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %offset, i32 0)
|
||||
call void asm sideeffect "; use $0", "s"(<4 x float> %sgpr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_buffer_load_v8f32:
|
||||
; GCN: s_buffer_load_dwordx8 s[0:7], s[0:3], s4
|
||||
define amdgpu_ps void @s_buffer_load_v8f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
|
||||
%sgpr = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %offset, i32 0)
|
||||
call void asm sideeffect "; use $0", "s"(<8 x float> %sgpr)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_buffer_load_v16f32:
|
||||
; GCN: s_buffer_load_dwordx16 s[0:15], s[0:3], s4
|
||||
define amdgpu_ps void @s_buffer_load_v16f32(<4 x i32> inreg %rsrc, i32 inreg %offset) {
|
||||
%sgpr = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %offset, i32 0)
|
||||
call void asm sideeffect "; use $0", "s"(<16 x float> %sgpr)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
|
||||
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
|
||||
|
@ -660,6 +698,12 @@ declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
|
|||
declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32)
|
||||
declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32)
|
||||
|
||||
declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32)
|
||||
declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32)
|
||||
declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32)
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { nounwind readnone speculatable }
|
||||
|
|
Loading…
Reference in New Issue