From ce2e053134ba5d2e890a37886228fd9f6528d2c6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 7 Dec 2018 18:41:39 +0000 Subject: [PATCH] AMDGPU: Allow f32 types for llvm.amdgcn.s.buffer.load llvm-svn: 348625 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 26 ++++++------- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 +++--- llvm/lib/Target/AMDGPU/SMInstructions.td | 6 +++ llvm/test/CodeGen/AMDGPU/smrd.ll | 46 ++++++++++++++++++++++- 4 files changed, 70 insertions(+), 19 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 67e7da7797a4..3ea364c78457 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -803,7 +803,7 @@ def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; def int_amdgcn_buffer_load : AMDGPUBufferLoad; def int_amdgcn_s_buffer_load : Intrinsic < - [llvm_anyint_ty], + [llvm_any_ty], [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // byte offset(SGPR/VGPR/imm) llvm_i32_ty], // cachepolicy(imm; bit 0 = glc) @@ -835,7 +835,7 @@ class AMDGPURawBufferLoad : Intrinsic < [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrReadMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; @@ -847,7 +847,7 @@ class AMDGPUStructBufferLoad : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrReadMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; @@ -859,7 +859,7 @@ class AMDGPURawBufferStore : Intrinsic < llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore; @@ -872,7 +872,7 @@ class AMDGPUStructBufferStore : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; @@ -884,7 +884,7 @@ class AMDGPURawBufferAtomic : Intrinsic < llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) [], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; @@ -904,7 +904,7 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) [], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -915,7 +915,7 @@ class AMDGPUStructBufferAtomic : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) [], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1, 0>; def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; @@ -936,7 +936,7 @@ def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) [], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; @@ -980,7 +980,7 @@ def int_amdgcn_raw_tbuffer_load : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrReadMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -991,7 +991,7 @@ def int_amdgcn_raw_tbuffer_store : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1002,7 +1002,7 @@ def int_amdgcn_struct_tbuffer_load : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrReadMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1014,7 +1014,7 @@ def int_amdgcn_struct_tbuffer_store : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) [IntrWriteMem], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a98183b28bb6..ff63c1f91b63 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4873,12 +4873,13 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SmallVector Loads; unsigned NumLoads = 1; MVT LoadVT = VT.getSimpleVT(); + MVT EltVT = LoadVT.isVector() ? LoadVT.getVectorElementType() : LoadVT; + unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; + assert((EltVT == MVT::i32 || EltVT == MVT::f32) && + isPowerOf2_32(NumElts)); - assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 || - LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32); - - if (VT == MVT::v8i32 || VT == MVT::v16i32) { - NumLoads = VT == MVT::v16i32 ? 4 : 2; + if (NumElts == 8 || NumElts == 16) { + NumLoads = NumElts == 16 ? 4 : 2; LoadVT = MVT::v4i32; } diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 8bd7de7269b7..8a063e1a4867 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -751,6 +751,12 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>; + +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>; } // End let AddedComplexity = 100 let OtherPredicates = [isSICI] in { diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll index d70801381cef..22ee62ef4276 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -625,7 +625,6 @@ exit: ret float %sum.next } - ; This test checks that the load after some control flow with an offset based ; on a divergent shader input is correctly recognized as divergent. This was ; reduced from an actual regression. Yes, the %unused argument matters, as @@ -649,6 +648,45 @@ endif1: ; preds = %if1, %main_body ret float %tmp97 } +; GCN-LABEL: {{^}}s_buffer_load_f32: +; GCN: s_buffer_load_dword s0, s[0:3], s4 +define amdgpu_ps void @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(float %sgpr) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_v2f32: +; GCN: s_buffer_load_dwordx2 s[0:1], s[0:3], s4 +define amdgpu_ps void @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(<2 x float> %sgpr) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_v4f32: +; GCN: s_buffer_load_dwordx4 s[0:3], s[0:3], s4 +define amdgpu_ps void @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(<4 x float> %sgpr) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_v8f32: +; GCN: s_buffer_load_dwordx8 s[0:7], s[0:3], s4 +define amdgpu_ps void @s_buffer_load_v8f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(<8 x float> %sgpr) + ret void +} + +; GCN-LABEL: {{^}}s_buffer_load_v16f32: +; GCN: s_buffer_load_dwordx16 s[0:15], s[0:3], s4 +define amdgpu_ps void @s_buffer_load_v16f32(<4 x i32> inreg %rsrc, i32 inreg %offset) { + %sgpr = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %offset, i32 0) + call void asm sideeffect "; use $0", "s"(<16 x float> %sgpr) + ret void +} declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 @@ -660,6 +698,12 @@ declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32) declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32) +declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) +declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32) +declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) +declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32) +declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32) + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind readnone speculatable }