AMDGPU: Overload return type of llvm.amdgcn.buffer.load.format

Summary:
Allow the selection of BUFFER_LOAD_FORMAT_x and _XY. Do this now before
the frontend patches land in Mesa. Eventually, we may want to automatically
reduce the size of loads at the LLVM IR level, which requires such overloads,
and in some cases Mesa can generate them directly.

Reviewers: tstellarAMD, arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D18255

llvm-svn: 263792
This commit is contained in:
Nicolai Haehnle 2016-03-18 16:24:40 +00:00
parent ad63638f6d
commit 95e8ffd398
4 changed files with 81 additions and 54 deletions

View File

@ -206,7 +206,7 @@ def int_amdgcn_image_atomic_cmpswap : Intrinsic <
[]>;
def int_amdgcn_buffer_load_format : Intrinsic <
[llvm_v4f32_ty],
[llvm_anyfloat_ty],
[llvm_v4i32_ty, // rsrc(SGPR)
llvm_i32_ty, // vindex(VGPR)
llvm_i32_ty, // offset(SGPR/VGPR/imm)

View File

@ -2108,45 +2108,52 @@ def : Pat <
//===----------------------------------------------------------------------===//
// buffer_load/store_format patterns
//===----------------------------------------------------------------------===//
def : Pat<
(int_amdgcn_buffer_load_format v4i32:$rsrc, 0,
(MUBUFIntrinsicOffset i32:$soffset,
i16:$offset),
imm:$glc, imm:$slc),
(BUFFER_LOAD_FORMAT_XYZW_OFFSET $rsrc, $soffset, (as_i16imm $offset),
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
def : Pat<
(int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicOffset i32:$soffset,
i16:$offset),
imm:$glc, imm:$slc),
(BUFFER_LOAD_FORMAT_XYZW_IDXEN $vindex, $rsrc, $soffset, (as_i16imm $offset),
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
multiclass MUBUF_LoadIntrinsicPat<ValueType vt, string opcode> {
def : Pat<
(vt (int_amdgcn_buffer_load_format v4i32:$rsrc, 0,
(MUBUFIntrinsicOffset i32:$soffset,
i16:$offset),
imm:$glc, imm:$slc)),
(!cast<MUBUF>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
def : Pat<
(int_amdgcn_buffer_load_format v4i32:$rsrc, 0,
(MUBUFIntrinsicVOffset i32:$soffset,
i16:$offset,
i32:$voffset),
imm:$glc, imm:$slc),
(BUFFER_LOAD_FORMAT_XYZW_OFFEN $voffset, $rsrc, $soffset, (as_i16imm $offset),
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
def : Pat<
(vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicOffset i32:$soffset,
i16:$offset),
imm:$glc, imm:$slc)),
(!cast<MUBUF>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
def : Pat<
(int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicVOffset i32:$soffset,
i16:$offset,
i32:$voffset),
imm:$glc, imm:$slc),
(BUFFER_LOAD_FORMAT_XYZW_BOTHEN
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
def : Pat<
(vt (int_amdgcn_buffer_load_format v4i32:$rsrc, 0,
(MUBUFIntrinsicVOffset i32:$soffset,
i16:$offset,
i32:$voffset),
imm:$glc, imm:$slc)),
(!cast<MUBUF>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
def : Pat<
(vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicVOffset i32:$soffset,
i16:$offset,
i32:$voffset),
imm:$glc, imm:$slc)),
(!cast<MUBUF>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
}
defm : MUBUF_LoadIntrinsicPat<f32, "BUFFER_LOAD_FORMAT_X">;
defm : MUBUF_LoadIntrinsicPat<v2f32, "BUFFER_LOAD_FORMAT_XY">;
defm : MUBUF_LoadIntrinsicPat<v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
def : Pat<
(int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, 0,

View File

@ -8,9 +8,9 @@
;CHECK: s_waitcnt
define {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 {
main_body:
%data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
%data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
%data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
%data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
%data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
%data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
%r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
%r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
%r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
@ -22,7 +22,7 @@ main_body:
;CHECK: s_waitcnt
define <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 {
main_body:
%data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
%data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
ret <4 x float> %data
}
@ -35,9 +35,9 @@ main_body:
;CHECK: s_waitcnt
define <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 {
main_body:
%d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0)
%d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0)
%d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0)
%d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0)
%d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0)
%d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0)
%d.3 = fadd <4 x float> %d.0, %d.1
%data = fadd <4 x float> %d.2, %d.3
ret <4 x float> %data
@ -51,8 +51,8 @@ main_body:
;CHECK: s_waitcnt
define <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) #0 {
main_body:
%d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0)
%d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0)
%d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0)
%d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0)
%data = fadd <4 x float> %d.0, %d.1
ret <4 x float> %data
}
@ -62,7 +62,7 @@ main_body:
;CHECK: s_waitcnt
define <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 {
main_body:
%data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
%data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
ret <4 x float> %data
}
@ -71,7 +71,7 @@ main_body:
;CHECK: s_waitcnt
define <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 {
main_body:
%data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
%data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
ret <4 x float> %data
}
@ -81,7 +81,7 @@ main_body:
define <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 {
main_body:
%ofs = add i32 %1, 58
%data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
%data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
ret <4 x float> %data
}
@ -90,7 +90,7 @@ main_body:
;CHECK: s_waitcnt
define <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 {
main_body:
%data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
%data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
ret <4 x float> %data
}
@ -100,11 +100,31 @@ main_body:
;CHECK: s_waitcnt
define <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 {
main_body:
%data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
%data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
ret <4 x float> %data
}
declare <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32>, i32, i32, i1, i1) #1
;CHECK-LABEL: {{^}}buffer_load_x:
;CHECK: buffer_load_format_x v0, s[0:3], 0
;CHECK: s_waitcnt
define float @buffer_load_x(<4 x i32> inreg %rsrc) #0 {
main_body:
%data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
ret float %data
}
;CHECK-LABEL: {{^}}buffer_load_xy:
;CHECK: buffer_load_format_xy v[0:1], s[0:3], 0
;CHECK: s_waitcnt
define <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) #0 {
main_body:
%data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
ret <2 x float> %data
}
declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1
declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1
declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
attributes #0 = { "ShaderType"="0" }
attributes #1 = { nounwind readonly }

View File

@ -65,13 +65,13 @@ main_body:
define void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 {
main_body:
call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
%data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
%data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
ret void
}
declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
declare <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32>, i32, i32, i1, i1) #2
declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2
attributes #0 = { "ShaderType"="0" }
attributes #1 = { nounwind }