llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.s...

;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s

;CHECK-LABEL: {{^}}buffer_store:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
main_body:
  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2)
  ret void
}

;CHECK-LABEL: {{^}}buffer_store_immoffs:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
main_body:
  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0)
  ret void
}

;CHECK-LABEL: {{^}}buffer_store_idx:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
main_body:
  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
  ret void
}

;CHECK-LABEL: {{^}}buffer_store_ofs:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
main_body:
  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0)
  ret void
}

;CHECK-LABEL: {{^}}buffer_store_both:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
main_body:
  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0)
  ret void
}

;CHECK-LABEL: {{^}}buffer_store_both_reversed:
;CHECK: v_mov_b32_e32 v6, v4
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
main_body:
  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0)
  ret void
}

; Ideally, the register allocator would avoid the wait here
;
;CHECK-LABEL: {{^}}buffer_store_wait:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
;VERDE: s_waitcnt expcnt(0)
;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
main_body:
  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0, i32 0)
  ret void
}

;CHECK-LABEL: {{^}}buffer_store_x1:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
main_body:
  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
  ret void
}

;CHECK-LABEL: {{^}}buffer_store_x2:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
main_body:
  call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
  ret void
}

;CHECK-LABEL: {{^}}buffer_store_int:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
;CHECK: buffer_store_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc
;CHECK: buffer_store_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc
define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) {
main_body:
  call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
  call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
  call void @llvm.amdgcn.struct.buffer.store.i32(i32 %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2)
  ret void
}

;CHECK-LABEL: {{^}}struct_buffer_store_byte:
;CHECK-NEXT: %bb.
;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
;CHECK-NEXT: s_endpgm
define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) {
main_body:
  %v2 = fptoui float %v1 to i32
  %v3 = trunc i32 %v2 to i8
  call void @llvm.amdgcn.struct.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
  ret void
}

;CHECK-LABEL: {{^}}struct_buffer_store_short:
;CHECK-NEXT: %bb.
;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
;CHECK-NEXT: s_endpgm
define amdgpu_ps void @struct_buffer_store_short(<4 x i32> inreg %rsrc, float %v1, i32 %index) {
main_body:
  %v2 = fptoui float %v1 to i32
  %v3 = trunc i32 %v2 to i16
  call void @llvm.amdgcn.struct.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
  ret void
}

declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.buffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1
declare void @llvm.amdgcn.struct.buffer.store.i8(i8, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.buffer.store.i16(i16, <4 x i32>, i32, i32, i32, i32) #0

attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }
[AMDGPU] New buffer intrinsics Summary: This commit adds new intrinsics llvm.amdgcn.raw.buffer.load llvm.amdgcn.raw.buffer.load.format llvm.amdgcn.raw.buffer.load.format.d16 llvm.amdgcn.struct.buffer.load llvm.amdgcn.struct.buffer.load.format llvm.amdgcn.struct.buffer.load.format.d16 llvm.amdgcn.raw.buffer.store llvm.amdgcn.raw.buffer.store.format llvm.amdgcn.raw.buffer.store.format.d16 llvm.amdgcn.struct.buffer.store llvm.amdgcn.struct.buffer.store.format llvm.amdgcn.struct.buffer.store.format.d16 llvm.amdgcn.raw.buffer.atomic.* llvm.amdgcn.struct.buffer.atomic.* with the following changes from the llvm.amdgcn.buffer.* intrinsics: * there are separate raw and struct versions: raw does not have an index arg and sets idxen=0 in the instruction, and struct always sets idxen=1 in the instruction even if the index is 0, to allow for the fact that gfx9 does bounds checking differently depending on whether idxen is set; * there is a combined cachepolicy arg (glc+slc) * there are now only two offset args: one for the offset that is included in bounds checking and swizzling, to be split between the instruction's voffset and immoffset fields, and one for the offset that is excluded from bounds checking and swizzling, to go into the instruction's soffset field. The AMDISD::BUFFER_* SD nodes always have an index operand, all three offset operands, combined cachepolicy operand, and an extra idxen operand. The obsolescent llvm.amdgcn.buffer.* intrinsics continue to work. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, jfb, llvm-commits Differential Revision: https://reviews.llvm.org/D50306 Change-Id: If897ea7dc34fcbf4d5496e98cc99a934f62fc205 llvm-svn: 340269 2018-08-21 19:07:10 +08:00			`;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck -check-prefix=VERDE %s`
			`;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck %s`

			`;CHECK-LABEL: {{^}}buffer_store:`
			`;CHECK-NOT: s_waitcnt`
			`;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen`
			`;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc`
			`;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc`
			`define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {`
			`main_body:`
			`call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)`
			`call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)`
			`call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2)`
			`ret void`
			`}`

			`;CHECK-LABEL: {{^}}buffer_store_immoffs:`
			`;CHECK-NOT: s_waitcnt`
			`;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42`
			`define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {`
			`main_body:`
			`call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0)`
			`ret void`
			`}`

			`;CHECK-LABEL: {{^}}buffer_store_idx:`
			`;CHECK-NOT: s_waitcnt`
			`;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen`
			`define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {`
			`main_body:`
			`call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)`
			`ret void`
			`}`

			`;CHECK-LABEL: {{^}}buffer_store_ofs:`
			`;CHECK-NOT: s_waitcnt`
			`;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen`
			`define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {`
			`main_body:`
			`call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0)`
			`ret void`
			`}`

			`;CHECK-LABEL: {{^}}buffer_store_both:`
			`;CHECK-NOT: s_waitcnt`
			`;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen`
			`define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {`
			`main_body:`
			`call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0)`
			`ret void`
			`}`

			`;CHECK-LABEL: {{^}}buffer_store_both_reversed:`
			`;CHECK: v_mov_b32_e32 v6, v4`
			`;CHECK-NOT: s_waitcnt`
			`;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen`
			`define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {`
			`main_body:`
			`call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0)`
			`ret void`
			`}`

			`; Ideally, the register allocator would avoid the wait here`
			`;`
			`;CHECK-LABEL: {{^}}buffer_store_wait:`
			`;CHECK-NOT: s_waitcnt`
			`;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen`
			`;VERDE: s_waitcnt expcnt(0)`
			`;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen`
			`;CHECK: s_waitcnt vmcnt(0)`
			`;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen`
			`define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {`
			`main_body:`
			`call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)`
			`%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0)`
			`call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0, i32 0)`
			`ret void`
			`}`

			`;CHECK-LABEL: {{^}}buffer_store_x1:`
			`;CHECK-NOT: s_waitcnt`
			`;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen`
			`define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {`
			`main_body:`
			`call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)`
			`ret void`
			`}`

			`;CHECK-LABEL: {{^}}buffer_store_x2:`
			`;CHECK-NOT: s_waitcnt`
			`;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen`
			`define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {`
			`main_body:`
			`call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)`
			`ret void`
			`}`

[AMDGPU] Allow int types for MUBUF vdata Summary: Previously the new llvm.amdgcn.raw/struct.buffer.load/store intrinsics only allowed float types for the data to be loaded or stored, which sometimes meant the frontend needed to generate a bitcast. In this, the new intrinsics copied the old buffer intrinsics. This commit extends the new intrinsics to allow int types as well. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D50315 Change-Id: I8202af2d036455553681dcbb3d7d32ae273f8f85 llvm-svn: 340270 2018-08-21 19:08:12 +08:00			`;CHECK-LABEL: {{^}}buffer_store_int:`
			`;CHECK-NOT: s_waitcnt`
			`;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen`
			`;CHECK: buffer_store_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc`
			`;CHECK: buffer_store_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc`
			`define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) {`
			`main_body:`
			`call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)`
			`call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)`
			`call void @llvm.amdgcn.struct.buffer.store.i32(i32 %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2)`
			`ret void`
			`}`

[AMDGPU] Add buffer/load 8/16 bit overloaded intrinsics Summary: Add buffer store/load 8/16 overloaded intrinsics for buffer, raw_buffer and struct_buffer Change-Id: I166a29f071b2ff4e4683fb0392564b1f223ac61d Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59265 llvm-svn: 356465 2019-03-20 00:07:00 +08:00			`;CHECK-LABEL: {{^}}struct_buffer_store_byte:`
			`;CHECK-NEXT: %bb.`
			`;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}`
			`;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen`
			`;CHECK-NEXT: s_endpgm`
			`define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) {`
			`main_body:`
			`%v2 = fptoui float %v1 to i32`
			`%v3 = trunc i32 %v2 to i8`
			`call void @llvm.amdgcn.struct.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)`
			`ret void`
			`}`

			`;CHECK-LABEL: {{^}}struct_buffer_store_short:`
			`;CHECK-NEXT: %bb.`
			`;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}`
			`;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen`
			`;CHECK-NEXT: s_endpgm`
			`define amdgpu_ps void @struct_buffer_store_short(<4 x i32> inreg %rsrc, float %v1, i32 %index) {`
			`main_body:`
			`%v2 = fptoui float %v1 to i32`
			`%v3 = trunc i32 %v2 to i16`
			`call void @llvm.amdgcn.struct.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)`
			`ret void`
			`}`

[AMDGPU] New buffer intrinsics Summary: This commit adds new intrinsics llvm.amdgcn.raw.buffer.load llvm.amdgcn.raw.buffer.load.format llvm.amdgcn.raw.buffer.load.format.d16 llvm.amdgcn.struct.buffer.load llvm.amdgcn.struct.buffer.load.format llvm.amdgcn.struct.buffer.load.format.d16 llvm.amdgcn.raw.buffer.store llvm.amdgcn.raw.buffer.store.format llvm.amdgcn.raw.buffer.store.format.d16 llvm.amdgcn.struct.buffer.store llvm.amdgcn.struct.buffer.store.format llvm.amdgcn.struct.buffer.store.format.d16 llvm.amdgcn.raw.buffer.atomic.* llvm.amdgcn.struct.buffer.atomic.* with the following changes from the llvm.amdgcn.buffer.* intrinsics: * there are separate raw and struct versions: raw does not have an index arg and sets idxen=0 in the instruction, and struct always sets idxen=1 in the instruction even if the index is 0, to allow for the fact that gfx9 does bounds checking differently depending on whether idxen is set; * there is a combined cachepolicy arg (glc+slc) * there are now only two offset args: one for the offset that is included in bounds checking and swizzling, to be split between the instruction's voffset and immoffset fields, and one for the offset that is excluded from bounds checking and swizzling, to go into the instruction's soffset field. The AMDISD::BUFFER_* SD nodes always have an index operand, all three offset operands, combined cachepolicy operand, and an extra idxen operand. The obsolescent llvm.amdgcn.buffer.* intrinsics continue to work. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, jfb, llvm-commits Differential Revision: https://reviews.llvm.org/D50306 Change-Id: If897ea7dc34fcbf4d5496e98cc99a934f62fc205 llvm-svn: 340269 2018-08-21 19:07:10 +08:00			`declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0`
			`declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0`
			`declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0`
[AMDGPU] Allow int types for MUBUF vdata Summary: Previously the new llvm.amdgcn.raw/struct.buffer.load/store intrinsics only allowed float types for the data to be loaded or stored, which sometimes meant the frontend needed to generate a bitcast. In this, the new intrinsics copied the old buffer intrinsics. This commit extends the new intrinsics to allow int types as well. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D50315 Change-Id: I8202af2d036455553681dcbb3d7d32ae273f8f85 llvm-svn: 340270 2018-08-21 19:08:12 +08:00			`declare void @llvm.amdgcn.struct.buffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32) #0`
			`declare void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0`
			`declare void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0`
[AMDGPU] New buffer intrinsics Summary: This commit adds new intrinsics llvm.amdgcn.raw.buffer.load llvm.amdgcn.raw.buffer.load.format llvm.amdgcn.raw.buffer.load.format.d16 llvm.amdgcn.struct.buffer.load llvm.amdgcn.struct.buffer.load.format llvm.amdgcn.struct.buffer.load.format.d16 llvm.amdgcn.raw.buffer.store llvm.amdgcn.raw.buffer.store.format llvm.amdgcn.raw.buffer.store.format.d16 llvm.amdgcn.struct.buffer.store llvm.amdgcn.struct.buffer.store.format llvm.amdgcn.struct.buffer.store.format.d16 llvm.amdgcn.raw.buffer.atomic.* llvm.amdgcn.struct.buffer.atomic.* with the following changes from the llvm.amdgcn.buffer.* intrinsics: * there are separate raw and struct versions: raw does not have an index arg and sets idxen=0 in the instruction, and struct always sets idxen=1 in the instruction even if the index is 0, to allow for the fact that gfx9 does bounds checking differently depending on whether idxen is set; * there is a combined cachepolicy arg (glc+slc) * there are now only two offset args: one for the offset that is included in bounds checking and swizzling, to be split between the instruction's voffset and immoffset fields, and one for the offset that is excluded from bounds checking and swizzling, to go into the instruction's soffset field. The AMDISD::BUFFER_* SD nodes always have an index operand, all three offset operands, combined cachepolicy operand, and an extra idxen operand. The obsolescent llvm.amdgcn.buffer.* intrinsics continue to work. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, jfb, llvm-commits Differential Revision: https://reviews.llvm.org/D50306 Change-Id: If897ea7dc34fcbf4d5496e98cc99a934f62fc205 llvm-svn: 340269 2018-08-21 19:07:10 +08:00			`declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1`
[AMDGPU] Add buffer/load 8/16 bit overloaded intrinsics Summary: Add buffer store/load 8/16 overloaded intrinsics for buffer, raw_buffer and struct_buffer Change-Id: I166a29f071b2ff4e4683fb0392564b1f223ac61d Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59265 llvm-svn: 356465 2019-03-20 00:07:00 +08:00			`declare void @llvm.amdgcn.struct.buffer.store.i8(i8, <4 x i32>, i32, i32, i32, i32) #0`
			`declare void @llvm.amdgcn.struct.buffer.store.i16(i16, <4 x i32>, i32, i32, i32, i32) #0`
[AMDGPU] New buffer intrinsics Summary: This commit adds new intrinsics llvm.amdgcn.raw.buffer.load llvm.amdgcn.raw.buffer.load.format llvm.amdgcn.raw.buffer.load.format.d16 llvm.amdgcn.struct.buffer.load llvm.amdgcn.struct.buffer.load.format llvm.amdgcn.struct.buffer.load.format.d16 llvm.amdgcn.raw.buffer.store llvm.amdgcn.raw.buffer.store.format llvm.amdgcn.raw.buffer.store.format.d16 llvm.amdgcn.struct.buffer.store llvm.amdgcn.struct.buffer.store.format llvm.amdgcn.struct.buffer.store.format.d16 llvm.amdgcn.raw.buffer.atomic.* llvm.amdgcn.struct.buffer.atomic.* with the following changes from the llvm.amdgcn.buffer.* intrinsics: * there are separate raw and struct versions: raw does not have an index arg and sets idxen=0 in the instruction, and struct always sets idxen=1 in the instruction even if the index is 0, to allow for the fact that gfx9 does bounds checking differently depending on whether idxen is set; * there is a combined cachepolicy arg (glc+slc) * there are now only two offset args: one for the offset that is included in bounds checking and swizzling, to be split between the instruction's voffset and immoffset fields, and one for the offset that is excluded from bounds checking and swizzling, to go into the instruction's soffset field. The AMDISD::BUFFER_* SD nodes always have an index operand, all three offset operands, combined cachepolicy operand, and an extra idxen operand. The obsolescent llvm.amdgcn.buffer.* intrinsics continue to work. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, jfb, llvm-commits Differential Revision: https://reviews.llvm.org/D50306 Change-Id: If897ea7dc34fcbf4d5496e98cc99a934f62fc205 llvm-svn: 340269 2018-08-21 19:07:10 +08:00
			`attributes #0 = { nounwind }`
			`attributes #1 = { nounwind readonly }`