forked from OSchip/llvm-project
3537 lines
153 KiB
LLVM
3537 lines
153 KiB
LLVM
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||
|
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
||
|
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||
|
|
||
|
; Test using saddr addressing mode of global_* flat atomic instructions.
|
||
|
|
||
|
define amdgpu_ps void @global_xchg_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Maximum positive offset on gfx10
|
||
|
define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_i32_nortn_offset_2047:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap v[2:3], v1, off offset:2047
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_i32_nortn_offset_2047:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap v[2:3], v1, off offset:2047
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Maximum negative offset on gfx10
|
||
|
define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap v[2:3], v1, off offset:-2048
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap v[2:3], v1, off offset:-2048
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_xchg_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_i32_rtn_2048:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:2048 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_i32_rtn_2048:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, 0x800, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_i32_rtn_neg2048:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:-2048 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_i32_rtn_neg2048:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:-2048 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; Uniformity edge cases
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
@ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef
|
||
|
|
||
|
; Base pointer is uniform, but also in VGPRs
|
||
|
define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||
|
; GFX9-NEXT: ds_read_b64 v[2:3], v2
|
||
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: ds_read_b64 v[2:3], v2
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
; Base pointer is uniform, but also in VGPRs, with imm offset
|
||
|
define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||
|
; GFX9-NEXT: ds_read_b64 v[2:3], v2
|
||
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:42 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: ds_read_b64 v[2:3], v2
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:42 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
; Base pointer is uniform, but also in VGPRs
|
||
|
define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||
|
; GFX9-NEXT: ds_read_b64 v[2:3], v2
|
||
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||
|
; GFX10-NEXT: ds_read_b64 v[2:3], v2
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; Base pointer is uniform, but also in VGPRs, with imm offset
|
||
|
define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||
|
; GFX9-NEXT: ds_read_b64 v[2:3], v2
|
||
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap v[2:3], v1, off offset:42
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||
|
; GFX10-NEXT: ds_read_b64 v[2:3], v2
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap v[2:3], v1, off offset:42
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; All atomicrmw ops
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; atomicrmw xchg
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw xchg i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw xchg i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_xchg_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap_x2 v[3:4], v[1:2], off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap_x2 v[3:4], v[1:2], off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw xchg i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_xchg_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_swap_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_xchg_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_swap_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw xchg i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; atomicrmw add
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
define amdgpu_ps float @global_add_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_add_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_add v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_add_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_add v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw add i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_add_saddr_i32_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_add v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_add_saddr_i32_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_add v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw add i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_add_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_add_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_add v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_add_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_add v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw add i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_add_saddr_i32_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_add v[2:3], v1, off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_add_saddr_i32_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_add v[2:3], v1, off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw add i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_add_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_add_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_add_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw add i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_add_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_add_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_add_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw add i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_add_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_add_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_add_x2 v[3:4], v[1:2], off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_add_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_add_x2 v[3:4], v[1:2], off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw add i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_add_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_add_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_add_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_add_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw add i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; atomicrmw sub
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
define amdgpu_ps float @global_sub_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_sub_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_sub v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_sub_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_sub v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw sub i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_sub_saddr_i32_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_sub v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_sub_saddr_i32_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_sub v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw sub i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_sub_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_sub_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_sub v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_sub_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_sub v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw sub i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_sub_saddr_i32_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_sub v[2:3], v1, off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_sub_saddr_i32_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_sub v[2:3], v1, off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw sub i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_sub_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_sub_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_sub_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw sub i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_sub_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_sub_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_sub_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw sub i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_sub_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_sub_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_sub_x2 v[3:4], v[1:2], off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_sub_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_sub_x2 v[3:4], v[1:2], off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw sub i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_sub_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_sub_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_sub_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_sub_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw sub i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; atomicrmw and
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
define amdgpu_ps float @global_and_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_and_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_and v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_and_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_and v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw and i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_and_saddr_i32_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_and v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_and_saddr_i32_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_and v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw and i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_and_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_and_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_and v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_and_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_and v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw and i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_and_saddr_i32_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_and v[2:3], v1, off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_and_saddr_i32_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_and v[2:3], v1, off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw and i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_and_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_and_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_and_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw and i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_and_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_and_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_and_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw and i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_and_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_and_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_and_x2 v[3:4], v[1:2], off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_and_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_and_x2 v[3:4], v[1:2], off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw and i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_and_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_and_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_and_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_and_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw and i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; atomicrmw or
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
define amdgpu_ps float @global_or_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_or_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_or v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_or_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_or v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw or i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_or_saddr_i32_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_or v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_or_saddr_i32_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_or v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw or i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_or_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_or_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_or v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_or_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_or v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw or i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_or_saddr_i32_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_or v[2:3], v1, off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_or_saddr_i32_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_or v[2:3], v1, off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw or i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_or_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_or_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_or_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw or i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_or_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_or_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_or_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw or i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_or_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_or_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_or_x2 v[3:4], v[1:2], off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_or_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_or_x2 v[3:4], v[1:2], off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw or i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_or_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_or_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_or_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_or_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw or i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; atomicrmw xor
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
define amdgpu_ps float @global_xor_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xor_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_xor v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_xor_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_xor v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw xor i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xor_saddr_i32_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_xor v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_xor_saddr_i32_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_xor v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw xor i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_xor_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xor_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_xor v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_xor_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_xor v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw xor i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_xor_saddr_i32_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_xor v[2:3], v1, off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_xor_saddr_i32_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_xor v[2:3], v1, off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw xor i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_xor_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_xor_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_xor_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw xor i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_xor_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_xor_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_xor_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw xor i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_xor_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_xor_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_xor_x2 v[3:4], v[1:2], off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_xor_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_xor_x2 v[3:4], v[1:2], off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw xor i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_xor_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_xor_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_xor_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_xor_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw xor i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; atomicrmw max
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
define amdgpu_ps float @global_max_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_max_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smax v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_max_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smax v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw max i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_max_saddr_i32_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smax v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_max_saddr_i32_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smax v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw max i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_max_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_max_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smax v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_max_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smax v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw max i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_max_saddr_i32_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smax v[2:3], v1, off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_max_saddr_i32_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smax v[2:3], v1, off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw max i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_max_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_max_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw max i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_max_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_max_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw max i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_max_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_max_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smax_x2 v[3:4], v[1:2], off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_max_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smax_x2 v[3:4], v[1:2], off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw max i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_max_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smax_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_max_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smax_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw max i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; atomicrmw min
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
define amdgpu_ps float @global_min_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_min_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smin v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_min_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smin v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw min i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_min_saddr_i32_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smin v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_min_saddr_i32_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smin v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw min i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_min_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_min_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smin v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_min_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smin v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw min i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_min_saddr_i32_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smin v[2:3], v1, off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_min_saddr_i32_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smin v[2:3], v1, off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw min i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_min_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_min_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw min i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_min_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_min_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw min i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_min_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_min_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smin_x2 v[3:4], v[1:2], off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_min_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smin_x2 v[3:4], v[1:2], off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw min i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_min_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_smin_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_min_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_smin_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw min i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; atomicrmw umax
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
define amdgpu_ps float @global_umax_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_umax_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umax v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_umax_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umax v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw umax i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umax v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umax v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw umax i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_umax_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_umax_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umax v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_umax_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umax v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw umax i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umax v[2:3], v1, off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umax v[2:3], v1, off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw umax i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_umax_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_umax_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw umax i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw umax i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_umax_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_umax_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umax_x2 v[3:4], v[1:2], off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_umax_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umax_x2 v[3:4], v[1:2], off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw umax i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umax_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umax_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw umax i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; atomicrmw umin
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
define amdgpu_ps float @global_umin_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_umin_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umin v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_umin_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umin v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw umin i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umin v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umin v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = atomicrmw umin i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_umin_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_umin_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umin v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_umin_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umin v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw umin i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umin v[2:3], v1, off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umin v[2:3], v1, off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = atomicrmw umin i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_umin_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_umin_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw umin i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%rtn = atomicrmw umin i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_umin_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_umin_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umin_x2 v[3:4], v[1:2], off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_umin_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umin_x2 v[3:4], v[1:2], off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw umin i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_umin_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_umin_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = atomicrmw umin i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; cmpxchg
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
|
||
|
; GFX9-LABEL: global_cmpxchg_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_cmpxchg_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v3, v1
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%cmpxchg = cmpxchg i32 addrspace(1)* %cast.gep0, i32 %cmp, i32 %data seq_cst seq_cst
|
||
|
%rtn = extractvalue { i32, i1 } %cmpxchg, 0
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
|
||
|
; GFX9-LABEL: global_cmpxchg_saddr_i32_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_cmpxchg_saddr_i32_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v3, v1
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%cmpxchg = cmpxchg i32 addrspace(1)* %cast.gep1, i32 %cmp, i32 %data seq_cst seq_cst
|
||
|
%rtn = extractvalue { i32, i1 } %cmpxchg, 0
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
|
||
|
; GFX9-LABEL: global_cmpxchg_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_cmpxchg_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v3, v1
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = cmpxchg i32 addrspace(1)* %cast.gep0, i32 %cmp, i32 %data seq_cst seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
|
||
|
; GFX9-LABEL: global_cmpxchg_saddr_i32_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_cmpxchg_saddr_i32_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v3, v1
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = cmpxchg i32 addrspace(1)* %cast.gep1, i32 %cmp, i32 %data seq_cst seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
|
||
|
; GFX9-LABEL: global_cmpxchg_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v5, v1
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v6, v2
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_cmpxchg_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v5, v1
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v6, v2
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%cmpxchg = cmpxchg i64 addrspace(1)* %cast.gep0, i64 %cmp, i64 %data seq_cst seq_cst
|
||
|
%rtn = extractvalue { i64, i1 } %cmpxchg, 0
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
|
||
|
; GFX9-LABEL: global_cmpxchg_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v5, v1
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v6, v2
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_cmpxchg_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v5, v1
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v6, v2
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%cmpxchg = cmpxchg i64 addrspace(1)* %cast.gep1, i64 %cmp, i64 %data seq_cst seq_cst
|
||
|
%rtn = extractvalue { i64, i1 } %cmpxchg, 0
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
|
||
|
; GFX9-LABEL: global_cmpxchg_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v5, v1
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v6, v2
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v[3:6], off
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_cmpxchg_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v5, v1
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v6, v2
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v[3:6], off
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = cmpxchg i64 addrspace(1)* %cast.gep0, i64 %cmp, i64 %data seq_cst seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
|
||
|
; GFX9-LABEL: global_cmpxchg_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v5, v1
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v6, v2
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v[3:6], off offset:-128
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX9-NEXT: buffer_wbinvl1
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_cmpxchg_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v5, v1
|
||
|
; GFX10-NEXT: v_mov_b32_e32 v6, v2
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v[3:6], off offset:-128
|
||
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||
|
; GFX10-NEXT: buffer_gl0_inv
|
||
|
; GFX10-NEXT: buffer_gl1_inv
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = cmpxchg i64 addrspace(1)* %cast.gep1, i64 %cmp, i64 %data seq_cst seq_cst
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; amdgcn atomic inc
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
|
||
|
declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
|
||
|
|
||
|
define amdgpu_ps float @global_inc_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_inc_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: global_atomic_inc v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_inc_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_inc v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false)
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_inc_saddr_i32_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: global_atomic_inc v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_inc_saddr_i32_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_inc v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false)
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_inc_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_inc_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: global_atomic_inc v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_inc_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_inc v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false)
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_inc_saddr_i32_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: global_atomic_inc v[2:3], v1, off offset:-128
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_inc_saddr_i32_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_inc v[2:3], v1, off offset:-128
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false)
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_inc_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_inc_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false)
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_inc_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_inc_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false)
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_inc_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_inc_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: global_atomic_inc_x2 v[3:4], v[1:2], off
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_inc_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_inc_x2 v[3:4], v[1:2], off
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false)
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_inc_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: global_atomic_inc_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_inc_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_inc_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false)
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
; --------------------------------------------------------------------------------
|
||
|
; amdgcn atomic dec
|
||
|
; --------------------------------------------------------------------------------
|
||
|
|
||
|
declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
|
||
|
declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
|
||
|
|
||
|
define amdgpu_ps float @global_dec_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_dec_saddr_i32_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: global_atomic_dec v0, v[2:3], v1, off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_dec_saddr_i32_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_dec v0, v[2:3], v1, off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false)
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_dec_saddr_i32_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: global_atomic_dec v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_dec_saddr_i32_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_dec v0, v[2:3], v1, off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false)
|
||
|
%cast.rtn = bitcast i32 %rtn to float
|
||
|
ret float %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_dec_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_dec_saddr_i32_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: global_atomic_dec v[2:3], v1, off
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_dec_saddr_i32_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_dec v[2:3], v1, off
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
|
||
|
%unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false)
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
|
||
|
; GFX9-LABEL: global_dec_saddr_i32_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||
|
; GFX9-NEXT: global_atomic_dec v[2:3], v1, off offset:-128
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_dec_saddr_i32_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_dec v[2:3], v1, off offset:-128
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
|
||
|
%unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false)
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_dec_saddr_i64_rtn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_dec_saddr_i64_rtn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v[3:4], v[1:2], off glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false)
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_dec_saddr_i64_rtn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX9-NEXT: ; return to shader part epilog
|
||
|
;
|
||
|
; GFX10-LABEL: global_dec_saddr_i64_rtn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc
|
||
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||
|
; GFX10-NEXT: ; return to shader part epilog
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false)
|
||
|
%cast.rtn = bitcast i64 %rtn to <2 x float>
|
||
|
ret <2 x float> %cast.rtn
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_dec_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_dec_saddr_i64_nortn:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: global_atomic_dec_x2 v[3:4], v[1:2], off
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_dec_saddr_i64_nortn:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_dec_x2 v[3:4], v[1:2], off
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
|
||
|
%unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false)
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
|
||
|
; GFX9-LABEL: global_dec_saddr_i64_nortn_neg128:
|
||
|
; GFX9: ; %bb.0:
|
||
|
; GFX9-NEXT: v_mov_b32_e32 v4, s3
|
||
|
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0
|
||
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
|
||
|
; GFX9-NEXT: global_atomic_dec_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX9-NEXT: s_endpgm
|
||
|
;
|
||
|
; GFX10-LABEL: global_dec_saddr_i64_nortn_neg128:
|
||
|
; GFX10: ; %bb.0:
|
||
|
; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0
|
||
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0
|
||
|
; GFX10-NEXT: global_atomic_dec_x2 v[3:4], v[1:2], off offset:-128
|
||
|
; GFX10-NEXT: s_endpgm
|
||
|
%zext.offset = zext i32 %voffset to i64
|
||
|
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
|
||
|
%gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
|
||
|
%cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
|
||
|
%unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false)
|
||
|
ret void
|
||
|
}
|
||
|
|
||
|
attributes #0 = { argmemonly nounwind willreturn }
|