llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s

; TODO: Merge with DAG test

define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
; CI-LABEL: is_local_vgpr:
; CI:       ; %bb.0:
; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    v_mov_b32_e32 v3, s1
; CI-NEXT:    v_mov_b32_e32 v2, s0
; CI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT:    s_load_dword s0, s[4:5], 0x10
; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v1
; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
; CI-NEXT:    flat_store_dword v[0:1], v0
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: is_local_vgpr:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v3, s1
; GFX9-NEXT:    v_mov_b32_e32 v2, s0
; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v1
; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT:    global_store_dword v[0:1], v0, off
; GFX9-NEXT:    s_endpgm
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i8*, i8* addrspace(1)* %ptr.ptr, i32 %id
  %ptr = load volatile i8*, i8* addrspace(1)* %gep
  %val = call i1 @llvm.amdgcn.is.shared(i8* %ptr)
  %ext = zext i1 %val to i32
  store i32 %ext, i32 addrspace(1)* undef
  ret void
}

define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
; CI-LABEL: is_local_sgpr:
; CI:       ; %bb.0:
; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    s_load_dword s0, s[4:5], 0x10
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    s_cmp_eq_u32 s1, s0
; CI-NEXT:    s_cselect_b32 s0, 1, 0
; CI-NEXT:    s_and_b32 s0, s0, 1
; CI-NEXT:    s_cmp_lg_u32 s0, 0
; CI-NEXT:    s_cbranch_scc0 BB1_2
; CI-NEXT:  ; %bb.1: ; %bb0
; CI-NEXT:    v_mov_b32_e32 v0, 0
; CI-NEXT:    flat_store_dword v[0:1], v0
; CI-NEXT:  BB1_2: ; %bb1
; CI-NEXT:    s_endpgm
;
; GFX9-LABEL: is_local_sgpr:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
; GFX9-NEXT:    s_cmp_eq_u32 s1, s0
; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
; GFX9-NEXT:    s_and_b32 s0, s0, 1
; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
; GFX9-NEXT:    s_cbranch_scc0 BB1_2
; GFX9-NEXT:  ; %bb.1: ; %bb0
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    global_store_dword v[0:1], v0, off
; GFX9-NEXT:  BB1_2: ; %bb1
; GFX9-NEXT:    s_endpgm
  %val = call i1 @llvm.amdgcn.is.shared(i8* %ptr)
  br i1 %val, label %bb0, label %bb1

bb0:
  store volatile i32 0, i32 addrspace(1)* undef
  br label %bb1

bb1:
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i1 @llvm.amdgcn.is.shared(i8* nocapture) #0

attributes #0 = { nounwind readnone speculatable }
AMDGPU: Add intrinsics for address space identification The library currently uses ptrtoint and directly checks the queue ptr for this, which counts as a pointer capture. llvm-svn: 371009 2019-09-05 10:20:39 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefix=CI %s`
			`; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefix=GFX9 %s`

			`; TODO: Merge with DAG test`

			`define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {`
			`; CI-LABEL: is_local_vgpr:`
			`; CI: ; %bb.0:`
			`; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0`
[GlobalISel] Add new combine to convert scalar G_MUL to G_SHL. For pow2 constants we should use G_SHL for pattern matching (and perf) purposes later. Vector support not yet implemented. Differential Revision: https://reviews.llvm.org/D73659 2020-01-30 01:42:26 +08:00			`; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0`
			`; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3`
AMDGPU: Add intrinsics for address space identification The library currently uses ptrtoint and directly checks the queue ptr for this, which counts as a pointer capture. llvm-svn: 371009 2019-09-05 10:20:39 +08:00			`; CI-NEXT: s_waitcnt lgkmcnt(0)`
AMDGPU/GlobalISel: Only map VOP operands to VGPRs This trivially avoids violating the constant bus restriction. Previously this was allowing one SGPR in the first source operand, which technically also avoided violating this for most operations (but not for special cases reading vcc). We do need to write some new, smarter operand folds to pick the optimal SGPR to use in some kind of post-isel fold, but that's purely an optimization. I was originally thinking we would pick which operands should be SGPRs in RegBankSelect, but I think this isn't really manageable. There would be additional complexity to handle every G_* instruction, and then any nontrivial instruction patterns would need to know when to avoid violating it, which is likely to be very error prone. I think having all inputs being canonically copies to VGPRs will simplify the operand folding logic. The current folding we do is backwards, and only considers one operand at a time, relative to operands it already has. It therefore poorly handles the case where there is already a constant bus operand user. If all operands are copies, it's somewhat simpler to consider all input operands at once to choose the optimal constant bus user. Since the failure mode for constant bus violations is now a verifier error and not an selection failure, this moves towards a place where we can turn on the fallback mode. The SGPR copy folding optimizations can be left for later. 2020-01-14 00:24:25 +08:00			`; CI-NEXT: v_mov_b32_e32 v3, s1`
			`; CI-NEXT: v_mov_b32_e32 v2, s0`
			`; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0`
			`; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc`
AMDGPU: Add intrinsics for address space identification The library currently uses ptrtoint and directly checks the queue ptr for this, which counts as a pointer capture. llvm-svn: 371009 2019-09-05 10:20:39 +08:00			`; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]`
			`; CI-NEXT: s_load_dword s0, s[4:5], 0x10`
			`; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)`
			`; CI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1`
			`; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc`
			`; CI-NEXT: flat_store_dword v[0:1], v0`
			`; CI-NEXT: s_endpgm`
			`;`
			`; GFX9-LABEL: is_local_vgpr:`
			`; GFX9: ; %bb.0:`
			`; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0`
[GlobalISel] Add new combine to convert scalar G_MUL to G_SHL. For pow2 constants we should use G_SHL for pattern matching (and perf) purposes later. Vector support not yet implemented. Differential Revision: https://reviews.llvm.org/D73659 2020-01-30 01:42:26 +08:00			`; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0`
			`; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]`
AMDGPU: Add intrinsics for address space identification The library currently uses ptrtoint and directly checks the queue ptr for this, which counts as a pointer capture. llvm-svn: 371009 2019-09-05 10:20:39 +08:00			`; GFX9-NEXT: s_waitcnt lgkmcnt(0)`
AMDGPU/GlobalISel: Only map VOP operands to VGPRs This trivially avoids violating the constant bus restriction. Previously this was allowing one SGPR in the first source operand, which technically also avoided violating this for most operations (but not for special cases reading vcc). We do need to write some new, smarter operand folds to pick the optimal SGPR to use in some kind of post-isel fold, but that's purely an optimization. I was originally thinking we would pick which operands should be SGPRs in RegBankSelect, but I think this isn't really manageable. There would be additional complexity to handle every G_* instruction, and then any nontrivial instruction patterns would need to know when to avoid violating it, which is likely to be very error prone. I think having all inputs being canonically copies to VGPRs will simplify the operand folding logic. The current folding we do is backwards, and only considers one operand at a time, relative to operands it already has. It therefore poorly handles the case where there is already a constant bus operand user. If all operands are copies, it's somewhat simpler to consider all input operands at once to choose the optimal constant bus user. Since the failure mode for constant bus violations is now a verifier error and not an selection failure, this moves towards a place where we can turn on the fallback mode. The SGPR copy folding optimizations can be left for later. 2020-01-14 00:24:25 +08:00			`; GFX9-NEXT: v_mov_b32_e32 v3, s1`
			`; GFX9-NEXT: v_mov_b32_e32 v2, s0`
			`; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0`
			`; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc`
AMDGPU: Add intrinsics for address space identification The library currently uses ptrtoint and directly checks the queue ptr for this, which counts as a pointer capture. llvm-svn: 371009 2019-09-05 10:20:39 +08:00			`; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off`
			`; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)`
			`; GFX9-NEXT: s_lshl_b32 s0, s0, 16`
			`; GFX9-NEXT: s_waitcnt vmcnt(0)`
			`; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1`
			`; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc`
			`; GFX9-NEXT: global_store_dword v[0:1], v0, off`
			`; GFX9-NEXT: s_endpgm`
			`%id = call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i8, i8 addrspace(1)* %ptr.ptr, i32 %id`
			`%ptr = load volatile i8, i8 addrspace(1)* %gep`
			`%val = call i1 @llvm.amdgcn.is.shared(i8* %ptr)`
			`%ext = zext i1 %val to i32`
			`store i32 %ext, i32 addrspace(1)* undef`
			`ret void`
			`}`

			`define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {`
			`; CI-LABEL: is_local_sgpr:`
			`; CI: ; %bb.0:`
			`; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0`
			`; CI-NEXT: s_waitcnt lgkmcnt(0)`
			`; CI-NEXT: s_load_dword s0, s[4:5], 0x10`
			`; CI-NEXT: s_waitcnt lgkmcnt(0)`
			`; CI-NEXT: s_cmp_eq_u32 s1, s0`
AMDGPU/GlobalISel: Replace handling of boolean values This solves selection failures with generated selection patterns, which would fail due to inferring the SGPR reg bank for virtual registers with a set register class instead of VCC bank. Use instruction selection would constrain the virtual register to a specific class, so when the def was selected later the bank no longer was set to VCC. Remove the SCC reg bank. SCC isn't directly addressable, so it requires copying from SCC to an allocatable 32-bit register during selection, so these might as well be treated as 32-bit SGPR values. Now any scalar boolean value that will produce an outupt in SCC should be widened during RegBankSelect to s32. Any s1 value should be a vector boolean during selection. This makes the vcc register bank unambiguous with a normal SGPR during selection. Summary of how this should now work: - G_TRUNC is always a no-op, and never should use a vcc bank result. - SALU boolean operations should be promoted to s32 in RegBankSelect apply mapping - An s1 value means vcc bank at selection. The exception is for legalization artifacts that use s1, which are never VCC. All other contexts should infer the VCC register classes for s1 typed registers. The LLT for the register is now needed to infer the correct register class. Extensions with vcc sources should be legalized to a select of constants during RegBankSelect. - Copy from non-vcc to vcc ensures high bits of the input value are cleared during selection. - SALU boolean inputs should ensure the inputs are 0/1. This includes select, conditional branches, and carry-ins. There are a few somewhat dirty details. One is that G_TRUNC/G_*EXT selection ignores the usual register-bank from register class functions, and can't handle truncates with VCC result banks. I think this is OK, since the artifacts are specially treated anyway. This does require some care to avoid producing cases with vcc. There will also be no 100% reliable way to verify this rule is followed in selection in case of register classes, and violations manifests themselves as invalid copy instructions much later. Standard phi handling also only considers the bank of the result register, and doesn't insert copies to make the source banks match. This doesn't work for vcc, so we have to manually correct phi inputs in this case. We should add a verifier check to make sure there are no phis with mixed vcc and non-vcc register bank inputs. There's also some duplication with the LegalizerHelper, and some code which should live in the helper. I don't see a good way to share special knowledge about what types to use for intermediate operations depending on the bank for example. Using the helper to replace extensions with selects also seems somewhat awkward to me. Another issue is there are some contexts calling getRegBankFromRegClass that apparently don't have the LLT type for the register, but I haven't yet run into a real issue from this. This also introduces new unnecessary instructions in most cases, since we don't yet try to optimize out the zext when the source is known to come from a compare. 2019-11-03 00:30:59 +08:00			`; CI-NEXT: s_cselect_b32 s0, 1, 0`
			`; CI-NEXT: s_and_b32 s0, s0, 1`
			`; CI-NEXT: s_cmp_lg_u32 s0, 0`
AMDGPU: Add intrinsics for address space identification The library currently uses ptrtoint and directly checks the queue ptr for this, which counts as a pointer capture. llvm-svn: 371009 2019-09-05 10:20:39 +08:00			`; CI-NEXT: s_cbranch_scc0 BB1_2`
			`; CI-NEXT: ; %bb.1: ; %bb0`
			`; CI-NEXT: v_mov_b32_e32 v0, 0`
			`; CI-NEXT: flat_store_dword v[0:1], v0`
			`; CI-NEXT: BB1_2: ; %bb1`
			`; CI-NEXT: s_endpgm`
			`;`
			`; GFX9-LABEL: is_local_sgpr:`
			`; GFX9: ; %bb.0:`
			`; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0`
			`; GFX9-NEXT: s_waitcnt lgkmcnt(0)`
			`; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)`
			`; GFX9-NEXT: s_lshl_b32 s0, s0, 16`
			`; GFX9-NEXT: s_cmp_eq_u32 s1, s0`
AMDGPU/GlobalISel: Replace handling of boolean values This solves selection failures with generated selection patterns, which would fail due to inferring the SGPR reg bank for virtual registers with a set register class instead of VCC bank. Use instruction selection would constrain the virtual register to a specific class, so when the def was selected later the bank no longer was set to VCC. Remove the SCC reg bank. SCC isn't directly addressable, so it requires copying from SCC to an allocatable 32-bit register during selection, so these might as well be treated as 32-bit SGPR values. Now any scalar boolean value that will produce an outupt in SCC should be widened during RegBankSelect to s32. Any s1 value should be a vector boolean during selection. This makes the vcc register bank unambiguous with a normal SGPR during selection. Summary of how this should now work: - G_TRUNC is always a no-op, and never should use a vcc bank result. - SALU boolean operations should be promoted to s32 in RegBankSelect apply mapping - An s1 value means vcc bank at selection. The exception is for legalization artifacts that use s1, which are never VCC. All other contexts should infer the VCC register classes for s1 typed registers. The LLT for the register is now needed to infer the correct register class. Extensions with vcc sources should be legalized to a select of constants during RegBankSelect. - Copy from non-vcc to vcc ensures high bits of the input value are cleared during selection. - SALU boolean inputs should ensure the inputs are 0/1. This includes select, conditional branches, and carry-ins. There are a few somewhat dirty details. One is that G_TRUNC/G_*EXT selection ignores the usual register-bank from register class functions, and can't handle truncates with VCC result banks. I think this is OK, since the artifacts are specially treated anyway. This does require some care to avoid producing cases with vcc. There will also be no 100% reliable way to verify this rule is followed in selection in case of register classes, and violations manifests themselves as invalid copy instructions much later. Standard phi handling also only considers the bank of the result register, and doesn't insert copies to make the source banks match. This doesn't work for vcc, so we have to manually correct phi inputs in this case. We should add a verifier check to make sure there are no phis with mixed vcc and non-vcc register bank inputs. There's also some duplication with the LegalizerHelper, and some code which should live in the helper. I don't see a good way to share special knowledge about what types to use for intermediate operations depending on the bank for example. Using the helper to replace extensions with selects also seems somewhat awkward to me. Another issue is there are some contexts calling getRegBankFromRegClass that apparently don't have the LLT type for the register, but I haven't yet run into a real issue from this. This also introduces new unnecessary instructions in most cases, since we don't yet try to optimize out the zext when the source is known to come from a compare. 2019-11-03 00:30:59 +08:00			`; GFX9-NEXT: s_cselect_b32 s0, 1, 0`
			`; GFX9-NEXT: s_and_b32 s0, s0, 1`
			`; GFX9-NEXT: s_cmp_lg_u32 s0, 0`
AMDGPU: Add intrinsics for address space identification The library currently uses ptrtoint and directly checks the queue ptr for this, which counts as a pointer capture. llvm-svn: 371009 2019-09-05 10:20:39 +08:00			`; GFX9-NEXT: s_cbranch_scc0 BB1_2`
			`; GFX9-NEXT: ; %bb.1: ; %bb0`
			`; GFX9-NEXT: v_mov_b32_e32 v0, 0`
			`; GFX9-NEXT: global_store_dword v[0:1], v0, off`
			`; GFX9-NEXT: BB1_2: ; %bb1`
			`; GFX9-NEXT: s_endpgm`
			`%val = call i1 @llvm.amdgcn.is.shared(i8* %ptr)`
			`br i1 %val, label %bb0, label %bb1`

			`bb0:`
			`store volatile i32 0, i32 addrspace(1)* undef`
			`br label %bb1`

			`bb1:`
			`ret void`
			`}`

			`declare i32 @llvm.amdgcn.workitem.id.x() #0`
			`declare i1 @llvm.amdgcn.is.shared(i8* nocapture) #0`

			`attributes #0 = { nounwind readnone speculatable }`