llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll

; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s

; GCN-LABEL: test_local_misaligned_v2:
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_write2_b32
define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
  %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4
  %v1 = extractelement <2 x i32> %load, i32 0
  %v2 = extractelement <2 x i32> %load, i32 1
  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
  store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4
  ret void
}

; GCN-LABEL: test_local_misaligned_v4:
; ALIGNED-DAG: ds_read2_b32
; ALIGNED-DAG: ds_read2_b32
; ALIGNED-DAG: ds_write2_b32
; ALIGNED-DAG: ds_write2_b32
; UNALIGNED-DAG: ds_read2_b64
; UNALIGNED-DAG: ds_write2_b64
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
  %v1 = extractelement <4 x i32> %load, i32 0
  %v2 = extractelement <4 x i32> %load, i32 1
  %v3 = extractelement <4 x i32> %load, i32 2
  %v4 = extractelement <4 x i32> %load, i32 3
  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
  store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4
  ret void
}

; GCN-LABEL: test_local_misaligned_v3:
; ALIGNED-DAG: ds_read2_b32
; ALIGNED-DAG: ds_read_b32
; ALIGNED-DAG: ds_write2_b32
; ALIGNED-DAG: ds_write_b32
; UNALIGNED-DAG: ds_read_b96
; UNALIGNED-DAG: ds_write_b96
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
  %v1 = extractelement <3 x i32> %load, i32 0
  %v2 = extractelement <3 x i32> %load, i32 1
  %v3 = extractelement <3 x i32> %load, i32 2
  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
  store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4
  ret void
}

; GCN-LABEL: test_local_aligned_v2:
; GCN-DAG: ds_read_b64
; GCN-DAG: ds_write_b64
define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
  %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8
  %v1 = extractelement <2 x i32> %load, i32 0
  %v2 = extractelement <2 x i32> %load, i32 1
  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
  store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8
  ret void
}

; GCN-LABEL: test_local_aligned_v3:
; GCN-DAG: ds_read_b96
; GCN-DAG: ds_write_b96
define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
  %v1 = extractelement <3 x i32> %load, i32 0
  %v2 = extractelement <3 x i32> %load, i32 1
  %v3 = extractelement <3 x i32> %load, i32 2
  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
  store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16
  ret void
}

; GCN-LABEL: test_local_v4_aligned8:
; ALIGNED-DAG: ds_read2_b64
; ALIGNED-DAG: ds_write2_b64
; UNALIGNED-DAG: ds_read2_b64
; UNALIGNED-DAG: ds_write2_b64
define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
  %v1 = extractelement <4 x i32> %load, i32 0
  %v2 = extractelement <4 x i32> %load, i32 1
  %v3 = extractelement <4 x i32> %load, i32 2
  %v4 = extractelement <4 x i32> %load, i32 3
  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
  store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x()
[NFC] Removed unused prefixes in CodeGen/AMDGPU/GlobalISel Differential Revision: https://reviews.llvm.org/D94099 2021-01-06 01:46:19 +08:00			`; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,ALIGNED %s`
			`; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,ALIGNED %s`
			`; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,ALIGNED %s`
			`; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s \| FileCheck -check-prefixes=GCN,ALIGNED %s`
			`; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s \| FileCheck -check-prefixes=GCN,UNALIGNED %s`
[AMDGPU] Workaround for LDS Misalignment bug on GFX10 Add subtarget feature check to avoid using ds_read/write_b96/128 with too low alignment if a bug is present on that specific hardware. Add this "feature" to GFX 10.1.1 as it is also affected. Add global-isel test. 2020-09-09 17:28:36 +08:00
			`; GCN-LABEL: test_local_misaligned_v2:`
			`; GCN-DAG: ds_read2_b32`
			`; GCN-DAG: ds_write2_b32`
			`define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid`
			`%ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*`
			`%load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4`
			`%v1 = extractelement <2 x i32> %load, i32 0`
			`%v2 = extractelement <2 x i32> %load, i32 1`
			`%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0`
			`%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1`
			`store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4`
			`ret void`
			`}`

			`; GCN-LABEL: test_local_misaligned_v4:`
[AMDGPU] Resolve issues when picking between ds_read/write and ds_read2/write2 Both ds_read_b128 and ds_read2_b64 are valid for 128bit 16-byte aligned loads but the one that will be selected is determined either by the order in tablegen or by the AddedComplexity attribute. Currently ds_read_b128 has priority. While ds_read2_b64 has lower alignment requirements, we cannot always restrict ds_read_b128 to 16-byte alignment because of unaligned-access-mode option. This was causing ds_read_b128 to be selected for 8-byte aligned loads regardles of chosen access mode. To resolve this we use two patterns for selecting ds_read_b128. One requires alignment of 16-byte and the other requires unaligned-access-mode option. Same goes for ds_write2_b64 and ds_write_b128. Differential Revision: https://reviews.llvm.org/D92767 2020-12-10 19:40:49 +08:00			`; ALIGNED-DAG: ds_read2_b32`
			`; ALIGNED-DAG: ds_read2_b32`
			`; ALIGNED-DAG: ds_write2_b32`
			`; ALIGNED-DAG: ds_write2_b32`
[AMDGPU] Only use ds_read/write_b128 for alignment >= 16 PS: Submitting on behalf of Jay. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D100008 2021-04-08 10:41:42 +08:00			`; UNALIGNED-DAG: ds_read2_b64`
			`; UNALIGNED-DAG: ds_write2_b64`
[AMDGPU] Workaround for LDS Misalignment bug on GFX10 Add subtarget feature check to avoid using ds_read/write_b96/128 with too low alignment if a bug is present on that specific hardware. Add this "feature" to GFX 10.1.1 as it is also affected. Add global-isel test. 2020-09-09 17:28:36 +08:00			`define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid`
			`%ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*`
			`%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4`
			`%v1 = extractelement <4 x i32> %load, i32 0`
			`%v2 = extractelement <4 x i32> %load, i32 1`
			`%v3 = extractelement <4 x i32> %load, i32 2`
			`%v4 = extractelement <4 x i32> %load, i32 3`
			`%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0`
			`%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1`
			`%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2`
			`%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3`
			`store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4`
			`ret void`
			`}`

			`; GCN-LABEL: test_local_misaligned_v3:`
[AMDGPU] Resolve issues when picking between ds_read/write and ds_read2/write2 Both ds_read_b128 and ds_read2_b64 are valid for 128bit 16-byte aligned loads but the one that will be selected is determined either by the order in tablegen or by the AddedComplexity attribute. Currently ds_read_b128 has priority. While ds_read2_b64 has lower alignment requirements, we cannot always restrict ds_read_b128 to 16-byte alignment because of unaligned-access-mode option. This was causing ds_read_b128 to be selected for 8-byte aligned loads regardles of chosen access mode. To resolve this we use two patterns for selecting ds_read_b128. One requires alignment of 16-byte and the other requires unaligned-access-mode option. Same goes for ds_write2_b64 and ds_write_b128. Differential Revision: https://reviews.llvm.org/D92767 2020-12-10 19:40:49 +08:00			`; ALIGNED-DAG: ds_read2_b32`
			`; ALIGNED-DAG: ds_read_b32`
			`; ALIGNED-DAG: ds_write2_b32`
			`; ALIGNED-DAG: ds_write_b32`
[AMDGPU] Set DS alignment requirements to be more strict Alignment requirements for ds_read/write_b96/b128 for gfx9 and onward are now the same as for other GCN subtargets. This way we can avoid any unintentional use of these instructions on systems that do not support dword alignment and instead require natural alignment. This also makes 'SH_MEM_CONFIG.alignment_mode == STRICT' the default. Differential Revision: https://reviews.llvm.org/D87821 2020-09-18 21:19:54 +08:00			`; UNALIGNED-DAG: ds_read_b96`
			`; UNALIGNED-DAG: ds_write_b96`
[AMDGPU] Workaround for LDS Misalignment bug on GFX10 Add subtarget feature check to avoid using ds_read/write_b96/128 with too low alignment if a bug is present on that specific hardware. Add this "feature" to GFX 10.1.1 as it is also affected. Add global-isel test. 2020-09-09 17:28:36 +08:00			`define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid`
			`%ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*`
			`%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4`
			`%v1 = extractelement <3 x i32> %load, i32 0`
			`%v2 = extractelement <3 x i32> %load, i32 1`
			`%v3 = extractelement <3 x i32> %load, i32 2`
			`%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0`
			`%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1`
			`%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2`
			`store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4`
			`ret void`
			`}`

			`; GCN-LABEL: test_local_aligned_v2:`
			`; GCN-DAG: ds_read_b64`
			`; GCN-DAG: ds_write_b64`
			`define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid`
			`%ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*`
			`%load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8`
			`%v1 = extractelement <2 x i32> %load, i32 0`
			`%v2 = extractelement <2 x i32> %load, i32 1`
			`%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0`
			`%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1`
			`store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8`
			`ret void`
			`}`

			`; GCN-LABEL: test_local_aligned_v3:`
			`; GCN-DAG: ds_read_b96`
			`; GCN-DAG: ds_write_b96`
			`define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid`
			`%ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*`
			`%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16`
			`%v1 = extractelement <3 x i32> %load, i32 0`
			`%v2 = extractelement <3 x i32> %load, i32 1`
			`%v3 = extractelement <3 x i32> %load, i32 2`
			`%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0`
			`%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1`
			`%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2`
			`store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16`
			`ret void`
			`}`

			`; GCN-LABEL: test_local_v4_aligned8:`
[AMDGPU] Resolve issues when picking between ds_read/write and ds_read2/write2 Both ds_read_b128 and ds_read2_b64 are valid for 128bit 16-byte aligned loads but the one that will be selected is determined either by the order in tablegen or by the AddedComplexity attribute. Currently ds_read_b128 has priority. While ds_read2_b64 has lower alignment requirements, we cannot always restrict ds_read_b128 to 16-byte alignment because of unaligned-access-mode option. This was causing ds_read_b128 to be selected for 8-byte aligned loads regardles of chosen access mode. To resolve this we use two patterns for selecting ds_read_b128. One requires alignment of 16-byte and the other requires unaligned-access-mode option. Same goes for ds_write2_b64 and ds_write_b128. Differential Revision: https://reviews.llvm.org/D92767 2020-12-10 19:40:49 +08:00			`; ALIGNED-DAG: ds_read2_b64`
			`; ALIGNED-DAG: ds_write2_b64`
[AMDGPU] Only use ds_read/write_b128 for alignment >= 16 PS: Submitting on behalf of Jay. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D100008 2021-04-08 10:41:42 +08:00			`; UNALIGNED-DAG: ds_read2_b64`
			`; UNALIGNED-DAG: ds_write2_b64`
[AMDGPU] Workaround for LDS Misalignment bug on GFX10 Add subtarget feature check to avoid using ds_read/write_b96/128 with too low alignment if a bug is present on that specific hardware. Add this "feature" to GFX 10.1.1 as it is also affected. Add global-isel test. 2020-09-09 17:28:36 +08:00			`define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid`
			`%ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*`
			`%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8`
			`%v1 = extractelement <4 x i32> %load, i32 0`
			`%v2 = extractelement <4 x i32> %load, i32 1`
			`%v3 = extractelement <4 x i32> %load, i32 2`
			`%v4 = extractelement <4 x i32> %load, i32 3`
			`%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0`
			`%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1`
			`%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2`
			`%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3`
			`store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8`
			`ret void`
			`}`

			`declare i32 @llvm.amdgcn.workitem.id.x()`