llvm-project/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll

; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=UNALIGNED,VECT %s

; GCN-LABEL: test_local_misaligned_v2:
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_write2_b32
define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
  %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4
  %v1 = extractelement <2 x i32> %load, i32 0
  %v2 = extractelement <2 x i32> %load, i32 1
  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
  store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4
  ret void
}

; GCN-LABEL: test_local_misaligned_v4:
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_write2_b32
; GCN-DAG: ds_write2_b32
; UNALIGNED-DAG: ds_read_b128
; UNALIGNED-DAG: ds_write_b128
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
  %v1 = extractelement <4 x i32> %load, i32 0
  %v2 = extractelement <4 x i32> %load, i32 1
  %v3 = extractelement <4 x i32> %load, i32 2
  %v4 = extractelement <4 x i32> %load, i32 3
  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
  store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4
  ret void
}

; GCN-LABEL: test_local_misaligned_v3:
; GCN-DAG: ds_read2_b32
; GCN-DAG: ds_read_b32
; GCN-DAG: ds_write2_b32
; GCN-DAG: ds_write_b32
; UNALIGNED-DAG: ds_read_b96
; UNALIGNED-DAG: ds_write_b96
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
  %v1 = extractelement <3 x i32> %load, i32 0
  %v2 = extractelement <3 x i32> %load, i32 1
  %v3 = extractelement <3 x i32> %load, i32 2
  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
  store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4
  ret void
}

; GCN-LABEL: test_flat_misaligned_v2:
; VECT-DAG:  flat_load_dwordx2 v
; VECT-DAG:  flat_store_dwordx2 v
; SPLIT-DAG: flat_load_dword v
; SPLIT-DAG: flat_load_dword v
; SPLIT-DAG: flat_store_dword v
; SPLIT-DAG: flat_store_dword v
define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
  %ptr = bitcast i32* %gep to <2 x i32>*
  %load = load <2 x i32>, <2 x i32>* %ptr, align 4
  %v1 = extractelement <2 x i32> %load, i32 0
  %v2 = extractelement <2 x i32> %load, i32 1
  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
  store <2 x i32> %v4, <2 x i32>* %ptr, align 4
  ret void
}

; GCN-LABEL: test_flat_misaligned_v4:
; VECT-DAG:  flat_load_dwordx4 v
; VECT-DAG:  flat_store_dwordx4 v
; SPLIT-DAG: flat_load_dword v
; SPLIT-DAG: flat_load_dword v
; SPLIT-DAG: flat_load_dword v
; SPLIT-DAG: flat_load_dword v
; SPLIT-DAG: flat_store_dword v
; SPLIT-DAG: flat_store_dword v
; SPLIT-DAG: flat_store_dword v
; SPLIT-DAG: flat_store_dword v
define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
  %ptr = bitcast i32* %gep to <4 x i32>*
  %load = load <4 x i32>, <4 x i32>* %ptr, align 4
  %v1 = extractelement <4 x i32> %load, i32 0
  %v2 = extractelement <4 x i32> %load, i32 1
  %v3 = extractelement <4 x i32> %load, i32 2
  %v4 = extractelement <4 x i32> %load, i32 3
  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
  store <4 x i32> %v8, <4 x i32>* %ptr, align 4
  ret void
}

; TODO: Reinstate the test below once v3i32/v3f32 is reinstated.

; GCN-LABEL: test_flat_misaligned_v3:
; xVECT-DAG:  flat_load_dwordx3 v
; xVECT-DAG:  flat_store_dwordx3 v
; xSPLIT-DAG: flat_load_dword v
; xSPLIT-DAG: flat_load_dword v
; xSPLIT-DAG: flat_load_dword v
; xSPLIT-DAG: flat_store_dword v
; xSPLIT-DAG: flat_store_dword v
; xSPLIT-DAG: flat_store_dword v
define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
  %ptr = bitcast i32* %gep to <3 x i32>*
  %load = load <3 x i32>, <3 x i32>* %ptr, align 4
  %v1 = extractelement <3 x i32> %load, i32 0
  %v2 = extractelement <3 x i32> %load, i32 1
  %v3 = extractelement <3 x i32> %load, i32 2
  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
  store <3 x i32> %v7, <3 x i32>* %ptr, align 4
  ret void
}

; GCN-LABEL: test_local_aligned_v2:
; GCN-DAG: ds_read_b64
; GCN-DAG: ds_write_b64
define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
  %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8
  %v1 = extractelement <2 x i32> %load, i32 0
  %v2 = extractelement <2 x i32> %load, i32 1
  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
  store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8
  ret void
}

; GCN-LABEL: test_local_aligned_v3:
; GCN-DAG: ds_read_b96
; GCN-DAG: ds_write_b96
define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
  %v1 = extractelement <3 x i32> %load, i32 0
  %v2 = extractelement <3 x i32> %load, i32 1
  %v3 = extractelement <3 x i32> %load, i32 2
  %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
  %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
  %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
  store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16
  ret void
}

; GCN-LABEL: test_flat_aligned_v2:
; GCN-DAG: flat_load_dwordx2 v
; GCN-DAG: flat_store_dwordx2 v
define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
  %ptr = bitcast i32* %gep to <2 x i32>*
  %load = load <2 x i32>, <2 x i32>* %ptr, align 8
  %v1 = extractelement <2 x i32> %load, i32 0
  %v2 = extractelement <2 x i32> %load, i32 1
  %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
  %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
  store <2 x i32> %v4, <2 x i32>* %ptr, align 8
  ret void
}

; GCN-LABEL: test_flat_aligned_v4:
; GCN-DAG: flat_load_dwordx4 v
; GCN-DAG: flat_store_dwordx4 v
define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
  %ptr = bitcast i32* %gep to <4 x i32>*
  %load = load <4 x i32>, <4 x i32>* %ptr, align 16
  %v1 = extractelement <4 x i32> %load, i32 0
  %v2 = extractelement <4 x i32> %load, i32 1
  %v3 = extractelement <4 x i32> %load, i32 2
  %v4 = extractelement <4 x i32> %load, i32 3
  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
  store <4 x i32> %v8, <4 x i32>* %ptr, align 16
  ret void
}

; GCN-LABEL: test_local_v4_aligned8:
; GCN-DAG: ds_read_b128
; GCN-DAG: ds_write_b128
define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
  %v1 = extractelement <4 x i32> %load, i32 0
  %v2 = extractelement <4 x i32> %load, i32 1
  %v3 = extractelement <4 x i32> %load, i32 2
  %v4 = extractelement <4 x i32> %load, i32 3
  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
  store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8
  ret void
}

; GCN-LABEL: test_flat_v4_aligned8:
; VECT-DAG:  flat_load_dwordx4 v
; VECT-DAG:  flat_store_dwordx4 v
; SPLIT-DAG: flat_load_dwordx2 v
; SPLIT-DAG: flat_load_dwordx2 v
; SPLIT-DAG: flat_store_dwordx2 v
; SPLIT-DAG: flat_store_dwordx2 v
define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) {
bb:
  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
  %ptr = bitcast i32* %gep to <4 x i32>*
  %load = load <4 x i32>, <4 x i32>* %ptr, align 8
  %v1 = extractelement <4 x i32> %load, i32 0
  %v2 = extractelement <4 x i32> %load, i32 1
  %v3 = extractelement <4 x i32> %load, i32 2
  %v4 = extractelement <4 x i32> %load, i32 3
  %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
  %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
  %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
  %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
  store <4 x i32> %v8, <4 x i32>* %ptr, align 8
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x()
[AMDGPU] gfx1010 DS implementation Differential Revision: https://reviews.llvm.org/D61332 llvm-svn: 359696 2019-05-02 00:11:11 +08:00			`; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SPLIT %s`
[AMDGPU] Workaround for LDS Misalignment bug on GFX10 Add subtarget feature check to avoid using ds_read/write_b96/128 with too low alignment if a bug is present on that specific hardware. Add this "feature" to GFX 10.1.1 as it is also affected. Add global-isel test. 2020-09-09 17:28:36 +08:00			`; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SPLIT %s`
[AMDGPU] gfx1011/gfx1012 targets Differential Revision: https://reviews.llvm.org/D63307 llvm-svn: 363344 2019-06-14 08:33:31 +08:00			`; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SPLIT %s`
[AMDGPU] gfx1010 DS implementation Differential Revision: https://reviews.llvm.org/D61332 llvm-svn: 359696 2019-05-02 00:11:11 +08:00			`; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s \| FileCheck -check-prefixes=GCN,VECT %s`
[AMDGPU] Set DS alignment requirements to be more strict Alignment requirements for ds_read/write_b96/b128 for gfx9 and onward are now the same as for other GCN subtargets. This way we can avoid any unintentional use of these instructions on systems that do not support dword alignment and instead require natural alignment. This also makes 'SH_MEM_CONFIG.alignment_mode == STRICT' the default. Differential Revision: https://reviews.llvm.org/D87821 2020-09-18 21:19:54 +08:00			`; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s \| FileCheck -check-prefixes=UNALIGNED,VECT %s`
[AMDGPU] gfx1010 DS implementation Differential Revision: https://reviews.llvm.org/D61332 llvm-svn: 359696 2019-05-02 00:11:11 +08:00
			`; GCN-LABEL: test_local_misaligned_v2:`
			`; GCN-DAG: ds_read2_b32`
			`; GCN-DAG: ds_write2_b32`
			`define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid`
			`%ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*`
			`%load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4`
			`%v1 = extractelement <2 x i32> %load, i32 0`
			`%v2 = extractelement <2 x i32> %load, i32 1`
			`%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0`
			`%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1`
			`store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4`
			`ret void`
			`}`

			`; GCN-LABEL: test_local_misaligned_v4:`
[AMDGPU] Set DS alignment requirements to be more strict Alignment requirements for ds_read/write_b96/b128 for gfx9 and onward are now the same as for other GCN subtargets. This way we can avoid any unintentional use of these instructions on systems that do not support dword alignment and instead require natural alignment. This also makes 'SH_MEM_CONFIG.alignment_mode == STRICT' the default. Differential Revision: https://reviews.llvm.org/D87821 2020-09-18 21:19:54 +08:00			`; GCN-DAG: ds_read2_b32`
			`; GCN-DAG: ds_read2_b32`
			`; GCN-DAG: ds_write2_b32`
			`; GCN-DAG: ds_write2_b32`
			`; UNALIGNED-DAG: ds_read_b128`
			`; UNALIGNED-DAG: ds_write_b128`
[AMDGPU] gfx1010 DS implementation Differential Revision: https://reviews.llvm.org/D61332 llvm-svn: 359696 2019-05-02 00:11:11 +08:00			`define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid`
			`%ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*`
			`%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4`
			`%v1 = extractelement <4 x i32> %load, i32 0`
			`%v2 = extractelement <4 x i32> %load, i32 1`
			`%v3 = extractelement <4 x i32> %load, i32 2`
			`%v4 = extractelement <4 x i32> %load, i32 3`
			`%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0`
			`%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1`
			`%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2`
			`%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3`
			`store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4`
			`ret void`
			`}`

			`; GCN-LABEL: test_local_misaligned_v3:`
[AMDGPU] Set DS alignment requirements to be more strict Alignment requirements for ds_read/write_b96/b128 for gfx9 and onward are now the same as for other GCN subtargets. This way we can avoid any unintentional use of these instructions on systems that do not support dword alignment and instead require natural alignment. This also makes 'SH_MEM_CONFIG.alignment_mode == STRICT' the default. Differential Revision: https://reviews.llvm.org/D87821 2020-09-18 21:19:54 +08:00			`; GCN-DAG: ds_read2_b32`
			`; GCN-DAG: ds_read_b32`
			`; GCN-DAG: ds_write2_b32`
			`; GCN-DAG: ds_write_b32`
			`; UNALIGNED-DAG: ds_read_b96`
			`; UNALIGNED-DAG: ds_write_b96`
[AMDGPU] gfx1010 DS implementation Differential Revision: https://reviews.llvm.org/D61332 llvm-svn: 359696 2019-05-02 00:11:11 +08:00			`define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid`
			`%ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*`
			`%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4`
			`%v1 = extractelement <3 x i32> %load, i32 0`
			`%v2 = extractelement <3 x i32> %load, i32 1`
			`%v3 = extractelement <3 x i32> %load, i32 2`
			`%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0`
			`%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1`
			`%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2`
			`store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4`
			`ret void`
			`}`

			`; GCN-LABEL: test_flat_misaligned_v2:`
			`; VECT-DAG: flat_load_dwordx2 v`
			`; VECT-DAG: flat_store_dwordx2 v`
			`; SPLIT-DAG: flat_load_dword v`
			`; SPLIT-DAG: flat_load_dword v`
			`; SPLIT-DAG: flat_store_dword v`
			`; SPLIT-DAG: flat_store_dword v`
			`define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32* %arg, i32 %lid`
			`%ptr = bitcast i32* %gep to <2 x i32>*`
			`%load = load <2 x i32>, <2 x i32>* %ptr, align 4`
			`%v1 = extractelement <2 x i32> %load, i32 0`
			`%v2 = extractelement <2 x i32> %load, i32 1`
			`%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0`
			`%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1`
			`store <2 x i32> %v4, <2 x i32>* %ptr, align 4`
			`ret void`
			`}`

			`; GCN-LABEL: test_flat_misaligned_v4:`
			`; VECT-DAG: flat_load_dwordx4 v`
			`; VECT-DAG: flat_store_dwordx4 v`
			`; SPLIT-DAG: flat_load_dword v`
			`; SPLIT-DAG: flat_load_dword v`
			`; SPLIT-DAG: flat_load_dword v`
			`; SPLIT-DAG: flat_load_dword v`
			`; SPLIT-DAG: flat_store_dword v`
			`; SPLIT-DAG: flat_store_dword v`
			`; SPLIT-DAG: flat_store_dword v`
			`; SPLIT-DAG: flat_store_dword v`
			`define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32* %arg, i32 %lid`
			`%ptr = bitcast i32* %gep to <4 x i32>*`
			`%load = load <4 x i32>, <4 x i32>* %ptr, align 4`
			`%v1 = extractelement <4 x i32> %load, i32 0`
			`%v2 = extractelement <4 x i32> %load, i32 1`
			`%v3 = extractelement <4 x i32> %load, i32 2`
			`%v4 = extractelement <4 x i32> %load, i32 3`
			`%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0`
			`%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1`
			`%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2`
			`%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3`
			`store <4 x i32> %v8, <4 x i32>* %ptr, align 4`
			`ret void`
			`}`

[AMDGPU] gfx1011/gfx1012 targets Differential Revision: https://reviews.llvm.org/D63307 llvm-svn: 363344 2019-06-14 08:33:31 +08:00			`; TODO: Reinstate the test below once v3i32/v3f32 is reinstated.`

[AMDGPU] gfx1010 DS implementation Differential Revision: https://reviews.llvm.org/D61332 llvm-svn: 359696 2019-05-02 00:11:11 +08:00			`; GCN-LABEL: test_flat_misaligned_v3:`
[AMDGPU] gfx1011/gfx1012 targets Differential Revision: https://reviews.llvm.org/D63307 llvm-svn: 363344 2019-06-14 08:33:31 +08:00			`; xVECT-DAG: flat_load_dwordx3 v`
			`; xVECT-DAG: flat_store_dwordx3 v`
			`; xSPLIT-DAG: flat_load_dword v`
			`; xSPLIT-DAG: flat_load_dword v`
			`; xSPLIT-DAG: flat_load_dword v`
			`; xSPLIT-DAG: flat_store_dword v`
			`; xSPLIT-DAG: flat_store_dword v`
			`; xSPLIT-DAG: flat_store_dword v`
[AMDGPU] gfx1010 DS implementation Differential Revision: https://reviews.llvm.org/D61332 llvm-svn: 359696 2019-05-02 00:11:11 +08:00			`define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32* %arg, i32 %lid`
			`%ptr = bitcast i32* %gep to <3 x i32>*`
			`%load = load <3 x i32>, <3 x i32>* %ptr, align 4`
			`%v1 = extractelement <3 x i32> %load, i32 0`
			`%v2 = extractelement <3 x i32> %load, i32 1`
			`%v3 = extractelement <3 x i32> %load, i32 2`
			`%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0`
			`%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1`
			`%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2`
			`store <3 x i32> %v7, <3 x i32>* %ptr, align 4`
			`ret void`
			`}`

			`; GCN-LABEL: test_local_aligned_v2:`
			`; GCN-DAG: ds_read_b64`
			`; GCN-DAG: ds_write_b64`
			`define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid`
			`%ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*`
			`%load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8`
			`%v1 = extractelement <2 x i32> %load, i32 0`
			`%v2 = extractelement <2 x i32> %load, i32 1`
			`%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0`
			`%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1`
			`store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8`
			`ret void`
			`}`

			`; GCN-LABEL: test_local_aligned_v3:`
[AMDGPU] Use ds_read/write_b96/b128 when possible for SDag Do not break down local loads and stores so ds_read/write_b96/b128 in ISelLowering can be selected on subtargets that support them and if align requirements allow them. Differential Revision: https://reviews.llvm.org/D84403 2020-08-21 17:51:06 +08:00			`; GCN-DAG: ds_read_b96`
			`; GCN-DAG: ds_write_b96`
[AMDGPU] gfx1010 DS implementation Differential Revision: https://reviews.llvm.org/D61332 llvm-svn: 359696 2019-05-02 00:11:11 +08:00			`define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid`
			`%ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*`
			`%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16`
			`%v1 = extractelement <3 x i32> %load, i32 0`
			`%v2 = extractelement <3 x i32> %load, i32 1`
			`%v3 = extractelement <3 x i32> %load, i32 2`
			`%v5 = insertelement <3 x i32> undef, i32 %v3, i32 0`
			`%v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1`
			`%v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2`
			`store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16`
			`ret void`
			`}`

			`; GCN-LABEL: test_flat_aligned_v2:`
			`; GCN-DAG: flat_load_dwordx2 v`
			`; GCN-DAG: flat_store_dwordx2 v`
			`define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32* %arg, i32 %lid`
			`%ptr = bitcast i32* %gep to <2 x i32>*`
			`%load = load <2 x i32>, <2 x i32>* %ptr, align 8`
			`%v1 = extractelement <2 x i32> %load, i32 0`
			`%v2 = extractelement <2 x i32> %load, i32 1`
			`%v3 = insertelement <2 x i32> undef, i32 %v2, i32 0`
			`%v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1`
			`store <2 x i32> %v4, <2 x i32>* %ptr, align 8`
			`ret void`
			`}`

			`; GCN-LABEL: test_flat_aligned_v4:`
			`; GCN-DAG: flat_load_dwordx4 v`
			`; GCN-DAG: flat_store_dwordx4 v`
			`define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32* %arg, i32 %lid`
			`%ptr = bitcast i32* %gep to <4 x i32>*`
			`%load = load <4 x i32>, <4 x i32>* %ptr, align 16`
			`%v1 = extractelement <4 x i32> %load, i32 0`
			`%v2 = extractelement <4 x i32> %load, i32 1`
			`%v3 = extractelement <4 x i32> %load, i32 2`
			`%v4 = extractelement <4 x i32> %load, i32 3`
			`%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0`
			`%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1`
			`%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2`
			`%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3`
			`store <4 x i32> %v8, <4 x i32>* %ptr, align 16`
			`ret void`
			`}`

			`; GCN-LABEL: test_local_v4_aligned8:`
[AMDGPU] Use ds_read/write_b96/b128 when possible for SDag Do not break down local loads and stores so ds_read/write_b96/b128 in ISelLowering can be selected on subtargets that support them and if align requirements allow them. Differential Revision: https://reviews.llvm.org/D84403 2020-08-21 17:51:06 +08:00			`; GCN-DAG: ds_read_b128`
			`; GCN-DAG: ds_write_b128`
[AMDGPU] gfx1010 DS implementation Differential Revision: https://reviews.llvm.org/D61332 llvm-svn: 359696 2019-05-02 00:11:11 +08:00			`define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid`
			`%ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*`
			`%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8`
			`%v1 = extractelement <4 x i32> %load, i32 0`
			`%v2 = extractelement <4 x i32> %load, i32 1`
			`%v3 = extractelement <4 x i32> %load, i32 2`
			`%v4 = extractelement <4 x i32> %load, i32 3`
			`%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0`
			`%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1`
			`%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2`
			`%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3`
			`store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8`
			`ret void`
			`}`

			`; GCN-LABEL: test_flat_v4_aligned8:`
			`; VECT-DAG: flat_load_dwordx4 v`
			`; VECT-DAG: flat_store_dwordx4 v`
			`; SPLIT-DAG: flat_load_dwordx2 v`
			`; SPLIT-DAG: flat_load_dwordx2 v`
			`; SPLIT-DAG: flat_store_dwordx2 v`
			`; SPLIT-DAG: flat_store_dwordx2 v`
			`define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) {`
			`bb:`
			`%lid = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%gep = getelementptr inbounds i32, i32* %arg, i32 %lid`
			`%ptr = bitcast i32* %gep to <4 x i32>*`
			`%load = load <4 x i32>, <4 x i32>* %ptr, align 8`
			`%v1 = extractelement <4 x i32> %load, i32 0`
			`%v2 = extractelement <4 x i32> %load, i32 1`
			`%v3 = extractelement <4 x i32> %load, i32 2`
			`%v4 = extractelement <4 x i32> %load, i32 3`
			`%v5 = insertelement <4 x i32> undef, i32 %v4, i32 0`
			`%v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1`
			`%v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2`
			`%v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3`
			`store <4 x i32> %v8, <4 x i32>* %ptr, align 8`
			`ret void`
			`}`

			`declare i32 @llvm.amdgcn.workitem.id.x()`