llvm-project/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll

; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR %s
; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s
; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VMEM %s

; ALL-LABEL: {{^}}spill_sgpr_x2:
; SMEM: s_add_u32 m0, s3, 0x100{{$}}
; SMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Spill
; SMEM: s_cbranch_scc1

; SMEM: s_add_u32 m0, s3, 0x100{{$}}
; SMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Reload

; SMEM: s_dcache_wb
; SMEM: s_endpgm

; FIXME: Should only need 4 bytes
; SMEM: ScratchSize: 12


; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
; VGPR: s_cbranch_scc1

; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1

; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
; VMEM: s_cbranch_scc1

; VMEM: buffer_load_dword
; VMEM: buffer_load_dword
define amdgpu_kernel void @spill_sgpr_x2(i32 addrspace(1)* %out, i32 %in) #0 {
  %wide.sgpr = call <2 x i32>  asm sideeffect "; def $0", "=s" () #0
  %cmp = icmp eq i32 %in, 0
  br i1 %cmp, label %bb0, label %ret

bb0:
  call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr) #0
  br label %ret

ret:
  ret void
}

; ALL-LABEL: {{^}}spill_sgpr_x4:
; SMEM: s_add_u32 m0, s3, 0x100{{$}}
; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill
; SMEM: s_cbranch_scc1

; SMEM: s_add_u32 m0, s3, 0x100{{$}}
; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload
; SMEM: s_dcache_wb
; SMEM: s_endpgm

; FIXME: Should only need 4 bytes
; SMEM: ScratchSize: 20

; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3
; VGPR: s_cbranch_scc1

; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3


; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
; VMEM: s_cbranch_scc1

; VMEM: buffer_load_dword
; VMEM: buffer_load_dword
; VMEM: buffer_load_dword
; VMEM: buffer_load_dword
define amdgpu_kernel void @spill_sgpr_x4(i32 addrspace(1)* %out, i32 %in) #0 {
  %wide.sgpr = call <4 x i32>  asm sideeffect "; def $0", "=s" () #0
  %cmp = icmp eq i32 %in, 0
  br i1 %cmp, label %bb0, label %ret

bb0:
  call void asm sideeffect "; use $0", "s"(<4 x i32> %wide.sgpr) #0
  br label %ret

ret:
  ret void
}

; ALL-LABEL: {{^}}spill_sgpr_x8:

; SMEM: s_add_u32 m0, s3, 0x100{{$}}
; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill
; SMEM: s_add_u32 m0, s3, 0x110{{$}}
; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Spill
; SMEM: s_cbranch_scc1

; SMEM: s_add_u32 m0, s3, 0x100{{$}}
; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload
; SMEM: s_add_u32 m0, s3, 0x110{{$}}
; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload

; SMEM: s_dcache_wb
; SMEM: s_endpgm

; SMEM: ScratchSize: 36

; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 5
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 6
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 7
; VGPR: s_cbranch_scc1

; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 5
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7

; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
; VMEM: buffer_store_dword
; VMEM: s_cbranch_scc1

; VMEM: buffer_load_dword
; VMEM: buffer_load_dword
; VMEM: buffer_load_dword
; VMEM: buffer_load_dword
; VMEM: buffer_load_dword
; VMEM: buffer_load_dword
; VMEM: buffer_load_dword
; VMEM: buffer_load_dword
define amdgpu_kernel void @spill_sgpr_x8(i32 addrspace(1)* %out, i32 %in) #0 {
  %wide.sgpr = call <8 x i32>  asm sideeffect "; def $0", "=s" () #0
  %cmp = icmp eq i32 %in, 0
  br i1 %cmp, label %bb0, label %ret

bb0:
  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr) #0
  br label %ret

ret:
  ret void
}

; FIXME: x16 inlineasm seems broken
; define amdgpu_kernel void @spill_sgpr_x16(i32 addrspace(1)* %out, i32 %in) #0 {
;   %wide.sgpr = call <16 x i32>  asm sideeffect "; def $0", "=s" () #0
;   %cmp = icmp eq i32 %in, 0
;   br i1 %cmp, label %bb0, label %ret

; bb0:
;   call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr) #0
;   br label %ret

; ret:
;   ret void
; }

attributes #0 = { nounwind }
AMDGPU: Use wider scalar spills for SGPR spilling Since the spill is for the whole wave, these don't have the swizzling problems that vector stores do and a single 4-byte allocation is enough to spill a 64 element register. This should reduce the number of spill instructions and put all the spills for a register in the same cacheline. This should save allocated private size, but for now it doesn't. The extra slots are allocated for each component, but never used because the frame layout is essentially finalized before frame indices are replaced. For always using the scalar store path, this should probably be moved into processFunctionBeforeFrameFinalized. llvm-svn: 288445 2016-12-02 08:54:45 +08:00			`; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s \| FileCheck -check-prefix=ALL -check-prefix=VGPR %s`
			`; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s \| FileCheck -check-prefix=ALL -check-prefix=SMEM %s`
			`; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s \| FileCheck -check-prefix=ALL -check-prefix=VMEM %s`

			`; ALL-LABEL: {{^}}spill_sgpr_x2:`
AMDGPU: Always allocate emergency stack slot at offset 0 This allows us to ensure that 0 is never a valid pointer to a user object, and ensures that the offset is always legal without needing a register to access it. This comes at the cost of usable offsets and wasted stack space. llvm-svn: 295877 2017-02-23 05:05:25 +08:00			`; SMEM: s_add_u32 m0, s3, 0x100{{$}}`
AMDGPU: Use wider scalar spills for SGPR spilling Since the spill is for the whole wave, these don't have the swizzling problems that vector stores do and a single 4-byte allocation is enough to spill a 64 element register. This should reduce the number of spill instructions and put all the spills for a register in the same cacheline. This should save allocated private size, but for now it doesn't. The extra slots are allocated for each component, but never used because the frame layout is essentially finalized before frame indices are replaced. For always using the scalar store path, this should probably be moved into processFunctionBeforeFrameFinalized. llvm-svn: 288445 2016-12-02 08:54:45 +08:00			`; SMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Spill`
			`; SMEM: s_cbranch_scc1`

AMDGPU: Always allocate emergency stack slot at offset 0 This allows us to ensure that 0 is never a valid pointer to a user object, and ensures that the offset is always legal without needing a register to access it. This comes at the cost of usable offsets and wasted stack space. llvm-svn: 295877 2017-02-23 05:05:25 +08:00			`; SMEM: s_add_u32 m0, s3, 0x100{{$}}`
AMDGPU: Use wider scalar spills for SGPR spilling Since the spill is for the whole wave, these don't have the swizzling problems that vector stores do and a single 4-byte allocation is enough to spill a 64 element register. This should reduce the number of spill instructions and put all the spills for a register in the same cacheline. This should save allocated private size, but for now it doesn't. The extra slots are allocated for each component, but never used because the frame layout is essentially finalized before frame indices are replaced. For always using the scalar store path, this should probably be moved into processFunctionBeforeFrameFinalized. llvm-svn: 288445 2016-12-02 08:54:45 +08:00			`; SMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Reload`

			`; SMEM: s_dcache_wb`
			`; SMEM: s_endpgm`

			`; FIXME: Should only need 4 bytes`
			`; SMEM: ScratchSize: 12`


			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0`
			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1`
			`; VGPR: s_cbranch_scc1`

			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0`
			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1`

			`; VMEM: buffer_store_dword`
			`; VMEM: buffer_store_dword`
			`; VMEM: s_cbranch_scc1`

			`; VMEM: buffer_load_dword`
			`; VMEM: buffer_load_dword`
			`define amdgpu_kernel void @spill_sgpr_x2(i32 addrspace(1)* %out, i32 %in) #0 {`
			`%wide.sgpr = call <2 x i32> asm sideeffect "; def $0", "=s" () #0`
			`%cmp = icmp eq i32 %in, 0`
			`br i1 %cmp, label %bb0, label %ret`

			`bb0:`
			`call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr) #0`
			`br label %ret`

			`ret:`
			`ret void`
			`}`

			`; ALL-LABEL: {{^}}spill_sgpr_x4:`
AMDGPU: Always allocate emergency stack slot at offset 0 This allows us to ensure that 0 is never a valid pointer to a user object, and ensures that the offset is always legal without needing a register to access it. This comes at the cost of usable offsets and wasted stack space. llvm-svn: 295877 2017-02-23 05:05:25 +08:00			`; SMEM: s_add_u32 m0, s3, 0x100{{$}}`
Relax fast register allocator related test cases; NFC - Relex hard coded registers and stack frame sizes - Some test cleanups - Change phi-dbg.ll to match on mir output after phi elimination instead of going through the whole codegen pipeline. This is in preparation for https://reviews.llvm.org/D52010 I'm committing all the test changes upfront that work before and after independently. llvm-svn: 345532 2018-10-30 04:10:42 +08:00			`; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill`
AMDGPU: Use wider scalar spills for SGPR spilling Since the spill is for the whole wave, these don't have the swizzling problems that vector stores do and a single 4-byte allocation is enough to spill a 64 element register. This should reduce the number of spill instructions and put all the spills for a register in the same cacheline. This should save allocated private size, but for now it doesn't. The extra slots are allocated for each component, but never used because the frame layout is essentially finalized before frame indices are replaced. For always using the scalar store path, this should probably be moved into processFunctionBeforeFrameFinalized. llvm-svn: 288445 2016-12-02 08:54:45 +08:00			`; SMEM: s_cbranch_scc1`

AMDGPU: Always allocate emergency stack slot at offset 0 This allows us to ensure that 0 is never a valid pointer to a user object, and ensures that the offset is always legal without needing a register to access it. This comes at the cost of usable offsets and wasted stack space. llvm-svn: 295877 2017-02-23 05:05:25 +08:00			`; SMEM: s_add_u32 m0, s3, 0x100{{$}}`
Relax fast register allocator related test cases; NFC - Relex hard coded registers and stack frame sizes - Some test cleanups - Change phi-dbg.ll to match on mir output after phi elimination instead of going through the whole codegen pipeline. This is in preparation for https://reviews.llvm.org/D52010 I'm committing all the test changes upfront that work before and after independently. llvm-svn: 345532 2018-10-30 04:10:42 +08:00			`; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload`
AMDGPU: Use wider scalar spills for SGPR spilling Since the spill is for the whole wave, these don't have the swizzling problems that vector stores do and a single 4-byte allocation is enough to spill a 64 element register. This should reduce the number of spill instructions and put all the spills for a register in the same cacheline. This should save allocated private size, but for now it doesn't. The extra slots are allocated for each component, but never used because the frame layout is essentially finalized before frame indices are replaced. For always using the scalar store path, this should probably be moved into processFunctionBeforeFrameFinalized. llvm-svn: 288445 2016-12-02 08:54:45 +08:00			`; SMEM: s_dcache_wb`
			`; SMEM: s_endpgm`

			`; FIXME: Should only need 4 bytes`
			`; SMEM: ScratchSize: 20`

			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0`
			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1`
			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2`
			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3`
			`; VGPR: s_cbranch_scc1`

			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0`
			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1`
			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2`
			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3`


			`; VMEM: buffer_store_dword`
			`; VMEM: buffer_store_dword`
			`; VMEM: buffer_store_dword`
			`; VMEM: buffer_store_dword`
			`; VMEM: s_cbranch_scc1`

			`; VMEM: buffer_load_dword`
			`; VMEM: buffer_load_dword`
			`; VMEM: buffer_load_dword`
			`; VMEM: buffer_load_dword`
			`define amdgpu_kernel void @spill_sgpr_x4(i32 addrspace(1)* %out, i32 %in) #0 {`
			`%wide.sgpr = call <4 x i32> asm sideeffect "; def $0", "=s" () #0`
			`%cmp = icmp eq i32 %in, 0`
			`br i1 %cmp, label %bb0, label %ret`

			`bb0:`
			`call void asm sideeffect "; use $0", "s"(<4 x i32> %wide.sgpr) #0`
			`br label %ret`

			`ret:`
			`ret void`
			`}`

			`; ALL-LABEL: {{^}}spill_sgpr_x8:`

AMDGPU: Always allocate emergency stack slot at offset 0 This allows us to ensure that 0 is never a valid pointer to a user object, and ensures that the offset is always legal without needing a register to access it. This comes at the cost of usable offsets and wasted stack space. llvm-svn: 295877 2017-02-23 05:05:25 +08:00			`; SMEM: s_add_u32 m0, s3, 0x100{{$}}`
Relax fast register allocator related test cases; NFC - Relex hard coded registers and stack frame sizes - Some test cleanups - Change phi-dbg.ll to match on mir output after phi elimination instead of going through the whole codegen pipeline. This is in preparation for https://reviews.llvm.org/D52010 I'm committing all the test changes upfront that work before and after independently. llvm-svn: 345532 2018-10-30 04:10:42 +08:00			`; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill`
AMDGPU: Always allocate emergency stack slot at offset 0 This allows us to ensure that 0 is never a valid pointer to a user object, and ensures that the offset is always legal without needing a register to access it. This comes at the cost of usable offsets and wasted stack space. llvm-svn: 295877 2017-02-23 05:05:25 +08:00			`; SMEM: s_add_u32 m0, s3, 0x110{{$}}`
Relax fast register allocator related test cases; NFC - Relex hard coded registers and stack frame sizes - Some test cleanups - Change phi-dbg.ll to match on mir output after phi elimination instead of going through the whole codegen pipeline. This is in preparation for https://reviews.llvm.org/D52010 I'm committing all the test changes upfront that work before and after independently. llvm-svn: 345532 2018-10-30 04:10:42 +08:00			`; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Spill`
AMDGPU: Use wider scalar spills for SGPR spilling Since the spill is for the whole wave, these don't have the swizzling problems that vector stores do and a single 4-byte allocation is enough to spill a 64 element register. This should reduce the number of spill instructions and put all the spills for a register in the same cacheline. This should save allocated private size, but for now it doesn't. The extra slots are allocated for each component, but never used because the frame layout is essentially finalized before frame indices are replaced. For always using the scalar store path, this should probably be moved into processFunctionBeforeFrameFinalized. llvm-svn: 288445 2016-12-02 08:54:45 +08:00			`; SMEM: s_cbranch_scc1`

AMDGPU: Always allocate emergency stack slot at offset 0 This allows us to ensure that 0 is never a valid pointer to a user object, and ensures that the offset is always legal without needing a register to access it. This comes at the cost of usable offsets and wasted stack space. llvm-svn: 295877 2017-02-23 05:05:25 +08:00			`; SMEM: s_add_u32 m0, s3, 0x100{{$}}`
Relax fast register allocator related test cases; NFC - Relex hard coded registers and stack frame sizes - Some test cleanups - Change phi-dbg.ll to match on mir output after phi elimination instead of going through the whole codegen pipeline. This is in preparation for https://reviews.llvm.org/D52010 I'm committing all the test changes upfront that work before and after independently. llvm-svn: 345532 2018-10-30 04:10:42 +08:00			`; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload`
AMDGPU: Always allocate emergency stack slot at offset 0 This allows us to ensure that 0 is never a valid pointer to a user object, and ensures that the offset is always legal without needing a register to access it. This comes at the cost of usable offsets and wasted stack space. llvm-svn: 295877 2017-02-23 05:05:25 +08:00			`; SMEM: s_add_u32 m0, s3, 0x110{{$}}`
Relax fast register allocator related test cases; NFC - Relex hard coded registers and stack frame sizes - Some test cleanups - Change phi-dbg.ll to match on mir output after phi elimination instead of going through the whole codegen pipeline. This is in preparation for https://reviews.llvm.org/D52010 I'm committing all the test changes upfront that work before and after independently. llvm-svn: 345532 2018-10-30 04:10:42 +08:00			`; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload`
AMDGPU: Use wider scalar spills for SGPR spilling Since the spill is for the whole wave, these don't have the swizzling problems that vector stores do and a single 4-byte allocation is enough to spill a 64 element register. This should reduce the number of spill instructions and put all the spills for a register in the same cacheline. This should save allocated private size, but for now it doesn't. The extra slots are allocated for each component, but never used because the frame layout is essentially finalized before frame indices are replaced. For always using the scalar store path, this should probably be moved into processFunctionBeforeFrameFinalized. llvm-svn: 288445 2016-12-02 08:54:45 +08:00
			`; SMEM: s_dcache_wb`
			`; SMEM: s_endpgm`

			`; SMEM: ScratchSize: 36`

			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0`
			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1`
			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2`
			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3`
			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4`
			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 5`
			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 6`
			`; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 7`
			`; VGPR: s_cbranch_scc1`

			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0`
			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1`
			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2`
			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3`
			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4`
			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 5`
			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6`
			`; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7`

			`; VMEM: buffer_store_dword`
			`; VMEM: buffer_store_dword`
			`; VMEM: buffer_store_dword`
			`; VMEM: buffer_store_dword`
			`; VMEM: buffer_store_dword`
			`; VMEM: buffer_store_dword`
			`; VMEM: buffer_store_dword`
			`; VMEM: buffer_store_dword`
			`; VMEM: s_cbranch_scc1`

			`; VMEM: buffer_load_dword`
			`; VMEM: buffer_load_dword`
			`; VMEM: buffer_load_dword`
			`; VMEM: buffer_load_dword`
			`; VMEM: buffer_load_dword`
			`; VMEM: buffer_load_dword`
			`; VMEM: buffer_load_dword`
			`; VMEM: buffer_load_dword`
			`define amdgpu_kernel void @spill_sgpr_x8(i32 addrspace(1)* %out, i32 %in) #0 {`
			`%wide.sgpr = call <8 x i32> asm sideeffect "; def $0", "=s" () #0`
			`%cmp = icmp eq i32 %in, 0`
			`br i1 %cmp, label %bb0, label %ret`

			`bb0:`
			`call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr) #0`
			`br label %ret`

			`ret:`
			`ret void`
			`}`

			`; FIXME: x16 inlineasm seems broken`
			`; define amdgpu_kernel void @spill_sgpr_x16(i32 addrspace(1)* %out, i32 %in) #0 {`
			`; %wide.sgpr = call <16 x i32> asm sideeffect "; def $0", "=s" () #0`
			`; %cmp = icmp eq i32 %in, 0`
			`; br i1 %cmp, label %bb0, label %ret`

			`; bb0:`
			`; call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr) #0`
			`; br label %ret`

			`; ret:`
			`; ret void`
			`; }`

			`attributes #0 = { nounwind }`