llvm-project/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll

; RUN: llc -march=amdgcn -verify-machineinstrs -mtriple=amdgcn-- -o - %s | FileCheck %s

declare float @llvm.fma.f32(float, float, float)

; This checks that rematerialization support of the coalescer does not
; unnecessarily widen the register class. Without those fixes > 20 VGprs
; are used here
; Also check that some rematerialization of the 0 constant happened.
; CHECK-LABEL: foobar
; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
; It's probably OK if this is slightly higher:
; CHECK: ; NumVgprs: 8
define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
entry:
  %cmpflag = icmp eq i32 %flag, 1
  br i1 %cmpflag, label %loop, label %exit

loop:
  %c = phi i32 [0, %entry], [%cnext, %loop]
  %v0 = phi float [0.0, %entry], [%fma.0, %loop]
  %v1 = phi float [0.0, %entry], [%fma.1, %loop]
  %v2 = phi float [0.0, %entry], [%fma.2, %loop]
  %v3 = phi float [0.0, %entry], [%fma.3, %loop]

  ; Try to get the 0 constant to get coalesced into a wide register
  %blup = insertelement <4 x float> undef, float %v0, i32 0
  store <4 x float> %blup, <4 x float> addrspace(1)* %out

  %load = load <4 x float>, <4 x float> addrspace(1)* %in
  %load.0 = extractelement <4 x float> %load, i32 0
  %load.1 = extractelement <4 x float> %load, i32 1
  %load.2 = extractelement <4 x float> %load, i32 2
  %load.3 = extractelement <4 x float> %load, i32 3
  %fma.0 = call float @llvm.fma.f32(float %v0, float %load.0, float %v0)
  %fma.1 = call float @llvm.fma.f32(float %v1, float %load.1, float %v1)
  %fma.2 = call float @llvm.fma.f32(float %v2, float %load.2, float %v2)
  %fma.3 = call float @llvm.fma.f32(float %v3, float %load.3, float %v3)

  %cnext = add nsw i32 %c, 1
  %cmp = icmp eq i32 %cnext, 42
  br i1 %cmp, label %exit, label %loop

exit:
  %ev0 = phi float [0.0, %entry], [%fma.0, %loop]
  %ev1 = phi float [0.0, %entry], [%fma.1, %loop]
  %ev2 = phi float [0.0, %entry], [%fma.2, %loop]
  %ev3 = phi float [0.0, %entry], [%fma.3, %loop]
  %dst.0 = insertelement <4 x float> undef,  float %ev0, i32 0
  %dst.1 = insertelement <4 x float> %dst.0, float %ev1, i32 1
  %dst.2 = insertelement <4 x float> %dst.1, float %ev2, i32 2
  %dst.3 = insertelement <4 x float> %dst.2, float %ev3, i32 3
  store <4 x float> %dst.3, <4 x float> addrspace(1)* %out
  ret void
}
R600/SI: Fix tests with triples in them Only set the triple from the command line options. Some of these were still testing SI features and using the old r600-- triple. llvm-svn: 238958 2015-06-04 04:04:05 +08:00			`; RUN: llc -march=amdgcn -verify-machineinstrs -mtriple=amdgcn-- -o - %s \| FileCheck %s`
R600/RegisterCoalescer: Enable more rematerialization/add missing testcase This enables the rematerialization of some R600 MOV instructions in the RegisterCoalescer and adds a testcase for r235668. llvm-svn: 235675 2015-04-24 08:25:50 +08:00
			`declare float @llvm.fma.f32(float, float, float)`

			`; This checks that rematerialization support of the coalescer does not`
			`; unnecessarily widen the register class. Without those fixes > 20 VGprs`
			`; are used here`
			`; Also check that some rematerialization of the 0 constant happened.`
			`; CHECK-LABEL: foobar`
			`; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0`
			`; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0`
			`; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0`
			`; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0`
R600: Re-enable sub-reg liveness The bug in the R600 backend that this uncovered has been fixed. llvm-svn: 238999 2015-06-04 09:20:04 +08:00			`; It's probably OK if this is slightly higher:`
Revert r307026, "[AMDGPU] Switch scalarize global loads ON by default" It broke a testcase. Failing Tests (1): LLVM :: CodeGen/AMDGPU/alignbit-pat.ll llvm-svn: 307054 2017-07-04 10:14:18 +08:00			`; CHECK: ; NumVgprs: 8`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {`
R600/RegisterCoalescer: Enable more rematerialization/add missing testcase This enables the rematerialization of some R600 MOV instructions in the RegisterCoalescer and adds a testcase for r235668. llvm-svn: 235675 2015-04-24 08:25:50 +08:00			`entry:`
			`%cmpflag = icmp eq i32 %flag, 1`
			`br i1 %cmpflag, label %loop, label %exit`

			`loop:`
			`%c = phi i32 [0, %entry], [%cnext, %loop]`
			`%v0 = phi float [0.0, %entry], [%fma.0, %loop]`
			`%v1 = phi float [0.0, %entry], [%fma.1, %loop]`
			`%v2 = phi float [0.0, %entry], [%fma.2, %loop]`
			`%v3 = phi float [0.0, %entry], [%fma.3, %loop]`

			`; Try to get the 0 constant to get coalesced into a wide register`
			`%blup = insertelement <4 x float> undef, float %v0, i32 0`
			`store <4 x float> %blup, <4 x float> addrspace(1)* %out`

			`%load = load <4 x float>, <4 x float> addrspace(1)* %in`
			`%load.0 = extractelement <4 x float> %load, i32 0`
			`%load.1 = extractelement <4 x float> %load, i32 1`
			`%load.2 = extractelement <4 x float> %load, i32 2`
			`%load.3 = extractelement <4 x float> %load, i32 3`
			`%fma.0 = call float @llvm.fma.f32(float %v0, float %load.0, float %v0)`
			`%fma.1 = call float @llvm.fma.f32(float %v1, float %load.1, float %v1)`
			`%fma.2 = call float @llvm.fma.f32(float %v2, float %load.2, float %v2)`
			`%fma.3 = call float @llvm.fma.f32(float %v3, float %load.3, float %v3)`

			`%cnext = add nsw i32 %c, 1`
			`%cmp = icmp eq i32 %cnext, 42`
			`br i1 %cmp, label %exit, label %loop`

			`exit:`
			`%ev0 = phi float [0.0, %entry], [%fma.0, %loop]`
			`%ev1 = phi float [0.0, %entry], [%fma.1, %loop]`
			`%ev2 = phi float [0.0, %entry], [%fma.2, %loop]`
			`%ev3 = phi float [0.0, %entry], [%fma.3, %loop]`
			`%dst.0 = insertelement <4 x float> undef, float %ev0, i32 0`
			`%dst.1 = insertelement <4 x float> %dst.0, float %ev1, i32 1`
			`%dst.2 = insertelement <4 x float> %dst.1, float %ev2, i32 2`
			`%dst.3 = insertelement <4 x float> %dst.2, float %ev3, i32 3`
			`store <4 x float> %dst.3, <4 x float> addrspace(1)* %out`
			`ret void`
			`}`