llvm-project/llvm/test/CodeGen/AMDGPU/mfma-loop.ll

; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:

; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
; 3 vgprs are needed to avoid wait states between writes.

; FIXME: We should not be using and temporary registers at all.
; At the moment we initialize an sgpr, then copy it via vgprs.

; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2:v[0-9]+]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3:v[0-9]+]]

; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1:v[0-9]+]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]

; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]

; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]

; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]

; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]

; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]

; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]

; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]

; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]

; Check that we do not copy agprs to vgprs and back inside the loop.

; GCN: [[LOOP:BB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GCN: v_mfma_f32_32x32x1f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[LOOP]]

; Final result should be read only once after the loop.

; GCN-COUNT32: v_accvgpr_read_b32

define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
entry:
  br label %for.cond.preheader

for.cond.preheader:
  %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
  %inc = add nuw nsw i32 %c, 1
  %cc = icmp eq i32 %inc, 16
  br i1 %cc, label %exit, label %for.cond.preheader

exit:
  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
  ret void
}

; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:

; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
; GCN: v_mfma_f32_32x32x1f32
; GCN-NOT: v_accvgpr

; GCN: [[LOOP:BB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GCN: v_mfma_f32_32x32x1f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[LOOP]]

; GCN-COUNT32: v_accvgpr_read_b32
define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) {
entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %init = bitcast i32 %tid to float
  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)

  br label %for.cond.preheader

for.cond.preheader:
  %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ]
  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
  %inc = add nuw nsw i32 %c, 1
  %cc = icmp eq i32 %inc, 16
  br i1 %cc, label %exit, label %for.cond.preheader

exit:
  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
  ret void
}

declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
declare i32 @llvm.amdgcn.workitem.id.x()
[AMDGPU] move PHI nodes to AGPR class If all uses of a PHI are in AGPR register class we should avoid unneeded copies via VGPRs. Differential Revision: https://reviews.llvm.org/D69200 llvm-svn: 375297 2019-10-19 06:48:45 +08:00			`; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s`

			`; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:`
[AMDGPU] Select AGPR in PHI operand legalization If a PHI defines AGPR legalize its operands to AGPR. At the moment we can get an AGPR PHI with VGPR operands. I am not aware of any problems as it seems to be handled gracefully in RA, but this is not right anyway. It also slightly decreases VGPR pressure in some cases because we do not have to a copy via VGPR. Differential Revision: https://reviews.llvm.org/D69206 llvm-svn: 375446 2019-10-22 03:25:27 +08:00
			`; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.`
			`; 3 vgprs are needed to avoid wait states between writes.`

			`; FIXME: We should not be using and temporary registers at all.`
			`; At the moment we initialize an sgpr, then copy it via vgprs.`

			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2:v[0-9]+]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3:v[0-9]+]]`

			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1:v[0-9]+]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]`

			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]`

			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]`

			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]`

			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]`

			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]`

			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]`

			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]`

			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]`
			`; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]`

			`; Check that we do not copy agprs to vgprs and back inside the loop.`

[AMDGPU] move PHI nodes to AGPR class If all uses of a PHI are in AGPR register class we should avoid unneeded copies via VGPRs. Differential Revision: https://reviews.llvm.org/D69200 llvm-svn: 375297 2019-10-19 06:48:45 +08:00			`; GCN: [[LOOP:BB[0-9_]+]]:`
			`; GCN-NOT: v_accvgpr`
			`; GCN: v_mfma_f32_32x32x1f32`
			`; GCN-NOT: v_accvgpr`
			`; GCN: s_cbranch_scc1 [[LOOP]]`
[AMDGPU] Select AGPR in PHI operand legalization If a PHI defines AGPR legalize its operands to AGPR. At the moment we can get an AGPR PHI with VGPR operands. I am not aware of any problems as it seems to be handled gracefully in RA, but this is not right anyway. It also slightly decreases VGPR pressure in some cases because we do not have to a copy via VGPR. Differential Revision: https://reviews.llvm.org/D69206 llvm-svn: 375446 2019-10-22 03:25:27 +08:00
			`; Final result should be read only once after the loop.`

[AMDGPU] move PHI nodes to AGPR class If all uses of a PHI are in AGPR register class we should avoid unneeded copies via VGPRs. Differential Revision: https://reviews.llvm.org/D69200 llvm-svn: 375297 2019-10-19 06:48:45 +08:00			`; GCN-COUNT32: v_accvgpr_read_b32`
[AMDGPU] Select AGPR in PHI operand legalization If a PHI defines AGPR legalize its operands to AGPR. At the moment we can get an AGPR PHI with VGPR operands. I am not aware of any problems as it seems to be handled gracefully in RA, but this is not right anyway. It also slightly decreases VGPR pressure in some cases because we do not have to a copy via VGPR. Differential Revision: https://reviews.llvm.org/D69206 llvm-svn: 375446 2019-10-22 03:25:27 +08:00
[AMDGPU] move PHI nodes to AGPR class If all uses of a PHI are in AGPR register class we should avoid unneeded copies via VGPRs. Differential Revision: https://reviews.llvm.org/D69200 llvm-svn: 375297 2019-10-19 06:48:45 +08:00			`define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {`
			`entry:`
			`br label %for.cond.preheader`

			`for.cond.preheader:`
			`%phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]`
			`%c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]`
			`%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)`
			`%inc = add nuw nsw i32 %c, 1`
			`%cc = icmp eq i32 %inc, 16`
			`br i1 %cc, label %exit, label %for.cond.preheader`

			`exit:`
			`store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg`
			`ret void`
			`}`

[AMDGPU] Fix mfma scheduling crash An SUnit can be neither intruction not SDNode. It is all null if represents a nop. Fixed a crash on using SU->getInstr(). Differential Revision: https://reviews.llvm.org/D69395 2019-10-25 01:34:47 +08:00			`; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:`

			`; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0`
			`; GCN: v_mfma_f32_32x32x1f32`
			`; GCN-NOT: v_accvgpr`

			`; GCN: [[LOOP:BB[0-9_]+]]:`
			`; GCN-NOT: v_accvgpr`
			`; GCN: v_mfma_f32_32x32x1f32`
			`; GCN-NOT: v_accvgpr`
			`; GCN: s_cbranch_scc1 [[LOOP]]`

			`; GCN-COUNT32: v_accvgpr_read_b32`
			`define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) {`
			`entry:`
			`%tid = call i32 @llvm.amdgcn.workitem.id.x()`
			`%init = bitcast i32 %tid to float`
			`%mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)`

			`br label %for.cond.preheader`

			`for.cond.preheader:`
			`%phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ]`
			`%c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]`
			`%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)`
			`%inc = add nuw nsw i32 %c, 1`
			`%cc = icmp eq i32 %inc, 16`
			`br i1 %cc, label %exit, label %for.cond.preheader`

			`exit:`
			`store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg`
			`ret void`
			`}`

[AMDGPU] move PHI nodes to AGPR class If all uses of a PHI are in AGPR register class we should avoid unneeded copies via VGPRs. Differential Revision: https://reviews.llvm.org/D69200 llvm-svn: 375297 2019-10-19 06:48:45 +08:00			`declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)`
			`declare i32 @llvm.amdgcn.workitem.id.x()`