2019-10-19 06:48:45 +08:00
|
|
|
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
|
|
|
|
|
|
|
; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
|
2019-10-22 03:25:27 +08:00
|
|
|
|
|
|
|
; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
|
|
|
|
; 3 vgprs are needed to avoid wait states between writes.
|
|
|
|
|
|
|
|
; FIXME: We should not be using and temporary registers at all.
|
|
|
|
; At the moment we initialize an sgpr, then copy it via vgprs.
|
|
|
|
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2:v[0-9]+]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3:v[0-9]+]]
|
|
|
|
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1:v[0-9]+]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
|
|
|
|
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
|
|
|
|
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
|
|
|
|
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
|
|
|
|
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
|
|
|
|
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
|
|
|
|
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
|
|
|
|
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
|
|
|
|
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
|
|
|
|
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
|
|
|
|
|
|
|
|
; Check that we do not copy agprs to vgprs and back inside the loop.
|
|
|
|
|
2019-10-19 06:48:45 +08:00
|
|
|
; GCN: [[LOOP:BB[0-9_]+]]:
|
|
|
|
; GCN-NOT: v_accvgpr
|
|
|
|
; GCN: v_mfma_f32_32x32x1f32
|
|
|
|
; GCN-NOT: v_accvgpr
|
|
|
|
; GCN: s_cbranch_scc1 [[LOOP]]
|
2019-10-22 03:25:27 +08:00
|
|
|
|
|
|
|
; Final result should be read only once after the loop.
|
|
|
|
|
2019-10-19 06:48:45 +08:00
|
|
|
; GCN-COUNT32: v_accvgpr_read_b32
|
2019-10-22 03:25:27 +08:00
|
|
|
|
2019-10-19 06:48:45 +08:00
|
|
|
define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
|
|
|
|
entry:
|
|
|
|
br label %for.cond.preheader
|
|
|
|
|
|
|
|
for.cond.preheader:
|
|
|
|
%phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
|
|
|
|
%c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
|
|
|
|
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
|
|
|
|
%inc = add nuw nsw i32 %c, 1
|
|
|
|
%cc = icmp eq i32 %inc, 16
|
|
|
|
br i1 %cc, label %exit, label %for.cond.preheader
|
|
|
|
|
|
|
|
exit:
|
|
|
|
store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2019-10-25 01:34:47 +08:00
|
|
|
; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:
|
|
|
|
|
|
|
|
; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
|
|
|
|
; GCN: v_mfma_f32_32x32x1f32
|
|
|
|
; GCN-NOT: v_accvgpr
|
|
|
|
|
|
|
|
; GCN: [[LOOP:BB[0-9_]+]]:
|
|
|
|
; GCN-NOT: v_accvgpr
|
|
|
|
; GCN: v_mfma_f32_32x32x1f32
|
|
|
|
; GCN-NOT: v_accvgpr
|
|
|
|
; GCN: s_cbranch_scc1 [[LOOP]]
|
|
|
|
|
|
|
|
; GCN-COUNT32: v_accvgpr_read_b32
|
|
|
|
define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) {
|
|
|
|
entry:
|
|
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
|
|
%init = bitcast i32 %tid to float
|
|
|
|
%mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
|
|
|
|
|
|
|
|
br label %for.cond.preheader
|
|
|
|
|
|
|
|
for.cond.preheader:
|
|
|
|
%phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ]
|
|
|
|
%c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
|
|
|
|
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
|
|
|
|
%inc = add nuw nsw i32 %c, 1
|
|
|
|
%cc = icmp eq i32 %inc, 16
|
|
|
|
br i1 %cc, label %exit, label %for.cond.preheader
|
|
|
|
|
|
|
|
exit:
|
|
|
|
store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2019-10-19 06:48:45 +08:00
|
|
|
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x()
|