From e5340ed30ce6149ebffcd808e024da9daee2c98c Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 27 Oct 2021 11:40:33 -0700
Subject: [PATCH] [AMDGPU] Fix global isel for kernels using agprs on gfx90a

With Global ISel getReservedRegs() is called before function is
regbank selected for the first time. Defer caching of usesAGPRs()
in this case.

Differential Revision: https://reviews.llvm.org/D112644
---
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |   3 +
 .../GlobalISel/llvm.amdgcn.mfma.gfx90a.ll     | 487 ++++++++++++++++++
 2 files changed, 490 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 1bfeb6c415b6..c4007f56f350 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -670,6 +670,9 @@ bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
     if (RC && SIRegisterInfo::isAGPRClass(RC)) {
       UsesAGPRs = true;
       return true;
+    } else if (!RC && !MRI.use_empty(Reg) && MRI.getType(Reg).isValid()) {
+      // Defer caching UsesAGPRs, function might not yet been regbank selected.
+      return true;
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
new file mode 100644
index 000000000000..cfb781709ea3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -0,0 +1,487 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN %s
+
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
+declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
+declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(<32 x float> addrspace(1)* %arg) {
+; GCN-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b64 s[16:17], 1
+; GCN-NEXT:    s_mov_b32 s18, 2
+; GCN-NEXT:    s_mov_b32 s19, s17
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x0
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1]
+; GCN-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s7
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s16
+; GCN-NEXT:    v_accvgpr_write_b32 a16, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s17
+; GCN-NEXT:    v_accvgpr_write_b32 a17, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s18
+; GCN-NEXT:    v_accvgpr_write_b32 a18, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s19
+; GCN-NEXT:    v_accvgpr_write_b32 a19, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s20
+; GCN-NEXT:    v_accvgpr_write_b32 a20, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s21
+; GCN-NEXT:    v_accvgpr_write_b32 a21, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s22
+; GCN-NEXT:    v_accvgpr_write_b32 a22, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s23
+; GCN-NEXT:    v_accvgpr_write_b32 a23, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s24
+; GCN-NEXT:    v_accvgpr_write_b32 a24, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s25
+; GCN-NEXT:    v_accvgpr_write_b32 a25, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s26
+; GCN-NEXT:    v_accvgpr_write_b32 a26, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s27
+; GCN-NEXT:    v_accvgpr_write_b32 a27, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s28
+; GCN-NEXT:    v_accvgpr_write_b32 a28, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s29
+; GCN-NEXT:    v_accvgpr_write_b32 a29, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s30
+; GCN-NEXT:    v_accvgpr_write_b32 a30, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s31
+; GCN-NEXT:    v_accvgpr_write_b32 a31, v4
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[34:35]
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
+; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
+; GCN-NEXT:    global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
+; GCN-NEXT:    global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
+; GCN-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
+; GCN-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
+; GCN-NEXT:    s_endpgm
+bb:
+  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
+  %a = bitcast i64 1 to <4 x i16>
+  %b = bitcast i64 2 to <4 x i16>
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(<16 x float> addrspace(1)* %arg) {
+; GCN-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b64 s[2:3], 1
+; GCN-NEXT:    s_mov_b32 s17, s3
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT:    s_mov_b32 s16, 2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s7
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v4
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[18:19]
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[18:19] offset:16
+; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[18:19] offset:32
+; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[18:19] offset:48
+; GCN-NEXT:    s_endpgm
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %a = bitcast i64 1 to <4 x i16>
+  %b = bitcast i64 2 to <4 x i16>
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(<4 x float> addrspace(1)* %arg) {
+; GCN-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b64 s[2:3], 1
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT:    s_mov_b32 s4, 2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GCN-NEXT:    s_endpgm
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %a = bitcast i64 1 to <4 x i16>
+  %b = bitcast i64 2 to <4 x i16>
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(<16 x float> addrspace(1)* %arg) {
+; GCN-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b64 s[2:3], 1
+; GCN-NEXT:    s_mov_b32 s17, s3
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT:    s_mov_b32 s16, 2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s7
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s13
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s14
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s15
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v4
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[18:19]
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[18:19] offset:16
+; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[18:19] offset:32
+; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[18:19] offset:48
+; GCN-NEXT:    s_endpgm
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %a = bitcast i64 1 to <4 x i16>
+  %b = bitcast i64 2 to <4 x i16>
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(<4 x float> addrspace(1)* %arg) {
+; GCN-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
+; GCN-NEXT:    s_mov_b64 s[2:3], 1
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT:    s_mov_b32 s4, 2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GCN-NEXT:    s_endpgm
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %a = bitcast i64 1 to <4 x i16>
+  %b = bitcast i64 2 to <4 x i16>
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f64_4x4x4f64(double addrspace(1)* %arg, double %a, double %b) {
+; GCN-LABEL: test_mfma_f64_4x4x4f64:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    global_store_dwordx2 v0, a[0:1], s[4:5]
+; GCN-NEXT:    s_endpgm
+bb:
+  %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
+  %mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3)
+  store double %mai.2, double addrspace(1)* %arg
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64(<4 x double> addrspace(1)* %arg, double %a, double %b) {
+; GCN-LABEL: test_mfma_f64_16x16x4f64:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x34
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s7
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v4
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
+; GCN-NEXT:    s_endpgm
+bb:
+  %in.1 = load <4 x double>, <4 x double> addrspace(1)* %arg
+  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3)
+  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) {
+; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
+; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[4:5]
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[4:5] offset:16
+; GCN-NEXT:    s_endpgm
+bb:
+  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
+  store <4 x double> %mai.2, <4 x double> addrspace(1)* %arg
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) {
+; GCN-LABEL: test_mfma_f64_16x16x4f64_imm:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    s_mov_b64 s[10:11], 1.0
+; GCN-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1]
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v4
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[12:13]
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[12:13] offset:16
+; GCN-NEXT:    s_endpgm
+bb:
+  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
+  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(<4 x double> addrspace(1)* %arg, double %a, double %b) {
+; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    s_mov_b32 s5, 0x405ec000
+; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1]
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s11
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v4
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[12:13]
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[12:13] offset:16
+; GCN-NEXT:    s_endpgm
+bb:
+  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
+  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
+  ret void
+}