From f5a2848376b488322e5573fef8a239befe110f3a Mon Sep 17 00:00:00 2001 From: Farhana Aleen Date: Tue, 18 Sep 2018 16:59:48 +0000 Subject: [PATCH] [AMDGPU] Match udot8 pattern Summary: D.u32 = S0.u4[0] * S1.u4[0] + S0.u4[1] * S1.u4[1] + S0.u4[2] * S1.u4[2] + S0.u4[3] * S1.u4[3] + S0.u4[4] * S1.u4[4] + S0.u4[5] * S1.u4[5] + S0.u4[6] * S1.u4[6] + S0.u4[7] * S1.u4[7] + S2.u32 Author: FarhanaAleen Reviewed By: arsenm, nhaehnle Differential Revision: https://reviews.llvm.org/D51947 llvm-svn: 342497 --- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 69 +- llvm/test/CodeGen/AMDGPU/idot8.ll | 2503 +++++++++++++++++++ 2 files changed, 2550 insertions(+), 22 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/idot8.ll diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 3154dcfdd459..83e95b553804 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -168,34 +168,53 @@ defm : MadFmaMixPats; class Srl : PatFrag<(ops node:$src), (srl node:$src, (i32 N))>; -foreach Bits = [8, 16, 24] in { - def srl#Bits : Srl; -} +foreach Bits = 1-7 in + def srl#!shl(Bits, 2) : Srl; -def and_255 : PatFrag< - (ops node:$src0), (and node:$src0, (i32 255)) ->; - -class Extract_U8 : PatFrag<( - ops node:$src), - !if (!eq (FromBitIndex, 24), // last element +class Extract_U : PatFrag< + (ops node:$src), + !if (!or (!and (!eq (BitMask, 255), !eq (FromBitIndex, 24)), + !and (!eq (BitMask, 15), !eq (FromBitIndex, 28))), // last element (!cast("srl"#FromBitIndex) node:$src), !if (!eq (FromBitIndex, 0), // first element - (and_255 node:$src), - (and_255 (!cast("srl"#FromBitIndex) node:$src))))>; + (and node:$src, (i32 BitMask)), + (and (!cast("srl"#FromBitIndex) node:$src), (i32 BitMask))))>; -// Defines patterns that extract each Index'ed 8bit from a 32bit scalar value; -foreach Index = [1, 2, 3, 4] in { - def UElt#Index : Extract_U8; -} +foreach Index = 0-3 in { + // Defines patterns that extract each Index'ed 8bit from an unsigned + // 32bit scalar value; + def U#Index#"_8bit" : Extract_U; -// Defines multiplication patterns where the multiplication is happening on each -// Index'ed 8bit of a 32bit scalar value. -foreach Index = [1, 2, 3, 4] in { + // Defines multiplication patterns where the multiplication is happening on each + // Index'ed 8bit of a 32bit scalar value. def MulU_Elt#Index : PatFrag< (ops node:$src0, node:$src1), - (AMDGPUmul_u24_oneuse (!cast("UElt"#Index) node:$src0), - (!cast("UElt"#Index) node:$src1))>; + (AMDGPUmul_u24_oneuse (!cast("U"#Index#"_8bit") node:$src0), + (!cast("U"#Index#"_8bit") node:$src1))>; +} + +// Different variants of dot8 patterns cause a huge increase in the compile time. +// Define non-associative/commutative add/mul to prevent permutation in the dot8 +// pattern. +def NonACAdd : SDNode<"ISD::ADD" , SDTIntBinOp>; +def NonACAdd_oneuse : HasOneUseBinOp; + +def NonACAMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24" , SDTIntBinOp>; +def NonACAMDGPUmul_u24_oneuse : HasOneUseBinOp; + +foreach Index = 0-7 in { + // Defines patterns that extract each Index'ed 4bit from an unsigned + // 32bit scalar value; + def U#Index#"_4bit" : Extract_U; + + // Defines multiplication patterns where the multiplication is happening on each + // Index'ed 8bit of a 32bit scalar value. + def MulU#Index#"_4bit" : PatFrag< + (ops node:$src0, node:$src1), + (NonACAMDGPUmul_u24_oneuse (!cast("U"#Index#"_4bit") node:$src0), + (!cast("U"#Index#"_4bit") node:$src1))>; } class UDot2Pat : GCNPat < @@ -246,11 +265,17 @@ def : UDot2Pat; def : SDot2Pat; def : GCNPat < - !cast(!foldl((i32 i32:$src2), [1, 2, 3, 4], lhs, y, + !cast(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, (add_oneuse lhs, (!cast("MulU_Elt"#y) i32:$src0, i32:$src1)))), (V_DOT4_U32_U8 (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0)) >; +def : GCNPat < + !cast(!foldl((add_oneuse i32:$src2, (MulU0_4bit i32:$src0, i32:$src1)), [1, 2, 3, 4, 5, 6, 7], lhs, y, + (NonACAdd_oneuse lhs, (!cast("MulU"#y#"_4bit") i32:$src0, i32:$src1)))), + (V_DOT8_U32_U4 (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0)) +>; + } // End SubtargetPredicate = HasDLInsts multiclass VOP3P_Real_vi op> { diff --git a/llvm/test/CodeGen/AMDGPU/idot8.ll b/llvm/test/CodeGen/AMDGPU/idot8.ll new file mode 100644 index 000000000000..94f93b002b7a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/idot8.ll @@ -0,0 +1,2503 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN-DL %s + +define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, +; GFX7-LABEL: udot8_acc32: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s8, s0, 28 +; GFX7-NEXT: s_lshr_b32 s15, s1, 28 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s21, s1, 0x40004 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s14, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-NEXT: v_mad_u32_u24 v0, s14, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s16 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: udot8_acc32: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s0, s2, 28 +; GFX8-NEXT: s_lshr_b32 s11, s4, 28 +; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX8-NEXT: s_and_b32 s4, s4, 15 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s2, s2, 15 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: udot8_acc32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s0, s2, 28 +; GFX9-NEXT: s_lshr_b32 s11, s4, 28 +; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX9-NEXT: s_and_b32 s4, s4, 15 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s13 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GCN-DL-LABEL: udot8_acc32: +; GCN-DL: ; %bb.0: ; %entry +; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: v_mov_b32_e32 v2, s4 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s5 +; GCN-DL-NEXT: v_dot8_u32_u4 v2, s2, v2, v3 +; GCN-DL-NEXT: global_store_dword v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %v1e0 = extractelement <8 x i4> %vec1, i64 0 + %cv1e0 = zext i4 %v1e0 to i32 + %v2e0 = extractelement <8 x i4> %vec2, i64 0 + %cv2e0 = zext i4 %v2e0 to i32 + %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 + + %v1e1 = extractelement <8 x i4> %vec1, i64 1 + %cv1e1 = zext i4 %v1e1 to i32 + %v2e1 = extractelement <8 x i4> %vec2, i64 1 + %cv2e1 = zext i4 %v2e1 to i32 + %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 + + %v1e2 = extractelement <8 x i4> %vec1, i64 2 + %cv1e2 = zext i4 %v1e2 to i32 + %v2e2 = extractelement <8 x i4> %vec2, i64 2 + %cv2e2 = zext i4 %v2e2 to i32 + %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 + + %v1e3 = extractelement <8 x i4> %vec1, i64 3 + %cv1e3 = zext i4 %v1e3 to i32 + %v2e3 = extractelement <8 x i4> %vec2, i64 3 + %cv2e3 = zext i4 %v2e3 to i32 + %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 + + %v1e4 = extractelement <8 x i4> %vec1, i64 4 + %cv1e4 = zext i4 %v1e4 to i32 + %v2e4 = extractelement <8 x i4> %vec2, i64 4 + %cv2e4 = zext i4 %v2e4 to i32 + %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 + + %v1e5 = extractelement <8 x i4> %vec1, i64 5 + %cv1e5 = zext i4 %v1e5 to i32 + %v2e5 = extractelement <8 x i4> %vec2, i64 5 + %cv2e5 = zext i4 %v2e5 to i32 + %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 + + %v1e6 = extractelement <8 x i4> %vec1, i64 6 + %cv1e6 = zext i4 %v1e6 to i32 + %v2e6 = extractelement <8 x i4> %vec2, i64 6 + %cv2e6 = zext i4 %v2e6 to i32 + %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 + + %v1e7 = extractelement <8 x i4> %vec1, i64 7 + %cv1e7 = zext i4 %v1e7 to i32 + %v2e7 = extractelement <8 x i4> %vec2, i64 7 + %cv2e7 = zext i4 %v2e7 to i32 + %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 + + %acc = load i32, i32 addrspace(1)* %dst, align 4 + %add1 = add i32 %mul0, %acc + %add2 = add i32 %add1, %mul1 + %add3 = add i32 %add2, %mul2 + %add4 = add i32 %add3, %mul3 + %add5 = add i32 %add4, %mul4 + %add6 = add i32 %add5, %mul5 + %add7 = add i32 %add6, %mul6 + %add8 = add i32 %add7, %mul7 + + store i32 %add8, i32 addrspace(1)* %dst, align 4 + ret void +} + +; TODO: Remove the unnecessary instruction(that is zero-extending the +; 2nd MAD) to have the pattern-recognizer to kick in. +define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, +; GFX7-LABEL: udot8_acc16: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX7-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s2, s0, 28 +; GFX7-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 +; GFX7-NEXT: s_lshr_b32 s14, s1, 28 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-NEXT: v_mov_b32_e32 v6, s16 +; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[8:11], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: udot8_acc16: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s0, s2, 15 +; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX8-NEXT: s_lshr_b32 s14, s4, 28 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s10 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s12 +; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: udot8_acc16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s0, s2, 15 +; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-NEXT: s_lshr_b32 s14, s4, 28 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GCN-DL-LABEL: udot8_acc16: +; GCN-DL: ; %bb.0: ; %entry +; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: global_load_ushort v2, v[0:1], off +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_and_b32 s0, s2, 15 +; GCN-DL-NEXT: s_and_b32 s1, s4, 15 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s1 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s5 +; GCN-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GCN-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GCN-DL-NEXT: s_lshr_b32 s14, s4, 28 +; GCN-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v5, s5 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GCN-DL-NEXT: v_mov_b32_e32 v6, s4 +; GCN-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GCN-DL-NEXT: v_mov_b32_e32 v7, s8 +; GCN-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GCN-DL-NEXT: v_mov_b32_e32 v8, s10 +; GCN-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GCN-DL-NEXT: v_mov_b32_e32 v9, s12 +; GCN-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GCN-DL-NEXT: s_waitcnt vmcnt(0) +; GCN-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GCN-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s14 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GCN-DL-NEXT: global_store_short v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i16 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %v1e0 = extractelement <8 x i4> %vec1, i64 0 + %cv1e0 = zext i4 %v1e0 to i16 + %v2e0 = extractelement <8 x i4> %vec2, i64 0 + %cv2e0 = zext i4 %v2e0 to i16 + %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0 + + %v1e1 = extractelement <8 x i4> %vec1, i64 1 + %cv1e1 = zext i4 %v1e1 to i16 + %v2e1 = extractelement <8 x i4> %vec2, i64 1 + %cv2e1 = zext i4 %v2e1 to i16 + %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1 + + %v1e2 = extractelement <8 x i4> %vec1, i64 2 + %cv1e2 = zext i4 %v1e2 to i16 + %v2e2 = extractelement <8 x i4> %vec2, i64 2 + %cv2e2 = zext i4 %v2e2 to i16 + %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2 + + %v1e3 = extractelement <8 x i4> %vec1, i64 3 + %cv1e3 = zext i4 %v1e3 to i16 + %v2e3 = extractelement <8 x i4> %vec2, i64 3 + %cv2e3 = zext i4 %v2e3 to i16 + %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3 + + %v1e4 = extractelement <8 x i4> %vec1, i64 4 + %cv1e4 = zext i4 %v1e4 to i16 + %v2e4 = extractelement <8 x i4> %vec2, i64 4 + %cv2e4 = zext i4 %v2e4 to i16 + %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4 + + %v1e5 = extractelement <8 x i4> %vec1, i64 5 + %cv1e5 = zext i4 %v1e5 to i16 + %v2e5 = extractelement <8 x i4> %vec2, i64 5 + %cv2e5 = zext i4 %v2e5 to i16 + %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5 + + %v1e6 = extractelement <8 x i4> %vec1, i64 6 + %cv1e6 = zext i4 %v1e6 to i16 + %v2e6 = extractelement <8 x i4> %vec2, i64 6 + %cv2e6 = zext i4 %v2e6 to i16 + %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6 + + %v1e7 = extractelement <8 x i4> %vec1, i64 7 + %cv1e7 = zext i4 %v1e7 to i16 + %v2e7 = extractelement <8 x i4> %vec2, i64 7 + %cv2e7 = zext i4 %v2e7 to i16 + %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7 + + %acc = load i16, i16 addrspace(1)* %dst, align 4 + %add1 = add i16 %mul0, %acc + %add2 = add i16 %add1, %mul1 + %add3 = add i16 %add2, %mul2 + %add4 = add i16 %add3, %mul3 + %add5 = add i16 %add4, %mul4 + %add6 = add i16 %add5, %mul5 + %add7 = add i16 %add6, %mul6 + %add8 = add i16 %add7, %mul7 + + store i16 %add8, i16 addrspace(1)* %dst, align 4 + ret void +} + +; TODO: Remove the unnecessary instruction(that is zero-extending the +; 2nd MAD) to have the pattern-recognizer to kick in. +define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, +; GFX7-LABEL: udot8_acc8: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; GFX7-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s2, s0, 28 +; GFX7-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 +; GFX7-NEXT: s_lshr_b32 s14, s1, 28 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-NEXT: v_mov_b32_e32 v6, s16 +; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: udot8_acc8: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s0, s2, 15 +; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX8-NEXT: s_lshr_b32 s14, s4, 28 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s10 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s12 +; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: udot8_acc8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s0, s2, 15 +; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-NEXT: s_lshr_b32 s14, s4, 28 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GCN-DL-LABEL: udot8_acc8: +; GCN-DL: ; %bb.0: ; %entry +; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_and_b32 s0, s2, 15 +; GCN-DL-NEXT: s_and_b32 s1, s4, 15 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s1 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s5 +; GCN-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GCN-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GCN-DL-NEXT: s_lshr_b32 s14, s4, 28 +; GCN-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v5, s5 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GCN-DL-NEXT: v_mov_b32_e32 v6, s4 +; GCN-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GCN-DL-NEXT: v_mov_b32_e32 v7, s8 +; GCN-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GCN-DL-NEXT: v_mov_b32_e32 v8, s10 +; GCN-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GCN-DL-NEXT: v_mov_b32_e32 v9, s12 +; GCN-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GCN-DL-NEXT: s_waitcnt vmcnt(0) +; GCN-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GCN-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s14 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GCN-DL-NEXT: global_store_byte v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i8 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %v1e0 = extractelement <8 x i4> %vec1, i64 0 + %cv1e0 = zext i4 %v1e0 to i8 + %v2e0 = extractelement <8 x i4> %vec2, i64 0 + %cv2e0 = zext i4 %v2e0 to i8 + %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0 + + %v1e1 = extractelement <8 x i4> %vec1, i64 1 + %cv1e1 = zext i4 %v1e1 to i8 + %v2e1 = extractelement <8 x i4> %vec2, i64 1 + %cv2e1 = zext i4 %v2e1 to i8 + %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1 + + %v1e2 = extractelement <8 x i4> %vec1, i64 2 + %cv1e2 = zext i4 %v1e2 to i8 + %v2e2 = extractelement <8 x i4> %vec2, i64 2 + %cv2e2 = zext i4 %v2e2 to i8 + %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2 + + %v1e3 = extractelement <8 x i4> %vec1, i64 3 + %cv1e3 = zext i4 %v1e3 to i8 + %v2e3 = extractelement <8 x i4> %vec2, i64 3 + %cv2e3 = zext i4 %v2e3 to i8 + %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3 + + %v1e4 = extractelement <8 x i4> %vec1, i64 4 + %cv1e4 = zext i4 %v1e4 to i8 + %v2e4 = extractelement <8 x i4> %vec2, i64 4 + %cv2e4 = zext i4 %v2e4 to i8 + %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4 + + %v1e5 = extractelement <8 x i4> %vec1, i64 5 + %cv1e5 = zext i4 %v1e5 to i8 + %v2e5 = extractelement <8 x i4> %vec2, i64 5 + %cv2e5 = zext i4 %v2e5 to i8 + %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5 + + %v1e6 = extractelement <8 x i4> %vec1, i64 6 + %cv1e6 = zext i4 %v1e6 to i8 + %v2e6 = extractelement <8 x i4> %vec2, i64 6 + %cv2e6 = zext i4 %v2e6 to i8 + %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6 + + %v1e7 = extractelement <8 x i4> %vec1, i64 7 + %cv1e7 = zext i4 %v1e7 to i8 + %v2e7 = extractelement <8 x i4> %vec2, i64 7 + %cv2e7 = zext i4 %v2e7 to i8 + %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7 + + %acc = load i8, i8 addrspace(1)* %dst, align 4 + %add1 = add i8 %mul0, %acc + %add2 = add i8 %add1, %mul1 + %add3 = add i8 %add2, %mul2 + %add4 = add i8 %add3, %mul3 + %add5 = add i8 %add4, %mul4 + %add6 = add i8 %add5, %mul5 + %add7 = add i8 %add6, %mul6 + %add8 = add i8 %add7, %mul7 + + store i8 %add8, i8 addrspace(1)* %dst, align 4 + ret void +} + +; TODO: Remove the two unnecessary instructions(and+add after 2nd MAD) +; to have the pattern-recognizer to kick in. +define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, +; GFX7-LABEL: udot8_acc4: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; GFX7-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s2, s0, 28 +; GFX7-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 +; GFX7-NEXT: s_lshr_b32 s14, s1, 28 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-NEXT: v_mov_b32_e32 v6, s16 +; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: udot8_acc4: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s0, s2, 15 +; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX8-NEXT: v_mul_u32_u24_e32 v4, s7, v4 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x4000c +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v6, s5 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX8-NEXT: s_lshr_b32 s4, s4, 28 +; GFX8-NEXT: v_mov_b32_e32 v9, s9 +; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: udot8_acc4: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s0, s2, 15 +; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX9-NEXT: v_mul_u32_u24_e32 v4, s7, v4 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x4000c +; GFX9-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s4, 28 +; GFX9-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GCN-DL-LABEL: udot8_acc4: +; GCN-DL: ; %bb.0: ; %entry +; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_and_b32 s0, s2, 15 +; GCN-DL-NEXT: s_and_b32 s1, s4, 15 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s1 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GCN-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s6 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v5, s5 +; GCN-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GCN-DL-NEXT: v_mul_u32_u24_e32 v4, s7, v4 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x4000c +; GCN-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GCN-DL-NEXT: s_bfe_u32 s7, s4, 0x40010 +; GCN-DL-NEXT: v_mov_b32_e32 v6, s5 +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GCN-DL-NEXT: v_mov_b32_e32 v7, s7 +; GCN-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s9, s4, 0x40018 +; GCN-DL-NEXT: v_mov_b32_e32 v8, s8 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GCN-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GCN-DL-NEXT: v_mov_b32_e32 v9, s9 +; GCN-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GCN-DL-NEXT: s_waitcnt vmcnt(0) +; GCN-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GCN-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GCN-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s7, v8, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s4 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GCN-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GCN-DL-NEXT: global_store_byte v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i4 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %v1e0 = extractelement <8 x i4> %vec1, i64 0 + %v2e0 = extractelement <8 x i4> %vec2, i64 0 + %mul0 = mul nuw nsw i4 %v1e0, %v2e0 + + %v1e1 = extractelement <8 x i4> %vec1, i64 1 + %v2e1 = extractelement <8 x i4> %vec2, i64 1 + %mul1 = mul nuw nsw i4 %v1e1, %v2e1 + + %v1e2 = extractelement <8 x i4> %vec1, i64 2 + %v2e2 = extractelement <8 x i4> %vec2, i64 2 + %mul2 = mul nuw nsw i4 %v1e2, %v2e2 + + %v1e3 = extractelement <8 x i4> %vec1, i64 3 + %v2e3 = extractelement <8 x i4> %vec2, i64 3 + %mul3 = mul nuw nsw i4 %v1e3, %v2e3 + + %v1e4 = extractelement <8 x i4> %vec1, i64 4 + %v2e4 = extractelement <8 x i4> %vec2, i64 4 + %mul4 = mul nuw nsw i4 %v1e4, %v2e4 + + %v1e5 = extractelement <8 x i4> %vec1, i64 5 + %v2e5 = extractelement <8 x i4> %vec2, i64 5 + %mul5 = mul nuw nsw i4 %v1e5, %v2e5 + + %v1e6 = extractelement <8 x i4> %vec1, i64 6 + %v2e6 = extractelement <8 x i4> %vec2, i64 6 + %mul6 = mul nuw nsw i4 %v1e6, %v2e6 + + %v1e7 = extractelement <8 x i4> %vec1, i64 7 + %v2e7 = extractelement <8 x i4> %vec2, i64 7 + %mul7 = mul nuw nsw i4 %v1e7, %v2e7 + + %acc = load i4, i4 addrspace(1)* %dst, align 4 + %add1 = add i4 %mul0, %acc + %add2 = add i4 %add1, %mul1 + %add3 = add i4 %add2, %mul2 + %add4 = add i4 %add3, %mul3 + %add5 = add i4 %add4, %mul4 + %add6 = add i4 %add5, %mul5 + %add7 = add i4 %add6, %mul6 + %add8 = add i4 %add7, %mul7 + + store i4 %add8, i4 addrspace(1)* %dst, align 4 + ret void +} + +; TODO: Currently, permutation of udot8 is turned off due to a huge increase +; in the compile time. +define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1, +; GFX7-LABEL: udot8_CommutationInsideMAD: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; GFX7-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s2, s0, 28 +; GFX7-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 +; GFX7-NEXT: s_lshr_b32 s14, s1, 28 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-NEXT: v_mov_b32_e32 v6, s16 +; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: udot8_CommutationInsideMAD: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s0, s2, 15 +; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v8, s9 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX8-NEXT: s_lshr_b32 s4, s4, 28 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 +; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: udot8_CommutationInsideMAD: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s0, s2, 15 +; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v8, s9 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s4, 28 +; GFX9-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GCN-DL-LABEL: udot8_CommutationInsideMAD: +; GCN-DL: ; %bb.0: ; %entry +; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_and_b32 s0, s2, 15 +; GCN-DL-NEXT: s_and_b32 s1, s4, 15 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s1 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GCN-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s5 +; GCN-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GCN-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GCN-DL-NEXT: v_mov_b32_e32 v5, s6 +; GCN-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GCN-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GCN-DL-NEXT: v_mov_b32_e32 v6, s7 +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GCN-DL-NEXT: v_mov_b32_e32 v7, s8 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s10, s4, 0x40018 +; GCN-DL-NEXT: v_mov_b32_e32 v8, s9 +; GCN-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GCN-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GCN-DL-NEXT: v_mov_b32_e32 v9, s10 +; GCN-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GCN-DL-NEXT: s_waitcnt vmcnt(0) +; GCN-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s5, v5, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s8, v8, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s4 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GCN-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GCN-DL-NEXT: global_store_byte v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i4 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %v1e0 = extractelement <8 x i4> %vec1, i64 0 + %v2e0 = extractelement <8 x i4> %vec2, i64 0 + %mul0 = mul nuw nsw i4 %v1e0, %v2e0 + + %v1e1 = extractelement <8 x i4> %vec1, i64 1 + %v2e1 = extractelement <8 x i4> %vec2, i64 1 + %mul1 = mul nuw nsw i4 %v1e1, %v2e1 + + %v1e2 = extractelement <8 x i4> %vec1, i64 2 + %v2e2 = extractelement <8 x i4> %vec2, i64 2 + %mul2 = mul nuw nsw i4 %v1e2, %v2e2 + + %v1e3 = extractelement <8 x i4> %vec1, i64 3 + %v2e3 = extractelement <8 x i4> %vec2, i64 3 + %mul3 = mul nuw nsw i4 %v1e3, %v2e3 + + %v1e4 = extractelement <8 x i4> %vec1, i64 4 + %v2e4 = extractelement <8 x i4> %vec2, i64 4 + %mul4 = mul nuw nsw i4 %v1e4, %v2e4 + + %v1e5 = extractelement <8 x i4> %vec1, i64 5 + %v2e5 = extractelement <8 x i4> %vec2, i64 5 + %mul5 = mul nuw nsw i4 %v1e5, %v2e5 + + %v1e6 = extractelement <8 x i4> %vec1, i64 6 + %v2e6 = extractelement <8 x i4> %vec2, i64 6 + %mul6 = mul nuw nsw i4 %v1e6, %v2e6 + + %v1e7 = extractelement <8 x i4> %vec1, i64 7 + %v2e7 = extractelement <8 x i4> %vec2, i64 7 + %mul7 = mul nuw nsw i4 %v1e7, %v2e7 + + %acc = load i4, i4 addrspace(1)* %dst, align 4 + %add1 = add i4 %mul0, %acc + %add2 = add i4 %mul1, %add1 + %add3 = add i4 %mul2, %add2 + %add4 = add i4 %mul3, %add3 + %add5 = add i4 %mul4, %add4 + %add6 = add i4 %mul5, %add5 + %add7 = add i4 %mul6, %add6 + %add8 = add i4 %mul7, %add7 + + store i4 %add8, i4 addrspace(1)* %dst, align 4 + ret void +} + +define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, +; GFX7-LABEL: udot8_multiuses_mul1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s8, s0, 28 +; GFX7-NEXT: s_bfe_u32 s21, s1, 0x40004 +; GFX7-NEXT: s_lshr_b32 s15, s1, 28 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40008 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s14, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_u32_u24 v1, s0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s21 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, s14, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mad_u32_u24 v1, s13, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mad_u32_u24 v1, s12, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s17 +; GFX7-NEXT: v_mad_u32_u24 v1, s10, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s16 +; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: udot8_multiuses_mul1: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s0, s2, 28 +; GFX8-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX8-NEXT: s_lshr_b32 s11, s4, 28 +; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX8-NEXT: s_and_b32 s4, s4, 15 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s2, s2, 15 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mad_u32_u24 v3, s2, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, s17 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v2, v3 +; GFX8-NEXT: v_mad_u32_u24 v3, s10, v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mad_u32_u24 v3, s9, v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: v_mad_u32_u24 v3, s8, v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 +; GFX8-NEXT: v_mad_u32_u24 v3, s7, v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, s13 +; GFX8-NEXT: v_mad_u32_u24 v3, s6, v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NEXT: v_mad_u32_u24 v3, s1, v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, s11 +; GFX8-NEXT: v_mad_u32_u24 v3, s0, v4, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: udot8_multiuses_mul1: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s0, s2, 28 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX9-NEXT: s_lshr_b32 s11, s4, 28 +; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-NEXT: s_and_b32 s4, s4, 15 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mad_u32_u24 v3, s2, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v2, v3 +; GFX9-NEXT: v_mad_u32_u24 v3, s10, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mad_u32_u24 v3, s9, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: v_mad_u32_u24 v3, s8, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-NEXT: v_mad_u32_u24 v3, s7, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, s13 +; GFX9-NEXT: v_mad_u32_u24 v3, s6, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-NEXT: v_mad_u32_u24 v3, s1, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-NEXT: v_mad_u32_u24 v3, s0, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GCN-DL-LABEL: udot8_multiuses_mul1: +; GCN-DL: ; %bb.0: ; %entry +; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_lshr_b32 s0, s2, 28 +; GCN-DL-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GCN-DL-NEXT: s_lshr_b32 s11, s4, 28 +; GCN-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GCN-DL-NEXT: s_bfe_u32 s13, s4, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s15, s4, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GCN-DL-NEXT: s_and_b32 s4, s4, 15 +; GCN-DL-NEXT: s_bfe_u32 s1, s2, 0x40018 +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GCN-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GCN-DL-NEXT: s_and_b32 s2, s2, 15 +; GCN-DL-NEXT: v_mov_b32_e32 v2, s4 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s5 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s2, v2, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s17 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v2, v3 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s10, v4, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s16 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s9, v4, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s15 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s8, v4, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s14 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s7, v4, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s13 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s6, v4, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s12 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s1, v4, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s11 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s0, v4, v3 +; GCN-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GCN-DL-NEXT: global_store_dword v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %v1e0 = extractelement <8 x i4> %vec1, i64 0 + %cv1e0 = zext i4 %v1e0 to i32 + %v2e0 = extractelement <8 x i4> %vec2, i64 0 + %cv2e0 = zext i4 %v2e0 to i32 + %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 + + %v1e1 = extractelement <8 x i4> %vec1, i64 1 + %cv1e1 = zext i4 %v1e1 to i32 + %v2e1 = extractelement <8 x i4> %vec2, i64 1 + %cv2e1 = zext i4 %v2e1 to i32 + %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 + + %v1e2 = extractelement <8 x i4> %vec1, i64 2 + %cv1e2 = zext i4 %v1e2 to i32 + %v2e2 = extractelement <8 x i4> %vec2, i64 2 + %cv2e2 = zext i4 %v2e2 to i32 + %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 + + %v1e3 = extractelement <8 x i4> %vec1, i64 3 + %cv1e3 = zext i4 %v1e3 to i32 + %v2e3 = extractelement <8 x i4> %vec2, i64 3 + %cv2e3 = zext i4 %v2e3 to i32 + %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 + + %v1e4 = extractelement <8 x i4> %vec1, i64 4 + %cv1e4 = zext i4 %v1e4 to i32 + %v2e4 = extractelement <8 x i4> %vec2, i64 4 + %cv2e4 = zext i4 %v2e4 to i32 + %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 + + %v1e5 = extractelement <8 x i4> %vec1, i64 5 + %cv1e5 = zext i4 %v1e5 to i32 + %v2e5 = extractelement <8 x i4> %vec2, i64 5 + %cv2e5 = zext i4 %v2e5 to i32 + %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 + + %v1e6 = extractelement <8 x i4> %vec1, i64 6 + %cv1e6 = zext i4 %v1e6 to i32 + %v2e6 = extractelement <8 x i4> %vec2, i64 6 + %cv2e6 = zext i4 %v2e6 to i32 + %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 + + %v1e7 = extractelement <8 x i4> %vec1, i64 7 + %cv1e7 = zext i4 %v1e7 to i32 + %v2e7 = extractelement <8 x i4> %vec2, i64 7 + %cv2e7 = zext i4 %v2e7 to i32 + %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 + + %acc = load i32, i32 addrspace(1)* %dst, align 4 + %add1 = add i32 %mul0, %acc + %add = add i32 %mul0, %add1 + %add2 = add i32 %add1, %mul1 + %add3 = add i32 %add2, %mul2 + %add4 = add i32 %add3, %mul3 + %add5 = add i32 %add4, %mul4 + %add6 = add i32 %add5, %mul5 + %add7 = add i32 %add6, %mul6 + %add8 = add i32 %add7, %mul7 + + %res = add i32 %add, %add8 + store i32 %res, i32 addrspace(1)* %dst, align 4 + ret void +} + +define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, +; GFX7-LABEL: udot8_acc32_vecMul: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s8, s0, 28 +; GFX7-NEXT: s_lshr_b32 s15, s1, 28 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s21, s1, 0x40004 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s14, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-NEXT: v_mad_u32_u24 v0, s14, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s16 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: udot8_acc32_vecMul: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s0, s2, 28 +; GFX8-NEXT: s_lshr_b32 s11, s4, 28 +; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX8-NEXT: s_and_b32 s4, s4, 15 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s2, s2, 15 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s12 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: udot8_acc32_vecMul: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s0, s2, 28 +; GFX9-NEXT: s_lshr_b32 s11, s4, 28 +; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX9-NEXT: s_and_b32 s4, s4, 15 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s13 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GCN-DL-LABEL: udot8_acc32_vecMul: +; GCN-DL: ; %bb.0: ; %entry +; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: v_mov_b32_e32 v2, s4 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s5 +; GCN-DL-NEXT: v_dot8_u32_u4 v2, s2, v2, v3 +; GCN-DL-NEXT: global_store_dword v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %cvec1 = zext <8 x i4> %vec1 to <8 x i32> + %cvec2 = zext <8 x i4> %vec2 to <8 x i32> + + %mul = mul <8 x i32> %cvec1, %cvec2 + %mul0 = extractelement <8 x i32> %mul, i64 0 + %mul1 = extractelement <8 x i32> %mul, i64 1 + %mul2 = extractelement <8 x i32> %mul, i64 2 + %mul3 = extractelement <8 x i32> %mul, i64 3 + %mul4 = extractelement <8 x i32> %mul, i64 4 + %mul5 = extractelement <8 x i32> %mul, i64 5 + %mul6 = extractelement <8 x i32> %mul, i64 6 + %mul7 = extractelement <8 x i32> %mul, i64 7 + + %acc = load i32, i32 addrspace(1)* %dst, align 4 + %add1 = add i32 %mul0, %acc + %add2 = add i32 %add1, %mul1 + %add3 = add i32 %add2, %mul2 + %add4 = add i32 %add3, %mul3 + %add5 = add i32 %add4, %mul4 + %add6 = add i32 %add5, %mul5 + %add7 = add i32 %add6, %mul6 + %add8 = add i32 %add7, %mul7 + + store i32 %add8, i32 addrspace(1)* %dst, align 4 + ret void +} + +; TODO: Clean up the code(by default pk_mad_I16 should be generated), then +; support the pattern. +define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, +; GFX7-LABEL: udot8_acc16_vecMul: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40004 +; GFX7-NEXT: s_bfe_u32 s13, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40004 +; GFX7-NEXT: s_bfe_u32 s20, s1, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-NEXT: s_bfe_u32 s14, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40010 +; GFX7-NEXT: s_lshr_b32 s16, s1, 28 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40018 +; GFX7-NEXT: s_and_b32 s19, s1, 15 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x40008 +; GFX7-NEXT: v_mul_u32_u24_e32 v2, s13, v2 +; GFX7-NEXT: v_mul_u32_u24_e32 v4, s11, v4 +; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX7-NEXT: s_lshr_b32 s9, s0, 28 +; GFX7-NEXT: v_mov_b32_e32 v6, s16 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40018 +; GFX7-NEXT: s_and_b32 s12, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-NEXT: v_mul_u32_u24_e32 v6, s9, v6 +; GFX7-NEXT: v_mul_u32_u24_e32 v1, s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_u32_u24_e32 v3, s12, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX7-NEXT: v_mul_u32_u24_e32 v5, s10, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mov_b32_e32 v8, s14 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX7-NEXT: v_alignbit_b32 v5, v1, v2, 16 +; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: v_mul_u32_u24_e32 v8, s2, v8 +; GFX7-NEXT: v_mul_u32_u24_e32 v7, s8, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: udot8_acc16_vecMul: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s0, s2, 15 +; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX8-NEXT: s_lshr_b32 s14, s4, 28 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s10 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s12 +; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: udot8_acc16_vecMul: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s0, s2, 15 +; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, s0, v3 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40014 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, s1, v4 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s1, s4, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s4, 28 +; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_bfe_u32 s0, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, s5, v5 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_pk_mul_lo_u16 v6, s0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GCN-DL-LABEL: udot8_acc16_vecMul: +; GCN-DL: ; %bb.0: ; %entry +; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: global_load_ushort v2, v[0:1], off +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_and_b32 s0, s2, 15 +; GCN-DL-NEXT: s_and_b32 s1, s4, 15 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GCN-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x40004 +; GCN-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s1 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GCN-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GCN-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GCN-DL-NEXT: v_pk_mul_lo_u16 v3, s0, v3 +; GCN-DL-NEXT: s_bfe_u32 s1, s2, 0x40008 +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GCN-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s5 +; GCN-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s7, s4, 0x40014 +; GCN-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GCN-DL-NEXT: v_pk_mul_lo_u16 v4, s1, v4 +; GCN-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 +; GCN-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GCN-DL-NEXT: v_mov_b32_e32 v5, s0 +; GCN-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GCN-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 +; GCN-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GCN-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GCN-DL-NEXT: v_pk_mul_lo_u16 v5, s5, v5 +; GCN-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GCN-DL-NEXT: v_mov_b32_e32 v6, s1 +; GCN-DL-NEXT: v_pk_mul_lo_u16 v6, s0, v6 +; GCN-DL-NEXT: s_waitcnt vmcnt(0) +; GCN-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GCN-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GCN-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GCN-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GCN-DL-NEXT: v_add_u32_e32 v2, v2, v5 +; GCN-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GCN-DL-NEXT: v_add_u32_e32 v2, v2, v6 +; GCN-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GCN-DL-NEXT: global_store_short v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i16 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %cvec1 = zext <8 x i4> %vec1 to <8 x i16> + %cvec2 = zext <8 x i4> %vec2 to <8 x i16> + + %mul = mul <8 x i16> %cvec1, %cvec2 + %mul0 = extractelement <8 x i16> %mul, i64 0 + %mul1 = extractelement <8 x i16> %mul, i64 1 + %mul2 = extractelement <8 x i16> %mul, i64 2 + %mul3 = extractelement <8 x i16> %mul, i64 3 + %mul4 = extractelement <8 x i16> %mul, i64 4 + %mul5 = extractelement <8 x i16> %mul, i64 5 + %mul6 = extractelement <8 x i16> %mul, i64 6 + %mul7 = extractelement <8 x i16> %mul, i64 7 + + %acc = load i16, i16 addrspace(1)* %dst, align 4 + %add1 = add i16 %mul0, %acc + %add2 = add i16 %add1, %mul1 + %add3 = add i16 %add2, %mul2 + %add4 = add i16 %add3, %mul3 + %add5 = add i16 %add4, %mul4 + %add6 = add i16 %add5, %mul5 + %add7 = add i16 %add6, %mul6 + %add8 = add i16 %add7, %mul7 + + store i16 %add8, i16 addrspace(1)* %dst, align 4 + ret void +} + +; TODO: Cleanup the code to generate MAD; pattern should be recognized then. +define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, +; GFX7-LABEL: udot8_acc8_vecMul: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_bfe_u32 s2, s0, 0x4000c +; GFX7-NEXT: s_lshr_b32 s11, s0, 28 +; GFX7-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX7-NEXT: s_lshr_b32 s18, s1, 28 +; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-NEXT: v_mov_b32_e32 v8, s14 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX7-NEXT: s_and_b32 s17, s1, 15 +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mul_u32_u24_e32 v2, s13, v2 +; GFX7-NEXT: v_mul_u32_u24_e32 v4, s11, v4 +; GFX7-NEXT: v_mul_u32_u24_e32 v8, s2, v8 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v6, s16 +; GFX7-NEXT: s_and_b32 s10, s0, 15 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40018 +; GFX7-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-NEXT: v_mul_u32_u24_e32 v6, s9, v6 +; GFX7-NEXT: v_mul_u32_u24_e32 v1, s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_mul_u32_u24_e32 v3, s12, v3 +; GFX7-NEXT: v_mul_u32_u24_e32 v7, s8, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v8 +; GFX7-NEXT: v_mul_u32_u24_e32 v5, s10, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 8 +; GFX7-NEXT: v_alignbit_b32 v4, v1, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: udot8_acc8_vecMul: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_bfe_u32 s0, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX8-NEXT: s_and_b32 s6, s4, 15 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v7, s5 +; GFX8-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NEXT: s_and_b32 s9, s2, 15 +; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s10, v3 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s9, v6 +; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_bfe_u32 s0, s2, 0x40014 +; GFX8-NEXT: s_lshr_b32 s1, s2, 28 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40010 +; GFX8-NEXT: s_lshr_b32 s7, s4, 28 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NEXT: v_mov_b32_e32 v8, s1 +; GFX8-NEXT: v_mov_b32_e32 v9, s6 +; GFX8-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NEXT: v_mov_b32_e32 v11, s0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_u32_u24_e32 v6, s2, v6 +; GFX8-NEXT: v_mul_u32_u24_e32 v8, s8, v9 +; GFX8-NEXT: v_mul_u32_u24_sdwa v9, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX8-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: udot8_acc8_vecMul: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s0, s2, 15 +; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: v_mul_lo_u16_e32 v3, s0, v3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s8, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v5, s9, v5 +; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s10, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_bfe_u32 s1, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX9-NEXT: s_lshr_b32 s4, s4, 28 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: v_mul_lo_u16_e32 v4, s6, v4 +; GFX9-NEXT: v_mul_lo_u16_sdwa v5, s7, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v6, s8, v6 +; GFX9-NEXT: v_mul_lo_u16_sdwa v7, s2, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GCN-DL-LABEL: udot8_acc8_vecMul: +; GCN-DL: ; %bb.0: ; %entry +; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_and_b32 s0, s2, 15 +; GCN-DL-NEXT: s_and_b32 s1, s4, 15 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GCN-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GCN-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GCN-DL-NEXT: v_mov_b32_e32 v3, s1 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s5 +; GCN-DL-NEXT: s_bfe_u32 s8, s2, 0x40004 +; GCN-DL-NEXT: v_mov_b32_e32 v5, s6 +; GCN-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v6, s7 +; GCN-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GCN-DL-NEXT: v_mul_lo_u16_e32 v3, s0, v3 +; GCN-DL-NEXT: v_mul_lo_u16_sdwa v4, s8, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-DL-NEXT: v_mul_lo_u16_e32 v5, s9, v5 +; GCN-DL-NEXT: v_mul_lo_u16_sdwa v6, s10, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-DL-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-DL-NEXT: s_bfe_u32 s1, s4, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40018 +; GCN-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GCN-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GCN-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s0 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x40014 +; GCN-DL-NEXT: v_mov_b32_e32 v5, s1 +; GCN-DL-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GCN-DL-NEXT: v_mov_b32_e32 v6, s5 +; GCN-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GCN-DL-NEXT: v_mov_b32_e32 v7, s4 +; GCN-DL-NEXT: v_mul_lo_u16_e32 v4, s6, v4 +; GCN-DL-NEXT: v_mul_lo_u16_sdwa v5, s7, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-DL-NEXT: v_mul_lo_u16_e32 v6, s8, v6 +; GCN-DL-NEXT: v_mul_lo_u16_sdwa v7, s2, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-DL-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-DL-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GCN-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GCN-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GCN-DL-NEXT: s_waitcnt vmcnt(0) +; GCN-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GCN-DL-NEXT: v_add_u32_e32 v2, v2, v5 +; GCN-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GCN-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GCN-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GCN-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GCN-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GCN-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GCN-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GCN-DL-NEXT: global_store_byte v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i8 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %cvec1 = zext <8 x i4> %vec1 to <8 x i8> + %cvec2 = zext <8 x i4> %vec2 to <8 x i8> + + %mul = mul <8 x i8> %cvec1, %cvec2 + %mul0 = extractelement <8 x i8> %mul, i64 0 + %mul1 = extractelement <8 x i8> %mul, i64 1 + %mul2 = extractelement <8 x i8> %mul, i64 2 + %mul3 = extractelement <8 x i8> %mul, i64 3 + %mul4 = extractelement <8 x i8> %mul, i64 4 + %mul5 = extractelement <8 x i8> %mul, i64 5 + %mul6 = extractelement <8 x i8> %mul, i64 6 + %mul7 = extractelement <8 x i8> %mul, i64 7 + + %acc = load i8, i8 addrspace(1)* %dst, align 4 + %add1 = add i8 %mul0, %acc + %add2 = add i8 %add1, %mul1 + %add3 = add i8 %add2, %mul2 + %add4 = add i8 %add3, %mul3 + %add5 = add i8 %add4, %mul4 + %add6 = add i8 %add5, %mul5 + %add7 = add i8 %add6, %mul6 + %add8 = add i8 %add7, %mul7 + + store i8 %add8, i8 addrspace(1)* %dst, align 4 + ret void +} + +; TODO: Once the adictional "and+add" are removed, the pattern will be recognized. +define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, +; GFX7-LABEL: udot8_acc4_vecMul: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; GFX7-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s2, s0, 28 +; GFX7-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 +; GFX7-NEXT: s_lshr_b32 s14, s1, 28 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-NEXT: v_mov_b32_e32 v6, s16 +; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[8:11], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: udot8_acc4_vecMul: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s0, s2, 15 +; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX8-NEXT: v_mul_u32_u24_e32 v4, s7, v4 +; GFX8-NEXT: s_bfe_u32 s5, s4, 0x4000c +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v6, s5 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX8-NEXT: s_lshr_b32 s4, s4, 28 +; GFX8-NEXT: v_mov_b32_e32 v9, s9 +; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: flat_store_byte v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: udot8_acc4_vecMul: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s0, s2, 15 +; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GFX9-NEXT: v_mul_u32_u24_e32 v4, s7, v4 +; GFX9-NEXT: s_bfe_u32 s5, s4, 0x4000c +; GFX9-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s4, 28 +; GFX9-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GCN-DL-LABEL: udot8_acc4_vecMul: +; GCN-DL: ; %bb.0: ; %entry +; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_and_b32 s0, s2, 15 +; GCN-DL-NEXT: s_and_b32 s1, s4, 15 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s1 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GCN-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s6 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v5, s5 +; GCN-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GCN-DL-NEXT: v_mul_u32_u24_e32 v4, s7, v4 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x4000c +; GCN-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GCN-DL-NEXT: s_bfe_u32 s7, s4, 0x40010 +; GCN-DL-NEXT: v_mov_b32_e32 v6, s5 +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GCN-DL-NEXT: v_mov_b32_e32 v7, s7 +; GCN-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s9, s4, 0x40018 +; GCN-DL-NEXT: v_mov_b32_e32 v8, s8 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GCN-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GCN-DL-NEXT: v_mov_b32_e32 v9, s9 +; GCN-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GCN-DL-NEXT: s_waitcnt vmcnt(0) +; GCN-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GCN-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GCN-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s7, v8, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s4 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GCN-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GCN-DL-NEXT: global_store_byte v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i4 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %mul = mul <8 x i4> %vec1, %vec2 + %mul0 = extractelement <8 x i4> %mul, i64 0 + %mul1 = extractelement <8 x i4> %mul, i64 1 + %mul2 = extractelement <8 x i4> %mul, i64 2 + %mul3 = extractelement <8 x i4> %mul, i64 3 + %mul4 = extractelement <8 x i4> %mul, i64 4 + %mul5 = extractelement <8 x i4> %mul, i64 5 + %mul6 = extractelement <8 x i4> %mul, i64 6 + %mul7 = extractelement <8 x i4> %mul, i64 7 + + %acc = load i4, i4 addrspace(1)* %dst, align 4 + %add1 = add i4 %mul0, %acc + %add2 = add i4 %add1, %mul1 + %add3 = add i4 %add2, %mul2 + %add4 = add i4 %add3, %mul3 + %add5 = add i4 %add4, %mul4 + %add6 = add i4 %add5, %mul5 + %add7 = add i4 %add6, %mul6 + %add8 = add i4 %add7, %mul7 + + store i4 %add8, i4 addrspace(1)* %dst, align 4 + ret void +}