From b8173c317812a51354e2874ee6dd5c3150d98ac8 Mon Sep 17 00:00:00 2001 From: David Stuttard Date: Thu, 13 May 2021 14:08:36 +0100 Subject: [PATCH] [AMDGPU] Stop mulhi from doing 24 bit mul for uniform values Added support to check if architecture supports s_mulhi which is used as part of the decision whether or not to use valu 24 bit mul (if the mulhi gets transformed to a valu op anyway, then may as well use it). This is an extension of the work in D97063 Differential Revision: https://reviews.llvm.org/D103321 Change-Id: I80b1323de640a52623d69ac005a97d06a5d42a14 --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 18 ++++++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 2 + llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 ++ llvm/test/CodeGen/AMDGPU/mul_int24.ll | 40 +++++++++---- llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 58 +++++++++++++------ 5 files changed, 94 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index fecbf5d80e8e..d68488ccb342 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3461,6 +3461,15 @@ SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N, if (!Subtarget->hasMulI24() || VT.isVector()) return SDValue(); + // Don't generate 24-bit multiplies on values that are in SGPRs, since + // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs + // unnecessarily). isDivergent() is used as an approximation of whether the + // value is in an SGPR. + // This doesn't apply if no s_mul_hi is available (since we'll end up with a + // valu op anyway) + if (Subtarget->hasSMulHi() && !N->isDivergent()) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); @@ -3485,6 +3494,15 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32) return SDValue(); + // Don't generate 24-bit multiplies on values that are in SGPRs, since + // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs + // unnecessarily). isDivergent() is used as an approximation of whether the + // value is in an SGPR. + // This doesn't apply if no s_mul_hi is available (since we'll end up with a + // valu op anyway) + if (Subtarget->hasSMulHi() && !N->isDivergent()) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index e67a76eeb4cb..7e5f0d0d5257 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -163,6 +163,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, WavefrontSizeLog2 = 5; HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; + HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; TargetID.setTargetIDFromFeaturesString(FS); @@ -185,6 +186,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : HasVOP3PInsts(false), HasMulI24(true), HasMulU24(true), + HasSMulHi(false), HasInv2PiInlineImm(false), HasFminFmaxLegacy(true), EnablePromoteAlloca(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 08576356255f..b160cdf3a97a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -54,6 +54,7 @@ protected: bool HasVOP3PInsts; bool HasMulI24; bool HasMulU24; + bool HasSMulHi; bool HasInv2PiInlineImm; bool HasFminFmaxLegacy; bool EnablePromoteAlloca; @@ -161,6 +162,10 @@ public: return HasMulU24; } + bool hasSMulHi() const { + return HasSMulHi; + } + bool hasInv2PiInlineImm() const { return HasInv2PiInlineImm; } diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 2681af332caa..9e99eaa721a1 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC,SIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC,SIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC,GFX9 %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s @@ -22,10 +23,12 @@ entry: } ; FUNC-LABEL: {{^}}test_smulhi24_i64: -; GCN-NOT: bfe +; SIVI-NOT: bfe ; GCN-NOT: ashr -; GCN: v_mul_hi_i32_i24_e32 [[RESULT:v[0-9]+]], -; GCN-NEXT: buffer_store_dword [[RESULT]] +; SIVI: v_mul_hi_i32_i24_e32 [[RESULT:v[0-9]+]], +; GFX9: s_mul_hi_i32 [[RES1:s[0-9]+]], +; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[RES1]] +; GCN: buffer_store_dword [[RESULT]] ; EG: ASHR ; EG: ASHR @@ -62,8 +65,10 @@ entry: ; GCN-NOT: ashr -; GCN-DAG: v_mul_hi_i32_i24_e32 -; GCN-DAG: s_mul_i32 +; SIVI-DAG: v_mul_hi_i32_i24_e32 +; SIVI-DAG: s_mul_i32 +; GFX9-DAG: s_mul_hi_i32 +; GFX9-DAG: s_mul_i32 ; GCN: buffer_store_dwordx2 define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 { @@ -80,8 +85,11 @@ define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i3 ; FUNC-LABEL: {{^}}test_smul24_i64_square: ; GCN: s_load_dword [[A:s[0-9]+]] -; GCN-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]] -; GCN-DAG: s_mul_i32 s{{[0-9]+}}, [[A]], [[A]] +; SIVI-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]] +; SIVI-DAG: s_mul_i32 s{{[0-9]+}}, [[A]], [[A]] +; GFX9: s_bfe_i32 [[B:s[0-9]+]], [[A]] +; GFX9-DAG: s_mul_hi_i32 s{{[0-9]+}}, [[B]], [[B]] +; GFX9-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]] ; GCN: buffer_store_dwordx2 define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %shl.i = shl i32 %a, 8 @@ -99,14 +107,19 @@ define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a ; GCN-NOT: and ; GCN-NOT: lshr -; GCN-DAG: s_mul_i32 -; GCN-DAG: v_mul_hi_i32_i24_e32 +; SIVI-DAG: s_mul_i32 +; SIVI-DAG: v_mul_hi_i32_i24_e32 ; SI: v_lshl_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 31 ; SI: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 31 ; VI: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}} ; VI: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}} +; GFX9-DAG: s_mul_i32 +; GFX9-DAG: s_mul_hi_i32 +; GFX9: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 31 +; GFX9: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 31 + ; GCN: buffer_store_dwordx2 define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 { entry: @@ -129,6 +142,11 @@ entry: ; SI: v_mul_hi_i32_i24_e32 v[[MUL_HI:[0-9]+]], ; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] ; SI-NEXT: buffer_store_dword v[[HI]] + +; GFX9: s_mul_hi_i32 s[[MUL_HI:[0-9]+]], +; GFX9-NEXT: s_and_b32 s[[HI:[0-9]+]], s[[MUL_HI]], 1 +; GFX9-NEXT: v_mov_b32_e32 v[[RES:[0-9]+]], s[[HI]] +; GFX9-NEXT: buffer_store_dword v[[RES]] define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { entry: %tmp0 = shl i33 %a, 9 diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 864039c0f930..e53a33fe0292 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone @@ -31,6 +32,7 @@ entry: ; FUNC-LABEL: {{^}}test_umul24_i16_vgpr_sext: ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; GFX9: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16 define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -60,6 +62,7 @@ entry: ; SI: v_mul_u32_u24_e32 ; SI: v_and_b32_e32 ; VI: v_mul_lo_u16 +; GFX9: v_mul_lo_u16 define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() @@ -76,6 +79,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addr ; FUNC-LABEL: {{^}}test_umul24_i8_vgpr: ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; GFX9: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) { entry: @@ -92,8 +96,10 @@ entry: } ; FUNC-LABEL: {{^}}test_umulhi24_i32_i64: -; GCN-NOT: and -; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]], +; SIVI-NOT: and +; SIVI: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]], +; GFX9: s_mul_hi_u32 [[SRESULT:s[0-9]+]], +; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]] ; GCN-NEXT: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: @@ -109,8 +115,10 @@ entry: } ; FUNC-LABEL: {{^}}test_umulhi24: -; GCN-NOT: and -; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]], +; SIVI-NOT: and +; SIVI: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]], +; GFX9: s_mul_hi_u32 [[SRESULT:s[0-9]+]], +; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]] ; GCN-NEXT: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) { entry: @@ -126,8 +134,10 @@ entry: ; Multiply with 24-bit inputs and 64-bit output. ; FUNC-LABEL: {{^}}test_umul24_i64: ; GCN-NOT: lshr -; GCN-DAG: s_mul_i32 -; GCN-DAG: v_mul_hi_u32_u24_e32 +; SIVI-DAG: s_mul_i32 +; SIVI-DAG: v_mul_hi_u32_u24_e32 +; GFX9-DAG: s_mul_i32 +; GFX9-DAG: s_mul_hi_u32 ; GCN: buffer_store_dwordx2 define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { entry: @@ -143,8 +153,10 @@ entry: ; FUNC-LABEL: {{^}}test_umul24_i64_square: ; GCN: s_load_dword [[A:s[0-9]+]] ; GCN: s_and_b32 [[B:s[0-9]+]], [[A]], 0xffffff -; GCN-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]] -; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]] +; SIVI-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]] +; SIVI-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]] +; GFX9-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]] +; GFX9-DAG: s_mul_hi_u32 s{{[0-9]+}}, [[B]], [[B]] define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) { entry: %tmp0 = shl i64 %a, 40 @@ -158,7 +170,9 @@ entry: ; GCN: s_and_b32 ; GCN: s_and_b32 ; GCN: s_mul_i32 [[MUL24:s[0-9]+]] -; GCN: s_lshr_b32 s{{[0-9]+}}, [[MUL24]], 16 +; SIVI: s_lshr_b32 s{{[0-9]+}}, [[MUL24]], 16 +; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[MUL24]] +; GFX9: global_store_short_d16_hi v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) { entry: %a.16 = and i32 %a, 65535 @@ -174,10 +188,15 @@ entry: ; GCN: s_load_dword s ; GCN: s_load_dword s ; GCN-NOT: lshr -; GCN-DAG: s_mul_i32 s[[MUL_LO:[0-9]+]], -; GCN-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]], -; GCN-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[MUL_LO]] +; SIVI-DAG: s_mul_i32 s[[MUL_LO:[0-9]+]], +; SIVI-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]], +; SIVI-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] +; SIVI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[MUL_LO]] +; GFX9-DAG: s_mul_i32 s[[MUL_LO:[0-9]+]], +; GFX9-DAG: s_mul_hi_u32 s[[MUL_HI:[0-9]+]], +; GFX9-DAG: s_and_b32 s[[AND_HI:[0-9]+]], s[[MUL_HI]], 1 +; GFX9-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[MUL_LO]] +; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[AND_HI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) { entry: @@ -194,10 +213,13 @@ entry: ; FUNC-LABEL: {{^}}test_umulhi24_i33: ; GCN: s_load_dword s ; GCN: s_load_dword s -; GCN-NOT: and +; SIVI-NOT: and ; GCN-NOT: lshr -; GCN: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]], -; GCN: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] +; SIVI: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]], +; SIVI: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]] +; GFX9: s_mul_hi_u32 s[[MUL_HI:[0-9]+]], +; GFX9: s_and_b32 s[[AND_HI:[0-9]+]], s[[MUL_HI]], 1 +; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]], s[[AND_HI]] ; GCN-NEXT: buffer_store_dword v[[HI]] define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) { entry: