[AMDGPU] Add 24-bit mulhi intrinsics in INTRINSIC_WO_CHAIN combine.

mul24 intrinsic's operands are simplified by AMDGPUTargetLowering::performIntrinsicWOChainCombine(). This change adds the mul24hi intrinsics in the combine since its operands can be simplified like that of the mul24 intrinsics. Differential Revision: https://reviews.llvm.org/D112702
2021-10-28 16:08:59 +05:30 · 2021-10-28 16:08:59 +05:30 · 2da6ef3664
parent 9f8e779b42
commit 2da6ef3664
2 changed files with 31 additions and 50 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -2897,8 +2897,22 @@ static SDValue simplifyMul24(SDNode *Node24,
  unsigned NewOpcode = Node24->getOpcode();
  if (IsIntrin) {
    unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
-    NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
-      AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+    switch (IID) {
+    case Intrinsic::amdgcn_mul_i24:
+      NewOpcode = AMDGPUISD::MUL_I24;
+      break;
+    case Intrinsic::amdgcn_mul_u24:
+      NewOpcode = AMDGPUISD::MUL_U24;
+      break;
+    case Intrinsic::amdgcn_mulhi_i24:
+      NewOpcode = AMDGPUISD::MULHI_I24;
+      break;
+    case Intrinsic::amdgcn_mulhi_u24:
+      NewOpcode = AMDGPUISD::MULHI_U24;
+      break;
+    default:
+      llvm_unreachable("Expected 24-bit mul intrinsic");
+    }
  }

  APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
@ -3107,6 +3121,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
  switch (IID) {
  case Intrinsic::amdgcn_mul_i24:
  case Intrinsic::amdgcn_mul_u24:
+  case Intrinsic::amdgcn_mulhi_i24:
+  case Intrinsic::amdgcn_mulhi_u24:
    return simplifyMul24(N, DCI);
  case Intrinsic::amdgcn_fract:
  case Intrinsic::amdgcn_rsq:
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@ -575,11 +575,9 @@ define i64 @test_umul48_i64(i64 %lhs, i64 %rhs) {
 ; GCN-LABEL: test_umul48_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, 0xffffff
-; GCN-NEXT:    v_and_b32_e32 v1, s4, v0
-; GCN-NEXT:    v_and_b32_e32 v3, s4, v2
-; GCN-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
-; GCN-NEXT:    v_mul_hi_u32_u24_e32 v1, v1, v3
+; GCN-NEXT:    v_mul_u32_u24_e32 v3, v0, v2
+; GCN-NEXT:    v_mul_hi_u32_u24_e32 v1, v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %lhs24 = and i64 %lhs, 16777215
  %rhs24 = and i64 %rhs, 16777215
@ -588,49 +586,16 @@ define i64 @test_umul48_i64(i64 %lhs, i64 %rhs) {
 }

 define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; SI-LABEL: test_umul48_v2i64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, 0xffffff
-; SI-NEXT:    v_mul_u32_u24_e32 v5, v0, v4
-; SI-NEXT:    v_mul_u32_u24_e32 v7, v2, v6
-; SI-NEXT:    v_and_b32_e32 v2, s4, v2
-; SI-NEXT:    v_and_b32_e32 v0, s4, v0
-; SI-NEXT:    v_and_b32_e32 v3, s4, v6
-; SI-NEXT:    v_and_b32_e32 v1, s4, v4
-; SI-NEXT:    v_mul_hi_u32_u24_e32 v1, v0, v1
-; SI-NEXT:    v_mul_hi_u32_u24_e32 v3, v2, v3
-; SI-NEXT:    v_mov_b32_e32 v0, v5
-; SI-NEXT:    v_mov_b32_e32 v2, v7
-; SI-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: test_umul48_v2i64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, 0xffffff
-; VI-NEXT:    v_and_b32_e32 v3, s4, v2
-; VI-NEXT:    v_and_b32_e32 v1, s4, v0
-; VI-NEXT:    v_and_b32_e32 v5, s4, v6
-; VI-NEXT:    v_and_b32_e32 v7, s4, v4
-; VI-NEXT:    v_mul_u32_u24_e32 v0, v0, v4
-; VI-NEXT:    v_mul_hi_u32_u24_e32 v1, v1, v7
-; VI-NEXT:    v_mul_u32_u24_e32 v2, v2, v6
-; VI-NEXT:    v_mul_hi_u32_u24_e32 v3, v3, v5
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_umul48_v2i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0xffffff
-; GFX9-NEXT:    v_and_b32_e32 v3, s4, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v0
-; GFX9-NEXT:    v_and_b32_e32 v5, s4, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, s4, v4
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, v1, v7
-; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v2, v6
-; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v3, v3, v5
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_umul48_v2i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_u32_u24_e32 v5, v0, v4
+; GCN-NEXT:    v_mul_hi_u32_u24_e32 v1, v0, v4
+; GCN-NEXT:    v_mul_u32_u24_e32 v4, v2, v6
+; GCN-NEXT:    v_mul_hi_u32_u24_e32 v3, v2, v6
+; GCN-NEXT:    v_mov_b32_e32 v0, v5
+; GCN-NEXT:    v_mov_b32_e32 v2, v4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
  %lhs24 = and <2 x i64> %lhs, <i64 16777215, i64 16777215>
  %rhs24 = and <2 x i64> %rhs, <i64 16777215, i64 16777215>
  %mul = mul <2 x i64> %lhs24, %rhs24