From 781dd39b7b6d6a3afecf7097c4fa51e194cac057 Mon Sep 17 00:00:00 2001
From: Abinav Puthan Purayil <abinav.puthanpurayil@amd.com>
Date: Sat, 23 Oct 2021 21:05:32 +0530
Subject: [PATCH] [AMDGPU] Enable 48-bit mul in AMDGPUCodeGenPrepare.

We were bailing out of creating 24-bit muls for results wider than 32
bits in AMDGPUCodeGenPrepare. With the 24-bit mulhi intrinsic, this
change teaches AMDGPUCodeGenPrepare to generate the 48-bit mul
correctly.

Differential Revision: https://reviews.llvm.org/D112395
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |  68 ++++---
 .../AMDGPU/amdgpu-codegenprepare-mul24.ll     | 177 ++++++++++++++++--
 2 files changed, 207 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index ba0afbc72b24..c8bea9b1a240 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -477,6 +477,34 @@ static Value *insertValues(IRBuilder<> &Builder,
   return NewVal;
 }
 
+// Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and
+// `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the
+// width of the original destination.
+static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS,
+                       unsigned Size, unsigned NumBits, bool IsSigned) {
+  if (Size <= 32 || (IsSigned ? NumBits <= 30 : NumBits <= 32)) {
+    Intrinsic::ID ID =
+        IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
+    return Builder.CreateIntrinsic(ID, {}, {LHS, RHS});
+  }
+
+  assert(IsSigned ? NumBits <= 46 : NumBits <= 48);
+
+  Intrinsic::ID LoID =
+      IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
+  Intrinsic::ID HiID =
+      IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24;
+
+  Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS});
+  Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS});
+
+  IntegerType *I64Ty = Builder.getInt64Ty();
+  Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty);
+  Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty);
+
+  return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32));
+}
+
 bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
   if (I.getOpcode() != Instruction::Mul)
     return false;
@@ -495,29 +523,18 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
   IRBuilder<> Builder(&I);
   Builder.SetCurrentDebugLocation(I.getDebugLoc());
 
-  Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
-
   unsigned LHSBits = 0, RHSBits = 0;
+  bool IsSigned = false;
 
   if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS, Size)) <= 24 &&
       (RHSBits = numBitsUnsigned(RHS, Size)) <= 24) {
-    // The mul24 instruction yields the low-order 32 bits. If the original
-    // result and the destination is wider than 32 bits, the mul24 would
-    // truncate the result.
-    if (Size > 32 && LHSBits + RHSBits > 32)
-      return false;
+    IsSigned = false;
 
-    IntrID = Intrinsic::amdgcn_mul_u24;
   } else if (ST->hasMulI24() &&
              (LHSBits = numBitsSigned(LHS, Size)) < 24 &&
              (RHSBits = numBitsSigned(RHS, Size)) < 24) {
-    // The original result is positive if its destination is wider than 32 bits
-    // and its highest set bit is at bit 31. Generating mul24 and sign-extending
-    // it would yield a negative value.
-    if (Size > 32 && LHSBits + RHSBits > 30)
-      return false;
+    IsSigned = true;
 
-    IntrID = Intrinsic::amdgcn_mul_i24;
   } else
     return false;
 
@@ -527,27 +544,26 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
   extractValues(Builder, LHSVals, LHS);
   extractValues(Builder, RHSVals, RHS);
 
-
   IntegerType *I32Ty = Builder.getInt32Ty();
-  FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);
   for (int I = 0, E = LHSVals.size(); I != E; ++I) {
     Value *LHS, *RHS;
-    if (IntrID == Intrinsic::amdgcn_mul_u24) {
-      LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
-      RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
-    } else {
+    if (IsSigned) {
       LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
       RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
+    } else {
+      LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
+      RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
     }
 
-    Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});
+    Value *Result =
+        getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned);
 
-    if (IntrID == Intrinsic::amdgcn_mul_u24) {
-      ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,
-                                                     LHSVals[I]->getType()));
+    if (IsSigned) {
+      ResultVals.push_back(
+          Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType()));
     } else {
-      ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,
-                                                     LHSVals[I]->getType()));
+      ResultVals.push_back(
+          Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType()));
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll
index fa14e968948a..d2fe8f433f7c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll
@@ -173,7 +173,14 @@ define i64 @smul24_i64(i64 %lhs, i64 %rhs) {
 ; SI-NEXT:    [[LHS24:%.*]] = ashr i64 [[SHL_LHS]], 40
 ; SI-NEXT:    [[LSHR_RHS:%.*]] = shl i64 [[RHS:%.*]], 40
 ; SI-NEXT:    [[RHS24:%.*]] = ashr i64 [[LHS]], 40
-; SI-NEXT:    [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]]
+; SI-NEXT:    [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
+; SI-NEXT:    [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
+; SI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP3]] to i64
+; SI-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; SI-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
+; SI-NEXT:    [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]]
 ; SI-NEXT:    ret i64 [[MUL]]
 ;
 ; VI-LABEL: @smul24_i64(
@@ -181,7 +188,14 @@ define i64 @smul24_i64(i64 %lhs, i64 %rhs) {
 ; VI-NEXT:    [[LHS24:%.*]] = ashr i64 [[SHL_LHS]], 40
 ; VI-NEXT:    [[LSHR_RHS:%.*]] = shl i64 [[RHS:%.*]], 40
 ; VI-NEXT:    [[RHS24:%.*]] = ashr i64 [[LHS]], 40
-; VI-NEXT:    [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]]
+; VI-NEXT:    [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
+; VI-NEXT:    [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
+; VI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP3]] to i64
+; VI-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; VI-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
+; VI-NEXT:    [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]]
 ; VI-NEXT:    ret i64 [[MUL]]
 ;
 ; DISABLED-LABEL: @smul24_i64(
@@ -245,7 +259,14 @@ define i64 @smul24_i64_3(i64 %lhs, i64 %rhs) {
 ; SI-NEXT:    [[LHS24:%.*]] = sext i16 [[LHS_TRUNC]] to i64
 ; SI-NEXT:    [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i17
 ; SI-NEXT:    [[RHS24:%.*]] = sext i17 [[RHS_TRUNC]] to i64
-; SI-NEXT:    [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]]
+; SI-NEXT:    [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
+; SI-NEXT:    [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
+; SI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP3]] to i64
+; SI-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; SI-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
+; SI-NEXT:    [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]]
 ; SI-NEXT:    ret i64 [[MUL]]
 ;
 ; VI-LABEL: @smul24_i64_3(
@@ -253,7 +274,14 @@ define i64 @smul24_i64_3(i64 %lhs, i64 %rhs) {
 ; VI-NEXT:    [[LHS24:%.*]] = sext i16 [[LHS_TRUNC]] to i64
 ; VI-NEXT:    [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i17
 ; VI-NEXT:    [[RHS24:%.*]] = sext i17 [[RHS_TRUNC]] to i64
-; VI-NEXT:    [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]]
+; VI-NEXT:    [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
+; VI-NEXT:    [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
+; VI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP3]] to i64
+; VI-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; VI-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
+; VI-NEXT:    [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]]
 ; VI-NEXT:    ret i64 [[MUL]]
 ;
 ; DISABLED-LABEL: @smul24_i64_3(
@@ -272,17 +300,64 @@ define i64 @smul24_i64_3(i64 %lhs, i64 %rhs) {
   ret i64 %mul
 }
 
+define i64 @smul24_i64_4(i64 %lhs, i64 %rhs) {
+; SI-LABEL: @smul24_i64_4(
+; SI-NEXT:    [[LHS_TRUNC:%.*]] = trunc i64 [[LHS:%.*]] to i24
+; SI-NEXT:    [[LHS24:%.*]] = sext i24 [[LHS_TRUNC]] to i64
+; SI-NEXT:    [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i25
+; SI-NEXT:    [[RHS24:%.*]] = sext i25 [[RHS_TRUNC]] to i64
+; SI-NEXT:    [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]]
+; SI-NEXT:    ret i64 [[MUL]]
+;
+; VI-LABEL: @smul24_i64_4(
+; VI-NEXT:    [[LHS_TRUNC:%.*]] = trunc i64 [[LHS:%.*]] to i24
+; VI-NEXT:    [[LHS24:%.*]] = sext i24 [[LHS_TRUNC]] to i64
+; VI-NEXT:    [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i25
+; VI-NEXT:    [[RHS24:%.*]] = sext i25 [[RHS_TRUNC]] to i64
+; VI-NEXT:    [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]]
+; VI-NEXT:    ret i64 [[MUL]]
+;
+; DISABLED-LABEL: @smul24_i64_4(
+; DISABLED-NEXT:    [[LHS_TRUNC:%.*]] = trunc i64 [[LHS:%.*]] to i24
+; DISABLED-NEXT:    [[LHS24:%.*]] = sext i24 [[LHS_TRUNC]] to i64
+; DISABLED-NEXT:    [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i25
+; DISABLED-NEXT:    [[RHS24:%.*]] = sext i25 [[RHS_TRUNC]] to i64
+; DISABLED-NEXT:    [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]]
+; DISABLED-NEXT:    ret i64 [[MUL]]
+;
+  %lhs.trunc = trunc i64 %lhs to i24
+  %lhs24 = sext i24 %lhs.trunc to i64
+  %rhs.trunc = trunc i64 %rhs to i25
+  %rhs24 = sext i25 %rhs.trunc to i64
+  %mul = mul i64 %lhs24, %rhs24
+  ret i64 %mul
+}
+
 define i64 @umul24_i64(i64 %lhs, i64 %rhs) {
 ; SI-LABEL: @umul24_i64(
 ; SI-NEXT:    [[LHS24:%.*]] = and i64 [[LHS:%.*]], 16777215
 ; SI-NEXT:    [[RHS24:%.*]] = and i64 [[RHS:%.*]], 16777215
-; SI-NEXT:    [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]]
+; SI-NEXT:    [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
+; SI-NEXT:    [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
+; SI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP3]] to i64
+; SI-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; SI-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
+; SI-NEXT:    [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]]
 ; SI-NEXT:    ret i64 [[MUL]]
 ;
 ; VI-LABEL: @umul24_i64(
 ; VI-NEXT:    [[LHS24:%.*]] = and i64 [[LHS:%.*]], 16777215
 ; VI-NEXT:    [[RHS24:%.*]] = and i64 [[RHS:%.*]], 16777215
-; VI-NEXT:    [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]]
+; VI-NEXT:    [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
+; VI-NEXT:    [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
+; VI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP3]] to i64
+; VI-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; VI-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
+; VI-NEXT:    [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]]
 ; VI-NEXT:    ret i64 [[MUL]]
 ;
 ; DISABLED-LABEL: @umul24_i64(
@@ -514,7 +589,15 @@ define i33 @smul24_i33(i33 %lhs, i33 %rhs) {
 ; SI-NEXT:    [[LHS24:%.*]] = ashr i33 [[SHL_LHS]], 9
 ; SI-NEXT:    [[LSHR_RHS:%.*]] = shl i33 [[RHS:%.*]], 9
 ; SI-NEXT:    [[RHS24:%.*]] = ashr i33 [[LHS]], 9
-; SI-NEXT:    [[MUL:%.*]] = mul i33 [[LHS24]], [[RHS24]]
+; SI-NEXT:    [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
+; SI-NEXT:    [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
+; SI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP3]] to i64
+; SI-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; SI-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
+; SI-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+; SI-NEXT:    [[MUL:%.*]] = trunc i64 [[TMP8]] to i33
 ; SI-NEXT:    ret i33 [[MUL]]
 ;
 ; VI-LABEL: @smul24_i33(
@@ -522,7 +605,15 @@ define i33 @smul24_i33(i33 %lhs, i33 %rhs) {
 ; VI-NEXT:    [[LHS24:%.*]] = ashr i33 [[SHL_LHS]], 9
 ; VI-NEXT:    [[LSHR_RHS:%.*]] = shl i33 [[RHS:%.*]], 9
 ; VI-NEXT:    [[RHS24:%.*]] = ashr i33 [[LHS]], 9
-; VI-NEXT:    [[MUL:%.*]] = mul i33 [[LHS24]], [[RHS24]]
+; VI-NEXT:    [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
+; VI-NEXT:    [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
+; VI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP3]] to i64
+; VI-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; VI-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
+; VI-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+; VI-NEXT:    [[MUL:%.*]] = trunc i64 [[TMP8]] to i33
 ; VI-NEXT:    ret i33 [[MUL]]
 ;
 ; DISABLED-LABEL: @smul24_i33(
@@ -545,13 +636,29 @@ define i33 @umul24_i33(i33 %lhs, i33 %rhs) {
 ; SI-LABEL: @umul24_i33(
 ; SI-NEXT:    [[LHS24:%.*]] = and i33 [[LHS:%.*]], 16777215
 ; SI-NEXT:    [[RHS24:%.*]] = and i33 [[RHS:%.*]], 16777215
-; SI-NEXT:    [[MUL:%.*]] = mul i33 [[LHS24]], [[RHS24]]
+; SI-NEXT:    [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
+; SI-NEXT:    [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
+; SI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]])
+; SI-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP3]] to i64
+; SI-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; SI-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
+; SI-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+; SI-NEXT:    [[MUL:%.*]] = trunc i64 [[TMP8]] to i33
 ; SI-NEXT:    ret i33 [[MUL]]
 ;
 ; VI-LABEL: @umul24_i33(
 ; VI-NEXT:    [[LHS24:%.*]] = and i33 [[LHS:%.*]], 16777215
 ; VI-NEXT:    [[RHS24:%.*]] = and i33 [[RHS:%.*]], 16777215
-; VI-NEXT:    [[MUL:%.*]] = mul i33 [[LHS24]], [[RHS24]]
+; VI-NEXT:    [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
+; VI-NEXT:    [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
+; VI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]])
+; VI-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP3]] to i64
+; VI-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; VI-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 32
+; VI-NEXT:    [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]]
+; VI-NEXT:    [[MUL:%.*]] = trunc i64 [[TMP8]] to i33
 ; VI-NEXT:    ret i33 [[MUL]]
 ;
 ; DISABLED-LABEL: @umul24_i33(
@@ -630,7 +737,30 @@ define <2 x i33> @smul24_v2i33(<2 x i33> %lhs, <2 x i33> %rhs) {
 ; SI-NEXT:    [[LHS24:%.*]] = ashr <2 x i33> [[SHL_LHS]], <i33 9, i33 9>
 ; SI-NEXT:    [[LSHR_RHS:%.*]] = shl <2 x i33> [[RHS:%.*]], <i33 9, i33 9>
 ; SI-NEXT:    [[RHS24:%.*]] = ashr <2 x i33> [[LHS]], <i33 9, i33 9>
-; SI-NEXT:    [[MUL:%.*]] = mul <2 x i33> [[LHS24]], [[RHS24]]
+; SI-NEXT:    [[TMP1:%.*]] = extractelement <2 x i33> [[LHS24]], i64 0
+; SI-NEXT:    [[TMP2:%.*]] = extractelement <2 x i33> [[LHS24]], i64 1
+; SI-NEXT:    [[TMP3:%.*]] = extractelement <2 x i33> [[RHS24]], i64 0
+; SI-NEXT:    [[TMP4:%.*]] = extractelement <2 x i33> [[RHS24]], i64 1
+; SI-NEXT:    [[TMP5:%.*]] = trunc i33 [[TMP1]] to i32
+; SI-NEXT:    [[TMP6:%.*]] = trunc i33 [[TMP3]] to i32
+; SI-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
+; SI-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP5]], i32 [[TMP6]])
+; SI-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; SI-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
+; SI-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 32
+; SI-NEXT:    [[TMP12:%.*]] = or i64 [[TMP9]], [[TMP11]]
+; SI-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i33
+; SI-NEXT:    [[TMP14:%.*]] = trunc i33 [[TMP2]] to i32
+; SI-NEXT:    [[TMP15:%.*]] = trunc i33 [[TMP4]] to i32
+; SI-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP14]], i32 [[TMP15]])
+; SI-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP14]], i32 [[TMP15]])
+; SI-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP16]] to i64
+; SI-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP17]] to i64
+; SI-NEXT:    [[TMP20:%.*]] = shl i64 [[TMP19]], 32
+; SI-NEXT:    [[TMP21:%.*]] = or i64 [[TMP18]], [[TMP20]]
+; SI-NEXT:    [[TMP22:%.*]] = trunc i64 [[TMP21]] to i33
+; SI-NEXT:    [[TMP23:%.*]] = insertelement <2 x i33> undef, i33 [[TMP13]], i64 0
+; SI-NEXT:    [[MUL:%.*]] = insertelement <2 x i33> [[TMP23]], i33 [[TMP22]], i64 1
 ; SI-NEXT:    ret <2 x i33> [[MUL]]
 ;
 ; VI-LABEL: @smul24_v2i33(
@@ -638,7 +768,30 @@ define <2 x i33> @smul24_v2i33(<2 x i33> %lhs, <2 x i33> %rhs) {
 ; VI-NEXT:    [[LHS24:%.*]] = ashr <2 x i33> [[SHL_LHS]], <i33 9, i33 9>
 ; VI-NEXT:    [[LSHR_RHS:%.*]] = shl <2 x i33> [[RHS:%.*]], <i33 9, i33 9>
 ; VI-NEXT:    [[RHS24:%.*]] = ashr <2 x i33> [[LHS]], <i33 9, i33 9>
-; VI-NEXT:    [[MUL:%.*]] = mul <2 x i33> [[LHS24]], [[RHS24]]
+; VI-NEXT:    [[TMP1:%.*]] = extractelement <2 x i33> [[LHS24]], i64 0
+; VI-NEXT:    [[TMP2:%.*]] = extractelement <2 x i33> [[LHS24]], i64 1
+; VI-NEXT:    [[TMP3:%.*]] = extractelement <2 x i33> [[RHS24]], i64 0
+; VI-NEXT:    [[TMP4:%.*]] = extractelement <2 x i33> [[RHS24]], i64 1
+; VI-NEXT:    [[TMP5:%.*]] = trunc i33 [[TMP1]] to i32
+; VI-NEXT:    [[TMP6:%.*]] = trunc i33 [[TMP3]] to i32
+; VI-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
+; VI-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP5]], i32 [[TMP6]])
+; VI-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; VI-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
+; VI-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 32
+; VI-NEXT:    [[TMP12:%.*]] = or i64 [[TMP9]], [[TMP11]]
+; VI-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i33
+; VI-NEXT:    [[TMP14:%.*]] = trunc i33 [[TMP2]] to i32
+; VI-NEXT:    [[TMP15:%.*]] = trunc i33 [[TMP4]] to i32
+; VI-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP14]], i32 [[TMP15]])
+; VI-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP14]], i32 [[TMP15]])
+; VI-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP16]] to i64
+; VI-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP17]] to i64
+; VI-NEXT:    [[TMP20:%.*]] = shl i64 [[TMP19]], 32
+; VI-NEXT:    [[TMP21:%.*]] = or i64 [[TMP18]], [[TMP20]]
+; VI-NEXT:    [[TMP22:%.*]] = trunc i64 [[TMP21]] to i33
+; VI-NEXT:    [[TMP23:%.*]] = insertelement <2 x i33> undef, i33 [[TMP13]], i64 0
+; VI-NEXT:    [[MUL:%.*]] = insertelement <2 x i33> [[TMP23]], i33 [[TMP22]], i64 1
 ; VI-NEXT:    ret <2 x i33> [[MUL]]
 ;
 ; DISABLED-LABEL: @smul24_v2i33(