forked from OSchip/llvm-project
[AMDGPU] Stop mulhi from doing 24 bit mul for uniform values
Added support to check if architecture supports s_mulhi which is used as part of the decision whether or not to use valu 24 bit mul (if the mulhi gets transformed to a valu op anyway, then may as well use it). This is an extension of the work in D97063 Differential Revision: https://reviews.llvm.org/D103321 Change-Id: I80b1323de640a52623d69ac005a97d06a5d42a14
This commit is contained in:
parent
3697f26836
commit
b8173c3178
|
@ -3461,6 +3461,15 @@ SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
|
||||||
if (!Subtarget->hasMulI24() || VT.isVector())
|
if (!Subtarget->hasMulI24() || VT.isVector())
|
||||||
return SDValue();
|
return SDValue();
|
||||||
|
|
||||||
|
// Don't generate 24-bit multiplies on values that are in SGPRs, since
|
||||||
|
// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
|
||||||
|
// unnecessarily). isDivergent() is used as an approximation of whether the
|
||||||
|
// value is in an SGPR.
|
||||||
|
// This doesn't apply if no s_mul_hi is available (since we'll end up with a
|
||||||
|
// valu op anyway)
|
||||||
|
if (Subtarget->hasSMulHi() && !N->isDivergent())
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
SelectionDAG &DAG = DCI.DAG;
|
SelectionDAG &DAG = DCI.DAG;
|
||||||
SDLoc DL(N);
|
SDLoc DL(N);
|
||||||
|
|
||||||
|
@ -3485,6 +3494,15 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
|
||||||
if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
|
if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
|
||||||
return SDValue();
|
return SDValue();
|
||||||
|
|
||||||
|
// Don't generate 24-bit multiplies on values that are in SGPRs, since
|
||||||
|
// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
|
||||||
|
// unnecessarily). isDivergent() is used as an approximation of whether the
|
||||||
|
// value is in an SGPR.
|
||||||
|
// This doesn't apply if no s_mul_hi is available (since we'll end up with a
|
||||||
|
// valu op anyway)
|
||||||
|
if (Subtarget->hasSMulHi() && !N->isDivergent())
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
SelectionDAG &DAG = DCI.DAG;
|
SelectionDAG &DAG = DCI.DAG;
|
||||||
SDLoc DL(N);
|
SDLoc DL(N);
|
||||||
|
|
||||||
|
|
|
@ -163,6 +163,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
|
||||||
WavefrontSizeLog2 = 5;
|
WavefrontSizeLog2 = 5;
|
||||||
|
|
||||||
HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
|
HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
|
||||||
|
HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
|
||||||
|
|
||||||
TargetID.setTargetIDFromFeaturesString(FS);
|
TargetID.setTargetIDFromFeaturesString(FS);
|
||||||
|
|
||||||
|
@ -185,6 +186,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
|
||||||
HasVOP3PInsts(false),
|
HasVOP3PInsts(false),
|
||||||
HasMulI24(true),
|
HasMulI24(true),
|
||||||
HasMulU24(true),
|
HasMulU24(true),
|
||||||
|
HasSMulHi(false),
|
||||||
HasInv2PiInlineImm(false),
|
HasInv2PiInlineImm(false),
|
||||||
HasFminFmaxLegacy(true),
|
HasFminFmaxLegacy(true),
|
||||||
EnablePromoteAlloca(false),
|
EnablePromoteAlloca(false),
|
||||||
|
|
|
@ -54,6 +54,7 @@ protected:
|
||||||
bool HasVOP3PInsts;
|
bool HasVOP3PInsts;
|
||||||
bool HasMulI24;
|
bool HasMulI24;
|
||||||
bool HasMulU24;
|
bool HasMulU24;
|
||||||
|
bool HasSMulHi;
|
||||||
bool HasInv2PiInlineImm;
|
bool HasInv2PiInlineImm;
|
||||||
bool HasFminFmaxLegacy;
|
bool HasFminFmaxLegacy;
|
||||||
bool EnablePromoteAlloca;
|
bool EnablePromoteAlloca;
|
||||||
|
@ -161,6 +162,10 @@ public:
|
||||||
return HasMulU24;
|
return HasMulU24;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool hasSMulHi() const {
|
||||||
|
return HasSMulHi;
|
||||||
|
}
|
||||||
|
|
||||||
bool hasInv2PiInlineImm() const {
|
bool hasInv2PiInlineImm() const {
|
||||||
return HasInv2PiInlineImm;
|
return HasInv2PiInlineImm;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
|
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC,SIVI %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC,SIVI %s
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC,GFX9 %s
|
||||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||||
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
|
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
|
||||||
|
|
||||||
|
@ -22,10 +23,12 @@ entry:
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}test_smulhi24_i64:
|
; FUNC-LABEL: {{^}}test_smulhi24_i64:
|
||||||
; GCN-NOT: bfe
|
; SIVI-NOT: bfe
|
||||||
; GCN-NOT: ashr
|
; GCN-NOT: ashr
|
||||||
; GCN: v_mul_hi_i32_i24_e32 [[RESULT:v[0-9]+]],
|
; SIVI: v_mul_hi_i32_i24_e32 [[RESULT:v[0-9]+]],
|
||||||
; GCN-NEXT: buffer_store_dword [[RESULT]]
|
; GFX9: s_mul_hi_i32 [[RES1:s[0-9]+]],
|
||||||
|
; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[RES1]]
|
||||||
|
; GCN: buffer_store_dword [[RESULT]]
|
||||||
|
|
||||||
; EG: ASHR
|
; EG: ASHR
|
||||||
; EG: ASHR
|
; EG: ASHR
|
||||||
|
@ -62,8 +65,10 @@ entry:
|
||||||
|
|
||||||
; GCN-NOT: ashr
|
; GCN-NOT: ashr
|
||||||
|
|
||||||
; GCN-DAG: v_mul_hi_i32_i24_e32
|
; SIVI-DAG: v_mul_hi_i32_i24_e32
|
||||||
; GCN-DAG: s_mul_i32
|
; SIVI-DAG: s_mul_i32
|
||||||
|
; GFX9-DAG: s_mul_hi_i32
|
||||||
|
; GFX9-DAG: s_mul_i32
|
||||||
|
|
||||||
; GCN: buffer_store_dwordx2
|
; GCN: buffer_store_dwordx2
|
||||||
define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
|
define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
|
||||||
|
@ -80,8 +85,11 @@ define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i3
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}test_smul24_i64_square:
|
; FUNC-LABEL: {{^}}test_smul24_i64_square:
|
||||||
; GCN: s_load_dword [[A:s[0-9]+]]
|
; GCN: s_load_dword [[A:s[0-9]+]]
|
||||||
; GCN-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]]
|
; SIVI-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]]
|
||||||
; GCN-DAG: s_mul_i32 s{{[0-9]+}}, [[A]], [[A]]
|
; SIVI-DAG: s_mul_i32 s{{[0-9]+}}, [[A]], [[A]]
|
||||||
|
; GFX9: s_bfe_i32 [[B:s[0-9]+]], [[A]]
|
||||||
|
; GFX9-DAG: s_mul_hi_i32 s{{[0-9]+}}, [[B]], [[B]]
|
||||||
|
; GFX9-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]]
|
||||||
; GCN: buffer_store_dwordx2
|
; GCN: buffer_store_dwordx2
|
||||||
define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
|
define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
|
||||||
%shl.i = shl i32 %a, 8
|
%shl.i = shl i32 %a, 8
|
||||||
|
@ -99,14 +107,19 @@ define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a
|
||||||
; GCN-NOT: and
|
; GCN-NOT: and
|
||||||
; GCN-NOT: lshr
|
; GCN-NOT: lshr
|
||||||
|
|
||||||
; GCN-DAG: s_mul_i32
|
; SIVI-DAG: s_mul_i32
|
||||||
; GCN-DAG: v_mul_hi_i32_i24_e32
|
; SIVI-DAG: v_mul_hi_i32_i24_e32
|
||||||
; SI: v_lshl_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 31
|
; SI: v_lshl_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 31
|
||||||
; SI: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 31
|
; SI: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 31
|
||||||
|
|
||||||
; VI: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}}
|
; VI: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}}
|
||||||
; VI: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}}
|
; VI: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}}
|
||||||
|
|
||||||
|
; GFX9-DAG: s_mul_i32
|
||||||
|
; GFX9-DAG: s_mul_hi_i32
|
||||||
|
; GFX9: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 31
|
||||||
|
; GFX9: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 31
|
||||||
|
|
||||||
; GCN: buffer_store_dwordx2
|
; GCN: buffer_store_dwordx2
|
||||||
define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
|
define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
|
||||||
entry:
|
entry:
|
||||||
|
@ -129,6 +142,11 @@ entry:
|
||||||
; SI: v_mul_hi_i32_i24_e32 v[[MUL_HI:[0-9]+]],
|
; SI: v_mul_hi_i32_i24_e32 v[[MUL_HI:[0-9]+]],
|
||||||
; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
|
; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
|
||||||
; SI-NEXT: buffer_store_dword v[[HI]]
|
; SI-NEXT: buffer_store_dword v[[HI]]
|
||||||
|
|
||||||
|
; GFX9: s_mul_hi_i32 s[[MUL_HI:[0-9]+]],
|
||||||
|
; GFX9-NEXT: s_and_b32 s[[HI:[0-9]+]], s[[MUL_HI]], 1
|
||||||
|
; GFX9-NEXT: v_mov_b32_e32 v[[RES:[0-9]+]], s[[HI]]
|
||||||
|
; GFX9-NEXT: buffer_store_dword v[[RES]]
|
||||||
define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
|
define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
|
||||||
entry:
|
entry:
|
||||||
%tmp0 = shl i33 %a, 9
|
%tmp0 = shl i33 %a, 9
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,FUNC %s
|
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI,FUNC %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,FUNC %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,FUNC %s
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
|
||||||
|
|
||||||
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||||
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
|
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
|
||||||
|
@ -31,6 +32,7 @@ entry:
|
||||||
; FUNC-LABEL: {{^}}test_umul24_i16_vgpr_sext:
|
; FUNC-LABEL: {{^}}test_umul24_i16_vgpr_sext:
|
||||||
; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||||
; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||||
|
; GFX9: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||||
; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
|
; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
|
||||||
define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
|
define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
|
||||||
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
|
@ -60,6 +62,7 @@ entry:
|
||||||
; SI: v_mul_u32_u24_e32
|
; SI: v_mul_u32_u24_e32
|
||||||
; SI: v_and_b32_e32
|
; SI: v_and_b32_e32
|
||||||
; VI: v_mul_lo_u16
|
; VI: v_mul_lo_u16
|
||||||
|
; GFX9: v_mul_lo_u16
|
||||||
define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
|
define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
|
||||||
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
|
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
|
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
|
||||||
|
@ -76,6 +79,7 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addr
|
||||||
; FUNC-LABEL: {{^}}test_umul24_i8_vgpr:
|
; FUNC-LABEL: {{^}}test_umul24_i8_vgpr:
|
||||||
; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||||
; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||||
|
; GFX9: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||||
; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
|
; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
|
||||||
define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
|
define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
|
@ -92,8 +96,10 @@ entry:
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
|
; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
|
||||||
; GCN-NOT: and
|
; SIVI-NOT: and
|
||||||
; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
|
; SIVI: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
|
||||||
|
; GFX9: s_mul_hi_u32 [[SRESULT:s[0-9]+]],
|
||||||
|
; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
|
||||||
; GCN-NEXT: buffer_store_dword [[RESULT]]
|
; GCN-NEXT: buffer_store_dword [[RESULT]]
|
||||||
define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
|
define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
|
||||||
entry:
|
entry:
|
||||||
|
@ -109,8 +115,10 @@ entry:
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}test_umulhi24:
|
; FUNC-LABEL: {{^}}test_umulhi24:
|
||||||
; GCN-NOT: and
|
; SIVI-NOT: and
|
||||||
; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
|
; SIVI: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
|
||||||
|
; GFX9: s_mul_hi_u32 [[SRESULT:s[0-9]+]],
|
||||||
|
; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
|
||||||
; GCN-NEXT: buffer_store_dword [[RESULT]]
|
; GCN-NEXT: buffer_store_dword [[RESULT]]
|
||||||
define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
|
define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
|
||||||
entry:
|
entry:
|
||||||
|
@ -126,8 +134,10 @@ entry:
|
||||||
; Multiply with 24-bit inputs and 64-bit output.
|
; Multiply with 24-bit inputs and 64-bit output.
|
||||||
; FUNC-LABEL: {{^}}test_umul24_i64:
|
; FUNC-LABEL: {{^}}test_umul24_i64:
|
||||||
; GCN-NOT: lshr
|
; GCN-NOT: lshr
|
||||||
; GCN-DAG: s_mul_i32
|
; SIVI-DAG: s_mul_i32
|
||||||
; GCN-DAG: v_mul_hi_u32_u24_e32
|
; SIVI-DAG: v_mul_hi_u32_u24_e32
|
||||||
|
; GFX9-DAG: s_mul_i32
|
||||||
|
; GFX9-DAG: s_mul_hi_u32
|
||||||
; GCN: buffer_store_dwordx2
|
; GCN: buffer_store_dwordx2
|
||||||
define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
|
define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
|
||||||
entry:
|
entry:
|
||||||
|
@ -143,8 +153,10 @@ entry:
|
||||||
; FUNC-LABEL: {{^}}test_umul24_i64_square:
|
; FUNC-LABEL: {{^}}test_umul24_i64_square:
|
||||||
; GCN: s_load_dword [[A:s[0-9]+]]
|
; GCN: s_load_dword [[A:s[0-9]+]]
|
||||||
; GCN: s_and_b32 [[B:s[0-9]+]], [[A]], 0xffffff
|
; GCN: s_and_b32 [[B:s[0-9]+]], [[A]], 0xffffff
|
||||||
; GCN-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]]
|
; SIVI-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]]
|
||||||
; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
|
; SIVI-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
|
||||||
|
; GFX9-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]]
|
||||||
|
; GFX9-DAG: s_mul_hi_u32 s{{[0-9]+}}, [[B]], [[B]]
|
||||||
define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
|
define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
|
||||||
entry:
|
entry:
|
||||||
%tmp0 = shl i64 %a, 40
|
%tmp0 = shl i64 %a, 40
|
||||||
|
@ -158,7 +170,9 @@ entry:
|
||||||
; GCN: s_and_b32
|
; GCN: s_and_b32
|
||||||
; GCN: s_and_b32
|
; GCN: s_and_b32
|
||||||
; GCN: s_mul_i32 [[MUL24:s[0-9]+]]
|
; GCN: s_mul_i32 [[MUL24:s[0-9]+]]
|
||||||
; GCN: s_lshr_b32 s{{[0-9]+}}, [[MUL24]], 16
|
; SIVI: s_lshr_b32 s{{[0-9]+}}, [[MUL24]], 16
|
||||||
|
; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[MUL24]]
|
||||||
|
; GFX9: global_store_short_d16_hi v{{[0-9]+}}, [[RESULT]]
|
||||||
define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
|
define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.16 = and i32 %a, 65535
|
%a.16 = and i32 %a, 65535
|
||||||
|
@ -174,10 +188,15 @@ entry:
|
||||||
; GCN: s_load_dword s
|
; GCN: s_load_dword s
|
||||||
; GCN: s_load_dword s
|
; GCN: s_load_dword s
|
||||||
; GCN-NOT: lshr
|
; GCN-NOT: lshr
|
||||||
; GCN-DAG: s_mul_i32 s[[MUL_LO:[0-9]+]],
|
; SIVI-DAG: s_mul_i32 s[[MUL_LO:[0-9]+]],
|
||||||
; GCN-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
|
; SIVI-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
|
||||||
; GCN-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
|
; SIVI-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[MUL_LO]]
|
; SIVI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[MUL_LO]]
|
||||||
|
; GFX9-DAG: s_mul_i32 s[[MUL_LO:[0-9]+]],
|
||||||
|
; GFX9-DAG: s_mul_hi_u32 s[[MUL_HI:[0-9]+]],
|
||||||
|
; GFX9-DAG: s_and_b32 s[[AND_HI:[0-9]+]], s[[MUL_HI]], 1
|
||||||
|
; GFX9-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[MUL_LO]]
|
||||||
|
; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[AND_HI]]
|
||||||
; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
|
; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
|
||||||
define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
|
define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
|
||||||
entry:
|
entry:
|
||||||
|
@ -194,10 +213,13 @@ entry:
|
||||||
; FUNC-LABEL: {{^}}test_umulhi24_i33:
|
; FUNC-LABEL: {{^}}test_umulhi24_i33:
|
||||||
; GCN: s_load_dword s
|
; GCN: s_load_dword s
|
||||||
; GCN: s_load_dword s
|
; GCN: s_load_dword s
|
||||||
; GCN-NOT: and
|
; SIVI-NOT: and
|
||||||
; GCN-NOT: lshr
|
; GCN-NOT: lshr
|
||||||
; GCN: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
|
; SIVI: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
|
||||||
; GCN: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
|
; SIVI: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
|
||||||
|
; GFX9: s_mul_hi_u32 s[[MUL_HI:[0-9]+]],
|
||||||
|
; GFX9: s_and_b32 s[[AND_HI:[0-9]+]], s[[MUL_HI]], 1
|
||||||
|
; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]], s[[AND_HI]]
|
||||||
; GCN-NEXT: buffer_store_dword v[[HI]]
|
; GCN-NEXT: buffer_store_dword v[[HI]]
|
||||||
define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
|
define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
|
||||||
entry:
|
entry:
|
||||||
|
|
Loading…
Reference in New Issue