forked from OSchip/llvm-project
AMDGPU: Check if users of fneg can fold mods
In multi-use cases this can save a few instructions. llvm-svn: 293962
This commit is contained in:
parent
21c89dc920
commit
a8fcfadf46
|
@ -486,6 +486,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
|||
// Target Information
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
LLVM_READNONE
|
||||
static bool fnegFoldsIntoOp(unsigned Opc) {
|
||||
switch (Opc) {
|
||||
case ISD::FADD:
|
||||
|
@ -507,6 +508,59 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
|
|||
}
|
||||
}
|
||||
|
||||
/// \p returns true if the operation will definitely need to use a 64-bit
|
||||
/// encoding, and thus will use a VOP3 encoding regardless of the source
|
||||
/// modifiers.
|
||||
LLVM_READONLY
|
||||
static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
|
||||
return N->getNumOperands() > 2 || VT == MVT::f64;
|
||||
}
|
||||
|
||||
// Most FP instructions support source modifiers, but this could be refined
|
||||
// slightly.
|
||||
LLVM_READONLY
|
||||
static bool hasSourceMods(const SDNode *N) {
|
||||
if (isa<MemSDNode>(N))
|
||||
return false;
|
||||
|
||||
switch (N->getOpcode()) {
|
||||
case ISD::CopyToReg:
|
||||
case ISD::SELECT:
|
||||
case ISD::FDIV:
|
||||
case ISD::FREM:
|
||||
case ISD::INLINEASM:
|
||||
case AMDGPUISD::INTERP_P1:
|
||||
case AMDGPUISD::INTERP_P2:
|
||||
case AMDGPUISD::DIV_SCALE:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) {
|
||||
// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
|
||||
// it is truly free to use a source modifier in all cases. If there are
|
||||
// multiple users but for each one will necessitate using VOP3, there will be
|
||||
// a code size increase. Try to avoid increasing code size unless we know it
|
||||
// will save on the instruction count.
|
||||
unsigned NumMayIncreaseSize = 0;
|
||||
MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
|
||||
|
||||
// XXX - Should this limit number of uses to check?
|
||||
for (const SDNode *U : N->uses()) {
|
||||
if (!hasSourceMods(U))
|
||||
return false;
|
||||
|
||||
if (!opMustUseVOP3Encoding(U, VT)) {
|
||||
if (++NumMayIncreaseSize > CostThreshold)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
|
||||
return MVT::i32;
|
||||
}
|
||||
|
@ -2854,10 +2908,16 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
|
|||
// the other uses cannot, give up. This both prevents unprofitable
|
||||
// transformations and infinite loops: we won't repeatedly try to fold around
|
||||
// a negate that has no 'good' form.
|
||||
//
|
||||
// TODO: Check users can fold
|
||||
if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
|
||||
return SDValue();
|
||||
if (N0.hasOneUse()) {
|
||||
// This may be able to fold into the source, but at a code size cost. Don't
|
||||
// fold if the fold into the user is free.
|
||||
if (allUsesHaveSourceMods(N, 0))
|
||||
return SDValue();
|
||||
} else {
|
||||
if (fnegFoldsIntoOp(Opc) &&
|
||||
(allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDLoc SL(N);
|
||||
switch (Opc) {
|
||||
|
|
|
@ -21,8 +21,8 @@ declare float @llvm.fabs.f32(float) #1
|
|||
; VI: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
|
||||
; VI: v_cndmask_b32_e32
|
||||
; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
|
||||
; VI: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; VI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
|
||||
; VI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
|
||||
define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
|
||||
%a11 = fadd fast float %y, -1.0
|
||||
%a12 = call float @llvm.fabs.f32(float %a11)
|
||||
|
@ -116,9 +116,9 @@ define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
|
|||
; VI: v_cmp_gt_f16_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
|
||||
; VI: v_cndmask_b32_e32
|
||||
; VI: v_add_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
|
||||
; VI: v_mul_f16_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
|
||||
; VI-DENORM: v_fma_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
|
||||
; VI: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
|
||||
; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
|
||||
define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
|
||||
%x = bitcast i16 %x.arg to half
|
||||
%y = bitcast i16 %y.arg to half
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
|
||||
|
||||
; --------------------------------------------------------------------------------
|
||||
; fadd tests
|
||||
|
@ -53,10 +53,14 @@ define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrsp
|
|||
; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
|
||||
; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
|
||||
; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
|
||||
; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
|
||||
|
||||
; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
|
||||
; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
|
||||
; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
|
||||
|
||||
; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]]
|
||||
; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[ADD]]
|
||||
; GCN: buffer_store_dword [[NEG_ADD]]
|
||||
; GCN-NEXT: buffer_store_dword [[MUL]]
|
||||
define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -253,11 +257,10 @@ define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrsp
|
|||
; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
|
||||
; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
|
||||
; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
|
||||
; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
|
||||
; GCN: buffer_store_dword [[MUL]]
|
||||
; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]]
|
||||
; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
|
||||
; GCN-NEXT: buffer_store_dword [[MUL0]]
|
||||
; GCN-NEXT: buffer_store_dword [[MUL1]]
|
||||
define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
|
@ -441,9 +444,14 @@ define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrsp
|
|||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
|
||||
; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
|
||||
; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
|
||||
; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
|
||||
|
||||
; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
|
||||
; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
|
||||
; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
|
||||
|
||||
; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
|
||||
; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]
|
||||
|
||||
; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
|
||||
; GCN-NEXT: buffer_store_dword [[MUL]]
|
||||
define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
|
||||
|
@ -697,10 +705,15 @@ define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.pt
|
|||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
|
||||
; GCN-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
|
||||
; GCN-DAG: v_xor_b32_e32 [[NEG_C:v[0-9]+]], 0x80000000, [[C]]
|
||||
; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
|
||||
; GCN-NEXT: buffer_store_dword [[NEG_C]]
|
||||
|
||||
; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]]
|
||||
; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
|
||||
; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
|
||||
|
||||
; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]]
|
||||
; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
|
||||
|
||||
; GCN: buffer_store_dword [[NEG_MAD]]
|
||||
; GCN-NEXT: buffer_store_dword [[MUL]]
|
||||
define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -1361,8 +1374,11 @@ define void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.p
|
|||
; GCN: v_trunc_f32_e32
|
||||
; GCN: v_subrev_f32_e32
|
||||
; GCN: v_cndmask_b32
|
||||
|
||||
; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]]
|
||||
|
||||
; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, v{{[0-9]+}}
|
||||
; GCN: buffer_store_dword [[RESULT]]
|
||||
define void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -1416,6 +1432,391 @@ define void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)*
|
|||
ret void
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------------------
|
||||
; vintrp tests
|
||||
; --------------------------------------------------------------------------------
|
||||
|
||||
; GCN-LABEL: {{^}}v_fneg_interp_p1_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
|
||||
; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
|
||||
; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
|
||||
define void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%mul = fmul float %a, %b
|
||||
%fneg = fsub float -0.0, %mul
|
||||
%intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0)
|
||||
%intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0)
|
||||
store volatile float %intrp0, float addrspace(1)* %out.gep
|
||||
store volatile float %intrp1, float addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_fneg_interp_p2_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
|
||||
; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
|
||||
; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
|
||||
define void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%mul = fmul float %a, %b
|
||||
%fneg = fsub float -0.0, %mul
|
||||
%intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0)
|
||||
%intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0)
|
||||
store volatile float %intrp0, float addrspace(1)* %out.gep
|
||||
store volatile float %intrp1, float addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------------------
|
||||
; CopyToReg tests
|
||||
; --------------------------------------------------------------------------------
|
||||
|
||||
; GCN-LABEL: {{^}}v_fneg_copytoreg_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
|
||||
; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]]
|
||||
; GCN: s_cbranch_scc1
|
||||
|
||||
; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
|
||||
; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[XOR]]
|
||||
; GCN: buffer_store_dword [[MUL1]]
|
||||
|
||||
; GCN: buffer_store_dword [[MUL0]]
|
||||
define void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%c = load volatile float, float addrspace(1)* %c.gep
|
||||
%mul = fmul float %a, %b
|
||||
%fneg = fsub float -0.0, %mul
|
||||
%cmp0 = icmp eq i32 %d, 0
|
||||
br i1 %cmp0, label %if, label %endif
|
||||
|
||||
if:
|
||||
%mul1 = fmul float %fneg, %c
|
||||
store volatile float %mul1, float addrspace(1)* %out.gep
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
store volatile float %mul, float addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------------------
|
||||
; inlineasm tests
|
||||
; --------------------------------------------------------------------------------
|
||||
|
||||
; Can't fold into use, so should fold into source
|
||||
; GCN-LABEL: {{^}}v_fneg_inlineasm_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
|
||||
; GCN: ; use [[MUL]]
|
||||
; GCN: buffer_store_dword [[MUL]]
|
||||
define void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%c = load volatile float, float addrspace(1)* %c.gep
|
||||
%mul = fmul float %a, %b
|
||||
%fneg = fsub float -0.0, %mul
|
||||
call void asm sideeffect "; use $0", "v"(float %fneg) #0
|
||||
store volatile float %fneg, float addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------------------
|
||||
; inlineasm tests
|
||||
; --------------------------------------------------------------------------------
|
||||
|
||||
; Can't fold into use, so should fold into source
|
||||
; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
|
||||
; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
|
||||
; GCN: ; use [[NEG]]
|
||||
; GCN: buffer_store_dword [[MUL]]
|
||||
define void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%c = load volatile float, float addrspace(1)* %c.gep
|
||||
%mul = fmul float %a, %b
|
||||
%fneg = fsub float -0.0, %mul
|
||||
call void asm sideeffect "; use $0", "v"(float %fneg) #0
|
||||
store volatile float %mul, float addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------------------
|
||||
; code size regression tests
|
||||
; --------------------------------------------------------------------------------
|
||||
|
||||
; There are multiple users of the fneg that must use a VOP3
|
||||
; instruction, so there is no penalty
|
||||
; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
|
||||
|
||||
; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]]
|
||||
; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
|
||||
; GCN-NEXT: buffer_store_dword [[FMA0]]
|
||||
; GCN-NEXT: buffer_store_dword [[FMA1]]
|
||||
define void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%c = load volatile float, float addrspace(1)* %c.gep
|
||||
|
||||
%fneg.a = fsub float -0.0, %a
|
||||
%fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
|
||||
%fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0)
|
||||
|
||||
store volatile float %fma0, float addrspace(1)* %out
|
||||
store volatile float %fma1, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; There are multiple users, but both require using a larger encoding
|
||||
; for the modifier.
|
||||
|
||||
; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
|
||||
|
||||
; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
|
||||
; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
|
||||
; GCN-NEXT: buffer_store_dword [[MUL0]]
|
||||
; GCN-NEXT: buffer_store_dword [[MUL1]]
|
||||
define void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%c = load volatile float, float addrspace(1)* %c.gep
|
||||
|
||||
%fneg.a = fsub float -0.0, %a
|
||||
%mul0 = fmul float %fneg.a, %b
|
||||
%mul1 = fmul float %fneg.a, %c
|
||||
|
||||
store volatile float %mul0, float addrspace(1)* %out
|
||||
store volatile float %mul1, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; One user is VOP3 so has no cost to folding the modifier, the other does.
|
||||
; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
|
||||
|
||||
; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0
|
||||
; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
|
||||
|
||||
; GCN: buffer_store_dword [[FMA0]]
|
||||
; GCN-NEXT: buffer_store_dword [[MUL1]]
|
||||
define void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%c = load volatile float, float addrspace(1)* %c.gep
|
||||
|
||||
%fneg.a = fsub float -0.0, %a
|
||||
%fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
|
||||
%mul1 = fmul float %fneg.a, %c
|
||||
|
||||
store volatile float %fma0, float addrspace(1)* %out
|
||||
store volatile float %mul1, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; The use of the fneg requires a code size increase, but folding into
|
||||
; the source does not
|
||||
|
||||
; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
|
||||
|
||||
; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0
|
||||
; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]]
|
||||
; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
|
||||
|
||||
; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
|
||||
; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[FMA0]]
|
||||
; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[D]], [[FMA0]]
|
||||
|
||||
; GCN: buffer_store_dword [[MUL1]]
|
||||
; GCN-NEXT: buffer_store_dword [[MUL2]]
|
||||
define void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
|
||||
%d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%c = load volatile float, float addrspace(1)* %c.gep
|
||||
%d = load volatile float, float addrspace(1)* %d.gep
|
||||
|
||||
%fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
|
||||
%fneg.fma0 = fsub float -0.0, %fma0
|
||||
%mul1 = fmul float %fneg.fma0, %c
|
||||
%mul2 = fmul float %fneg.fma0, %d
|
||||
|
||||
store volatile float %mul1, float addrspace(1)* %out
|
||||
store volatile float %mul2, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64:
|
||||
; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
|
||||
; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
|
||||
; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
|
||||
; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
|
||||
|
||||
; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
|
||||
; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]]
|
||||
; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
|
||||
|
||||
; GCN: buffer_store_dwordx2 [[MUL0]]
|
||||
; GCN: buffer_store_dwordx2 [[MUL1]]
|
||||
define void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext
|
||||
%d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile double, double addrspace(1)* %a.gep
|
||||
%b = load volatile double, double addrspace(1)* %b.gep
|
||||
%c = load volatile double, double addrspace(1)* %c.gep
|
||||
%d = load volatile double, double addrspace(1)* %d.gep
|
||||
|
||||
%fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
|
||||
%fneg.fma0 = fsub double -0.0, %fma0
|
||||
%mul1 = fmul double %fneg.fma0, %c
|
||||
%mul2 = fmul double %fneg.fma0, %d
|
||||
|
||||
store volatile double %mul1, double addrspace(1)* %out
|
||||
store volatile double %mul2, double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; %trunc.a has one fneg use, but it requires a code size increase and
|
||||
; %the fneg can instead be folded for free into the fma.
|
||||
|
||||
; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
|
||||
; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
|
||||
; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
|
||||
; GCN: buffer_store_dword [[FMA0]]
|
||||
define void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
|
||||
%d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%c = load volatile float, float addrspace(1)* %c.gep
|
||||
%d = load volatile float, float addrspace(1)* %d.gep
|
||||
|
||||
%trunc.a = call float @llvm.trunc.f32(float %a)
|
||||
%trunc.fneg.a = fsub float -0.0, %trunc.a
|
||||
%fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
|
||||
store volatile float %fma0, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
|
||||
; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
|
||||
; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]]
|
||||
; GCN: buffer_store_dword [[FMA0]]
|
||||
; GCN: buffer_store_dword [[MUL1]]
|
||||
define void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
|
||||
%d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%c = load volatile float, float addrspace(1)* %c.gep
|
||||
%d = load volatile float, float addrspace(1)* %d.gep
|
||||
|
||||
%trunc.a = call float @llvm.trunc.f32(float %a)
|
||||
%trunc.fneg.a = fsub float -0.0, %trunc.a
|
||||
%fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
|
||||
%mul1 = fmul float %trunc.a, %d
|
||||
store volatile float %fma0, float addrspace(1)* %out
|
||||
store volatile float %mul1, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
declare float @llvm.fma.f32(float, float, float) #1
|
||||
declare float @llvm.fmuladd.f32(float, float, float) #1
|
||||
|
@ -1425,10 +1826,14 @@ declare float @llvm.round.f32(float) #1
|
|||
declare float @llvm.rint.f32(float) #1
|
||||
declare float @llvm.nearbyint.f32(float) #1
|
||||
|
||||
declare double @llvm.fma.f64(double, double, double) #1
|
||||
|
||||
declare float @llvm.amdgcn.sin.f32(float) #1
|
||||
declare float @llvm.amdgcn.rcp.f32(float) #1
|
||||
declare float @llvm.amdgcn.rcp.legacy(float) #1
|
||||
declare float @llvm.amdgcn.fmul.legacy(float, float) #1
|
||||
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
|
||||
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
|
|
|
@ -12,8 +12,8 @@
|
|||
; GCN: v_mul_f32_e32
|
||||
; GCN: v_div_fmas_f32
|
||||
; GCN: v_div_fixup_f32
|
||||
; GCN: v_trunc_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN: v_mac_f32_e32
|
||||
; GCN: v_trunc_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: s_endpgm
|
||||
define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
|
||||
float addrspace(1)* %in2) #0 {
|
||||
|
@ -28,12 +28,11 @@ define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
|
|||
; FUNC-LABEL: {{^}}unsafe_frem_f32:
|
||||
; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
|
||||
; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}}
|
||||
; GCN: v_rcp_f32_e64 [[INVY:v[0-9]+]], -[[Y]]
|
||||
; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
|
||||
; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
|
||||
; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
|
||||
; GCN: v_mac_f32_e32 [[X]], [[Y]], [[TRUNC]]
|
||||
; GCN: buffer_store_dword [[X]]
|
||||
; GCN: s_endpgm
|
||||
; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
|
||||
; GCN: buffer_store_dword [[RESULT]]
|
||||
define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
|
||||
float addrspace(1)* %in2) #1 {
|
||||
%gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
|
||||
|
|
|
@ -55,8 +55,9 @@ define void @multi_foldable_use_fneg_src() #0 {
|
|||
; GCN: buffer_load_dword [[B:v[0-9]+]]
|
||||
; GCN: buffer_load_dword [[C:v[0-9]+]]
|
||||
|
||||
; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
|
||||
; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]]
|
||||
; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
|
||||
; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 4.0, [[MUL]]
|
||||
; GCN-NOT: xor
|
||||
; GCN: buffer_store_dword [[MUL]]
|
||||
define void @multi_use_fneg() #0 {
|
||||
%a = load volatile float, float addrspace(1)* undef
|
||||
|
|
|
@ -65,9 +65,10 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mac_f16_neg_a:
|
||||
; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]]
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
|
||||
|
||||
; VI-NOT: v_mac_f16
|
||||
; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
@ -91,9 +92,11 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mac_f16_neg_b:
|
||||
; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]]
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
|
||||
|
||||
; VI-NOT: v_mac_f16
|
||||
; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: s_endpgm
|
||||
|
@ -117,9 +120,9 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}mac_f16_neg_c:
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
; SI-DAG: v_cvt_f32_f16_e32
|
||||
; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
|
||||
; SI: v_mac_f32_e32 [[CVT_NEG]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
|
||||
; VI-NOT: v_mac_f16
|
||||
; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
|
@ -215,9 +218,10 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mac_f16_neg_a_unsafe_fp_math:
|
||||
; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]]
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
|
||||
|
||||
; VI-NOT: v_mac_f16
|
||||
; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
|
||||
|
@ -241,9 +245,10 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mac_f16_neg_b_unsafe_fp_math:
|
||||
; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
|
||||
; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]]
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
|
||||
|
||||
; VI-NOT: v_mac_f16
|
||||
; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
|
||||
|
@ -267,10 +272,10 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mac_f16_neg_c_unsafe_fp_math:
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
; SI: v_cvt_f32_f16_e32
|
||||
; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}}
|
||||
; SI: v_mac_f32_e32 [[CVT_NEG]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
|
||||
; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]]
|
||||
|
||||
; VI-NOT: v_mac_f16
|
||||
; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
|
||||
|
@ -373,11 +378,11 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mac_v2f16_neg_a:
|
||||
; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
|
||||
|
||||
; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]]
|
||||
; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]]
|
||||
; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}}
|
||||
|
||||
; VI-NOT: v_mac_f16
|
||||
; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
@ -402,10 +407,10 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mac_v2f16_neg_b
|
||||
; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
|
||||
; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]]
|
||||
; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]]
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}}
|
||||
|
||||
|
||||
; VI-NOT: v_mac_f16
|
||||
|
@ -431,11 +436,15 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mac_v2f16_neg_c:
|
||||
; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
|
||||
|
||||
; SI-DAG: v_mac_f32_e32 [[CVT_NEG0]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI-DAG: v_mac_f32_e32 [[CVT_NEG1]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT2]]
|
||||
; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT5]]
|
||||
|
||||
; VI-NOT: v_mac_f16
|
||||
; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
|
@ -544,11 +553,15 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mac_v2f16_neg_a_unsafe_fp_math:
|
||||
; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
|
||||
|
||||
; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]]
|
||||
; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]]
|
||||
; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; VI-NOT: v_mac_f16
|
||||
; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
|
||||
|
@ -573,11 +586,15 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mac_v2f16_neg_b_unsafe_fp_math:
|
||||
; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
|
||||
|
||||
; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]]
|
||||
; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]]
|
||||
; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; VI-NOT: v_mac_f16
|
||||
; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
|
||||
|
@ -602,11 +619,15 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}mac_v2f16_neg_c_unsafe_fp_math:
|
||||
; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
|
||||
; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
|
||||
|
||||
; SI-DAG: v_mac_f32_e32 [[CVT_NEG0]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI-DAG: v_mac_f32_e32 [[CVT_NEG1]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
|
||||
; VI-NOT: v_mac_f16
|
||||
; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
|
||||
|
|
Loading…
Reference in New Issue