forked from OSchip/llvm-project
AMDGPU: Pull fneg out of extract_vector_elt
This allows folding source modifiers in more f16 cases. Makes it easier to select per-component packed neg modifiers. llvm-svn: 302813
This commit is contained in:
parent
33a97ec4ed
commit
bf5482e4bb
|
@ -567,13 +567,19 @@ static bool hasSourceMods(const SDNode *N) {
|
|||
case AMDGPUISD::INTERP_P1:
|
||||
case AMDGPUISD::INTERP_P2:
|
||||
case AMDGPUISD::DIV_SCALE:
|
||||
|
||||
// TODO: Should really be looking at the users of the bitcast. These are
|
||||
// problematic because bitcasts are used to legalize all stores to integer
|
||||
// types.
|
||||
case ISD::BITCAST:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) {
|
||||
bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
|
||||
unsigned CostThreshold) {
|
||||
// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
|
||||
// it is truly free to use a source modifier in all cases. If there are
|
||||
// multiple users but for each one will necessitate using VOP3, there will be
|
||||
|
|
|
@ -132,6 +132,8 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
static bool allUsesHaveSourceMods(const SDNode *N,
|
||||
unsigned CostThreshold = 4);
|
||||
bool isFAbsFree(EVT VT) const override;
|
||||
bool isFNegFree(EVT VT) const override;
|
||||
bool isTruncateFree(EVT Src, EVT Dest) const override;
|
||||
|
|
|
@ -488,6 +488,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
setTargetDAGCombine(ISD::FCANONICALIZE);
|
||||
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
|
||||
setTargetDAGCombine(ISD::ZERO_EXTEND);
|
||||
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
|
||||
|
||||
// All memory operations. Some folding on the pointer operand is done to help
|
||||
// matching the constant offsets in the addressing modes.
|
||||
|
@ -4604,6 +4605,24 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::performExtractVectorEltCombine(
|
||||
SDNode *N, DAGCombinerInfo &DCI) const {
|
||||
SDValue Vec = N->getOperand(0);
|
||||
|
||||
SelectionDAG &DAG= DCI.DAG;
|
||||
if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
|
||||
SDLoc SL(N);
|
||||
EVT EltVT = N->getValueType(0);
|
||||
SDValue Idx = N->getOperand(1);
|
||||
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
|
||||
Vec.getOperand(0), Idx);
|
||||
return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
||||
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
|
||||
const SDNode *N0,
|
||||
const SDNode *N1) const {
|
||||
|
@ -4891,6 +4910,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
|
|||
|
||||
break;
|
||||
}
|
||||
case ISD::EXTRACT_VECTOR_ELT:
|
||||
return performExtractVectorEltCombine(N, DCI);
|
||||
}
|
||||
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
|
||||
}
|
||||
|
|
|
@ -100,6 +100,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
|
|||
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
|
||||
unsigned getFusedOpcode(const SelectionDAG &DAG,
|
||||
const SDNode *N0, const SDNode *N1) const;
|
||||
|
|
|
@ -1471,11 +1471,10 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addr
|
|||
; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
|
||||
; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
|
||||
; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
|
||||
; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
|
||||
; GCN: buffer_store_dword [[MUL]]
|
||||
; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
|
||||
; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
|
||||
; GCN-NEXT: buffer_store_dword [[ADD]]
|
||||
; GCN-NEXT: buffer_store_dword [[MUL]]
|
||||
define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
|
||||
|
||||
; FIXME: Should be able to do scalar op
|
||||
; GCN-LABEL: {{^}}s_fneg_f16:
|
||||
|
@ -129,6 +129,41 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16:
|
||||
; GCN: flat_load_dword [[VAL:v[0-9]+]]
|
||||
; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}
|
||||
; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
|
||||
|
||||
; GFX89: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]]
|
||||
; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
|
||||
; GFX89-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]]
|
||||
define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
|
||||
%val = load <2 x half>, <2 x half> addrspace(1)* %in
|
||||
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
|
||||
%elt0 = extractelement <2 x half> %fneg, i32 0
|
||||
%elt1 = extractelement <2 x half> %fneg, i32 1
|
||||
|
||||
%fmul0 = fmul half %elt0, 4.0
|
||||
%fadd1 = fadd half %elt1, 2.0
|
||||
store volatile half %fmul0, half addrspace(1)* undef
|
||||
store volatile half %fadd1, half addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16:
|
||||
; GCN: flat_load_dword [[VAL:v[0-9]+]]
|
||||
; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]]
|
||||
; GCN: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]]
|
||||
define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
|
||||
%val = load <2 x half>, <2 x half> addrspace(1)* %in
|
||||
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
|
||||
%elt0 = extractelement <2 x half> %fneg, i32 0
|
||||
%elt1 = extractelement <2 x half> %fneg, i32 1
|
||||
store volatile half %elt0, half addrspace(1)* undef
|
||||
store volatile half %elt1, half addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
Loading…
Reference in New Issue