forked from OSchip/llvm-project
DAG: Stop trying to fold FP -(x-y) -> y-x in getNode with nsz
This was increasing the number of instructions when fsub was legalized on AMDGPU with no signed zeros enabled. This fold should be guarded by hasOneUse, and I don't think getNode should be doing that. The same fold is already done as a regular combine through isNegatibleForFree. This does require duplicating, even though isNegatibleForFree does this combine already (and properly checks hasOneUse) to avoid one PPC regression. In the regression, the outer fneg has nsz but the fsub operand does not. isNegatibleForFree only sees the operand, and doesn't see it's used from a nsz context. A nsz parameter needs to be added and threaded through isNegatibleForFree to avoid this.
This commit is contained in:
parent
e3e13db714
commit
4d7201e7b9
|
@ -13350,6 +13350,16 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
|
|||
if (TLI.isNegatibleForFree(N0, DAG, LegalOperations, ForCodeSize))
|
||||
return TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);
|
||||
|
||||
// -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 FIXME: This is
|
||||
// duplicated in isNegatibleForFree, but isNegatibleForFree doesn't know it
|
||||
// was called from a context with a nsz flag if the input fsub does not.
|
||||
if (N0.getOpcode() == ISD::FSUB &&
|
||||
(DAG.getTarget().Options.NoSignedZerosFPMath ||
|
||||
N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) {
|
||||
return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1),
|
||||
N0.getOperand(0), N->getFlags());
|
||||
}
|
||||
|
||||
// Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
|
||||
// constant pool values.
|
||||
if (!TLI.isFNegFree(VT) &&
|
||||
|
|
|
@ -4688,11 +4688,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
|
|||
if (OpOpcode == ISD::UNDEF)
|
||||
return getUNDEF(VT);
|
||||
|
||||
// -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
|
||||
if ((getTarget().Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) &&
|
||||
OpOpcode == ISD::FSUB)
|
||||
return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
|
||||
Operand.getOperand(0), Flags);
|
||||
if (OpOpcode == ISD::FNEG) // --X -> X
|
||||
return Operand.getOperand(0);
|
||||
break;
|
||||
|
|
|
@ -38,9 +38,8 @@ define double @negation_propagation(double* %arg, double %arg1, double %arg2) {
|
|||
define { double, double } @testfn(double %x, double %y) #0 {
|
||||
; CHECK-LABEL: testfn:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: fsub d2, d0, d1
|
||||
; CHECK-NEXT: fsub d1, d1, d0
|
||||
; CHECK-NEXT: mov v0.16b, v2.16b
|
||||
; CHECK-NEXT: fsub d0, d0, d1
|
||||
; CHECK-NEXT: fneg d1, d0
|
||||
; CHECK-NEXT: ret
|
||||
%sub = fsub fast double %x, %y
|
||||
%neg = fneg fast double %sub
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-no-signed-zeros-fp-math=true < %s | FileCheck %s
|
||||
|
||||
; FIXME: This should be the same when the extra instructions are fixed
|
||||
; XUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-no-signed-zeros-fp-math=false < %s | FileCheck %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-no-signed-zeros-fp-math=false < %s | FileCheck %s
|
||||
|
||||
; no-signed-zeros-fp-math should not increase the number of
|
||||
; instructions emitted.
|
||||
|
@ -11,10 +9,9 @@ define { double, double } @testfn(double %arg, double %arg1, double %arg2) {
|
|||
; CHECK-LABEL: testfn:
|
||||
; CHECK: ; %bb.0: ; %bb
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: v_add_f64 v[6:7], v[4:5], -v[0:1]
|
||||
; CHECK-NEXT: v_add_f64 v[4:5], v[0:1], -v[4:5]
|
||||
; CHECK-NEXT: v_add_f64 v[0:1], v[6:7], -v[2:3]
|
||||
; CHECK-NEXT: v_add_f64 v[2:3], -v[2:3], v[4:5]
|
||||
; CHECK-NEXT: v_add_f64 v[4:5], v[4:5], -v[0:1]
|
||||
; CHECK-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3]
|
||||
; CHECK-NEXT: v_add_f64 v[2:3], -v[2:3], -v[4:5]
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
%tmp = fsub fast double 0.000000e+00, %arg1
|
||||
|
|
Loading…
Reference in New Issue