forked from OSchip/llvm-project
[X86][SSE] isHorizontalBinOp - reuse any existing horizontal ops.
If we already have similar horizontal ops using the same args, then match that, even if we are on a target with slow horizontal ops.
This commit is contained in:
parent
6e75ee6b65
commit
770d1e0a88
|
@ -45628,8 +45628,9 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
|
|||
/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
|
||||
/// A horizontal-op B, for some already available A and B, and if so then LHS is
|
||||
/// set to A, RHS to B, and the routine returns 'true'.
|
||||
static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
|
||||
const X86Subtarget &Subtarget, bool IsCommutative,
|
||||
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
|
||||
SelectionDAG &DAG, const X86Subtarget &Subtarget,
|
||||
bool IsCommutative,
|
||||
SmallVectorImpl<int> &PostShuffleMask) {
|
||||
// If either operand is undef, bail out. The binop should be simplified.
|
||||
if (LHS.isUndef() || RHS.isUndef())
|
||||
|
@ -45790,9 +45791,20 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
|
|||
isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
|
||||
return false;
|
||||
|
||||
// If the source nodes are already used in HorizOps then always accept this.
|
||||
// Shuffle folding should merge these back together.
|
||||
bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
|
||||
return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
|
||||
});
|
||||
bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
|
||||
return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
|
||||
});
|
||||
bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
|
||||
|
||||
// Assume a SingleSource HOP if we only shuffle one input and don't need to
|
||||
// shuffle the result.
|
||||
if (!shouldUseHorizontalOp(NewLHS == NewRHS &&
|
||||
if (!ForceHorizOp &&
|
||||
!shouldUseHorizontalOp(NewLHS == NewRHS &&
|
||||
(NumShuffles < 2 || !IsIdentityPostShuffle),
|
||||
DAG, Subtarget))
|
||||
return false;
|
||||
|
@ -45816,7 +45828,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
|
|||
SmallVector<int, 8> PostShuffleMask;
|
||||
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
|
||||
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
|
||||
isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
|
||||
isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsFadd,
|
||||
PostShuffleMask)) {
|
||||
SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
|
||||
if (!PostShuffleMask.empty())
|
||||
HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
|
||||
|
@ -48931,17 +48944,18 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
|
|||
SDValue Op0 = N->getOperand(0);
|
||||
SDValue Op1 = N->getOperand(1);
|
||||
bool IsAdd = N->getOpcode() == ISD::ADD;
|
||||
auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
|
||||
assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
|
||||
|
||||
SmallVector<int, 8> PostShuffleMask;
|
||||
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
|
||||
VT == MVT::v8i32) &&
|
||||
Subtarget.hasSSSE3() &&
|
||||
isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
|
||||
auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
|
||||
ArrayRef<SDValue> Ops) {
|
||||
return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
|
||||
Ops[0].getValueType(), Ops);
|
||||
isHorizontalBinOp(HorizOpcode, Op0, Op1, DAG, Subtarget, IsAdd,
|
||||
PostShuffleMask)) {
|
||||
auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
|
||||
ArrayRef<SDValue> Ops) {
|
||||
return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
|
||||
};
|
||||
SDValue HorizBinOp =
|
||||
SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
|
||||
|
|
|
@ -873,45 +873,15 @@ define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) {
|
|||
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
|
||||
|
||||
define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
|
||||
; SSSE3_SLOW-LABEL: PR34724_1:
|
||||
; SSSE3_SLOW: # %bb.0:
|
||||
; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm0
|
||||
; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
|
||||
; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2
|
||||
; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
|
||||
; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
|
||||
; SSSE3_SLOW-NEXT: retq
|
||||
; SSSE3-LABEL: PR34724_1:
|
||||
; SSSE3: # %bb.0:
|
||||
; SSSE3-NEXT: haddps %xmm1, %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSSE3_FAST-LABEL: PR34724_1:
|
||||
; SSSE3_FAST: # %bb.0:
|
||||
; SSSE3_FAST-NEXT: haddps %xmm1, %xmm0
|
||||
; SSSE3_FAST-NEXT: retq
|
||||
;
|
||||
; AVX1_SLOW-LABEL: PR34724_1:
|
||||
; AVX1_SLOW: # %bb.0:
|
||||
; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
|
||||
; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
|
||||
; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
|
||||
; AVX1_SLOW-NEXT: retq
|
||||
;
|
||||
; AVX1_FAST-LABEL: PR34724_1:
|
||||
; AVX1_FAST: # %bb.0:
|
||||
; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX1_FAST-NEXT: retq
|
||||
;
|
||||
; AVX2_SLOW-LABEL: PR34724_1:
|
||||
; AVX2_SLOW: # %bb.0:
|
||||
; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
|
||||
; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
|
||||
; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
|
||||
; AVX2_SLOW-NEXT: retq
|
||||
;
|
||||
; AVX2_FAST-LABEL: PR34724_1:
|
||||
; AVX2_FAST: # %bb.0:
|
||||
; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX2_FAST-NEXT: retq
|
||||
; AVX-LABEL: PR34724_1:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>
|
||||
%t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5>
|
||||
%t2 = fadd <2 x float> %t0, %t1
|
||||
|
|
|
@ -576,30 +576,17 @@ define <4 x float> @add_ps_008(<4 x float> %x) {
|
|||
}
|
||||
|
||||
define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) {
|
||||
; SSE-SLOW-LABEL: add_ps_016:
|
||||
; SSE-SLOW: # %bb.0:
|
||||
; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; SSE-SLOW-NEXT: addps %xmm1, %xmm2
|
||||
; SSE-SLOW-NEXT: haddps %xmm0, %xmm1
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,0]
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3]
|
||||
; SSE-SLOW-NEXT: movaps %xmm2, %xmm0
|
||||
; SSE-SLOW-NEXT: retq
|
||||
;
|
||||
; SSE-FAST-LABEL: add_ps_016:
|
||||
; SSE-FAST: # %bb.0:
|
||||
; SSE-FAST-NEXT: haddps %xmm0, %xmm1
|
||||
; SSE-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,3]
|
||||
; SSE-FAST-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE-FAST-NEXT: retq
|
||||
; SSE-LABEL: add_ps_016:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: haddps %xmm0, %xmm1
|
||||
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,3]
|
||||
; SSE-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-SLOW-LABEL: add_ps_016:
|
||||
; AVX-SLOW: # %bb.0:
|
||||
; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm1, %xmm0
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
|
||||
; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
|
||||
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,3]
|
||||
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,3]
|
||||
; AVX-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX-FAST-LABEL: add_ps_016:
|
||||
|
@ -1006,32 +993,15 @@ define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
|
|||
}
|
||||
|
||||
define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) {
|
||||
; SSE-SLOW-LABEL: PR34724_add_v4f32_u123:
|
||||
; SSE-SLOW: # %bb.0:
|
||||
; SSE-SLOW-NEXT: haddps %xmm1, %xmm0
|
||||
; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
|
||||
; SSE-SLOW-NEXT: addps %xmm1, %xmm2
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
|
||||
; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
|
||||
; SSE-SLOW-NEXT: retq
|
||||
; SSE-LABEL: PR34724_add_v4f32_u123:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: haddps %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; SSE-FAST-LABEL: PR34724_add_v4f32_u123:
|
||||
; SSE-FAST: # %bb.0:
|
||||
; SSE-FAST-NEXT: haddps %xmm1, %xmm0
|
||||
; SSE-FAST-NEXT: retq
|
||||
;
|
||||
; AVX-SLOW-LABEL: PR34724_add_v4f32_u123:
|
||||
; AVX-SLOW: # %bb.0:
|
||||
; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
|
||||
; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
|
||||
; AVX-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX-FAST-LABEL: PR34724_add_v4f32_u123:
|
||||
; AVX-FAST: # %bb.0:
|
||||
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX-FAST-NEXT: retq
|
||||
; AVX-LABEL: PR34724_add_v4f32_u123:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 4>
|
||||
%4 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 5>
|
||||
%5 = fadd <2 x float> %3, %4
|
||||
|
|
Loading…
Reference in New Issue