[X86] Add target combine rule to select ADDSUB instructions from a build_vector

This patch teaches the backend how to combine a build_vector that implements
an 'addsub' between packed float vectors into a sequence of vector add
and vector sub followed by a VSELECT.

The new VSELECT is expected to be lowered into a BLENDI.
At ISel stage, the sequence 'vector add + vector sub + BLENDI' is
pattern-matched against ISel patterns added at r211427 to select
'addsub' instructions.
Added three more ISel patterns for ADDSUB.

Added test sse3-avx-addsub-2.ll to verify that we correctly emit 'addsub'
instructions.

llvm-svn: 211679
This commit is contained in:
Andrea Di Biagio 2014-06-25 10:02:21 +00:00
parent d99cca2c7a
commit 6d9b9e125d
3 changed files with 456 additions and 1 deletions

View File

@ -6222,6 +6222,127 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
}
/// \brief Try to fold a build_vector that performs an 'addsub' into the
/// sequence of 'vadd + vsub + blendi'.
static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDLoc DL(BV);
EVT VT = BV->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
SDValue InVec0 = DAG.getUNDEF(VT);
SDValue InVec1 = DAG.getUNDEF(VT);
assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
VT == MVT::v2f64) && "build_vector with an invalid type found!");
// Don't try to emit a VSELECT that cannot be lowered into a blend.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
return SDValue();
// Odd-numbered elements in the input build vector are obtained from
// adding two integer/float elements.
// Even-numbered elements in the input build vector are obtained from
// subtracting two integer/float elements.
unsigned ExpectedOpcode = ISD::FSUB;
unsigned NextExpectedOpcode = ISD::FADD;
bool AddFound = false;
bool SubFound = false;
for (unsigned i = 0, e = NumElts; i != e; i++) {
SDValue Op = BV->getOperand(i);
// Skip 'undef' values.
unsigned Opcode = Op.getOpcode();
if (Opcode == ISD::UNDEF) {
std::swap(ExpectedOpcode, NextExpectedOpcode);
continue;
}
// Early exit if we found an unexpected opcode.
if (Opcode != ExpectedOpcode)
return SDValue();
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// Try to match the following pattern:
// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
// Early exit if we cannot match that sequence.
if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Op0.getOperand(1)) ||
!isa<ConstantSDNode>(Op1.getOperand(1)) ||
Op0.getOperand(1) != Op1.getOperand(1))
return SDValue();
unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
if (I0 != i)
return SDValue();
// We found a valid add/sub node. Update the information accordingly.
if (i & 1)
AddFound = true;
else
SubFound = true;
// Update InVec0 and InVec1.
if (InVec0.getOpcode() == ISD::UNDEF)
InVec0 = Op0.getOperand(0);
if (InVec1.getOpcode() == ISD::UNDEF)
InVec1 = Op1.getOperand(0);
// Make sure that operands in input to each add/sub node always
// come from a same pair of vectors.
if (InVec0 != Op0.getOperand(0)) {
if (ExpectedOpcode == ISD::FSUB)
return SDValue();
// FADD is commutable. Try to commute the operands
// and then test again.
std::swap(Op0, Op1);
if (InVec0 != Op0.getOperand(0))
return SDValue();
}
if (InVec1 != Op1.getOperand(0))
return SDValue();
// Update the pair of expected opcodes.
std::swap(ExpectedOpcode, NextExpectedOpcode);
}
// Don't try to fold this build_vector into a VSELECT if it has
// too many UNDEF operands.
if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
InVec1.getOpcode() != ISD::UNDEF) {
// Emit a sequence of vector add and sub followed by a VSELECT.
// The new VSELECT will be lowered into a BLENDI.
// At ISel stage, we pattern-match the sequence 'add + sub + BLENDI'
// and emit a single ADDSUB instruction.
SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1);
SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1);
// Construct the VSELECT mask.
EVT MaskVT = VT.changeVectorElementTypeToInteger();
EVT SVT = MaskVT.getVectorElementType();
unsigned SVTBits = SVT.getSizeInBits();
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0, e = NumElts; i != e; ++i) {
APInt Value = i & 1 ? APInt::getNullValue(SVTBits) :
APInt::getAllOnesValue(SVTBits);
SDValue Constant = DAG.getConstant(Value, SVT);
Ops.push_back(Constant);
}
SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops);
return DAG.getSelect(DL, VT, Mask, Sub, Add);
}
return SDValue();
}
static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDLoc DL(N);
@ -6230,6 +6351,14 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
SDValue InVec0, InVec1;
// Try to match an ADDSUB.
if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
SDValue Value = matchAddSub(BV, DAG, Subtarget);
if (Value.getNode())
return Value;
}
// Try to match horizontal ADD/SUB.
unsigned NumUndefsLO = 0;
unsigned NumUndefsHI = 0;

View File

@ -5377,10 +5377,15 @@ let Predicates = [HasAVX] in {
def : Pat<(v4f64 (X86Shufp (v4f64 (fsub VR256:$lhs, VR256:$rhs)),
(v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i8 10))),
(VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)),
(v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))),
(VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)),
(v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))),
(VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)),
(v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))),
(VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)),
(v2f64 (fsub VR128:$lhs, VR128:$rhs)))),
(VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
@ -5396,6 +5401,9 @@ let Predicates = [UseSSE3] in {
(v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))),
(ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)),
(v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))),
(ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)),
(v2f64 (fsub VR128:$lhs, VR128:$rhs)))),
(ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;

View File

@ -0,0 +1,318 @@
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
; Verify that we correctly generate 'addsub' instructions from
; a sequence of vector extracts + float add/sub + vector inserts.
define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 0
%2 = extractelement <4 x float> %B, i32 0
%sub = fsub float %1, %2
%3 = extractelement <4 x float> %A, i32 2
%4 = extractelement <4 x float> %B, i32 2
%sub2 = fsub float %3, %4
%5 = extractelement <4 x float> %A, i32 1
%6 = extractelement <4 x float> %B, i32 1
%add = fadd float %5, %6
%7 = extractelement <4 x float> %A, i32 3
%8 = extractelement <4 x float> %B, i32 3
%add2 = fadd float %7, %8
%vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
%vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
%vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
%vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
ret <4 x float> %vecinsert4
}
; CHECK-LABEL: test1
; SSE: addsubps
; AVX: vaddsubps
; CHECK-NEXT: ret
define <4 x float> @test2(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 2
%2 = extractelement <4 x float> %B, i32 2
%sub2 = fsub float %1, %2
%3 = extractelement <4 x float> %A, i32 3
%4 = extractelement <4 x float> %B, i32 3
%add2 = fadd float %3, %4
%vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2
%vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
ret <4 x float> %vecinsert2
}
; CHECK-LABEL: test2
; SSE: addsubps
; AVX: vaddsubps
; CHECK-NEXT: ret
define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 0
%2 = extractelement <4 x float> %B, i32 0
%sub = fsub float %1, %2
%3 = extractelement <4 x float> %A, i32 3
%4 = extractelement <4 x float> %B, i32 3
%add = fadd float %4, %3
%vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
%vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
ret <4 x float> %vecinsert2
}
; CHECK-LABEL: test3
; SSE: addsubps
; AVX: vaddsubps
; CHECK-NEXT: ret
define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 2
%2 = extractelement <4 x float> %B, i32 2
%sub = fsub float %1, %2
%3 = extractelement <4 x float> %A, i32 1
%4 = extractelement <4 x float> %B, i32 1
%add = fadd float %3, %4
%vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
%vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
ret <4 x float> %vecinsert2
}
; CHECK-LABEL: test4
; SSE: addsubps
; AVX: vaddsubps
; CHECK-NEXT: ret
define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 0
%2 = extractelement <4 x float> %B, i32 0
%sub2 = fsub float %1, %2
%3 = extractelement <4 x float> %A, i32 1
%4 = extractelement <4 x float> %B, i32 1
%add2 = fadd float %3, %4
%vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0
%vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
ret <4 x float> %vecinsert2
}
; CHECK-LABEL: test5
; SSE: addsubps
; AVX: vaddsubps
; CHECK-NEXT: ret
define <4 x float> @test6(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 0
%2 = extractelement <4 x float> %B, i32 0
%sub = fsub float %1, %2
%3 = extractelement <4 x float> %A, i32 2
%4 = extractelement <4 x float> %B, i32 2
%sub2 = fsub float %3, %4
%5 = extractelement <4 x float> %A, i32 1
%6 = extractelement <4 x float> %B, i32 1
%add = fadd float %5, %6
%7 = extractelement <4 x float> %A, i32 3
%8 = extractelement <4 x float> %B, i32 3
%add2 = fadd float %7, %8
%vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
%vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
%vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
%vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
ret <4 x float> %vecinsert4
}
; CHECK-LABEL: test6
; SSE: addsubps
; AVX: vaddsubps
; CHECK-NEXT: ret
define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
%1 = extractelement <4 x double> %A, i32 0
%2 = extractelement <4 x double> %B, i32 0
%sub = fsub double %1, %2
%3 = extractelement <4 x double> %A, i32 2
%4 = extractelement <4 x double> %B, i32 2
%sub2 = fsub double %3, %4
%5 = extractelement <4 x double> %A, i32 1
%6 = extractelement <4 x double> %B, i32 1
%add = fadd double %5, %6
%7 = extractelement <4 x double> %A, i32 3
%8 = extractelement <4 x double> %B, i32 3
%add2 = fadd double %7, %8
%vecinsert1 = insertelement <4 x double> undef, double %add, i32 1
%vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
%vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
%vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
ret <4 x double> %vecinsert4
}
; CHECK-LABEL: test7
; SSE: addsubpd
; SSE-NEXT: addsubpd
; AVX: vaddsubpd
; AVX-NOT: vaddsubpd
; CHECK: ret
define <2 x double> @test8(<2 x double> %A, <2 x double> %B) {
%1 = extractelement <2 x double> %A, i32 0
%2 = extractelement <2 x double> %B, i32 0
%sub = fsub double %1, %2
%3 = extractelement <2 x double> %A, i32 1
%4 = extractelement <2 x double> %B, i32 1
%add = fadd double %3, %4
%vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0
%vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
ret <2 x double> %vecinsert2
}
; CHECK-LABEL: test8
; SSE: addsubpd
; AVX: vaddsubpd
; CHECK: ret
define <8 x float> @test9(<8 x float> %A, <8 x float> %B) {
%1 = extractelement <8 x float> %A, i32 0
%2 = extractelement <8 x float> %B, i32 0
%sub = fsub float %1, %2
%3 = extractelement <8 x float> %A, i32 2
%4 = extractelement <8 x float> %B, i32 2
%sub2 = fsub float %3, %4
%5 = extractelement <8 x float> %A, i32 1
%6 = extractelement <8 x float> %B, i32 1
%add = fadd float %5, %6
%7 = extractelement <8 x float> %A, i32 3
%8 = extractelement <8 x float> %B, i32 3
%add2 = fadd float %7, %8
%9 = extractelement <8 x float> %A, i32 4
%10 = extractelement <8 x float> %B, i32 4
%sub3 = fsub float %9, %10
%11 = extractelement <8 x float> %A, i32 6
%12 = extractelement <8 x float> %B, i32 6
%sub4 = fsub float %11, %12
%13 = extractelement <8 x float> %A, i32 5
%14 = extractelement <8 x float> %B, i32 5
%add3 = fadd float %13, %14
%15 = extractelement <8 x float> %A, i32 7
%16 = extractelement <8 x float> %B, i32 7
%add4 = fadd float %15, %16
%vecinsert1 = insertelement <8 x float> undef, float %add, i32 1
%vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
%vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
%vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
%vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
%vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
%vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
%vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
ret <8 x float> %vecinsert8
}
; CHECK-LABEL: test9
; SSE: addsubps
; SSE-NEXT: addsubps
; AVX: vaddsubps
; AVX-NOT: vaddsubps
; CHECK: ret
; Verify that we don't generate addsub instruction for the following
; functions.
define <4 x float> @test10(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 0
%2 = extractelement <4 x float> %B, i32 0
%sub = fsub float %1, %2
%vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
ret <4 x float> %vecinsert1
}
; CHECK-LABEL: test10
; CHECK-NOT: addsubps
; CHECK: ret
define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 2
%2 = extractelement <4 x float> %B, i32 2
%sub = fsub float %1, %2
%vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
ret <4 x float> %vecinsert1
}
; CHECK-LABEL: test11
; CHECK-NOT: addsubps
; CHECK: ret
define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 1
%2 = extractelement <4 x float> %B, i32 1
%add = fadd float %1, %2
%vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
ret <4 x float> %vecinsert1
}
; CHECK-LABEL: test12
; CHECK-NOT: addsubps
; CHECK: ret
define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 3
%2 = extractelement <4 x float> %B, i32 3
%add = fadd float %1, %2
%vecinsert1 = insertelement <4 x float> undef, float %add, i32 3
ret <4 x float> %vecinsert1
}
; CHECK-LABEL: test13
; CHECK-NOT: addsubps
; CHECK: ret
define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 0
%2 = extractelement <4 x float> %B, i32 0
%sub = fsub float %1, %2
%3 = extractelement <4 x float> %A, i32 2
%4 = extractelement <4 x float> %B, i32 2
%sub2 = fsub float %3, %4
%vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
%vecinsert2 = insertelement <4 x float> %vecinsert1, float %sub2, i32 2
ret <4 x float> %vecinsert2
}
; CHECK-LABEL: test14
; CHECK-NOT: addsubps
; CHECK: ret
define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 1
%2 = extractelement <4 x float> %B, i32 1
%add = fadd float %1, %2
%3 = extractelement <4 x float> %A, i32 3
%4 = extractelement <4 x float> %B, i32 3
%add2 = fadd float %3, %4
%vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
%vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
ret <4 x float> %vecinsert2
}
; CHECK-LABEL: test15
; CHECK-NOT: addsubps
; CHECK: ret
define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
%1 = extractelement <4 x float> %A, i32 0
%2 = extractelement <4 x float> %B, i32 0
%sub = fsub float %1, undef
%3 = extractelement <4 x float> %A, i32 2
%4 = extractelement <4 x float> %B, i32 2
%sub2 = fsub float %3, %4
%5 = extractelement <4 x float> %A, i32 1
%6 = extractelement <4 x float> %B, i32 1
%add = fadd float %5, undef
%7 = extractelement <4 x float> %A, i32 3
%8 = extractelement <4 x float> %B, i32 3
%add2 = fadd float %7, %8
%vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
%vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
%vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
%vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
ret <4 x float> %vecinsert4
}
; CHECK-LABEL: test16
; CHECK-NOT: addsubps
; CHECK: ret