forked from OSchip/llvm-project
- Handle special scalar_to_vector case: splats. Using a native 128-bit
shuffle before inserting on a 256-bit vector. - Add AVX versions of movd/movq instructions - Introduce a few COPY patterns to match insert_subvector instructions. This turns a trivial insert_subvector instruction into a register copy, coalescing the xmm into a ymm and avoid emiting on more instruction. llvm-svn: 136002
This commit is contained in:
parent
276eb8debf
commit
123dff0f58
|
@ -3955,6 +3955,34 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
|
|||
return DAG.getNode(ISD::BITCAST, dl, VT, V);
|
||||
}
|
||||
|
||||
/// PromoteVectorToScalarSplat - Since there's no native support for
|
||||
/// scalar_to_vector for 256-bit AVX, a 128-bit scalar_to_vector +
|
||||
/// INSERT_SUBVECTOR is generated. Recognize this idiom and do the
|
||||
/// shuffle before the insertion, this yields less instructions in the end.
|
||||
static SDValue PromoteVectorToScalarSplat(ShuffleVectorSDNode *SV,
|
||||
SelectionDAG &DAG) {
|
||||
EVT SrcVT = SV->getValueType(0);
|
||||
SDValue V1 = SV->getOperand(0);
|
||||
DebugLoc dl = SV->getDebugLoc();
|
||||
int NumElems = SrcVT.getVectorNumElements();
|
||||
|
||||
assert(SrcVT.is256BitVector() && "unknown howto handle vector type");
|
||||
|
||||
SmallVector<int, 4> Mask;
|
||||
for (int i = 0; i < NumElems/2; ++i)
|
||||
Mask.push_back(SV->getMaskElt(i));
|
||||
|
||||
EVT SVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
|
||||
NumElems/2);
|
||||
SDValue SV1 = DAG.getVectorShuffle(SVT, dl, V1.getOperand(1),
|
||||
DAG.getUNDEF(SVT), &Mask[0]);
|
||||
SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), SV1,
|
||||
DAG.getConstant(0, MVT::i32), DAG, dl);
|
||||
|
||||
return Insert128BitVector(InsV, SV1,
|
||||
DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
|
||||
}
|
||||
|
||||
/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and
|
||||
/// v8i32, v16i16 or v32i8 to v8f32.
|
||||
static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
|
||||
|
@ -5742,7 +5770,17 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
|
|||
if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
|
||||
return Op;
|
||||
|
||||
// Handle splats by matching through known masks
|
||||
// Since there's no native support for scalar_to_vector for 256-bit AVX, a
|
||||
// 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this
|
||||
// idiom and do the shuffle before the insertion, this yields less
|
||||
// instructions in the end.
|
||||
if (VT.is256BitVector() &&
|
||||
V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
|
||||
V1.getOperand(0).getOpcode() == ISD::UNDEF &&
|
||||
V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR)
|
||||
return PromoteVectorToScalarSplat(SVOp, DAG);
|
||||
|
||||
// Handle splats by matching through known shuffle masks
|
||||
if ((VT.is128BitVector() && NumElem <= 4) ||
|
||||
(VT.is256BitVector() && NumElem <= 8))
|
||||
return SDValue();
|
||||
|
|
|
@ -501,6 +501,9 @@ class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
|
|||
class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
|
||||
list<dag> pattern>
|
||||
: PDI<o, F, outs, ins, asm, pattern>, REX_W;
|
||||
class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
|
||||
list<dag> pattern>
|
||||
: VPDI<o, F, outs, ins, asm, pattern>, VEX_W;
|
||||
|
||||
// MMX Instruction templates
|
||||
//
|
||||
|
|
|
@ -467,3 +467,4 @@ def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
|
|||
node:$index), [{
|
||||
return X86::isVINSERTF128Index(N);
|
||||
}], INSERT_get_vinsertf128_imm>;
|
||||
|
||||
|
|
|
@ -2858,6 +2858,14 @@ def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
|
|||
[(set VR128:$dst,
|
||||
(v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
|
||||
VEX;
|
||||
def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
|
||||
"mov{d|q}\t{$src, $dst|$dst, $src}",
|
||||
[(set VR128:$dst,
|
||||
(v2i64 (scalar_to_vector GR64:$src)))]>, VEX;
|
||||
def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
|
||||
"mov{d|q}\t{$src, $dst|$dst, $src}",
|
||||
[(set FR64:$dst, (bitconvert GR64:$src))]>, VEX;
|
||||
|
||||
def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
|
||||
"movd\t{$src, $dst|$dst, $src}",
|
||||
[(set VR128:$dst,
|
||||
|
@ -5358,6 +5366,20 @@ def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
|
|||
(VINSERTF128rr VR256:$src1, VR128:$src2,
|
||||
(INSERT_get_vinsertf128_imm VR256:$ins))>;
|
||||
|
||||
// Special COPY patterns
|
||||
def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)),
|
||||
(INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
|
||||
def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)),
|
||||
(INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
|
||||
def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)),
|
||||
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
|
||||
def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)),
|
||||
(INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
|
||||
def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)),
|
||||
(INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
|
||||
def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)),
|
||||
(INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// VEXTRACTF128 - Extract packed floating-point values
|
||||
//
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
; CHECK: vextractf128 $0
|
||||
; CHECK-NEXT: punpcklbw
|
||||
; CHECK-NEXT: punpckhbw
|
||||
; CHECK-NEXT: vinsertf128 $0
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
; CHECK-NEXT: vpermilps $85
|
||||
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
|
||||
|
@ -16,7 +15,6 @@ entry:
|
|||
|
||||
; CHECK: vextractf128 $0
|
||||
; CHECK-NEXT: punpckhwd
|
||||
; CHECK-NEXT: vinsertf128 $0
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
; CHECK-NEXT: vpermilps $85
|
||||
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
|
||||
|
@ -25,3 +23,25 @@ entry:
|
|||
ret <16 x i16> %shuffle
|
||||
}
|
||||
|
||||
; CHECK: vmovd
|
||||
; CHECK-NEXT: movlhps
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
|
||||
entry:
|
||||
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
|
||||
%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
|
||||
%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
|
||||
%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
|
||||
ret <4 x i64> %vecinit6.i
|
||||
}
|
||||
|
||||
; CHECK: vshufpd
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
|
||||
entry:
|
||||
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
|
||||
%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
|
||||
%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
|
||||
%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
|
||||
ret <4 x double> %vecinit6.i
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7 -mattr=avx | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
|
||||
|
||||
@x = common global <8 x float> zeroinitializer, align 32
|
||||
@y = common global <4 x double> zeroinitializer, align 32
|
||||
|
@ -12,4 +12,3 @@ entry:
|
|||
store <4 x double> zeroinitializer, <4 x double>* @y, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue