forked from OSchip/llvm-project
Legalize: Improve legalization of long vector extends.
When an extend more than doubles the size of the elements (e.g., a zext from v16i8 to v16i32), the normal legalization method of splitting the vectors will run into problems as by the time the destination vector is legal, the source vector is illegal. The end result is the operation often becoming scalarized, with the typical horrible performance. For example, on x86_64, the simple input of: define void @bar(<16 x i8> %a, <16 x i32>* %p) nounwind { %tmp = zext <16 x i8> %a to <16 x i32> store <16 x i32> %tmp, <16 x i32>*%p ret void } Generates: .section __TEXT,__text,regular,pure_instructions .section __TEXT,__const .align 5 LCPI0_0: .long 255 ## 0xff .long 255 ## 0xff .long 255 ## 0xff .long 255 ## 0xff .long 255 ## 0xff .long 255 ## 0xff .long 255 ## 0xff .long 255 ## 0xff .section __TEXT,__text,regular,pure_instructions .globl _bar .align 4, 0x90 _bar: vpunpckhbw %xmm0, %xmm0, %xmm1 vpunpckhwd %xmm0, %xmm1, %xmm2 vpmovzxwd %xmm1, %xmm1 vinsertf128 $1, %xmm2, %ymm1, %ymm1 vmovaps LCPI0_0(%rip), %ymm2 vandps %ymm2, %ymm1, %ymm1 vpmovzxbw %xmm0, %xmm3 vpunpckhwd %xmm0, %xmm3, %xmm3 vpmovzxbd %xmm0, %xmm0 vinsertf128 $1, %xmm3, %ymm0, %ymm0 vandps %ymm2, %ymm0, %ymm0 vmovaps %ymm0, (%rdi) vmovaps %ymm1, 32(%rdi) vzeroupper ret So instead we can check if there are legal types that enable us to split more cleverly when the input vector is already legal such that we don't turn it into an illegal type. If the extend is such that it's more than doubling the size of the input we check if - the number of vector elements is even, - the source type is legal, - the type of a split source is illegal, - the type of an extended (by doubling element size) source is legal, and - the type of that extended source when split is legal. If the conditions are met, instead of just splitting both the destination and the source types, we create an extend that only goes up one "step" (doubling the element width), and the continue legalizing the rest of the operation normally. The result is that this operates as a new, more effecient, termination condition for the loop of "split the operation until the destination type is legal." With this change, the above example now compiles to: _bar: vpxor %xmm1, %xmm1, %xmm1 vpunpcklbw %xmm1, %xmm0, %xmm2 vpunpckhwd %xmm1, %xmm2, %xmm3 vpunpcklwd %xmm1, %xmm2, %xmm2 vinsertf128 $1, %xmm3, %ymm2, %ymm2 vpunpckhbw %xmm1, %xmm0, %xmm0 vpunpckhwd %xmm1, %xmm0, %xmm3 vpunpcklwd %xmm1, %xmm0, %xmm0 vinsertf128 $1, %xmm3, %ymm0, %ymm0 vmovaps %ymm0, 32(%rdi) vmovaps %ymm2, (%rdi) vzeroupper ret This generalizes a custom lowering that was added a while back to the ARM backend. That lowering is no longer necessary, and is removed. The testcases for it, however, provide excellent ARM tests for this change and so remain. rdar://14735100 llvm-svn: 193727
This commit is contained in:
parent
9d2ffea486
commit
7236678687
|
@ -561,6 +561,7 @@ private:
|
|||
void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
|
||||
void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
|
||||
|
|
|
@ -521,7 +521,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
|
|||
SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
|
||||
break;
|
||||
|
||||
case ISD::ANY_EXTEND:
|
||||
case ISD::CONVERT_RNDSAT:
|
||||
case ISD::CTLZ:
|
||||
case ISD::CTTZ:
|
||||
|
@ -548,14 +547,18 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
|
|||
case ISD::FSIN:
|
||||
case ISD::FSQRT:
|
||||
case ISD::FTRUNC:
|
||||
case ISD::SIGN_EXTEND:
|
||||
case ISD::SINT_TO_FP:
|
||||
case ISD::TRUNCATE:
|
||||
case ISD::UINT_TO_FP:
|
||||
case ISD::ZERO_EXTEND:
|
||||
SplitVecRes_UnaryOp(N, Lo, Hi);
|
||||
break;
|
||||
|
||||
case ISD::ANY_EXTEND:
|
||||
case ISD::SIGN_EXTEND:
|
||||
case ISD::ZERO_EXTEND:
|
||||
SplitVecRes_ExtendOp(N, Lo, Hi);
|
||||
break;
|
||||
|
||||
case ISD::ADD:
|
||||
case ISD::SUB:
|
||||
case ISD::MUL:
|
||||
|
@ -921,6 +924,62 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
|
|||
}
|
||||
}
|
||||
|
||||
void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
|
||||
SDValue &Hi) {
|
||||
SDLoc dl(N);
|
||||
EVT SrcVT = N->getOperand(0).getValueType();
|
||||
EVT DestVT = N->getValueType(0);
|
||||
EVT LoVT, HiVT;
|
||||
GetSplitDestVTs(DestVT, LoVT, HiVT);
|
||||
|
||||
// We can do better than a generic split operation if the extend is doing
|
||||
// more than just doubling the width of the elements and the following are
|
||||
// true:
|
||||
// - The number of vector elements is even,
|
||||
// - the source type is legal,
|
||||
// - the type of a split source is illegal,
|
||||
// - the type of an extended (by doubling element size) source is legal, and
|
||||
// - the type of that extended source when split is legal.
|
||||
//
|
||||
// This won't necessarily completely legalize the operation, but it will
|
||||
// more effectively move in the right direction and prevent falling down
|
||||
// to scalarization in many cases due to the input vector being split too
|
||||
// far.
|
||||
unsigned NumElements = SrcVT.getVectorNumElements();
|
||||
if ((NumElements & 1) == 0 &&
|
||||
SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
|
||||
LLVMContext &Ctx = *DAG.getContext();
|
||||
EVT NewSrcVT = EVT::getVectorVT(
|
||||
Ctx, EVT::getIntegerVT(
|
||||
Ctx, SrcVT.getVectorElementType().getSizeInBits() * 2),
|
||||
NumElements);
|
||||
EVT SplitSrcVT =
|
||||
EVT::getVectorVT(Ctx, SrcVT.getVectorElementType(), NumElements / 2);
|
||||
EVT SplitLoVT, SplitHiVT;
|
||||
GetSplitDestVTs(NewSrcVT, SplitLoVT, SplitHiVT);
|
||||
if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
|
||||
TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
|
||||
DEBUG(dbgs() << "Split vector extend via incremental extend:";
|
||||
N->dump(&DAG); dbgs() << "\n");
|
||||
// Extend the source vector by one step.
|
||||
SDValue NewSrc =
|
||||
DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
|
||||
// Get the low and high halves of the new, extended one step, vector.
|
||||
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitLoVT, NewSrc,
|
||||
DAG.getConstant(0, TLI.getVectorIdxTy()));
|
||||
Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitHiVT, NewSrc,
|
||||
DAG.getConstant(SplitLoVT.getVectorNumElements(),
|
||||
TLI.getVectorIdxTy()));
|
||||
// Extend those vector halves the rest of the way.
|
||||
Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
|
||||
Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Fall back to the generic unary operator splitting otherwise.
|
||||
SplitVecRes_UnaryOp(N, Lo, Hi);
|
||||
}
|
||||
|
||||
void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
|
||||
SDValue &Lo, SDValue &Hi) {
|
||||
// The low and high parts of the original input give four input vectors.
|
||||
|
|
|
@ -567,16 +567,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
|
|||
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
|
||||
setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
|
||||
|
||||
// Custom expand long extensions to vectors.
|
||||
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
|
||||
setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
|
||||
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
|
||||
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
|
||||
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
|
||||
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
|
||||
|
||||
// NEON does not have single instruction CTPOP for vectors with element
|
||||
// types wider than 8-bits. However, custom lowering can leverage the
|
||||
// v8i8/v16i8 vcnt instruction.
|
||||
|
@ -3830,47 +3820,6 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
|
|||
return FrameAddr;
|
||||
}
|
||||
|
||||
/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec),
|
||||
/// and size(DestVec) > 128-bits.
|
||||
/// This is achieved by doing the one extension from the SrcVec, splitting the
|
||||
/// result, extending these parts, and then concatenating these into the
|
||||
/// destination.
|
||||
static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
|
||||
SDValue Op = N->getOperand(0);
|
||||
EVT SrcVT = Op.getValueType();
|
||||
EVT DestVT = N->getValueType(0);
|
||||
|
||||
assert(DestVT.getSizeInBits() > 128 &&
|
||||
"Custom sext/zext expansion needs >128-bit vector.");
|
||||
// If this is a normal length extension, use the default expansion.
|
||||
if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() &&
|
||||
SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits())
|
||||
return SDValue();
|
||||
|
||||
SDLoc dl(N);
|
||||
unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
|
||||
unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits();
|
||||
unsigned NumElts = SrcVT.getVectorNumElements();
|
||||
LLVMContext &Ctx = *DAG.getContext();
|
||||
SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi;
|
||||
|
||||
EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
|
||||
NumElts);
|
||||
EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
|
||||
NumElts/2);
|
||||
EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize),
|
||||
NumElts/2);
|
||||
|
||||
Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op);
|
||||
SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
|
||||
DAG.getIntPtrConstant(0));
|
||||
SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
|
||||
DAG.getIntPtrConstant(NumElts/2));
|
||||
ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo);
|
||||
ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi);
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi);
|
||||
}
|
||||
|
||||
/// ExpandBITCAST - If the target supports VFP, this function is called to
|
||||
/// expand a bit convert where either the source or destination type is i64 to
|
||||
/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
|
||||
|
@ -6149,10 +6098,6 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
|
|||
case ISD::BITCAST:
|
||||
Res = ExpandBITCAST(N, DAG);
|
||||
break;
|
||||
case ISD::SIGN_EXTEND:
|
||||
case ISD::ZERO_EXTEND:
|
||||
Res = ExpandVectorExtension(N, DAG);
|
||||
break;
|
||||
case ISD::SRL:
|
||||
case ISD::SRA:
|
||||
Res = Expand64BitShift(N, DAG, Subtarget);
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
; RUN: llc < %s -mcpu=core-avx-i -mtriple=x86_64-linux -asm-verbose=0| FileCheck %s
|
||||
define void @test_long_extend(<16 x i8> %a, <16 x i32>* %p) nounwind {
|
||||
; CHECK-LABEL: test_long_extend
|
||||
; CHECK: vpunpcklbw %xmm1, %xmm0, [[REG1:%xmm[0-9]+]]
|
||||
; CHECK: vpunpckhwd %xmm1, [[REG1]], [[REG2:%xmm[0-9]+]]
|
||||
; CHECK: vpunpcklwd %xmm1, [[REG1]], %x[[REG3:mm[0-9]+]]
|
||||
; CHECK: vinsertf128 $1, [[REG2]], %y[[REG3]], [[REG_result0:%ymm[0-9]+]]
|
||||
; CHECK: vpunpckhbw %xmm1, %xmm0, [[REG4:%xmm[0-9]+]]
|
||||
; CHECK: vpunpckhwd %xmm1, [[REG4]], [[REG5:%xmm[0-9]+]]
|
||||
; CHECK: vpunpcklwd %xmm1, [[REG4]], %x[[REG6:mm[0-9]+]]
|
||||
; CHECK: vinsertf128 $1, [[REG5]], %y[[REG6]], [[REG_result1:%ymm[0-9]+]]
|
||||
; CHECK: vmovaps [[REG_result1]], 32(%rdi)
|
||||
; CHECK: vmovaps [[REG_result0]], (%rdi)
|
||||
|
||||
%tmp = zext <16 x i8> %a to <16 x i32>
|
||||
store <16 x i32> %tmp, <16 x i32>*%p
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue