Legalize: Improve legalization of long vector extends.

When an extend more than doubles the size of the elements (e.g., a zext
from v16i8 to v16i32), the normal legalization method of splitting the
vectors will run into problems as by the time the destination vector is
legal, the source vector is illegal. The end result is the operation
often becoming scalarized, with the typical horrible performance. For
example, on x86_64, the simple input of:
define void @bar(<16 x i8> %a, <16 x i32>* %p) nounwind {
  %tmp = zext <16 x i8> %a to <16 x i32>
  store <16 x i32> %tmp, <16 x i32>*%p
  ret void
}

Generates:
  .section  __TEXT,__text,regular,pure_instructions
  .section  __TEXT,__const
  .align  5
LCPI0_0:
  .long 255                     ## 0xff
  .long 255                     ## 0xff
  .long 255                     ## 0xff
  .long 255                     ## 0xff
  .long 255                     ## 0xff
  .long 255                     ## 0xff
  .long 255                     ## 0xff
  .long 255                     ## 0xff
  .section  __TEXT,__text,regular,pure_instructions
  .globl  _bar
  .align  4, 0x90
_bar:
  vpunpckhbw  %xmm0, %xmm0, %xmm1
  vpunpckhwd  %xmm0, %xmm1, %xmm2
  vpmovzxwd %xmm1, %xmm1
  vinsertf128 $1, %xmm2, %ymm1, %ymm1
  vmovaps LCPI0_0(%rip), %ymm2
  vandps  %ymm2, %ymm1, %ymm1
  vpmovzxbw %xmm0, %xmm3
  vpunpckhwd  %xmm0, %xmm3, %xmm3
  vpmovzxbd %xmm0, %xmm0
  vinsertf128 $1, %xmm3, %ymm0, %ymm0
  vandps  %ymm2, %ymm0, %ymm0
  vmovaps %ymm0, (%rdi)
  vmovaps %ymm1, 32(%rdi)
  vzeroupper
  ret

So instead we can check if there are legal types that enable us to split
more cleverly when the input vector is already legal such that we don't
turn it into an illegal type. If the extend is such that it's more than
doubling the size of the input we check if
  - the number of vector elements is even,
  - the source type is legal,
  - the type of a split source is illegal,
  - the type of an extended (by doubling element size) source is legal, and
  - the type of that extended source when split is legal.
If the conditions are met, instead of just splitting both the
destination and the source types, we create an extend that only goes up
one "step" (doubling the element width), and the continue legalizing the
rest of the operation normally. The result is that this operates as a
new, more effecient, termination condition for the loop of "split the
operation until the destination type is legal."

With this change, the above example now compiles to:
_bar:
  vpxor %xmm1, %xmm1, %xmm1
  vpunpcklbw  %xmm1, %xmm0, %xmm2
  vpunpckhwd  %xmm1, %xmm2, %xmm3
  vpunpcklwd  %xmm1, %xmm2, %xmm2
  vinsertf128 $1, %xmm3, %ymm2, %ymm2
  vpunpckhbw  %xmm1, %xmm0, %xmm0
  vpunpckhwd  %xmm1, %xmm0, %xmm3
  vpunpcklwd  %xmm1, %xmm0, %xmm0
  vinsertf128 $1, %xmm3, %ymm0, %ymm0
  vmovaps %ymm0, 32(%rdi)
  vmovaps %ymm2, (%rdi)
  vzeroupper
  ret

This generalizes a custom lowering that was added a while back to the
ARM backend. That lowering is no longer necessary, and is removed. The
testcases for it, however, provide excellent ARM tests for this change
and so remain.

rdar://14735100

llvm-svn: 193727
This commit is contained in:
Jim Grosbach 2013-10-31 00:20:48 +00:00
parent 9d2ffea486
commit 7236678687
4 changed files with 81 additions and 58 deletions

View File

@ -561,6 +561,7 @@ private:
void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);

View File

@ -521,7 +521,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi); SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
break; break;
case ISD::ANY_EXTEND:
case ISD::CONVERT_RNDSAT: case ISD::CONVERT_RNDSAT:
case ISD::CTLZ: case ISD::CTLZ:
case ISD::CTTZ: case ISD::CTTZ:
@ -548,14 +547,18 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::FSIN: case ISD::FSIN:
case ISD::FSQRT: case ISD::FSQRT:
case ISD::FTRUNC: case ISD::FTRUNC:
case ISD::SIGN_EXTEND:
case ISD::SINT_TO_FP: case ISD::SINT_TO_FP:
case ISD::TRUNCATE: case ISD::TRUNCATE:
case ISD::UINT_TO_FP: case ISD::UINT_TO_FP:
case ISD::ZERO_EXTEND:
SplitVecRes_UnaryOp(N, Lo, Hi); SplitVecRes_UnaryOp(N, Lo, Hi);
break; break;
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
SplitVecRes_ExtendOp(N, Lo, Hi);
break;
case ISD::ADD: case ISD::ADD:
case ISD::SUB: case ISD::SUB:
case ISD::MUL: case ISD::MUL:
@ -921,6 +924,62 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
} }
} }
void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDLoc dl(N);
EVT SrcVT = N->getOperand(0).getValueType();
EVT DestVT = N->getValueType(0);
EVT LoVT, HiVT;
GetSplitDestVTs(DestVT, LoVT, HiVT);
// We can do better than a generic split operation if the extend is doing
// more than just doubling the width of the elements and the following are
// true:
// - The number of vector elements is even,
// - the source type is legal,
// - the type of a split source is illegal,
// - the type of an extended (by doubling element size) source is legal, and
// - the type of that extended source when split is legal.
//
// This won't necessarily completely legalize the operation, but it will
// more effectively move in the right direction and prevent falling down
// to scalarization in many cases due to the input vector being split too
// far.
unsigned NumElements = SrcVT.getVectorNumElements();
if ((NumElements & 1) == 0 &&
SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
LLVMContext &Ctx = *DAG.getContext();
EVT NewSrcVT = EVT::getVectorVT(
Ctx, EVT::getIntegerVT(
Ctx, SrcVT.getVectorElementType().getSizeInBits() * 2),
NumElements);
EVT SplitSrcVT =
EVT::getVectorVT(Ctx, SrcVT.getVectorElementType(), NumElements / 2);
EVT SplitLoVT, SplitHiVT;
GetSplitDestVTs(NewSrcVT, SplitLoVT, SplitHiVT);
if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
DEBUG(dbgs() << "Split vector extend via incremental extend:";
N->dump(&DAG); dbgs() << "\n");
// Extend the source vector by one step.
SDValue NewSrc =
DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
// Get the low and high halves of the new, extended one step, vector.
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitLoVT, NewSrc,
DAG.getConstant(0, TLI.getVectorIdxTy()));
Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitHiVT, NewSrc,
DAG.getConstant(SplitLoVT.getVectorNumElements(),
TLI.getVectorIdxTy()));
// Extend those vector halves the rest of the way.
Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
return;
}
}
// Fall back to the generic unary operator splitting otherwise.
SplitVecRes_UnaryOp(N, Lo, Hi);
}
void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
SDValue &Lo, SDValue &Hi) { SDValue &Lo, SDValue &Hi) {
// The low and high parts of the original input give four input vectors. // The low and high parts of the original input give four input vectors.

View File

@ -567,16 +567,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
// Custom expand long extensions to vectors.
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
// NEON does not have single instruction CTPOP for vectors with element // NEON does not have single instruction CTPOP for vectors with element
// types wider than 8-bits. However, custom lowering can leverage the // types wider than 8-bits. However, custom lowering can leverage the
// v8i8/v16i8 vcnt instruction. // v8i8/v16i8 vcnt instruction.
@ -3830,47 +3820,6 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
return FrameAddr; return FrameAddr;
} }
/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec),
/// and size(DestVec) > 128-bits.
/// This is achieved by doing the one extension from the SrcVec, splitting the
/// result, extending these parts, and then concatenating these into the
/// destination.
static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
SDValue Op = N->getOperand(0);
EVT SrcVT = Op.getValueType();
EVT DestVT = N->getValueType(0);
assert(DestVT.getSizeInBits() > 128 &&
"Custom sext/zext expansion needs >128-bit vector.");
// If this is a normal length extension, use the default expansion.
if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() &&
SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits())
return SDValue();
SDLoc dl(N);
unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits();
unsigned NumElts = SrcVT.getVectorNumElements();
LLVMContext &Ctx = *DAG.getContext();
SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi;
EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
NumElts);
EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
NumElts/2);
EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize),
NumElts/2);
Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op);
SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
DAG.getIntPtrConstant(0));
SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
DAG.getIntPtrConstant(NumElts/2));
ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo);
ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi);
}
/// ExpandBITCAST - If the target supports VFP, this function is called to /// ExpandBITCAST - If the target supports VFP, this function is called to
/// expand a bit convert where either the source or destination type is i64 to /// expand a bit convert where either the source or destination type is i64 to
/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
@ -6149,10 +6098,6 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::BITCAST: case ISD::BITCAST:
Res = ExpandBITCAST(N, DAG); Res = ExpandBITCAST(N, DAG);
break; break;
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
Res = ExpandVectorExtension(N, DAG);
break;
case ISD::SRL: case ISD::SRL:
case ISD::SRA: case ISD::SRA:
Res = Expand64BitShift(N, DAG, Subtarget); Res = Expand64BitShift(N, DAG, Subtarget);

View File

@ -0,0 +1,18 @@
; RUN: llc < %s -mcpu=core-avx-i -mtriple=x86_64-linux -asm-verbose=0| FileCheck %s
define void @test_long_extend(<16 x i8> %a, <16 x i32>* %p) nounwind {
; CHECK-LABEL: test_long_extend
; CHECK: vpunpcklbw %xmm1, %xmm0, [[REG1:%xmm[0-9]+]]
; CHECK: vpunpckhwd %xmm1, [[REG1]], [[REG2:%xmm[0-9]+]]
; CHECK: vpunpcklwd %xmm1, [[REG1]], %x[[REG3:mm[0-9]+]]
; CHECK: vinsertf128 $1, [[REG2]], %y[[REG3]], [[REG_result0:%ymm[0-9]+]]
; CHECK: vpunpckhbw %xmm1, %xmm0, [[REG4:%xmm[0-9]+]]
; CHECK: vpunpckhwd %xmm1, [[REG4]], [[REG5:%xmm[0-9]+]]
; CHECK: vpunpcklwd %xmm1, [[REG4]], %x[[REG6:mm[0-9]+]]
; CHECK: vinsertf128 $1, [[REG5]], %y[[REG6]], [[REG_result1:%ymm[0-9]+]]
; CHECK: vmovaps [[REG_result1]], 32(%rdi)
; CHECK: vmovaps [[REG_result0]], (%rdi)
%tmp = zext <16 x i8> %a to <16 x i32>
store <16 x i32> %tmp, <16 x i32>*%p
ret void
}