Instead of always leaving the work to the generic legalizer when

there is no support for native 256-bit shuffles, be more smart in some
cases, for example, when you can extract specific 128-bit parts and use
regular 128-bit shuffles for them. Example:

For this shuffle:
  shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32>
                <i32 1, i32 0, i32 7, i32 6>

This was expanded to:
  vextractf128  $1, %ymm1, %xmm2
  vpextrq $0, %xmm2, %rax
  vmovd %rax, %xmm1
  vpextrq $1, %xmm2, %rax
  vmovd %rax, %xmm2
  vpunpcklqdq %xmm1, %xmm2, %xmm1
  vpextrq $0, %xmm0, %rax
  vmovd %rax, %xmm2
  vpextrq $1, %xmm0, %rax
  vmovd %rax, %xmm0
  vpunpcklqdq %xmm2, %xmm0, %xmm0
  vinsertf128 $1, %xmm1, %ymm0, %ymm0
  ret

Now we get:
  vshufpd $1, %xmm0, %xmm0, %xmm0
  vextractf128  $1, %ymm1, %xmm1
  vshufpd $1, %xmm1, %xmm1, %xmm1
  vinsertf128 $1, %xmm1, %ymm0, %ymm0

llvm-svn: 137733
This commit is contained in:
Bruno Cardoso Lopes 2011-08-16 18:21:54 +00:00
parent cbc9eb45ab
commit 2e99f1b3aa
2 changed files with 139 additions and 0 deletions

View File

@ -3027,6 +3027,17 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
return (Val < 0) || (Val >= Low && Val < Hi);
}
/// isUndefOrInRange - Return true if every element in Mask, begining
/// from position Pos and ending in Pos+Size, falls within the specified
/// range (L, L+Pos]. or is undef.
static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask,
int Pos, int Size, int Low, int Hi) {
for (int i = Pos, e = Pos+Size; i != e; ++i)
if (!isUndefOrInRange(Mask[i], Low, Hi))
return false;
return true;
}
/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
/// specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
@ -5666,10 +5677,95 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
OpVT, SrcOp)));
}
/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector
/// shuffle node referes to only one lane in the sources.
static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) {
EVT VT = SVOp->getValueType(0);
int NumElems = VT.getVectorNumElements();
int HalfSize = NumElems/2;
SmallVector<int, 16> M;
SVOp->getMask(M);
bool MatchA = false, MatchB = false;
for (int l = 0; l < NumElems*2; l += HalfSize) {
if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) {
MatchA = true;
break;
}
}
for (int l = 0; l < NumElems*2; l += HalfSize) {
if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) {
MatchB = true;
break;
}
}
return MatchA && MatchB;
}
/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
/// which could not be matched by any known target speficic shuffle
static SDValue
LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
if (areShuffleHalvesWithinDisjointLanes(SVOp)) {
// If each half of a vector shuffle node referes to only one lane in the
// source vectors, extract each used 128-bit lane and shuffle them using
// 128-bit shuffles. Then, concatenate the results. Otherwise leave
// the work to the legalizer.
DebugLoc dl = SVOp->getDebugLoc();
EVT VT = SVOp->getValueType(0);
int NumElems = VT.getVectorNumElements();
int HalfSize = NumElems/2;
// Extract the reference for each half
int FstVecExtractIdx = 0, SndVecExtractIdx = 0;
int FstVecOpNum = 0, SndVecOpNum = 0;
for (int i = 0; i < HalfSize; ++i) {
int Elt = SVOp->getMaskElt(i);
if (SVOp->getMaskElt(i) < 0)
continue;
FstVecOpNum = Elt/NumElems;
FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
break;
}
for (int i = HalfSize; i < NumElems; ++i) {
int Elt = SVOp->getMaskElt(i);
if (SVOp->getMaskElt(i) < 0)
continue;
SndVecOpNum = Elt/NumElems;
SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
break;
}
// Extract the subvectors
SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum),
DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl);
SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum),
DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl);
// Generate 128-bit shuffles
SmallVector<int, 16> MaskV1, MaskV2;
for (int i = 0; i < HalfSize; ++i) {
int Elt = SVOp->getMaskElt(i);
MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize);
}
for (int i = HalfSize; i < NumElems; ++i) {
int Elt = SVOp->getMaskElt(i);
MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize);
}
EVT NVT = V1.getValueType();
V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]);
V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]);
// Concatenate the result back
SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1,
DAG.getConstant(0, MVT::i32), DAG, dl);
return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
DAG, dl);
}
return SDValue();
}

View File

@ -50,3 +50,46 @@ entry:
%shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 4>
ret <4 x i64> %shuffle
}
;;;
;;; Check that some 256-bit vectors are xformed into 128 ops
; CHECK: _A
; CHECK: vshufpd $1
; CHECK-NEXT: vextractf128 $1
; CHECK-NEXT: vshufpd $1
; CHECK-NEXT: vinsertf128 $1
define <4 x i64> @A(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
ret <4 x i64> %shuffle
}
; CHECK: vpunpckhqdq
; CHECK-NEXT: vextractf128 $1
; CHECK-NEXT: movlhps
; CHECK-NEXT: vinsertf128 $1
define <4 x i64> @B(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 undef, i32 undef, i32 6>
ret <4 x i64> %shuffle
}
; CHECK: movlhps
; CHECK-NEXT: vextractf128 $1
; CHECK-NEXT: movlhps
; CHECK-NEXT: vinsertf128 $1
define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 undef, i32 0, i32 undef, i32 6>
ret <4 x i64> %shuffle
}
; CHECK: vpshufd $-96
; CHECK: vpshufd $-6
; CHECK: vinsertf128 $1
define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
entry:
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 10, i32 10, i32 11, i32 11>
ret <8 x i32> %shuffle
}