forked from OSchip/llvm-project
[DAG] Allow build-to-shuffle combine to combine builds from two wide vectors.
This allows us to, in some cases, create a vector_shuffle out of a build_vector, when the inputs to the build are extract_elements from two different vectors, at least one of which is wider than the output. (E.g. a <8 x i16> being constructed out of elements from a <16 x i16> and a <8 x i16>). Differential Revision: https://reviews.llvm.org/D24491 llvm-svn: 281402
This commit is contained in:
parent
fc19fa3721
commit
59f8305305
|
@ -12974,9 +12974,15 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
|
|||
EVT InVT1 = VecIn1.getValueType();
|
||||
EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;
|
||||
unsigned Vec2Offset = InVT1.getVectorNumElements();
|
||||
unsigned ShuffleNumElems = NumElems;
|
||||
|
||||
MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
|
||||
SDValue ZeroIdx = DAG.getConstant(0, dl, IdxTy);
|
||||
|
||||
// We can't generate a shuffle node with mismatched input and output types.
|
||||
// Try to make the types match.
|
||||
// TODO: Should this fire if InVT1/InVT2 are not legal types, or should
|
||||
// we let legalization run its course first?
|
||||
if (InVT1 != VT || InVT2 != VT) {
|
||||
// Both inputs and the output must have the same base element type.
|
||||
EVT ElemType = VT.getVectorElementType();
|
||||
|
@ -12984,6 +12990,9 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
|
|||
ElemType != InVT2.getVectorElementType())
|
||||
return SDValue();
|
||||
|
||||
// TODO: Canonicalize this so that if the vectors have different lengths,
|
||||
// VecIn1 is always longer.
|
||||
|
||||
// The element types match, now figure out the lengths.
|
||||
if (InVT1.getSizeInBits() * 2 == VT.getSizeInBits() && InVT1 == InVT2) {
|
||||
// If both input vectors are exactly half the size of the output, concat
|
||||
|
@ -12997,26 +13006,36 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
|
|||
if (UsesZeroVector)
|
||||
Vec2Offset = NumElems;
|
||||
} else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) {
|
||||
// If we only have one input vector, and it's twice the size of the
|
||||
// output, split it in two.
|
||||
if (!TLI.isExtractSubvectorCheap(VT, NumElems))
|
||||
return SDValue();
|
||||
|
||||
// TODO: Support the case where we have one input that's too wide, and
|
||||
// another input which is wide/"correct"/narrow. We can do this by
|
||||
// widening the narrow input, shuffling the wide vectors, and then
|
||||
// extracting the low subvector.
|
||||
if (UsesZeroVector || VecIn2.getNode())
|
||||
if (UsesZeroVector)
|
||||
return SDValue();
|
||||
|
||||
MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
|
||||
VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
|
||||
DAG.getConstant(NumElems, dl, IdxTy));
|
||||
VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
|
||||
DAG.getConstant(0, dl, IdxTy));
|
||||
// Since we now have shorter input vectors, adjust the offset of the
|
||||
// second vector's start.
|
||||
Vec2Offset = NumElems;
|
||||
if (!VecIn2.getNode()) {
|
||||
// If we only have one input vector, and it's twice the size of the
|
||||
// output, split it in two.
|
||||
VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
|
||||
DAG.getConstant(NumElems, dl, IdxTy));
|
||||
VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, ZeroIdx);
|
||||
// Since we now have shorter input vectors, adjust the offset of the
|
||||
// second vector's start.
|
||||
Vec2Offset = NumElems;
|
||||
} else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) {
|
||||
// VecIn1 is wider than the output, and we have another, possibly
|
||||
// smaller input. Pad the smaller input with undefs, shuffle at the
|
||||
// input vector width, and extract the output.
|
||||
|
||||
// The shuffle type is different than VT, so check legality again.
|
||||
if (LegalOperations &&
|
||||
!TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1))
|
||||
return SDValue();
|
||||
|
||||
if (InVT1 != InVT2)
|
||||
VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT1,
|
||||
DAG.getUNDEF(InVT1), VecIn2, ZeroIdx);
|
||||
ShuffleNumElems = NumElems * 2;
|
||||
}
|
||||
} else {
|
||||
// TODO: Support cases where the length mismatch isn't exactly by a
|
||||
// factor of 2.
|
||||
|
@ -13024,18 +13043,20 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
|
|||
}
|
||||
}
|
||||
|
||||
SmallVector<int, 8> Mask;
|
||||
// Initialize mask to undef.
|
||||
SmallVector<int, 8> Mask(ShuffleNumElems, -1);
|
||||
|
||||
// Only need to run up to the number of elements actually used, not the
|
||||
// total number of elements in the shuffle - if we are shuffling a wider
|
||||
// vector, the high lanes should be set to undef.
|
||||
for (unsigned i = 0; i != NumElems; ++i) {
|
||||
if (VectorMask[i] == -1) {
|
||||
Mask.push_back(-1);
|
||||
if (VectorMask[i] == -1)
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we are trying to blend with zero, we need to take a zero from the
|
||||
// correct position in the second input.
|
||||
if (VectorMask[i] == 0) {
|
||||
Mask.push_back(Vec2Offset + i);
|
||||
Mask[i] = Vec2Offset + i;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -13044,12 +13065,12 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
|
|||
cast<ConstantSDNode>(Extract.getOperand(1))->getZExtValue();
|
||||
|
||||
if (VectorMask[i] == 1) {
|
||||
Mask.push_back(ExtIndex);
|
||||
Mask[i] = ExtIndex;
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(VectorMask[i] == 2 && "Expected input to be from second vector");
|
||||
Mask.push_back(Vec2Offset + ExtIndex);
|
||||
Mask[i] = Vec2Offset + ExtIndex;
|
||||
}
|
||||
|
||||
// Avoid introducing illegal shuffles with zero.
|
||||
|
@ -13059,18 +13080,23 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
|
|||
if (UsesZeroVector && !TLI.isVectorClearMaskLegal(Mask, VT))
|
||||
return SDValue();
|
||||
|
||||
// The type the input vectors may have changed above.
|
||||
InVT1 = VecIn1.getValueType();
|
||||
|
||||
// If we already have a VecIn2, it should have the same type as VecIn1.
|
||||
// If we don't, get an undef/zero vector of the appropriate type.
|
||||
VecIn2 =
|
||||
getRightHandValue(DAG, dl, VecIn2, VecIn1.getValueType(), UsesZeroVector);
|
||||
assert(VecIn1.getValueType() == VecIn2.getValueType() &&
|
||||
"Unexpected second input type.");
|
||||
VecIn2 = getRightHandValue(DAG, dl, VecIn2, InVT1, UsesZeroVector);
|
||||
assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type.");
|
||||
|
||||
// Return the new VECTOR_SHUFFLE node.
|
||||
SDValue Ops[2];
|
||||
Ops[0] = VecIn1;
|
||||
Ops[1] = VecIn2;
|
||||
return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], Mask);
|
||||
SDValue Shuffle = DAG.getVectorShuffle(InVT1, dl, Ops[0], Ops[1], Mask);
|
||||
if (ShuffleNumElems > NumElems)
|
||||
Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Shuffle, ZeroIdx);
|
||||
|
||||
return Shuffle;
|
||||
}
|
||||
|
||||
SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
|
||||
|
|
|
@ -984,54 +984,21 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
|
|||
; AVX1-LABEL: interleave_24i16_out:
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vmovdqu 32(%rdi), %xmm0
|
||||
; AVX1-NEXT: vmovdqu (%rdi), %ymm2
|
||||
; AVX1-NEXT: vpextrw $3, %xmm2, %eax
|
||||
; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm1
|
||||
; AVX1-NEXT: vpextrw $6, %xmm2, %eax
|
||||
; AVX1-NEXT: vpinsrw $2, %eax, %xmm1, %xmm3
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
|
||||
; AVX1-NEXT: vpextrw $1, %xmm1, %eax
|
||||
; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpextrw $4, %xmm1, %eax
|
||||
; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpextrw $7, %xmm1, %eax
|
||||
; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpextrw $2, %xmm0, %eax
|
||||
; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpextrw $5, %xmm0, %eax
|
||||
; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpextrw $4, %xmm2, %eax
|
||||
; AVX1-NEXT: vpextrw $1, %xmm2, %edi
|
||||
; AVX1-NEXT: vmovd %edi, %xmm4
|
||||
; AVX1-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpextrw $7, %xmm2, %eax
|
||||
; AVX1-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpextrw $2, %xmm1, %eax
|
||||
; AVX1-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpextrw $5, %xmm1, %eax
|
||||
; AVX1-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vmovd %xmm0, %eax
|
||||
; AVX1-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX1-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpextrw $6, %xmm0, %eax
|
||||
; AVX1-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpextrw $5, %xmm2, %eax
|
||||
; AVX1-NEXT: vpextrw $2, %xmm2, %edi
|
||||
; AVX1-NEXT: vmovd %edi, %xmm2
|
||||
; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vmovd %xmm1, %eax
|
||||
; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpextrw $3, %xmm1, %eax
|
||||
; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpextrw $6, %xmm1, %eax
|
||||
; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm1
|
||||
; AVX1-NEXT: vpextrw $1, %xmm0, %eax
|
||||
; AVX1-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpextrw $4, %xmm0, %eax
|
||||
; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpextrw $7, %xmm0, %eax
|
||||
; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vmovups (%rdi), %ymm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
|
||||
; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,10,11,8,9,14,15]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
|
||||
; AVX1-NEXT: vmovdqu %xmm3, (%rsi)
|
||||
; AVX1-NEXT: vmovdqu %xmm4, (%rdx)
|
||||
; AVX1-NEXT: vmovdqu %xmm0, (%rcx)
|
||||
|
@ -1040,57 +1007,22 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
|
|||
;
|
||||
; AVX2-LABEL: interleave_24i16_out:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovdqu 32(%rdi), %xmm0
|
||||
; AVX2-NEXT: vmovdqu (%rdi), %ymm2
|
||||
; AVX2-NEXT: vpextrw $3, %xmm2, %eax
|
||||
; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm1
|
||||
; AVX2-NEXT: vpextrw $6, %xmm2, %eax
|
||||
; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm3
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
|
||||
; AVX2-NEXT: vpextrw $1, %xmm1, %eax
|
||||
; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
|
||||
; AVX2-NEXT: vpextrw $4, %xmm1, %eax
|
||||
; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
|
||||
; AVX2-NEXT: vpextrw $7, %xmm1, %eax
|
||||
; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
|
||||
; AVX2-NEXT: vpextrw $2, %xmm0, %eax
|
||||
; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
|
||||
; AVX2-NEXT: vpextrw $5, %xmm0, %eax
|
||||
; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
|
||||
; AVX2-NEXT: vpextrw $4, %xmm2, %eax
|
||||
; AVX2-NEXT: vpextrw $1, %xmm2, %edi
|
||||
; AVX2-NEXT: vmovd %edi, %xmm4
|
||||
; AVX2-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpextrw $7, %xmm2, %eax
|
||||
; AVX2-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpextrw $2, %xmm1, %eax
|
||||
; AVX2-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpextrw $5, %xmm1, %eax
|
||||
; AVX2-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vmovd %xmm0, %eax
|
||||
; AVX2-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX2-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpextrw $6, %xmm0, %eax
|
||||
; AVX2-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpextrw $5, %xmm2, %eax
|
||||
; AVX2-NEXT: vpextrw $2, %xmm2, %edi
|
||||
; AVX2-NEXT: vmovd %edi, %xmm2
|
||||
; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vmovd %xmm1, %eax
|
||||
; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpextrw $3, %xmm1, %eax
|
||||
; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpextrw $6, %xmm1, %eax
|
||||
; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm1
|
||||
; AVX2-NEXT: vpextrw $1, %xmm0, %eax
|
||||
; AVX2-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpextrw $4, %xmm0, %eax
|
||||
; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpextrw $7, %xmm0, %eax
|
||||
; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
|
||||
; AVX2-NEXT: vmovdqu %xmm3, (%rsi)
|
||||
; AVX2-NEXT: vmovdqu %xmm4, (%rdx)
|
||||
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
|
||||
; AVX2-NEXT: vmovdqu 32(%rdi), %xmm1
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
|
||||
; AVX2-NEXT: vmovdqu %xmm2, (%rsi)
|
||||
; AVX2-NEXT: vmovdqu %xmm3, (%rdx)
|
||||
; AVX2-NEXT: vmovdqu %xmm0, (%rcx)
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
|
|
Loading…
Reference in New Issue