[DAGCombiner] narrow shuffle of concatenated vectors

// shuffle (concat X, undef), (concat Y, undef), Mask -->
// concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1)

The ARM changes with 'vtrn' and narrowed 'vuzp' are improvements.

The x86 changes look neutral or better. There's one test with an
extra instruction, but that could be reversed for a subtarget with
the right attributes. But by default, we want to avoid the 256-bit
op when possible (in my motivating benchmark, a handful of ymm ops
sprinkled into a sequence of xmm ops are triggering frequency
throttling on Haswell resulting in significantly worse perf).

Differential Revision: https://reviews.llvm.org/D60545

llvm-svn: 358291
This commit is contained in:
Sanjay Patel 2019-04-12 16:31:56 +00:00
parent 7bd8c37b17
commit 5e4ad39af7
5 changed files with 85 additions and 49 deletions

View File

@ -17598,6 +17598,53 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
return SDValue();
}
/// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles
/// followed by concatenation. Narrow vector ops may have better performance
/// than wide ops, and this can unlock further narrowing of other vector ops.
/// Targets can invert this transform later if it is not profitable.
static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf,
SelectionDAG &DAG) {
SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1);
if (N0.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
N1.getOpcode() != ISD::CONCAT_VECTORS || N1.getNumOperands() != 2 ||
!N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef())
return SDValue();
// Split the wide shuffle mask into halves. Any mask element that is accessing
// operand 1 is offset down to account for narrowing of the vectors.
ArrayRef<int> Mask = Shuf->getMask();
EVT VT = Shuf->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
unsigned HalfNumElts = NumElts / 2;
SmallVector<int, 16> Mask0(HalfNumElts, -1);
SmallVector<int, 16> Mask1(HalfNumElts, -1);
for (unsigned i = 0; i != NumElts; ++i) {
if (Mask[i] == -1)
continue;
int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts;
if (i < HalfNumElts)
Mask0[i] = M;
else
Mask1[i - HalfNumElts] = M;
}
// Ask the target if this is a valid transform.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
HalfNumElts);
if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) ||
!TLI.isShuffleMaskLegal(Mask1, HalfVT))
return SDValue();
// shuffle (concat X, undef), (concat Y, undef), Mask -->
// concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1)
SDValue X = N0.getOperand(0), Y = N1.getOperand(0);
SDLoc DL(Shuf);
SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0);
SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1);
}
// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
// or turn a shuffle of a single concat into simpler shuffle then concat.
static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
@ -18379,6 +18426,9 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask);
}
if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
return V;
return SDValue();
}

View File

@ -270,11 +270,10 @@ define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
; CHECK-LABEL: vuzp_lower_shufflemask_undef:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldr d17, [r1]
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vorr q9, q8, q8
; CHECK-NEXT: vuzp.16 q8, q9
; CHECK-NEXT: vmov r0, r1, d18
; CHECK-NEXT: vmov r2, r3, d19
; CHECK-NEXT: vldr d18, [r0]
; CHECK-NEXT: vuzp.16 d18, d17
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
entry:
%tmp1 = load <4 x i16>, <4 x i16>* %A
@ -286,13 +285,13 @@ entry:
define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) {
; CHECK-LABEL: vuzp_lower_shufflemask_zeroed:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldr d18, [r0]
; CHECK-NEXT: vorr d19, d18, d18
; CHECK-NEXT: vldr d17, [r1]
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.32 q9, d16[0]
; CHECK-NEXT: vuzp.32 q8, q9
; CHECK-NEXT: vext.32 q8, q9, q9, #2
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vtrn.32 d19, d17
; CHECK-NEXT: vdup.32 d16, d18[0]
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
entry:
%tmp1 = load <2 x i32>, <2 x i32>* %A
@ -304,11 +303,10 @@ entry:
define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) {
; CHECK-LABEL: vuzp_rev_shufflemask_vtrn:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldr d17, [r1]
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vrev64.32 q9, q8
; CHECK-NEXT: vuzp.32 q8, q9
; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
; CHECK-NEXT: vldr d16, [r1]
; CHECK-NEXT: vldr d17, [r0]
; CHECK-NEXT: vtrn.32 d17, d16
; CHECK-NEXT: vst1.64 {d16, d17}, [r2]
; CHECK-NEXT: mov pc, lr
entry:
%tmp1 = load <2 x i32>, <2 x i32>* %A

View File

@ -270,8 +270,8 @@ define <8 x i16> @vzip_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
; CHECK-LABEL: vzip_lower_shufflemask_undef:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldr d17, [r1]
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vzip.16 d16, d17
; CHECK-NEXT: vldr d18, [r0]
; CHECK-NEXT: vzip.16 d18, d17
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr

View File

@ -229,8 +229,9 @@ define <4 x i64> @_mul4xi32toi64b(<4 x i32>, <4 x i32>) {
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm0[1]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%even0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
%even1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>

View File

@ -381,36 +381,23 @@ define void @v7i32(<4 x i32> %a, <4 x i32> %b, <7 x i32>* %p) nounwind {
; SSE42-NEXT: movdqa %xmm2, (%rdi)
; SSE42-NEXT: retq
;
; AVX1-LABEL: v7i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3]
; AVX1-NEXT: vmovss %xmm1, 24(%rdi)
; AVX1-NEXT: vmovlps %xmm0, 16(%rdi)
; AVX1-NEXT: vmovaps %xmm2, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: v7i32:
; AVX2: # %bb.0:
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,6,3,6,1,7,4,u>
; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovss %xmm1, 24(%rdi)
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovlps %xmm1, 16(%rdi)
; AVX2-NEXT: vmovaps %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX-LABEL: v7i32:
; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX-NEXT: vmovss %xmm1, 24(%rdi)
; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
; AVX-NEXT: vmovaps %xmm2, (%rdi)
; AVX-NEXT: retq
;
; XOP-LABEL: v7i32:
; XOP: # %bb.0:
; XOP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
; XOP-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3]
; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; XOP-NEXT: vmovss %xmm1, 24(%rdi)
; XOP-NEXT: vmovlps %xmm0, 16(%rdi)
; XOP-NEXT: vmovaps %xmm2, (%rdi)
@ -487,12 +474,12 @@ define void @v12i16(<8 x i16> %a, <8 x i16> %b, <12 x i16>* %p) nounwind {
; SSE2-NEXT: pandn %xmm2, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7]
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: movq %xmm2, 16(%rdi)
; SSE2-NEXT: movdqa %xmm3, (%rdi)
; SSE2-NEXT: retq
@ -502,7 +489,7 @@ define void @v12i16(<8 x i16> %a, <8 x i16> %b, <12 x i16>* %p) nounwind {
; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3,4,5,6,7]
; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,3]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]