[X86][AVX] Add lowerShuffleAsLanePermuteAndSHUFP lowering

Add initial support for lowering v4f64 shuffles to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)), eventually this could be used for v8f32 (and maybe v8f64/v16f32) but I'm being conservative for the initial implementation as only v4f64 can always succeed.

This currently is only called from lowerShuffleAsLanePermuteAndShuffle so only gets used for unary shuffles, and we limit this to cases where we use upper elements as otherwise concating 2 xmm shuffles is probably the better case.

Helps with poor shuffles mentioned in D66004.
This commit is contained in:
Simon Pilgrim 2020-01-11 12:28:27 +00:00
parent 5d069f4314
commit ce35010d78
3 changed files with 78 additions and 52 deletions

View File

@ -14887,6 +14887,37 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
DAG);
}
// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
// TODO: Extend to support v8f32 (+ 512-bit shuffles).
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
int LHSMask[4] = {-1, -1, -1, -1};
int RHSMask[4] = {-1, -1, -1, -1};
unsigned SHUFPMask = 0;
// As SHUFPD uses a single LHS/RHS element per lane, we can always
// perform the shuffle once the lanes have been shuffled in place.
for (int i = 0; i != 4; ++i) {
int M = Mask[i];
if (M < 0)
continue;
int LaneBase = i & ~1;
auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
LaneMask[LaneBase + 0] = (M & ~1);
LaneMask[LaneBase + 1] = (M & ~1) + 1;
SHUFPMask |= (M & 1) << i;
}
SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
}
/// Lower a vector shuffle crossing multiple 128-bit lanes as
/// a lane permutation followed by a per-lane permutation.
///
@ -14965,6 +14996,15 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
int Size = Mask.size();
int LaneSize = Size / 2;
// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
// Only do this if the elements aren't all from the lower lane,
// otherwise we're (probably) better off doing a split.
if (VT == MVT::v4f64 &&
!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
if (SDValue V =
lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
return V;
// If there are only inputs from one 128-bit lane, splitting will in fact be
// less expensive. The flags track whether the given lane contains an element
// that crosses to another lane.

View File

@ -48,10 +48,8 @@ define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_0020:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_0020:
@ -71,8 +69,8 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_0300:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_0300:
@ -194,8 +192,8 @@ define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_3330:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_3330:
@ -299,9 +297,9 @@ define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0213(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_0213:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_0213:
@ -628,11 +626,9 @@ define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0456(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_0456:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1],xmm2[0]
; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_0456:
@ -786,10 +782,8 @@ define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0020:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0020:
@ -808,9 +802,8 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0112:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0112:
@ -830,8 +823,8 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0300:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0300:
@ -891,8 +884,8 @@ define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_3330:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_3330:
@ -931,9 +924,9 @@ define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0213(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0213:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0213:
@ -1006,10 +999,9 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0412:
; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0412:
@ -1031,11 +1023,9 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_4012:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_4012:

View File

@ -1154,9 +1154,9 @@ define <8 x float> @shuffle_v8f32_084c195d(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_01452367d(<8 x float> %a) {
; AVX1-LABEL: shuffle_v8f32_01452367d:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_01452367d:
@ -1198,11 +1198,9 @@ define <8 x float> @shuffle_v8f32_089abcde(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_0189abcd(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_0189abcd:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1],xmm2[0]
; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_0189abcd:
@ -2504,11 +2502,9 @@ define <8 x i32> @shuffle_v8i32_089abcde(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_0189abcd(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_0189abcd:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_0189abcd: