forked from OSchip/llvm-project
[X86][AVX] Add lowerShuffleAsLanePermuteAndSHUFP lowering
Add initial support for lowering v4f64 shuffles to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)), eventually this could be used for v8f32 (and maybe v8f64/v16f32) but I'm being conservative for the initial implementation as only v4f64 can always succeed. This currently is only called from lowerShuffleAsLanePermuteAndShuffle so only gets used for unary shuffles, and we limit this to cases where we use upper elements as otherwise concating 2 xmm shuffles is probably the better case. Helps with poor shuffles mentioned in D66004.
This commit is contained in:
parent
5d069f4314
commit
ce35010d78
|
@ -14887,6 +14887,37 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
|
|||
DAG);
|
||||
}
|
||||
|
||||
// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
|
||||
// TODO: Extend to support v8f32 (+ 512-bit shuffles).
|
||||
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
|
||||
SDValue V1, SDValue V2,
|
||||
ArrayRef<int> Mask,
|
||||
SelectionDAG &DAG) {
|
||||
assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
|
||||
|
||||
int LHSMask[4] = {-1, -1, -1, -1};
|
||||
int RHSMask[4] = {-1, -1, -1, -1};
|
||||
unsigned SHUFPMask = 0;
|
||||
|
||||
// As SHUFPD uses a single LHS/RHS element per lane, we can always
|
||||
// perform the shuffle once the lanes have been shuffled in place.
|
||||
for (int i = 0; i != 4; ++i) {
|
||||
int M = Mask[i];
|
||||
if (M < 0)
|
||||
continue;
|
||||
int LaneBase = i & ~1;
|
||||
auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
|
||||
LaneMask[LaneBase + 0] = (M & ~1);
|
||||
LaneMask[LaneBase + 1] = (M & ~1) + 1;
|
||||
SHUFPMask |= (M & 1) << i;
|
||||
}
|
||||
|
||||
SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
|
||||
SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
|
||||
return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
|
||||
DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
|
||||
}
|
||||
|
||||
/// Lower a vector shuffle crossing multiple 128-bit lanes as
|
||||
/// a lane permutation followed by a per-lane permutation.
|
||||
///
|
||||
|
@ -14965,6 +14996,15 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
|
|||
int Size = Mask.size();
|
||||
int LaneSize = Size / 2;
|
||||
|
||||
// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
|
||||
// Only do this if the elements aren't all from the lower lane,
|
||||
// otherwise we're (probably) better off doing a split.
|
||||
if (VT == MVT::v4f64 &&
|
||||
!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
|
||||
if (SDValue V =
|
||||
lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
|
||||
return V;
|
||||
|
||||
// If there are only inputs from one 128-bit lane, splitting will in fact be
|
||||
// less expensive. The flags track whether the given lane contains an element
|
||||
// that crosses to another lane.
|
||||
|
|
|
@ -48,10 +48,8 @@ define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
|
|||
define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1-LABEL: shuffle_v4f64_0020:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
||||
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
|
||||
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4f64_0020:
|
||||
|
@ -71,8 +69,8 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
|
|||
; AVX1-LABEL: shuffle_v4f64_0300:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4f64_0300:
|
||||
|
@ -194,8 +192,8 @@ define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
|
|||
; AVX1-LABEL: shuffle_v4f64_3330:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4f64_3330:
|
||||
|
@ -299,9 +297,9 @@ define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
|
|||
define <4 x double> @shuffle_v4f64_0213(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1-LABEL: shuffle_v4f64_0213:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,2,3]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4f64_0213:
|
||||
|
@ -628,11 +626,9 @@ define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) {
|
|||
define <4 x double> @shuffle_v4f64_0456(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1-LABEL: shuffle_v4f64_0456:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1],xmm2[0]
|
||||
; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[2]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4f64_0456:
|
||||
|
@ -786,10 +782,8 @@ define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
|
|||
define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
|
||||
; AVX1-LABEL: shuffle_v4i64_0020:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
|
||||
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_0020:
|
||||
|
@ -808,9 +802,8 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
|
|||
define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) {
|
||||
; AVX1-LABEL: shuffle_v4i64_0112:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_0112:
|
||||
|
@ -830,8 +823,8 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
|
|||
; AVX1-LABEL: shuffle_v4i64_0300:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_0300:
|
||||
|
@ -891,8 +884,8 @@ define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
|
|||
; AVX1-LABEL: shuffle_v4i64_3330:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_3330:
|
||||
|
@ -931,9 +924,9 @@ define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
|
|||
define <4 x i64> @shuffle_v4i64_0213(<4 x i64> %a, <4 x i64> %b) {
|
||||
; AVX1-LABEL: shuffle_v4i64_0213:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,2,3]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_0213:
|
||||
|
@ -1006,10 +999,9 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
|
|||
; AVX1-LABEL: shuffle_v4i64_0412:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
||||
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_0412:
|
||||
|
@ -1031,11 +1023,9 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
|
|||
define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
|
||||
; AVX1-LABEL: shuffle_v4i64_4012:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
||||
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v4i64_4012:
|
||||
|
|
|
@ -1154,9 +1154,9 @@ define <8 x float> @shuffle_v8f32_084c195d(<8 x float> %a, <8 x float> %b) {
|
|||
define <8 x float> @shuffle_v8f32_01452367d(<8 x float> %a) {
|
||||
; AVX1-LABEL: shuffle_v8f32_01452367d:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,2,3]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2OR512VL-LABEL: shuffle_v8f32_01452367d:
|
||||
|
@ -1198,11 +1198,9 @@ define <8 x float> @shuffle_v8f32_089abcde(<8 x float> %a, <8 x float> %b) {
|
|||
define <8 x float> @shuffle_v8f32_0189abcd(<8 x float> %a, <8 x float> %b) {
|
||||
; AVX1-LABEL: shuffle_v8f32_0189abcd:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1],xmm2[0]
|
||||
; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[2]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8f32_0189abcd:
|
||||
|
@ -2504,11 +2502,9 @@ define <8 x i32> @shuffle_v8i32_089abcde(<8 x i32> %a, <8 x i32> %b) {
|
|||
define <8 x i32> @shuffle_v8i32_0189abcd(<8 x i32> %a, <8 x i32> %b) {
|
||||
; AVX1-LABEL: shuffle_v8i32_0189abcd:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[2]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: shuffle_v8i32_0189abcd:
|
||||
|
|
Loading…
Reference in New Issue