[X86][AVX] Remove lowerShuffleByMerging128BitLanes 2-lane restriction

First step towards adding support for 64-bit unary "sublane" handling (a bit like lowerShuffleAsRepeatedMaskAndLanePermute). 

This allows us to add lowerV64I8Shuffle handling.

llvm-svn: 352389
This commit is contained in:
Simon Pilgrim 2019-01-28 17:02:35 +00:00
parent 61db81d8d0
commit 2c17512456
2 changed files with 17 additions and 24 deletions

View File

@ -14166,8 +14166,6 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
/// or two of the lanes of the inputs. The lanes of the input vectors are /// or two of the lanes of the inputs. The lanes of the input vectors are
/// shuffled in one or two independent shuffles to get the lanes into the /// shuffled in one or two independent shuffles to get the lanes into the
/// position needed by the final shuffle. /// position needed by the final shuffle.
///
/// FIXME: This should be generalized to 512-bit shuffles.
static SDValue lowerShuffleByMerging128BitLanes( static SDValue lowerShuffleByMerging128BitLanes(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) { const X86Subtarget &Subtarget, SelectionDAG &DAG) {
@ -14177,12 +14175,10 @@ static SDValue lowerShuffleByMerging128BitLanes(
return SDValue(); return SDValue();
int Size = Mask.size(); int Size = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
int LaneSize = 128 / VT.getScalarSizeInBits(); int LaneSize = 128 / VT.getScalarSizeInBits();
int NumLanes = Size / LaneSize;
assert(NumLanes == 2 && "Only handles 256-bit shuffles.");
SmallVector<int, 16> RepeatMask(LaneSize, -1); SmallVector<int, 16> RepeatMask(LaneSize, -1);
int LaneSrcs[2][2] = { { -1, -1 }, { -1 , -1 } }; SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {-1, -1});
// First pass will try to fill in the RepeatMask from lanes that need two // First pass will try to fill in the RepeatMask from lanes that need two
// sources. // sources.
@ -14193,7 +14189,7 @@ static SDValue lowerShuffleByMerging128BitLanes(
int M = Mask[(Lane * LaneSize) + i]; int M = Mask[(Lane * LaneSize) + i];
if (M < 0) if (M < 0)
continue; continue;
// Determine which of the 4 possible input lanes (2 from each source) // Determine which of the possible input lanes (NumLanes from each source)
// this element comes from. Assign that as one of the sources for this // this element comes from. Assign that as one of the sources for this
// lane. We can assign up to 2 sources for this lane. If we run out // lane. We can assign up to 2 sources for this lane. If we run out
// sources we can't do anything. // sources we can't do anything.
@ -15866,6 +15862,13 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG)) Zeroable, Subtarget, DAG))
return Blend; return Blend;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (!V2.isUndef())
if (SDValue Result = lowerShuffleByMerging128BitLanes(
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return Result;
// FIXME: Implement direct support for this type! // FIXME: Implement direct support for this type!
return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
} }

View File

@ -444,13 +444,8 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_
; ;
; AVX512BW-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512BW-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
; AVX512BW: # %bb.0: ; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zmm0[13],zero,zmm0[11],zero,zmm0[9],zero,zmm0[7],zero,zmm0[5],zero,zmm0[3],zero,zmm0[1],zero,zmm0[31],zero,zmm0[29],zero,zmm0[27],zero,zmm0[25],zero,zmm0[23],zero,zmm0[21],zero,zmm0[19],zero,zmm0[17],zero,zmm0[47],zero,zmm0[45],zero,zmm0[43],zero,zmm0[41],zero,zmm0[39],zero,zmm0[37],zero,zmm0[35],zero,zmm0[33],zero,zmm0[63],zero,zmm0[61],zero,zmm0[59],zero,zmm0[57],zero,zmm0[55],zero,zmm0[53],zero,zmm0[51],zero,zmm0[49],zero
; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq ; AVX512BW-NEXT: retq
; ;
; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
@ -490,13 +485,12 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
; AVX512BW: # %bb.0: ; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX512BW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
; AVX512BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512BW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@ -541,13 +535,9 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_
; AVX512BW: # %bb.0: ; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,6,7],zmm1[2,3,6,7]
; AVX512BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[0,1,4,5]
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512BW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq ; AVX512BW-NEXT: retq
; ;
; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: ; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: