[X86] Increase the depth threshold required to form VPERMI2W/VPERMI2B in shuffle combining

These instructions are implemented with two port 5 uops and one port 015 uop so they are more complicated that most shuffles.

This patch increases the depth threshold for when we form them during shuffle combining to try to limit increasing the number of uops especially on port 5.

Differential Revision: https://reviews.llvm.org/D88503
This commit is contained in:
Craig Topper 2020-09-29 10:51:49 -07:00
parent 0a146a9d0b
commit 618a890b72
6 changed files with 71 additions and 188 deletions

View File

@ -35351,6 +35351,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Depth threshold above which we can efficiently use variable mask shuffles.
int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
// VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
// higher depth before combining them.
bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask);
bool MaskContainsZeros = isAnyZero(Mask);
@ -35387,9 +35390,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
(Subtarget.hasBWI() &&
(Subtarget.hasBWI() && AllowBWIVPERMV3 &&
(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
(Subtarget.hasVBMI() &&
(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
// Adjust shuffle mask - replace SM_SentinelZero with second source index.
for (unsigned i = 0; i != NumMaskElts; ++i)
@ -35416,9 +35419,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasBWI() &&
(Subtarget.hasBWI() && AllowBWIVPERMV3 &&
(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
(Subtarget.hasVBMI() &&
(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
@ -35588,10 +35591,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
MaskVT == MVT::v16i32)) ||
(Subtarget.hasBWI() && (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
MaskVT == MVT::v32i16)) ||
(Subtarget.hasVBMI() && (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
MaskVT == MVT::v64i8)))) {
(Subtarget.hasBWI() && AllowBWIVPERMV3 &&
(MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
(MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

View File

@ -857,10 +857,10 @@ define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-wi
define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
; CHECK-LABEL: trunc_v8i64_v8i32_zeroes:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm1
; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm2
; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm0
; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm0
; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm1
; CHECK-NEXT: vpackusdw %ymm0, %ymm1, %ymm0
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-NEXT: retq
%a = load <8 x i64>, <8 x i64>* %x
%b = lshr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
@ -920,9 +920,10 @@ define <8 x i32> @trunc_v8i64_v8i32_sign(<8 x i64>* %x) nounwind "min-legal-vect
define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
; CHECK-LABEL: trunc_v16i32_v16i16_sign:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: vpsrad $16, 32(%rdi), %ymm0
; CHECK-NEXT: vpsrad $16, (%rdi), %ymm1
; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-NEXT: retq
%a = load <16 x i32>, <16 x i32>* %x
%b = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@ -931,20 +932,13 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-
}
define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" {
; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign:
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: vpsraw $8, 32(%rdi), %ymm0
; CHECK-AVX512-NEXT: vpsraw $8, (%rdi), %ymm1
; CHECK-AVX512-NEXT: vpacksswb %ymm0, %ymm1, %ymm0
; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-AVX512-NEXT: retq
;
; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_sign:
; CHECK-VBMI: # %bb.0:
; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
; CHECK-VBMI-NEXT: retq
; CHECK-LABEL: trunc_v32i16_v32i8_sign:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsraw $8, 32(%rdi), %ymm0
; CHECK-NEXT: vpsraw $8, (%rdi), %ymm1
; CHECK-NEXT: vpacksswb %ymm0, %ymm1, %ymm0
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-NEXT: retq
%a = load <32 x i16>, <32 x i16>* %x
%b = ashr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%c = trunc <32 x i16> %b to <32 x i8>

View File

@ -304,24 +304,11 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,0,17,0,18,0,19,0,20,0,21,0,22,0,23]
; AVX512VLVBMI-NEXT: vpermi2b %xmm0, %xmm1, %xmm2
; AVX512VLVBMI-NEXT: vmovdqa %xmm2, %xmm0
; AVX512VLVBMI-NEXT: retq
; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
; XOPAVX1: # %bb.0:
@ -1335,23 +1322,11 @@ define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1OR2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,1,17,4,20,5,21,2,18,3,19,6,22,7,23]
; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
; AVX512VLVBMI-NEXT: retq
; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23>
ret <16 x i8> %shuffle
}

View File

@ -1017,23 +1017,11 @@ define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8i16_0c1d2e3f:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_0c1d2e3f:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_0c1d2e3f:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,12,1,13,2,14,3,15]
; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-FAST-NEXT: retq
; AVX-LABEL: shuffle_v8i16_0c1d2e3f:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15>
ret <8 x i16> %shuffle
}
@ -1059,23 +1047,11 @@ define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8i16_48596a7b:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_48596a7b:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_48596a7b:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,8,5,9,6,10,7,11]
; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-FAST-NEXT: retq
; AVX-LABEL: shuffle_v8i16_48596a7b:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11>
ret <8 x i16> %shuffle
}
@ -1424,23 +1400,11 @@ define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8i16_012dXXXX:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_012dXXXX:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_012dXXXX:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,13,4,5,6,7]
; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
; AVX512VL-FAST-NEXT: retq
; AVX-LABEL: shuffle_v8i16_012dXXXX:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@ -1475,24 +1439,11 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i16_XXXXcde3:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXXXcde3:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_XXXXcde3:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,11]
; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0
; AVX512VL-FAST-NEXT: retq
; AVX2OR512VL-LABEL: shuffle_v8i16_XXXXcde3:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v8i16_XXXXcde3:
; XOPAVX1: # %bb.0:
@ -1533,24 +1484,11 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8i16_cde3XXXX:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; AVX1OR2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i16_cde3XXXX:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_cde3XXXX:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,4,5,6,7]
; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2
; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0
; AVX512VL-FAST-NEXT: retq
; AVX-LABEL: shuffle_v8i16_cde3XXXX:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}

View File

@ -4804,29 +4804,11 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR28136:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: PR28136:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-SLOW-LABEL: PR28136:
; AVX512VLVBMI-SLOW: # %bb.0:
; AVX512VLVBMI-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VLVBMI-SLOW-NEXT: retq
;
; AVX512VLVBMI-FAST-LABEL: PR28136:
; AVX512VLVBMI-FAST: # %bb.0:
; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,16,48,17,49,18,50,19,51,4,36,5,37,6,38,7,39,20,52,21,53,22,54,23,55]
; AVX512VLVBMI-FAST-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
; AVX512VLVBMI-FAST-NEXT: retq
; AVX2OR512VL-LABEL: PR28136:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: PR28136:
; XOPAVX1: # %bb.0:

View File

@ -1902,20 +1902,11 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuf_zext_8i16_to_4i64_offset2:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuf_zext_8i16_to_4i64_offset2:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,33,34,35,3,37,38,39,4,41,42,43,5,45,46,47]
; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>
%Z = bitcast <16 x i16> %B to <4 x i64>