forked from OSchip/llvm-project
[X86] Allow cross-lane permutations for sub targets supporting AVX2.
Summary: Most instructions in AVX work “in-lane”, that is, each source element is applied only to other elements of the same lane, thus a cross lane permutation is costly and needs more than one instrution. AVX2 includes instructions to perform any-to-any permutation of words over a 256-bit register and vectorized table lookup. This should also Fix PR34369 Differential Revision: https://reviews.llvm.org/D37388 llvm-svn: 312608
This commit is contained in:
parent
6dbf0876c1
commit
2c139f77c7
|
@ -12081,7 +12081,8 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
|
|||
static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
|
||||
SDValue V1, SDValue V2,
|
||||
ArrayRef<int> Mask,
|
||||
SelectionDAG &DAG) {
|
||||
SelectionDAG &DAG,
|
||||
const X86Subtarget &Subtarget) {
|
||||
// FIXME: This should probably be generalized for 512-bit vectors as well.
|
||||
assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
|
||||
int Size = Mask.size();
|
||||
|
@ -12090,12 +12091,21 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
|
|||
// If there are only inputs from one 128-bit lane, splitting will in fact be
|
||||
// less expensive. The flags track whether the given lane contains an element
|
||||
// that crosses to another lane.
|
||||
if (!Subtarget.hasAVX2()) {
|
||||
bool LaneCrossing[2] = {false, false};
|
||||
for (int i = 0; i < Size; ++i)
|
||||
if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
|
||||
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
|
||||
if (!LaneCrossing[0] || !LaneCrossing[1])
|
||||
return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
|
||||
} else {
|
||||
bool LaneUsed[2] = {false, false};
|
||||
for (int i = 0; i < Size; ++i)
|
||||
if (Mask[i] >= 0)
|
||||
LaneUsed[(Mask[i] / LaneSize)] = true;
|
||||
if (!LaneUsed[0] || !LaneUsed[1])
|
||||
return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
|
||||
}
|
||||
|
||||
assert(V2.isUndef() &&
|
||||
"This last part of this routine only works on single input shuffles");
|
||||
|
@ -12710,7 +12720,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
|
||||
// Otherwise, fall back.
|
||||
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
|
||||
DAG);
|
||||
DAG, Subtarget);
|
||||
}
|
||||
|
||||
// Use dedicated unpack instructions for masks that match their pattern.
|
||||
|
@ -12913,7 +12923,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
|
||||
// Otherwise, fall back.
|
||||
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
|
||||
DAG);
|
||||
DAG, Subtarget);
|
||||
}
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
|
@ -13114,7 +13124,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
// element types.
|
||||
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
|
||||
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
|
||||
Mask, DAG);
|
||||
Mask, DAG, Subtarget);
|
||||
|
||||
SmallVector<int, 8> RepeatedMask;
|
||||
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
|
||||
|
@ -13199,7 +13209,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
// element types.
|
||||
if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
|
||||
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
|
||||
DAG);
|
||||
DAG, Subtarget);
|
||||
|
||||
if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
|
||||
DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -63,24 +63,18 @@ define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_
|
|||
define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b) {
|
||||
; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
|
||||
; KNL: ## BB#0:
|
||||
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
|
||||
; KNL-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
|
||||
; KNL-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,12,13,12,13,10,11,0,1,4,5,4,5,0,1]
|
||||
; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
|
||||
; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,3,2,2,4,5,6,7]
|
||||
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; KNL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
|
||||
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,8,9,14,15,4,5,2,3,2,3,6,7]
|
||||
; KNL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15]
|
||||
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm5
|
||||
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7],ymm0[8],ymm5[9],ymm0[10],ymm5[11],ymm0[12],ymm5[13],ymm0[14],ymm5[15]
|
||||
; KNL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
|
||||
; KNL-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7],ymm1[8,9,10,11],ymm2[12,13],ymm1[14],ymm2[15]
|
||||
; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,u,u]
|
||||
; KNL-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
|
||||
; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7],ymm0[8,9,10,11,12],ymm4[13,14,15]
|
||||
; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17,u,u]
|
||||
; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
|
||||
; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3
|
||||
; KNL-NEXT: vpbroadcastw %xmm3, %ymm3
|
||||
; KNL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
|
||||
; KNL-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
|
||||
; KNL-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
|
||||
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
|
||||
; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
|
||||
; KNL-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
|
||||
; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17]
|
||||
; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
|
||||
; KNL-NEXT: retq
|
||||
;
|
||||
|
|
|
@ -883,22 +883,14 @@ define <32 x i8> @constant_fold_pshufb_256() {
|
|||
define <32 x i8> @PR27320(<8 x i32> %a0) {
|
||||
; X32-LABEL: PR27320:
|
||||
; X32: # BB#0:
|
||||
; X32-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[12,13,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; X32-NEXT: vextracti128 $1, %ymm0, %xmm2
|
||||
; X32-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,0,1,2,3,3,4,5,6,6,7]
|
||||
; X32-NEXT: vpor %xmm1, %xmm2, %xmm1
|
||||
; X32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11]
|
||||
; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
|
||||
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23]
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: PR27320:
|
||||
; X64: # BB#0:
|
||||
; X64-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[12,13,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; X64-NEXT: vextracti128 $1, %ymm0, %xmm2
|
||||
; X64-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,0,1,2,3,3,4,5,6,6,7]
|
||||
; X64-NEXT: vpor %xmm1, %xmm2, %xmm1
|
||||
; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11]
|
||||
; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
|
||||
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23]
|
||||
; X64-NEXT: retq
|
||||
%1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef>
|
||||
%2 = bitcast <8 x i32> %1 to <32 x i8>
|
||||
|
|
|
@ -3,18 +3,17 @@
|
|||
define <32 x i8> @foo(<48 x i8>* %x0, <16 x i32> %x1, <16 x i32> %x2) {
|
||||
; CHECK-LABEL: foo:
|
||||
; CHECK: # BB#0:
|
||||
; CHECK-NEXT: vmovdqu (%rdi), %ymm0
|
||||
; CHECK-NEXT: vmovdqu 32(%rdi), %xmm1
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
|
||||
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
|
||||
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,2,3,5,6]
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpor %xmm3, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,11,12,14,15,u,u,u,u,u,u,u,u,u,u]
|
||||
; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vmovdqu 32(%rdi), %xmm0
|
||||
; CHECK-NEXT: vmovdqu (%rdi), %ymm1
|
||||
; CHECK-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6]
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
|
||||
; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u>
|
||||
; CHECK-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
|
||||
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0]
|
||||
; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
|
||||
; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%1 = load <48 x i8>, <48 x i8>* %x0, align 1
|
||||
%2 = shufflevector <48 x i8> %1, <48 x i8> undef, <32 x i32> <i32 0, i32 1, i32 3, i32 4, i32 6, i32 7, i32 9, i32 10, i32 12, i32 13, i32 15, i32 16, i32 18, i32 19, i32 21, i32 22, i32 24, i32 25, i32 27, i32 28, i32 30, i32 31, i32 33, i32 34, i32 36, i32 37, i32 39, i32 40, i32 42, i32 43, i32 45, i32 46>
|
||||
|
|
|
@ -738,88 +738,92 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
|
|||
;
|
||||
; AVX2-LABEL: interleaved_load_vf32_i8_stride4:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovdqa (%rdi), %ymm9
|
||||
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm11
|
||||
; AVX2-NEXT: vmovdqa 64(%rdi), %ymm14
|
||||
; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
||||
; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm5
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5
|
||||
; AVX2-NEXT: vpshufb %ymm4, %ymm14, %ymm7
|
||||
; AVX2-NEXT: vmovdqa (%rdi), %ymm11
|
||||
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
|
||||
; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4
|
||||
; AVX2-NEXT: vmovdqa 96(%rdi), %ymm6
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
|
||||
; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm3
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
|
||||
; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm7
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
|
||||
; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm7
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
|
||||
; AVX2-NEXT: vpshufb %ymm4, %ymm11, %ymm7
|
||||
; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm7
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm7[0],xmm3[0]
|
||||
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm7
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
|
||||
; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm7
|
||||
; AVX2-NEXT: vpshufb %ymm4, %ymm9, %ymm4
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
|
||||
; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm10
|
||||
; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm7
|
||||
; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm2
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
||||
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm8
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm9
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm7, %xmm10, %xmm5
|
||||
; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm6
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm13
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm4
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm1
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm6[6,7]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm6
|
||||
; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm4
|
||||
; AVX2-NEXT: vpshufb %xmm7, %xmm11, %xmm7
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm7
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm1
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm0
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
|
||||
; AVX2-NEXT: vpshufb %xmm7, %xmm9, %xmm3
|
||||
; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm5
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm10
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm10, %xmm3
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm0
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],xmm5[2,3]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm13
|
||||
; AVX2-NEXT: vpshufb %xmm7, %xmm13, %xmm3
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3,0,1]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm14
|
||||
; AVX2-NEXT: vpshufb %xmm7, %xmm14, %xmm7
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm0
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3,0,1]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm8
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm10, %xmm1
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm4
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm4, %xmm13, %xmm5
|
||||
; AVX2-NEXT: vpshufb %xmm4, %xmm14, %xmm2
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm2
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm5
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm6
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm5
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm0
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm5
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm3
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm5
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm6
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm5
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm14, %xmm2
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm2
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm11, %xmm0
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
||||
; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm2
|
||||
; AVX2-NEXT: vpshufb %xmm4, %xmm9, %xmm4
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm1, %xmm10, %xmm2
|
||||
; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm3
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm13, %xmm4
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm14, %xmm5
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
|
||||
; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm4
|
||||
; AVX2-NEXT: vpshufb %xmm1, %xmm11, %xmm1
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm4
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm3
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm5
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
|
||||
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
||||
|
@ -831,74 +835,78 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
|
|||
; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
|
||||
; AVX512: # BB#0:
|
||||
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
|
||||
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
|
||||
; AVX512-NEXT: vpmovdw %zmm0, %ymm2
|
||||
; AVX512-NEXT: vpmovdw %zmm1, %ymm3
|
||||
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
|
||||
; AVX512-NEXT: vpmovwb %zmm2, %ymm8
|
||||
; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm14
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm9
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm7, %xmm9, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm7, %xmm14, %xmm5
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm10
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm6
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm4
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm5[6,7]
|
||||
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm5
|
||||
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm7
|
||||
; AVX512-NEXT: vpmovdw %zmm0, %ymm1
|
||||
; AVX512-NEXT: vpmovdw %zmm7, %ymm2
|
||||
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vpmovwb %zmm1, %ymm8
|
||||
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm9
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm9, %xmm3
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm4
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm3[0,1],xmm4[2,3]
|
||||
; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm5
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm12
|
||||
; AVX512-NEXT: vpshufb %xmm7, %xmm12, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm7, %xmm5, %xmm7
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm13
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm6
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm3
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
|
||||
; AVX512-NEXT: vpcmpeqb %ymm3, %ymm8, %ymm8
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm6
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm10, %xmm7
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm2
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1]
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm3
|
||||
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3,0,1]
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm13
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm6
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm14
|
||||
; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm4
|
||||
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3,0,1]
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7
|
||||
; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm6
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
|
||||
; AVX512-NEXT: vpcmpeqb %ymm2, %ymm8, %ymm8
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm3
|
||||
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm4
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm6
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
|
||||
; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm14, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm4
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm6
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm10, %xmm7
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm1
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm14, %xmm3
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm4
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX512-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0
|
||||
; AVX512-NEXT: vpsllw $7, %ymm8, %ymm1
|
||||
|
@ -1044,104 +1052,96 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
|
|||
;
|
||||
; AVX2-LABEL: interleaved_load_vf32_i8_stride3:
|
||||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
|
||||
; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
|
||||
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
|
||||
; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0>
|
||||
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14],zero,zero,zero,zero,zero
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13]
|
||||
; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero
|
||||
; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5
|
||||
; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255>
|
||||
; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,16,19,22,25,28,31,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3,0,1]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,4,7,10,13]
|
||||
; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0]
|
||||
; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
|
||||
; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255>
|
||||
; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm6[0,3,6,9,12,15],zero,zero,zero,zero,zero
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14]
|
||||
; AVX2-NEXT: vpor %xmm7, %xmm5, %xmm5
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero
|
||||
; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
|
||||
; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3,0,1]
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = <0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u>
|
||||
; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm2[2,5,8,11,14]
|
||||
; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
|
||||
; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u>
|
||||
; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15]
|
||||
; AVX2-NEXT: vpor %xmm6, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[0,3,6,9,12,15]
|
||||
; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u>
|
||||
; AVX2-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[0,3,6,9,12,15]
|
||||
; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
|
||||
; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vpaddb %ymm0, %ymm5, %ymm0
|
||||
; AVX2-NEXT: vpaddb %ymm0, %ymm4, %ymm0
|
||||
; AVX2-NEXT: vpaddb %ymm0, %ymm3, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: interleaved_load_vf32_i8_stride3:
|
||||
; AVX512: # BB#0:
|
||||
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
|
||||
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0>
|
||||
; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2
|
||||
; AVX512-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm4[2,5,8,11,14],zero,zero,zero,zero,zero
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13]
|
||||
; AVX512-NEXT: vpor %xmm5, %xmm3, %xmm3
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,3,6,9,12,15,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[1,4,7,10,13]
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero
|
||||
; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5
|
||||
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
|
||||
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0>
|
||||
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm3
|
||||
; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm2
|
||||
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3,0,1]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255>
|
||||
; AVX512-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,16,19,22,25,28,31,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3,0,1]
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,2,5,8,11,14],zero,zero,zero,zero,zero
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[1,4,7,10,13]
|
||||
; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0]
|
||||
; AVX512-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
|
||||
; AVX512-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255>
|
||||
; AVX512-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm5
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm6[0,3,6,9,12,15],zero,zero,zero,zero,zero
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,4,7,10,13],zero,zero,zero,zero,zero,zero,xmm5[2,5,8,11,14]
|
||||
; AVX512-NEXT: vpor %xmm7, %xmm5, %xmm5
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,4,7,10,13,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,5,8,11,14]
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero
|
||||
; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6
|
||||
; AVX512-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm5
|
||||
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3,0,1]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = <0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u>
|
||||
; AVX512-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,0,3,6,9,12,15],zero,zero,zero,zero,zero
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm1[2,5,8,11,14]
|
||||
; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
|
||||
; AVX512-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u,0,255,u>
|
||||
; AVX512-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
|
||||
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm2[1,4,7,10,13],zero,zero,zero,zero,zero,zero
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,5,8,11,14],zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15]
|
||||
; AVX512-NEXT: vpor %xmm6, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,11,14,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,xmm3[0,3,6,9,12,15]
|
||||
; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,u,u,255,u,u,255,u,u,255,u,u,255,u,u,255,u>
|
||||
; AVX512-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u]
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,1,4,7,10,13],zero,zero,zero,zero,zero,zero
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[0,3,6,9,12,15]
|
||||
; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
|
||||
; AVX512-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
|
||||
; AVX512-NEXT: vpaddb %ymm0, %ymm5, %ymm0
|
||||
; AVX512-NEXT: vpaddb %ymm0, %ymm4, %ymm0
|
||||
; AVX512-NEXT: vpaddb %ymm0, %ymm2, %ymm0
|
||||
; AVX512-NEXT: retq
|
||||
%wide.vec = load <96 x i8>, <96 x i8>* %ptr
|
||||
%v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
|
||||
|
|
Loading…
Reference in New Issue