forked from OSchip/llvm-project
[X86][SSE] Lower shuffles to permute(unpack(x,y)) (PR31151)
Attempt to lower a shuffle as an unpack of elements from two inputs followed by a single-input (wider) permutation. As long as the permutation is wider this is a win - there may be some circumstances where same size permutations would also be useful but I've left that for future work. Differential Revision: https://reviews.llvm.org/D52043 llvm-svn: 342257
This commit is contained in:
parent
ac356cac0c
commit
32857c54d2
|
@ -10030,6 +10030,72 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
|
|||
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
|
||||
}
|
||||
|
||||
/// Try to lower as an unpack of elements from two inputs followed by
|
||||
/// a single-input permutation.
|
||||
///
|
||||
/// This matches the pattern where we can unpack elements from two inputs and
|
||||
/// then reduce the shuffle to a single-input (wider) permutation.
|
||||
static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
|
||||
SDValue V1, SDValue V2,
|
||||
ArrayRef<int> Mask,
|
||||
SelectionDAG &DAG) {
|
||||
int NumElts = Mask.size();
|
||||
int NumLanes = VT.getSizeInBits() / 128;
|
||||
int NumLaneElts = NumElts / NumLanes;
|
||||
int NumHalfLaneElts = NumLaneElts / 2;
|
||||
|
||||
bool MatchLo = true, MatchHi = true;
|
||||
SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
|
||||
|
||||
// Determine UNPCKL/UNPCKH type and operand order.
|
||||
for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
|
||||
for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
|
||||
int M = Mask[Lane + Elt];
|
||||
if (M < 0)
|
||||
continue;
|
||||
|
||||
SDValue &Op = Ops[Elt & 1];
|
||||
if (M < NumElts && (Op.isUndef() || Op == V1))
|
||||
Op = V1;
|
||||
else if (NumElts <= M && (Op.isUndef() || Op == V2))
|
||||
Op = V2;
|
||||
else
|
||||
return SDValue();
|
||||
|
||||
int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
|
||||
MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
|
||||
isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
|
||||
MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
|
||||
isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
|
||||
if (!MatchLo && !MatchHi)
|
||||
return SDValue();
|
||||
}
|
||||
}
|
||||
assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
|
||||
|
||||
// Now check that each pair of elts come from the same unpack pair
|
||||
// and set the permute mask based on each pair.
|
||||
// TODO - Investigate cases where we permute individual elements.
|
||||
SmallVector<int, 32> PermuteMask(NumElts, -1);
|
||||
for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
|
||||
for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
|
||||
int M0 = Mask[Lane + Elt + 0];
|
||||
int M1 = Mask[Lane + Elt + 1];
|
||||
if (0 <= M0 && 0 <= M1 &&
|
||||
(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
|
||||
return SDValue();
|
||||
if (0 <= M0)
|
||||
PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
|
||||
if (0 <= M1)
|
||||
PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
|
||||
SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
|
||||
return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
|
||||
}
|
||||
|
||||
/// Generic routine to decompose a shuffle and blend into independent
|
||||
/// blends and permutes.
|
||||
///
|
||||
|
@ -10056,15 +10122,19 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
|
|||
BlendMask[i] = i + Size;
|
||||
}
|
||||
|
||||
// Try to lower with the simpler initial blend strategy unless one of the
|
||||
// input shuffles would be a no-op. We prefer to shuffle inputs as the
|
||||
// Try to lower with the simpler initial blend/unpack strategies unless one of
|
||||
// the input shuffles would be a no-op. We prefer to shuffle inputs as the
|
||||
// shuffle may be able to fold with a load or other benefit. However, when
|
||||
// we'll have to do 2x as many shuffles in order to achieve this, blending
|
||||
// first is a better strategy.
|
||||
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
|
||||
// we'll have to do 2x as many shuffles in order to achieve this,
|
||||
// blending/unpacking first is a better strategy.
|
||||
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
|
||||
if (SDValue BlendPerm =
|
||||
lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
|
||||
return BlendPerm;
|
||||
if (SDValue UnpackPerm =
|
||||
lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
|
||||
return UnpackPerm;
|
||||
}
|
||||
|
||||
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
|
||||
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
|
||||
|
|
|
@ -1,71 +1,71 @@
|
|||
; RUN: llc -O2 %s -o %t -filetype=obj
|
||||
; RUN: llvm-dwarfdump -debug-info %t | FileCheck %s
|
||||
|
||||
; Check that Machine CSE correctly handles during the transformation, the
|
||||
; debug location information for variables.
|
||||
|
||||
; Generated with clang -c -g -O2
|
||||
|
||||
; typedef float __attribute__((__vector_size__(16))) f4;
|
||||
; f4 get();
|
||||
; int main() {
|
||||
; float MyVar = get()[0];
|
||||
; if (MyVar)
|
||||
; return 1;
|
||||
; }
|
||||
|
||||
; ModuleID = 'test.cpp'
|
||||
source_filename = "test.cpp"
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-pc-linux-gnu"
|
||||
|
||||
define dso_local i32 @main() !dbg !7 {
|
||||
entry:
|
||||
%call = tail call <4 x float> @_Z3getv(), !dbg !14
|
||||
%vecext = extractelement <4 x float> %call, i32 0, !dbg !14
|
||||
call void @llvm.dbg.value(metadata float %vecext, metadata !12, metadata !DIExpression()), !dbg !15
|
||||
%tobool = fcmp une float %vecext, 0.000000e+00, !dbg !16
|
||||
%. = zext i1 %tobool to i32, !dbg !18
|
||||
ret i32 %., !dbg !19
|
||||
}
|
||||
|
||||
declare dso_local <4 x float> @_Z3getv()
|
||||
|
||||
declare void @llvm.dbg.value(metadata, metadata, metadata) #2
|
||||
|
||||
!llvm.dbg.cu = !{!0}
|
||||
!llvm.module.flags = !{!3, !4, !5}
|
||||
!llvm.ident = !{!6}
|
||||
|
||||
!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 339665)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
|
||||
!1 = !DIFile(filename: "test.cpp", directory: ".")
|
||||
!2 = !{}
|
||||
!3 = !{i32 2, !"Dwarf Version", i32 4}
|
||||
!4 = !{i32 2, !"Debug Info Version", i32 3}
|
||||
!5 = !{i32 1, !"wchar_size", i32 4}
|
||||
!6 = !{!"clang version 8.0.0 (trunk 339665)"}
|
||||
!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
|
||||
!8 = !DISubroutineType(types: !9)
|
||||
!9 = !{!10}
|
||||
!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
|
||||
!11 = !{!12}
|
||||
!12 = !DILocalVariable(name: "MyVar", scope: !7, file: !1, line: 4, type: !13)
|
||||
!13 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
|
||||
!14 = !DILocation(line: 4, column: 18, scope: !7)
|
||||
!15 = !DILocation(line: 4, column: 9, scope: !7)
|
||||
!16 = !DILocation(line: 5, column: 7, scope: !17)
|
||||
!17 = distinct !DILexicalBlock(scope: !7, file: !1, line: 5, column: 7)
|
||||
!18 = !DILocation(line: 6, column: 5, scope: !17)
|
||||
!19 = !DILocation(line: 7, column: 1, scope: !7)
|
||||
|
||||
; Look at the debug location information for variable 'MyVar'.
|
||||
; Verify that we see a sequence of DI entries, that looks like:
|
||||
; DW_TAG_variable
|
||||
; DW_AT_location (0x00000000
|
||||
; [0x0000000000000009, 0x0000000000000012): DW_OP_reg17 XMM0)
|
||||
; DW_AT_name ("MyVar")
|
||||
|
||||
; CHECK-LABEL: DW_TAG_variable
|
||||
; CHECK-NEXT: DW_AT_location{{.*}}
|
||||
; CHECK-NEXT: {{.*}}DW_OP_reg17 XMM0
|
||||
; CHECK-NEXT: DW_AT_name{{.*}}("MyVar")
|
||||
; RUN: llc -O2 %s -o %t -filetype=obj
|
||||
; RUN: llvm-dwarfdump -debug-info %t | FileCheck %s
|
||||
|
||||
; Check that Machine CSE correctly handles during the transformation, the
|
||||
; debug location information for variables.
|
||||
|
||||
; Generated with clang -c -g -O2
|
||||
|
||||
; typedef float __attribute__((__vector_size__(16))) f4;
|
||||
; f4 get();
|
||||
; int main() {
|
||||
; float MyVar = get()[0];
|
||||
; if (MyVar)
|
||||
; return 1;
|
||||
; }
|
||||
|
||||
; ModuleID = 'test.cpp'
|
||||
source_filename = "test.cpp"
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-pc-linux-gnu"
|
||||
|
||||
define dso_local i32 @main() !dbg !7 {
|
||||
entry:
|
||||
%call = tail call <4 x float> @_Z3getv(), !dbg !14
|
||||
%vecext = extractelement <4 x float> %call, i32 0, !dbg !14
|
||||
call void @llvm.dbg.value(metadata float %vecext, metadata !12, metadata !DIExpression()), !dbg !15
|
||||
%tobool = fcmp une float %vecext, 0.000000e+00, !dbg !16
|
||||
%. = zext i1 %tobool to i32, !dbg !18
|
||||
ret i32 %., !dbg !19
|
||||
}
|
||||
|
||||
declare dso_local <4 x float> @_Z3getv()
|
||||
|
||||
declare void @llvm.dbg.value(metadata, metadata, metadata) #2
|
||||
|
||||
!llvm.dbg.cu = !{!0}
|
||||
!llvm.module.flags = !{!3, !4, !5}
|
||||
!llvm.ident = !{!6}
|
||||
|
||||
!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 339665)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
|
||||
!1 = !DIFile(filename: "test.cpp", directory: ".")
|
||||
!2 = !{}
|
||||
!3 = !{i32 2, !"Dwarf Version", i32 4}
|
||||
!4 = !{i32 2, !"Debug Info Version", i32 3}
|
||||
!5 = !{i32 1, !"wchar_size", i32 4}
|
||||
!6 = !{!"clang version 8.0.0 (trunk 339665)"}
|
||||
!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
|
||||
!8 = !DISubroutineType(types: !9)
|
||||
!9 = !{!10}
|
||||
!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
|
||||
!11 = !{!12}
|
||||
!12 = !DILocalVariable(name: "MyVar", scope: !7, file: !1, line: 4, type: !13)
|
||||
!13 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
|
||||
!14 = !DILocation(line: 4, column: 18, scope: !7)
|
||||
!15 = !DILocation(line: 4, column: 9, scope: !7)
|
||||
!16 = !DILocation(line: 5, column: 7, scope: !17)
|
||||
!17 = distinct !DILexicalBlock(scope: !7, file: !1, line: 5, column: 7)
|
||||
!18 = !DILocation(line: 6, column: 5, scope: !17)
|
||||
!19 = !DILocation(line: 7, column: 1, scope: !7)
|
||||
|
||||
; Look at the debug location information for variable 'MyVar'.
|
||||
; Verify that we see a sequence of DI entries, that looks like:
|
||||
; DW_TAG_variable
|
||||
; DW_AT_location (0x00000000
|
||||
; [0x0000000000000009, 0x0000000000000012): DW_OP_reg17 XMM0)
|
||||
; DW_AT_name ("MyVar")
|
||||
|
||||
; CHECK-LABEL: DW_TAG_variable
|
||||
; CHECK-NEXT: DW_AT_location{{.*}}
|
||||
; CHECK-NEXT: {{.*}}DW_OP_reg17 XMM0
|
||||
; CHECK-NEXT: DW_AT_name{{.*}}("MyVar")
|
||||
|
|
|
@ -978,30 +978,11 @@ define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(
|
|||
|
||||
; PR31151
|
||||
define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(<16 x i8> %val1, <16 x i8> %val2) {
|
||||
; SSE2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,1,3]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; SSE2-NEXT: pandn %xmm2, %xmm1
|
||||
; SSE2-NEXT: por %xmm0, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
|
||||
; SSSE3: # %bb.0:
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
|
||||
; SSE41: # %bb.0:
|
||||
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
||||
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; SSE41-NEXT: retq
|
||||
; SSE-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
|
||||
; AVX: # %bb.0:
|
||||
|
|
|
@ -1019,20 +1019,11 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2
|
|||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
|
||||
; AVX2-SLOW: # %bb.0:
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
|
||||
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
|
||||
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
|
||||
; AVX2-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX2-FAST-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
|
||||
; AVX2-FAST: # %bb.0:
|
||||
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
|
||||
; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
|
||||
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
|
||||
; AVX2-FAST-NEXT: retq
|
||||
; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
|
||||
; AVX512VL: # %bb.0:
|
||||
|
@ -4272,9 +4263,8 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_2
|
|||
;
|
||||
; AVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31]
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
|
||||
|
@ -4308,10 +4298,8 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2
|
|||
;
|
||||
; AVX2-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19]
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31]
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
|
||||
|
|
|
@ -1326,40 +1326,28 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
|
|||
;
|
||||
; AVX2-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
|
||||
; AVX2-SLOW: # %bb.0:
|
||||
; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
||||
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
|
||||
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
|
||||
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
|
||||
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
|
||||
; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
|
||||
; AVX2-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX2-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
|
||||
; AVX2-FAST: # %bb.0:
|
||||
; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
||||
; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
|
||||
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
|
||||
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
|
||||
; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
|
||||
; AVX2-FAST-NEXT: retq
|
||||
;
|
||||
; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
|
||||
; AVX512VLBW-SLOW: # %bb.0:
|
||||
; AVX512VLBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
|
||||
; AVX512VLBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
|
||||
; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
|
||||
; AVX512VLBW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX512VLBW-SLOW-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
|
||||
; AVX512VLBW-SLOW-NEXT: kmovd %eax, %k1
|
||||
; AVX512VLBW-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1}
|
||||
; AVX512VLBW-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
|
||||
; AVX512VLBW-FAST: # %bb.0:
|
||||
; AVX512VLBW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
|
||||
; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
|
||||
; AVX512VLBW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX512VLBW-FAST-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
|
||||
; AVX512VLBW-FAST-NEXT: kmovd %eax, %k1
|
||||
; AVX512VLBW-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1}
|
||||
; AVX512VLBW-FAST-NEXT: retq
|
||||
;
|
||||
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
|
||||
|
|
|
@ -1293,9 +1293,8 @@ define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) {
|
|||
;
|
||||
; AVX2OR512VL-LABEL: shuffle_v8i32_08084c4c:
|
||||
; AVX2OR512VL: # %bb.0:
|
||||
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,0,4,4,6,4]
|
||||
; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
|
||||
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
|
||||
; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
|
||||
; AVX2OR512VL-NEXT: retq
|
||||
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
|
||||
ret <8 x i32> %shuffle
|
||||
|
|
Loading…
Reference in New Issue