forked from OSchip/llvm-project
[X86][AVX] lowerShuffleWithPERMV - pad 128/256-bit shuffles on non-VLX targets
Allow non-VLX targets to use 512-bits VPERMV/VPERMV3 for 128/256-bit shuffles. TBH I'm not sure these targets actually exist in the wild, but we're testing for them and its good test coverage for shuffle lowering/combines across different subvector widths.
This commit is contained in:
parent
011bf4fd96
commit
abd33bf5ef
|
@ -14969,17 +14969,35 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
Mask, Subtarget, DAG);
|
||||
}
|
||||
|
||||
// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
|
||||
// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
|
||||
// the active subvector is extracted.
|
||||
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
|
||||
ArrayRef<int> Mask, SDValue V1,
|
||||
SDValue V2, SelectionDAG &DAG) {
|
||||
ArrayRef<int> Mask, SDValue V1, SDValue V2,
|
||||
const X86Subtarget &Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
|
||||
MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
|
||||
|
||||
SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
|
||||
if (V2.isUndef())
|
||||
return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
|
||||
|
||||
return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
|
||||
MVT ShuffleVT = VT;
|
||||
if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
|
||||
V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
|
||||
V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
|
||||
MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
|
||||
ShuffleVT = V1.getSimpleValueType();
|
||||
}
|
||||
|
||||
SDValue Result;
|
||||
if (V2.isUndef())
|
||||
Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
|
||||
else
|
||||
Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
|
||||
|
||||
if (VT != ShuffleVT)
|
||||
Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
|
||||
|
||||
return Result;
|
||||
}
|
||||
|
||||
/// Generic lowering of v16i8 shuffles.
|
||||
|
@ -15208,9 +15226,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
|
||||
return Unpack;
|
||||
|
||||
// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
|
||||
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
|
||||
return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
|
||||
// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
|
||||
if (Subtarget.hasVBMI())
|
||||
return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
|
||||
DAG);
|
||||
|
||||
// If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
|
||||
if (Subtarget.hasXOP()) {
|
||||
|
@ -16964,9 +16983,9 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
Zeroable, Subtarget, DAG))
|
||||
return PSHUFB;
|
||||
|
||||
// AVX512BWVL can lower to VPERMW.
|
||||
if (Subtarget.hasBWI() && Subtarget.hasVLX())
|
||||
return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
|
||||
// AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
|
||||
if (Subtarget.hasBWI())
|
||||
return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle.
|
||||
|
@ -17069,9 +17088,9 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
Zeroable, Subtarget, DAG))
|
||||
return PSHUFB;
|
||||
|
||||
// AVX512VBMIVL can lower to VPERMB.
|
||||
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
|
||||
return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
|
||||
// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
|
||||
if (Subtarget.hasVBMI())
|
||||
return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
|
||||
|
||||
// Try to simplify this by merging 128-bit lanes to enable a lane-based
|
||||
// shuffle.
|
||||
|
@ -17325,7 +17344,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
Zeroable, Subtarget, DAG))
|
||||
return Blend;
|
||||
|
||||
return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
|
||||
return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
|
||||
}
|
||||
|
||||
/// Handle lowering of 16-lane 32-bit floating point shuffles.
|
||||
|
@ -17384,7 +17403,7 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
V1, V2, DAG, Subtarget))
|
||||
return V;
|
||||
|
||||
return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
|
||||
return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
|
||||
}
|
||||
|
||||
/// Handle lowering of 8-lane 64-bit integer shuffles.
|
||||
|
@ -17447,7 +17466,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
Zeroable, Subtarget, DAG))
|
||||
return Blend;
|
||||
|
||||
return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
|
||||
return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
|
||||
}
|
||||
|
||||
/// Handle lowering of 16-lane 32-bit integer shuffles.
|
||||
|
@ -17524,7 +17543,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
Zeroable, Subtarget, DAG))
|
||||
return Blend;
|
||||
|
||||
return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
|
||||
return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
|
||||
}
|
||||
|
||||
/// Handle lowering of 32-lane 16-bit integer shuffles.
|
||||
|
@ -17587,7 +17606,7 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
Zeroable, Subtarget, DAG))
|
||||
return PSHUFB;
|
||||
|
||||
return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
|
||||
return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
|
||||
}
|
||||
|
||||
/// Handle lowering of 64-lane 8-bit integer shuffles.
|
||||
|
@ -17643,7 +17662,7 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
|
||||
// VBMI can use VPERMV/VPERMV3 byte shuffles.
|
||||
if (Subtarget.hasVBMI())
|
||||
return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
|
||||
return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
|
||||
|
||||
// Try to create an in-lane repeating shuffle mask and then shuffle the
|
||||
// results into the target lanes.
|
||||
|
|
|
@ -85,12 +85,10 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
|
|||
;
|
||||
; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
|
||||
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
|
||||
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
|
||||
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,17,19,21,23,9,11,13,15,25,27,29,31]
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
|
||||
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,2,1,3]
|
||||
; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
|
@ -260,20 +258,11 @@ define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
|
|||
;
|
||||
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
|
||||
; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
|
||||
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
|
||||
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
|
||||
; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
|
||||
|
@ -327,20 +316,11 @@ define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
|
|||
;
|
||||
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
|
||||
; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
|
||||
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
|
||||
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
|
||||
; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
|
||||
|
@ -394,20 +374,11 @@ define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
|
|||
;
|
||||
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
|
||||
; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
|
||||
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
|
||||
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
|
||||
; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:
|
||||
|
|
|
@ -328,8 +328,8 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
|
|||
;
|
||||
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
|
||||
; AVX512VBMI: # %bb.0:
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
|
||||
; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512VBMI-NEXT: vpermt2b %zmm0, %zmm1, %zmm0
|
||||
; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
|
||||
; AVX512VBMI-NEXT: vzeroupper
|
||||
; AVX512VBMI-NEXT: retq
|
||||
|
@ -413,8 +413,8 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
|
|||
;
|
||||
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
|
||||
; AVX512VBMI: # %bb.0:
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
|
||||
; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512VBMI-NEXT: vpermt2b %zmm0, %zmm1, %zmm0
|
||||
; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
|
||||
; AVX512VBMI-NEXT: vzeroupper
|
||||
; AVX512VBMI-NEXT: retq
|
||||
|
@ -457,13 +457,10 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
|
|||
;
|
||||
; AVX512BW-LABEL: PR34175:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1
|
||||
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
|
||||
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
|
||||
; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1
|
||||
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
|
||||
; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
||||
; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
|
@ -478,13 +475,10 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
|
|||
;
|
||||
; AVX512VBMI-LABEL: PR34175:
|
||||
; AVX512VBMI: # %bb.0:
|
||||
; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1
|
||||
; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
|
||||
; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
|
||||
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
|
||||
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
|
||||
; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1
|
||||
; AVX512VBMI-NEXT: vpermt2w %zmm0, %zmm0, %zmm1
|
||||
; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
|
||||
; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
|
||||
; AVX512VBMI-NEXT: retq
|
||||
;
|
||||
|
|
Loading…
Reference in New Issue