[X86][SSE] Transform truncations between vectors of integers into X86ISD::PACKUS/PACKSS operations during DAG combine.

This patch transforms truncation between vectors of integers into
X86ISD::PACKUS/PACKSS operations during DAG combine. We don't do it in
lowering phase because after type legalization, the original truncation
will be turned into a BUILD_VECTOR with each element that is extracted
from a vector and then truncated, and from them it is difficult to do
this optimization. This greatly improves the performance of truncations
on some specific types.

Cost table is updated accordingly.


Differential revision: http://reviews.llvm.org/D14588

llvm-svn: 256194
This commit is contained in:
Cong Hou 2015-12-21 20:42:43 +00:00
parent 6c494cd0df
commit 8df93ce455
4 changed files with 353 additions and 163 deletions

View File

@ -25897,12 +25897,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
return SDValue();
}
static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
return detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget,
SDLoc(N));
}
/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@ -26546,6 +26540,163 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
static SDValue
combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
SmallVector<SDValue, 8> &Regs) {
assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
Regs[0].getValueType() == MVT::v2i64));
EVT OutVT = N->getValueType(0);
EVT OutSVT = OutVT.getVectorElementType();
EVT InVT = Regs[0].getValueType();
EVT InSVT = InVT.getVectorElementType();
SDLoc DL(N);
// First, use mask to unset all bits that won't appear in the result.
assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
"OutSVT can only be either i8 or i16.");
SDValue MaskVal =
DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT);
SDValue MaskVec = DAG.getNode(
ISD::BUILD_VECTOR, DL, InVT,
SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal));
for (auto &Reg : Regs)
Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg);
MVT UnpackedVT, PackedVT;
if (OutSVT == MVT::i8) {
UnpackedVT = MVT::v8i16;
PackedVT = MVT::v16i8;
} else {
UnpackedVT = MVT::v4i32;
PackedVT = MVT::v8i16;
}
// In each iteration, truncate the type by a half size.
auto RegNum = Regs.size();
for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
j < e; j *= 2, RegNum /= 2) {
for (unsigned i = 0; i < RegNum; i++)
Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]);
for (unsigned i = 0; i < RegNum / 2; i++)
Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
Regs[i * 2 + 1]);
}
// If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
// then extract a subvector as the result since v8i8 is not a legal type.
if (OutVT == MVT::v8i8) {
Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
DAG.getIntPtrConstant(0, DL));
return Regs[0];
} else if (RegNum > 1) {
Regs.resize(RegNum);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
} else
return Regs[0];
}
/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
static SDValue
combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
SmallVector<SDValue, 8> &Regs) {
assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
EVT OutVT = N->getValueType(0);
SDLoc DL(N);
// Shift left by 16 bits, then arithmetic-shift right by 16 bits.
SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
for (auto &Reg : Regs) {
Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
}
for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
Regs[i * 2 + 1]);
if (Regs.size() > 2) {
Regs.resize(Regs.size() / 2);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
} else
return Regs[0];
}
/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
/// legalization the truncation will be translated into a BUILD_VECTOR with each
/// element that is extracted from a vector and then truncated, and it is
/// diffcult to do this optimization based on them.
static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
EVT OutVT = N->getValueType(0);
if (!OutVT.isVector())
return SDValue();
SDValue In = N->getOperand(0);
if (!In.getValueType().isSimple())
return SDValue();
EVT InVT = In.getValueType();
unsigned NumElems = OutVT.getVectorNumElements();
// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
// SSE2, and we need to take care of it specially.
// AVX512 provides vpmovdb.
if (!Subtarget->hasSSE2() || Subtarget->hasAVX2())
return SDValue();
EVT OutSVT = OutVT.getVectorElementType();
EVT InSVT = InVT.getVectorElementType();
if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
NumElems >= 8))
return SDValue();
// SSSE3's pshufb results in less instructions in the cases below.
if (Subtarget->hasSSSE3() && NumElems == 8 &&
((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
(InSVT == MVT::i32 && OutSVT == MVT::i16)))
return SDValue();
SDLoc DL(N);
// Split a long vector into vectors of legal type.
unsigned RegNum = InVT.getSizeInBits() / 128;
SmallVector<SDValue, 8> SubVec(RegNum);
if (InSVT == MVT::i32) {
for (unsigned i = 0; i < RegNum; i++)
SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
DAG.getIntPtrConstant(i * 4, DL));
} else {
for (unsigned i = 0; i < RegNum; i++)
SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(i * 2, DL));
}
// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS
// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
// truncate 2 x v4i32 to v8i16.
if (Subtarget->hasSSE41() || OutSVT == MVT::i8)
return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
else if (InSVT == MVT::i32)
return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
else
return SDValue();
}
static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
// Try to detect AVG pattern first.
SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG,
Subtarget, SDLoc(N));
if (Avg.getNode())
return Avg;
return combineVectorTruncation(N, DAG, Subtarget);
}
/// Do target-specific dag combines on floating point negations.
static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {

View File

@ -780,10 +780,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 14 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 7 },
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 31 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },

View File

@ -237,7 +237,7 @@ define void @sext_v4i8_to_v4i16(<4 x i8>* %a) {
define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) {
; SSE2: truncate_v16i32_to_v16i16
; SSE2: cost of 14 {{.*}} trunc
; SSE2: cost of 10 {{.*}} trunc
;
; SSE41: truncate_v16i32_to_v16i16
; SSE41: cost of 6 {{.*}} trunc
@ -250,7 +250,7 @@ define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) {
define void @truncate_v8i32_to_v8i16(<8 x i32>* %a) {
; SSE2: truncate_v8i32_to_v8i16
; SSE2: cost of 7 {{.*}} trunc
; SSE2: cost of 5 {{.*}} trunc
;
; SSE41: truncate_v8i32_to_v8i16
; SSE41: cost of 3 {{.*}} trunc
@ -276,7 +276,7 @@ define void @truncate_v4i32_to_v4i16(<4 x i32>* %a) {
define void @truncate_v16i32_to_v16i8(<16 x i32>* %a) {
; SSE2: truncate_v16i32_to_v16i8
; SSE2: cost of 31 {{.*}} trunc
; SSE2: cost of 7 {{.*}} trunc
;
; SSE41: truncate_v16i32_to_v16i8
; SSE41: cost of 30 {{.*}} trunc

View File

@ -4,6 +4,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
; SSE2-LABEL: trunc8i64_8i32:
@ -56,6 +57,11 @@ define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc8i64_8i32:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: retq
entry:
%0 = trunc <8 x i64> %a to <8 x i32>
ret <8 x i32> %0
@ -102,36 +108,28 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
;
; SSE41-LABEL: trunc8i64_8i16:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pextrw $4, %xmm0, %eax
; SSE41-NEXT: pinsrw $1, %eax, %xmm0
; SSE41-NEXT: movd %xmm1, %eax
; SSE41-NEXT: pinsrw $2, %eax, %xmm0
; SSE41-NEXT: pextrw $4, %xmm1, %eax
; SSE41-NEXT: pinsrw $3, %eax, %xmm0
; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: pinsrw $4, %eax, %xmm0
; SSE41-NEXT: pextrw $4, %xmm2, %eax
; SSE41-NEXT: pinsrw $5, %eax, %xmm0
; SSE41-NEXT: movd %xmm3, %eax
; SSE41-NEXT: pinsrw $6, %eax, %xmm0
; SSE41-NEXT: pextrw $4, %xmm3, %eax
; SSE41-NEXT: pinsrw $7, %eax, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
; SSE41-NEXT: packusdw %xmm3, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: packusdw %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc8i64_8i16:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@ -145,21 +143,79 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc8i64_8i16:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
%0 = trunc <8 x i64> %a to <8 x i16>
ret <8 x i16> %0
}
define void @trunc8i64_8i8(<8 x i64> %a) {
; SSE-LABEL: trunc8i64_8i8:
; SSE: # BB#0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: packuswb %xmm3, %xmm2
; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: movq %xmm0, (%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc8i64_8i8:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i64_8i8:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc8i64_8i8:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpmovqb %zmm0, (%rax)
; AVX512BW-NEXT: retq
entry:
%0 = trunc <8 x i64> %a to <8 x i8>
store <8 x i8> %0, <8 x i8>* undef, align 4
ret void
}
define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
; SSE2-LABEL: trunc8i32_8i16:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc8i32_8i16:
@ -194,6 +250,11 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc8i32_8i16:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: retq
entry:
%0 = trunc <8 x i32> %a to <8 x i16>
ret <8 x i16> %0
@ -202,7 +263,7 @@ entry:
define void @trunc8i32_8i8(<8 x i32> %a) {
; SSE2-LABEL: trunc8i32_8i8:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
@ -247,6 +308,13 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc8i32_8i8:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-NEXT: retq
entry:
%0 = trunc <8 x i32> %a to <8 x i8>
store <8 x i8> %0, <8 x i8>* undef, align 4
@ -254,128 +322,31 @@ entry:
}
define void @trunc16i32_16i8(<16 x i32> %a) {
; SSE2-LABEL: trunc16i32_16i8:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc16i32_16i8:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
; SSSE3-NEXT: movdqu %xmm0, (%rax)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc16i32_16i8:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pextrb $4, %xmm0, %eax
; SSE41-NEXT: pextrb $8, %xmm0, %ecx
; SSE41-NEXT: pextrb $12, %xmm0, %edx
; SSE41-NEXT: pinsrb $1, %eax, %xmm0
; SSE41-NEXT: pinsrb $2, %ecx, %xmm0
; SSE41-NEXT: pinsrb $3, %edx, %xmm0
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: pinsrb $4, %eax, %xmm0
; SSE41-NEXT: pextrb $4, %xmm1, %eax
; SSE41-NEXT: pinsrb $5, %eax, %xmm0
; SSE41-NEXT: pextrb $8, %xmm1, %eax
; SSE41-NEXT: pinsrb $6, %eax, %xmm0
; SSE41-NEXT: pextrb $12, %xmm1, %eax
; SSE41-NEXT: pinsrb $7, %eax, %xmm0
; SSE41-NEXT: pextrb $0, %xmm2, %eax
; SSE41-NEXT: pinsrb $8, %eax, %xmm0
; SSE41-NEXT: pextrb $4, %xmm2, %eax
; SSE41-NEXT: pinsrb $9, %eax, %xmm0
; SSE41-NEXT: pextrb $8, %xmm2, %eax
; SSE41-NEXT: pinsrb $10, %eax, %xmm0
; SSE41-NEXT: pextrb $12, %xmm2, %eax
; SSE41-NEXT: pinsrb $11, %eax, %xmm0
; SSE41-NEXT: pextrb $0, %xmm3, %eax
; SSE41-NEXT: pinsrb $12, %eax, %xmm0
; SSE41-NEXT: pextrb $4, %xmm3, %eax
; SSE41-NEXT: pinsrb $13, %eax, %xmm0
; SSE41-NEXT: pextrb $8, %xmm3, %eax
; SSE41-NEXT: pinsrb $14, %eax, %xmm0
; SSE41-NEXT: pextrb $12, %xmm3, %eax
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
; SSE41-NEXT: movdqu %xmm0, (%rax)
; SSE41-NEXT: retq
; SSE-LABEL: trunc16i32_16i8:
; SSE: # BB#0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
; SSE-NEXT: packuswb %xmm3, %xmm2
; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: movdqu %xmm0, (%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc16i32_16i8:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@ -394,6 +365,11 @@ define void @trunc16i32_16i8(<16 x i32> %a) {
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc16i32_16i8:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpmovdb %zmm0, (%rax)
; AVX512BW-NEXT: retq
entry:
%0 = trunc <16 x i32> %a to <16 x i8>
store <16 x i8> %0, <16 x i8>* undef, align 4
@ -451,6 +427,13 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i64_8i32:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
%0 = trunc <4 x i64> %a to <4 x i32>
%1 = trunc <4 x i64> %b to <4 x i32>
@ -543,6 +526,16 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i64_8i16:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: retq
entry:
%0 = trunc <4 x i64> %a to <4 x i16>
%1 = trunc <4 x i64> %b to <4 x i16>
@ -585,6 +578,13 @@ define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc2x2i64_4i32:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BW-NEXT: retq
entry:
%0 = trunc <2 x i64> %a to <2 x i32>
%1 = trunc <2 x i64> %b to <2 x i32>
@ -604,6 +604,12 @@ define i64 @trunc2i64_i64(<2 x i64> %inval) {
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
;
; AVX512BW-LABEL: trunc2i64_i64:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: retq
entry:
%0 = trunc <2 x i64> %inval to <2 x i32>
%1 = bitcast <2 x i32> %0 to i64
@ -645,6 +651,14 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i32_8i16:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: retq
entry:
%0 = trunc <4 x i32> %a to <4 x i16>
%1 = trunc <4 x i32> %b to <4 x i16>
@ -679,6 +693,12 @@ define i64 @trunc4i32_i64(<4 x i32> %inval) {
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
;
; AVX512BW-LABEL: trunc4i32_i64:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: retq
entry:
%0 = trunc <4 x i32> %inval to <4 x i16>
%1 = bitcast <4 x i16> %0 to i64
@ -717,6 +737,14 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
;
; AVX512BW-LABEL: trunc2x8i16_16i8:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: retq
entry:
%0 = trunc <8 x i16> %a to <8 x i8>
%1 = trunc <8 x i16> %b to <8 x i8>
@ -750,6 +778,12 @@ define i64 @trunc8i16_i64(<8 x i16> %inval) {
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
;
; AVX512BW-LABEL: trunc8i16_i64:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: retq
entry:
%0 = trunc <8 x i16> %inval to <8 x i8>
%1 = bitcast <8 x i8> %0 to i64
@ -766,6 +800,11 @@ define <16 x i8> @trunc16i64_16i8_const() {
; AVX: # BB#0: # %entry
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512BW-LABEL: trunc16i64_16i8_const:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
%0 = trunc <16 x i64> zeroinitializer to <16 x i8>