forked from OSchip/llvm-project
[X86][SSE] Transform truncations between vectors of integers into X86ISD::PACKUS/PACKSS operations during DAG combine.
This patch transforms truncation between vectors of integers into X86ISD::PACKUS/PACKSS operations during DAG combine. We don't do it in lowering phase because after type legalization, the original truncation will be turned into a BUILD_VECTOR with each element that is extracted from a vector and then truncated, and from them it is difficult to do this optimization. This greatly improves the performance of truncations on some specific types. Cost table is updated accordingly. Differential revision: http://reviews.llvm.org/D14588 llvm-svn: 256194
This commit is contained in:
parent
6c494cd0df
commit
8df93ce455
|
@ -25897,12 +25897,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
|
||||
const X86Subtarget *Subtarget) {
|
||||
return detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget,
|
||||
SDLoc(N));
|
||||
}
|
||||
|
||||
/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
|
||||
static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
|
@ -26546,6 +26540,163 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
|
||||
static SDValue
|
||||
combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
|
||||
SmallVector<SDValue, 8> &Regs) {
|
||||
assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
|
||||
Regs[0].getValueType() == MVT::v2i64));
|
||||
EVT OutVT = N->getValueType(0);
|
||||
EVT OutSVT = OutVT.getVectorElementType();
|
||||
EVT InVT = Regs[0].getValueType();
|
||||
EVT InSVT = InVT.getVectorElementType();
|
||||
SDLoc DL(N);
|
||||
|
||||
// First, use mask to unset all bits that won't appear in the result.
|
||||
assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
|
||||
"OutSVT can only be either i8 or i16.");
|
||||
SDValue MaskVal =
|
||||
DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT);
|
||||
SDValue MaskVec = DAG.getNode(
|
||||
ISD::BUILD_VECTOR, DL, InVT,
|
||||
SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal));
|
||||
for (auto &Reg : Regs)
|
||||
Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg);
|
||||
|
||||
MVT UnpackedVT, PackedVT;
|
||||
if (OutSVT == MVT::i8) {
|
||||
UnpackedVT = MVT::v8i16;
|
||||
PackedVT = MVT::v16i8;
|
||||
} else {
|
||||
UnpackedVT = MVT::v4i32;
|
||||
PackedVT = MVT::v8i16;
|
||||
}
|
||||
|
||||
// In each iteration, truncate the type by a half size.
|
||||
auto RegNum = Regs.size();
|
||||
for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
|
||||
j < e; j *= 2, RegNum /= 2) {
|
||||
for (unsigned i = 0; i < RegNum; i++)
|
||||
Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]);
|
||||
for (unsigned i = 0; i < RegNum / 2; i++)
|
||||
Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
|
||||
Regs[i * 2 + 1]);
|
||||
}
|
||||
|
||||
// If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
|
||||
// then extract a subvector as the result since v8i8 is not a legal type.
|
||||
if (OutVT == MVT::v8i8) {
|
||||
Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
|
||||
Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
|
||||
DAG.getIntPtrConstant(0, DL));
|
||||
return Regs[0];
|
||||
} else if (RegNum > 1) {
|
||||
Regs.resize(RegNum);
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
|
||||
} else
|
||||
return Regs[0];
|
||||
}
|
||||
|
||||
/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
|
||||
static SDValue
|
||||
combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
|
||||
SmallVector<SDValue, 8> &Regs) {
|
||||
assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
|
||||
EVT OutVT = N->getValueType(0);
|
||||
SDLoc DL(N);
|
||||
|
||||
// Shift left by 16 bits, then arithmetic-shift right by 16 bits.
|
||||
SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
|
||||
for (auto &Reg : Regs) {
|
||||
Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
|
||||
Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
|
||||
}
|
||||
|
||||
for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
|
||||
Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
|
||||
Regs[i * 2 + 1]);
|
||||
|
||||
if (Regs.size() > 2) {
|
||||
Regs.resize(Regs.size() / 2);
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
|
||||
} else
|
||||
return Regs[0];
|
||||
}
|
||||
|
||||
/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
|
||||
/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
|
||||
/// legalization the truncation will be translated into a BUILD_VECTOR with each
|
||||
/// element that is extracted from a vector and then truncated, and it is
|
||||
/// diffcult to do this optimization based on them.
|
||||
static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
|
||||
const X86Subtarget *Subtarget) {
|
||||
EVT OutVT = N->getValueType(0);
|
||||
if (!OutVT.isVector())
|
||||
return SDValue();
|
||||
|
||||
SDValue In = N->getOperand(0);
|
||||
if (!In.getValueType().isSimple())
|
||||
return SDValue();
|
||||
|
||||
EVT InVT = In.getValueType();
|
||||
unsigned NumElems = OutVT.getVectorNumElements();
|
||||
|
||||
// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
|
||||
// SSE2, and we need to take care of it specially.
|
||||
// AVX512 provides vpmovdb.
|
||||
if (!Subtarget->hasSSE2() || Subtarget->hasAVX2())
|
||||
return SDValue();
|
||||
|
||||
EVT OutSVT = OutVT.getVectorElementType();
|
||||
EVT InSVT = InVT.getVectorElementType();
|
||||
if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
|
||||
(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
|
||||
NumElems >= 8))
|
||||
return SDValue();
|
||||
|
||||
// SSSE3's pshufb results in less instructions in the cases below.
|
||||
if (Subtarget->hasSSSE3() && NumElems == 8 &&
|
||||
((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
|
||||
(InSVT == MVT::i32 && OutSVT == MVT::i16)))
|
||||
return SDValue();
|
||||
|
||||
SDLoc DL(N);
|
||||
|
||||
// Split a long vector into vectors of legal type.
|
||||
unsigned RegNum = InVT.getSizeInBits() / 128;
|
||||
SmallVector<SDValue, 8> SubVec(RegNum);
|
||||
if (InSVT == MVT::i32) {
|
||||
for (unsigned i = 0; i < RegNum; i++)
|
||||
SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
|
||||
DAG.getIntPtrConstant(i * 4, DL));
|
||||
} else {
|
||||
for (unsigned i = 0; i < RegNum; i++)
|
||||
SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
|
||||
DAG.getIntPtrConstant(i * 2, DL));
|
||||
}
|
||||
|
||||
// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS
|
||||
// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
|
||||
// truncate 2 x v4i32 to v8i16.
|
||||
if (Subtarget->hasSSE41() || OutSVT == MVT::i8)
|
||||
return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
|
||||
else if (InSVT == MVT::i32)
|
||||
return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
|
||||
else
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
|
||||
const X86Subtarget *Subtarget) {
|
||||
// Try to detect AVG pattern first.
|
||||
SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG,
|
||||
Subtarget, SDLoc(N));
|
||||
if (Avg.getNode())
|
||||
return Avg;
|
||||
|
||||
return combineVectorTruncation(N, DAG, Subtarget);
|
||||
}
|
||||
|
||||
/// Do target-specific dag combines on floating point negations.
|
||||
static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
|
||||
const X86Subtarget *Subtarget) {
|
||||
|
|
|
@ -780,10 +780,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
|
|||
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
|
||||
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
|
||||
|
||||
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 14 },
|
||||
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 7 },
|
||||
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
|
||||
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
|
||||
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
|
||||
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 31 },
|
||||
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
|
||||
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
|
||||
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
|
||||
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
|
||||
|
|
|
@ -237,7 +237,7 @@ define void @sext_v4i8_to_v4i16(<4 x i8>* %a) {
|
|||
|
||||
define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) {
|
||||
; SSE2: truncate_v16i32_to_v16i16
|
||||
; SSE2: cost of 14 {{.*}} trunc
|
||||
; SSE2: cost of 10 {{.*}} trunc
|
||||
;
|
||||
; SSE41: truncate_v16i32_to_v16i16
|
||||
; SSE41: cost of 6 {{.*}} trunc
|
||||
|
@ -250,7 +250,7 @@ define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) {
|
|||
|
||||
define void @truncate_v8i32_to_v8i16(<8 x i32>* %a) {
|
||||
; SSE2: truncate_v8i32_to_v8i16
|
||||
; SSE2: cost of 7 {{.*}} trunc
|
||||
; SSE2: cost of 5 {{.*}} trunc
|
||||
;
|
||||
; SSE41: truncate_v8i32_to_v8i16
|
||||
; SSE41: cost of 3 {{.*}} trunc
|
||||
|
@ -276,7 +276,7 @@ define void @truncate_v4i32_to_v4i16(<4 x i32>* %a) {
|
|||
|
||||
define void @truncate_v16i32_to_v16i8(<16 x i32>* %a) {
|
||||
; SSE2: truncate_v16i32_to_v16i8
|
||||
; SSE2: cost of 31 {{.*}} trunc
|
||||
; SSE2: cost of 7 {{.*}} trunc
|
||||
;
|
||||
; SSE41: truncate_v16i32_to_v16i8
|
||||
; SSE41: cost of 30 {{.*}} trunc
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
|
||||
|
||||
define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
|
||||
; SSE2-LABEL: trunc8i64_8i32:
|
||||
|
@ -56,6 +57,11 @@ define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
|
|||
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc8i64_8i32:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <8 x i64> %a to <8 x i32>
|
||||
ret <8 x i32> %0
|
||||
|
@ -102,36 +108,28 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
|
|||
;
|
||||
; SSE41-LABEL: trunc8i64_8i16:
|
||||
; SSE41: # BB#0: # %entry
|
||||
; SSE41-NEXT: pextrw $4, %xmm0, %eax
|
||||
; SSE41-NEXT: pinsrw $1, %eax, %xmm0
|
||||
; SSE41-NEXT: movd %xmm1, %eax
|
||||
; SSE41-NEXT: pinsrw $2, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrw $4, %xmm1, %eax
|
||||
; SSE41-NEXT: pinsrw $3, %eax, %xmm0
|
||||
; SSE41-NEXT: movd %xmm2, %eax
|
||||
; SSE41-NEXT: pinsrw $4, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrw $4, %xmm2, %eax
|
||||
; SSE41-NEXT: pinsrw $5, %eax, %xmm0
|
||||
; SSE41-NEXT: movd %xmm3, %eax
|
||||
; SSE41-NEXT: pinsrw $6, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrw $4, %xmm3, %eax
|
||||
; SSE41-NEXT: pinsrw $7, %eax, %xmm0
|
||||
; SSE41-NEXT: pxor %xmm4, %xmm4
|
||||
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
|
||||
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
|
||||
; SSE41-NEXT: packusdw %xmm3, %xmm2
|
||||
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
|
||||
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
|
||||
; SSE41-NEXT: packusdw %xmm1, %xmm0
|
||||
; SSE41-NEXT: packusdw %xmm2, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: trunc8i64_8i16:
|
||||
; AVX1: # BB#0: # %entry
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
|
||||
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
|
||||
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
|
||||
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vzeroupper
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
|
@ -145,21 +143,79 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
|
|||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc8i64_8i16:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <8 x i64> %a to <8 x i16>
|
||||
ret <8 x i16> %0
|
||||
}
|
||||
|
||||
define void @trunc8i64_8i8(<8 x i64> %a) {
|
||||
; SSE-LABEL: trunc8i64_8i8:
|
||||
; SSE: # BB#0: # %entry
|
||||
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
||||
; SSE-NEXT: pand %xmm4, %xmm3
|
||||
; SSE-NEXT: pand %xmm4, %xmm2
|
||||
; SSE-NEXT: packuswb %xmm3, %xmm2
|
||||
; SSE-NEXT: pand %xmm4, %xmm1
|
||||
; SSE-NEXT: pand %xmm4, %xmm0
|
||||
; SSE-NEXT: packuswb %xmm1, %xmm0
|
||||
; SSE-NEXT: packuswb %xmm2, %xmm0
|
||||
; SSE-NEXT: packuswb %xmm0, %xmm0
|
||||
; SSE-NEXT: movq %xmm0, (%rax)
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: trunc8i64_8i8:
|
||||
; AVX1: # BB#0: # %entry
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
||||
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
||||
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovq %xmm0, (%rax)
|
||||
; AVX1-NEXT: vzeroupper
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: trunc8i64_8i8:
|
||||
; AVX2: # BB#0: # %entry
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
|
||||
; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX2-NEXT: vmovq %xmm0, (%rax)
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc8i64_8i8:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpmovqb %zmm0, (%rax)
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <8 x i64> %a to <8 x i8>
|
||||
store <8 x i8> %0, <8 x i8>* undef, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
|
||||
; SSE2-LABEL: trunc8i32_8i16:
|
||||
; SSE2: # BB#0: # %entry
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
||||
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
|
||||
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; SSE2-NEXT: pslld $16, %xmm1
|
||||
; SSE2-NEXT: psrad $16, %xmm1
|
||||
; SSE2-NEXT: pslld $16, %xmm0
|
||||
; SSE2-NEXT: psrad $16, %xmm0
|
||||
; SSE2-NEXT: packssdw %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: trunc8i32_8i16:
|
||||
|
@ -194,6 +250,11 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
|
|||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc8i32_8i16:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <8 x i32> %a to <8 x i16>
|
||||
ret <8 x i16> %0
|
||||
|
@ -202,7 +263,7 @@ entry:
|
|||
define void @trunc8i32_8i8(<8 x i32> %a) {
|
||||
; SSE2-LABEL: trunc8i32_8i8:
|
||||
; SSE2: # BB#0: # %entry
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
||||
; SSE2-NEXT: pand %xmm2, %xmm1
|
||||
; SSE2-NEXT: pand %xmm2, %xmm0
|
||||
; SSE2-NEXT: packuswb %xmm1, %xmm0
|
||||
|
@ -247,6 +308,13 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
|
|||
; AVX2-NEXT: vmovq %xmm0, (%rax)
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc8i32_8i8:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; AVX512BW-NEXT: vmovq %xmm0, (%rax)
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <8 x i32> %a to <8 x i8>
|
||||
store <8 x i8> %0, <8 x i8>* undef, align 4
|
||||
|
@ -254,128 +322,31 @@ entry:
|
|||
}
|
||||
|
||||
define void @trunc16i32_16i8(<16 x i32> %a) {
|
||||
; SSE2-LABEL: trunc16i32_16i8:
|
||||
; SSE2: # BB#0: # %entry
|
||||
; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
|
||||
; SSE2-NEXT: movdqu %xmm0, (%rax)
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: trunc16i32_16i8:
|
||||
; SSSE3: # BB#0: # %entry
|
||||
; SSSE3-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
|
||||
; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
|
||||
; SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
|
||||
; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
|
||||
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
|
||||
; SSSE3-NEXT: movdqu %xmm0, (%rax)
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: trunc16i32_16i8:
|
||||
; SSE41: # BB#0: # %entry
|
||||
; SSE41-NEXT: pextrb $4, %xmm0, %eax
|
||||
; SSE41-NEXT: pextrb $8, %xmm0, %ecx
|
||||
; SSE41-NEXT: pextrb $12, %xmm0, %edx
|
||||
; SSE41-NEXT: pinsrb $1, %eax, %xmm0
|
||||
; SSE41-NEXT: pinsrb $2, %ecx, %xmm0
|
||||
; SSE41-NEXT: pinsrb $3, %edx, %xmm0
|
||||
; SSE41-NEXT: pextrb $0, %xmm1, %eax
|
||||
; SSE41-NEXT: pinsrb $4, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrb $4, %xmm1, %eax
|
||||
; SSE41-NEXT: pinsrb $5, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrb $8, %xmm1, %eax
|
||||
; SSE41-NEXT: pinsrb $6, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrb $12, %xmm1, %eax
|
||||
; SSE41-NEXT: pinsrb $7, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrb $0, %xmm2, %eax
|
||||
; SSE41-NEXT: pinsrb $8, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrb $4, %xmm2, %eax
|
||||
; SSE41-NEXT: pinsrb $9, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrb $8, %xmm2, %eax
|
||||
; SSE41-NEXT: pinsrb $10, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrb $12, %xmm2, %eax
|
||||
; SSE41-NEXT: pinsrb $11, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrb $0, %xmm3, %eax
|
||||
; SSE41-NEXT: pinsrb $12, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrb $4, %xmm3, %eax
|
||||
; SSE41-NEXT: pinsrb $13, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrb $8, %xmm3, %eax
|
||||
; SSE41-NEXT: pinsrb $14, %eax, %xmm0
|
||||
; SSE41-NEXT: pextrb $12, %xmm3, %eax
|
||||
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
|
||||
; SSE41-NEXT: movdqu %xmm0, (%rax)
|
||||
; SSE41-NEXT: retq
|
||||
; SSE-LABEL: trunc16i32_16i8:
|
||||
; SSE: # BB#0: # %entry
|
||||
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
||||
; SSE-NEXT: pand %xmm4, %xmm3
|
||||
; SSE-NEXT: pand %xmm4, %xmm2
|
||||
; SSE-NEXT: packuswb %xmm3, %xmm2
|
||||
; SSE-NEXT: pand %xmm4, %xmm1
|
||||
; SSE-NEXT: pand %xmm4, %xmm0
|
||||
; SSE-NEXT: packuswb %xmm1, %xmm0
|
||||
; SSE-NEXT: packuswb %xmm2, %xmm0
|
||||
; SSE-NEXT: movdqu %xmm0, (%rax)
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: trunc16i32_16i8:
|
||||
; AVX1: # BB#0: # %entry
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
|
||||
; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
|
||||
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
|
||||
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
||||
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
|
||||
; AVX1-NEXT: vzeroupper
|
||||
; AVX1-NEXT: retq
|
||||
|
@ -394,6 +365,11 @@ define void @trunc16i32_16i8(<16 x i32> %a) {
|
|||
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc16i32_16i8:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpmovdb %zmm0, (%rax)
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <16 x i32> %a to <16 x i8>
|
||||
store <16 x i8> %0, <16 x i8>* undef, align 4
|
||||
|
@ -451,6 +427,13 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
|
|||
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc2x4i64_8i32:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
|
||||
; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <4 x i64> %a to <4 x i32>
|
||||
%1 = trunc <4 x i64> %b to <4 x i32>
|
||||
|
@ -543,6 +526,16 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
|
|||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc2x4i64_8i16:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <4 x i64> %a to <4 x i16>
|
||||
%1 = trunc <4 x i64> %b to <4 x i16>
|
||||
|
@ -585,6 +578,13 @@ define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
|
|||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc2x2i64_4i32:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
|
||||
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <2 x i64> %a to <2 x i32>
|
||||
%1 = trunc <2 x i64> %b to <2 x i32>
|
||||
|
@ -604,6 +604,12 @@ define i64 @trunc2i64_i64(<2 x i64> %inval) {
|
|||
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; AVX-NEXT: vmovq %xmm0, %rax
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc2i64_i64:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; AVX512BW-NEXT: vmovq %xmm0, %rax
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <2 x i64> %inval to <2 x i32>
|
||||
%1 = bitcast <2 x i32> %0 to i64
|
||||
|
@ -645,6 +651,14 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
|
|||
; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc2x4i32_8i16:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <4 x i32> %a to <4 x i16>
|
||||
%1 = trunc <4 x i32> %b to <4 x i16>
|
||||
|
@ -679,6 +693,12 @@ define i64 @trunc4i32_i64(<4 x i32> %inval) {
|
|||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; AVX-NEXT: vmovq %xmm0, %rax
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc4i32_i64:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; AVX512BW-NEXT: vmovq %xmm0, %rax
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <4 x i32> %inval to <4 x i16>
|
||||
%1 = bitcast <4 x i16> %0 to i64
|
||||
|
@ -717,6 +737,14 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
|
|||
; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc2x8i16_16i8:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <8 x i16> %a to <8 x i8>
|
||||
%1 = trunc <8 x i16> %b to <8 x i8>
|
||||
|
@ -750,6 +778,12 @@ define i64 @trunc8i16_i64(<8 x i16> %inval) {
|
|||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; AVX-NEXT: vmovq %xmm0, %rax
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc8i16_i64:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; AVX512BW-NEXT: vmovq %xmm0, %rax
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <8 x i16> %inval to <8 x i8>
|
||||
%1 = bitcast <8 x i8> %0 to i64
|
||||
|
@ -766,6 +800,11 @@ define <16 x i8> @trunc16i64_16i8_const() {
|
|||
; AVX: # BB#0: # %entry
|
||||
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: trunc16i64_16i8_const:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: retq
|
||||
|
||||
entry:
|
||||
%0 = trunc <16 x i64> zeroinitializer to <16 x i8>
|
||||
|
|
Loading…
Reference in New Issue