forked from OSchip/llvm-project
[X86][SSE] Add support for (V)PMOVSX* constant folding
We already have (V)PMOVZX* combining support, this is the beginning of handling (V)PMOVSX* similarly - other combines in combineVSZext can be generalized in future patches. This unearthed an interesting bug in that we were generating illegal build vectors on 32-bit targets - it was proving difficult to create a test for it from PMOVZX, but it fired immediately with PMOVSX. I've created a more general form of the existing getConstVector to handle these cases - ideally this should be handled in non-target-specific code but I couldn't find an equivalent. Differential Revision: https://reviews.llvm.org/D25874 llvm-svn: 285072
This commit is contained in:
parent
426e6f71f8
commit
5c3c9707c3
|
@ -4427,6 +4427,40 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
|
|||
return ConstsNode;
|
||||
}
|
||||
|
||||
static SDValue getConstVector(ArrayRef<APInt> Values, SmallBitVector &Undefs,
|
||||
MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
|
||||
assert(Values.size() == Undefs.size() && "Unequal constant and undef arrays");
|
||||
SmallVector<SDValue, 32> Ops;
|
||||
bool Split = false;
|
||||
|
||||
MVT ConstVecVT = VT;
|
||||
unsigned NumElts = VT.getVectorNumElements();
|
||||
bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
|
||||
if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
|
||||
ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
|
||||
Split = true;
|
||||
}
|
||||
|
||||
MVT EltVT = ConstVecVT.getVectorElementType();
|
||||
for (unsigned i = 0, e = Values.size(); i != e; ++i) {
|
||||
if (Undefs[i]) {
|
||||
Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
|
||||
continue;
|
||||
}
|
||||
const APInt &V = Values[i];
|
||||
assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
|
||||
if (Split) {
|
||||
Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
|
||||
Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
|
||||
} else {
|
||||
Ops.push_back(DAG.getConstant(V, dl, EltVT));
|
||||
}
|
||||
}
|
||||
|
||||
SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
|
||||
return DAG.getBitcast(VT, ConstsNode);
|
||||
}
|
||||
|
||||
/// Returns a vector of specified type with all zero elements.
|
||||
static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
|
||||
SelectionDAG &DAG, const SDLoc &dl) {
|
||||
|
@ -31817,10 +31851,11 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
|
|||
return OptimizeConditionalInDecrement(N, DAG);
|
||||
}
|
||||
|
||||
static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const X86Subtarget &Subtarget) {
|
||||
static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const X86Subtarget &Subtarget) {
|
||||
SDLoc DL(N);
|
||||
unsigned Opcode = N->getOpcode();
|
||||
MVT VT = N->getSimpleValueType(0);
|
||||
MVT SVT = VT.getVectorElementType();
|
||||
SDValue Op = N->getOperand(0);
|
||||
|
@ -31829,25 +31864,28 @@ static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
|
|||
unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
|
||||
|
||||
// Perform any constant folding.
|
||||
// FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
|
||||
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
|
||||
SmallVector<SDValue, 4> Vals;
|
||||
for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
|
||||
unsigned NumDstElts = VT.getVectorNumElements();
|
||||
SmallBitVector Undefs(NumDstElts, false);
|
||||
SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
|
||||
for (unsigned i = 0; i != NumDstElts; ++i) {
|
||||
SDValue OpElt = Op.getOperand(i);
|
||||
if (OpElt.getOpcode() == ISD::UNDEF) {
|
||||
Vals.push_back(DAG.getUNDEF(SVT));
|
||||
Undefs[i] = true;
|
||||
continue;
|
||||
}
|
||||
APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
|
||||
assert(Cst.getBitWidth() == OpEltVT.getSizeInBits());
|
||||
Cst = Cst.zextOrTrunc(SVT.getSizeInBits());
|
||||
Vals.push_back(DAG.getConstant(Cst, DL, SVT));
|
||||
Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
|
||||
: Cst.sextOrTrunc(SVT.getSizeInBits());
|
||||
}
|
||||
return DAG.getBuildVector(VT, DL, Vals);
|
||||
return getConstVector(Vals, Undefs, VT, DAG, DL);
|
||||
}
|
||||
|
||||
// (vzext (bitcast (vzext (x)) -> (vzext x)
|
||||
// TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
|
||||
SDValue V = peekThroughBitcasts(Op);
|
||||
if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
|
||||
if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
|
||||
MVT InnerVT = V.getSimpleValueType();
|
||||
MVT InnerEltVT = InnerVT.getVectorElementType();
|
||||
|
||||
|
@ -31872,7 +31910,9 @@ static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
|
|||
// Check if we can bypass extracting and re-inserting an element of an input
|
||||
// vector. Essentially:
|
||||
// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
|
||||
if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
|
||||
// TODO: Add X86ISD::VSEXT support
|
||||
if (Opcode == X86ISD::VZEXT &&
|
||||
V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
|
||||
V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
|
||||
V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
|
||||
SDValue ExtractedV = V.getOperand(0);
|
||||
|
@ -31994,7 +32034,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
|
|||
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
|
||||
case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
|
||||
case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
|
||||
case X86ISD::VZEXT: return combineVZext(N, DAG, DCI, Subtarget);
|
||||
case X86ISD::VSEXT:
|
||||
case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
|
||||
case X86ISD::SHUFP: // Handle all target specific shuffles
|
||||
case X86ISD::INSERTPS:
|
||||
case X86ISD::PALIGNR:
|
||||
|
|
|
@ -83,9 +83,8 @@ define <4 x i32> @test_sext_4i8_4i32_undef() {
|
|||
define <4 x i64> @test_sext_4i8_4i64() {
|
||||
; X32-LABEL: test_sext_4i8_4i64:
|
||||
; X32: # BB#0:
|
||||
; X32-NEXT: vpmovsxbq {{\.LCPI.*}}, %xmm0
|
||||
; X32-NEXT: vpmovsxbq {{\.LCPI.*}}, %xmm1
|
||||
; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,4294967295,4294967295]
|
||||
; X32-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm0
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: test_sext_4i8_4i64:
|
||||
|
@ -104,9 +103,7 @@ define <4 x i64> @test_sext_4i8_4i64_undef() {
|
|||
; X32-LABEL: test_sext_4i8_4i64_undef:
|
||||
; X32: # BB#0:
|
||||
; X32-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
||||
; X32-NEXT: vpmovsxbq %xmm0, %xmm0
|
||||
; X32-NEXT: vpmovsxbq {{\.LCPI.*}}, %xmm1
|
||||
; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; X32-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm0
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: test_sext_4i8_4i64_undef:
|
||||
|
|
|
@ -26,7 +26,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
|
|||
; SSE41-LABEL: mul_v16i8c:
|
||||
; SSE41: # BB#0: # %entry
|
||||
; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
|
||||
; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm2
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
|
||||
; SSE41-NEXT: pmullw %xmm2, %xmm1
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
|
||||
; SSE41-NEXT: pand %xmm3, %xmm1
|
||||
|
@ -41,8 +41,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
|
|||
; AVX2-LABEL: mul_v16i8c:
|
||||
; AVX2: # BB#0: # %entry
|
||||
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
|
||||
; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1
|
||||
; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
|
@ -54,8 +53,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
|
|||
; AVX512F-LABEL: mul_v16i8c:
|
||||
; AVX512F: # BB#0: # %entry
|
||||
; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
|
||||
; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1
|
||||
; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
|
||||
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
||||
; AVX512F-NEXT: retq
|
||||
|
@ -63,8 +61,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
|
|||
; AVX512BW-LABEL: mul_v16i8c:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
|
||||
; AVX512BW-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1
|
||||
; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
||||
; AVX512BW-NEXT: retq
|
||||
|
@ -418,7 +415,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
|
|||
; SSE41-LABEL: mul_v32i8c:
|
||||
; SSE41: # BB#0: # %entry
|
||||
; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
|
||||
; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm4
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
|
||||
; SSE41-NEXT: pmullw %xmm4, %xmm2
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
|
||||
; SSE41-NEXT: pand %xmm5, %xmm2
|
||||
|
@ -443,7 +440,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
|
|||
; AVX2: # BB#0: # %entry
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
|
||||
; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
|
||||
; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
|
@ -462,7 +459,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
|
|||
; AVX512F-LABEL: mul_v32i8c:
|
||||
; AVX512F: # BB#0: # %entry
|
||||
; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
|
||||
; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
|
||||
; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
|
||||
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
|
||||
|
@ -477,8 +474,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
|
|||
; AVX512BW-LABEL: mul_v32i8c:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
|
||||
; AVX512BW-NEXT: vpmovsxbw {{.*}}(%rip), %zmm1
|
||||
; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
|
||||
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0
|
||||
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: retq
|
||||
entry:
|
||||
|
@ -833,7 +829,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
|
|||
; SSE41-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
|
||||
; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm6
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117]
|
||||
; SSE41-NEXT: pmullw %xmm6, %xmm0
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
|
||||
; SSE41-NEXT: pand %xmm7, %xmm0
|
||||
|
@ -874,7 +870,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
|
|||
; AVX2: # BB#0: # %entry
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
|
||||
; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
|
||||
; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
|
||||
; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
|
@ -907,7 +903,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
|
|||
; AVX512F-LABEL: mul_v64i8c:
|
||||
; AVX512F: # BB#0: # %entry
|
||||
; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2
|
||||
; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
|
||||
; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
|
||||
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
|
||||
|
@ -932,7 +928,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
|
|||
; AVX512BW-LABEL: mul_v64i8c:
|
||||
; AVX512BW: # BB#0: # %entry
|
||||
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
|
||||
; AVX512BW-NEXT: vpmovsxbw {{.*}}(%rip), %zmm2
|
||||
; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
|
||||
; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1
|
||||
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
|
||||
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
||||
|
|
|
@ -544,7 +544,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
|
|||
; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
|
||||
; SSE41-NEXT: paddb %xmm2, %xmm1
|
||||
; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
|
||||
; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm3
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7]
|
||||
; SSE41-NEXT: pmullw %xmm3, %xmm2
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
|
||||
; SSE41-NEXT: pand %xmm4, %xmm2
|
||||
|
@ -577,7 +577,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
|
|||
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
|
||||
; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2
|
||||
; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm3
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7]
|
||||
; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
|
||||
|
@ -607,8 +607,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
|
|||
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1
|
||||
; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
|
||||
; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
|
||||
; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
|
||||
|
|
|
@ -459,7 +459,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
|
|||
; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpmovsxbw %xmm3, %xmm4
|
||||
; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm5
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7]
|
||||
; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
|
||||
; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
|
||||
|
@ -524,7 +524,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
|
|||
; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
||||
; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
|
||||
; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
|
||||
; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
|
|
|
@ -1439,7 +1439,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
|
|||
; AVX512F-NEXT: vpsubb %ymm7, %ymm4, %ymm4
|
||||
; AVX512F-NEXT: vpaddb %ymm8, %ymm4, %ymm8
|
||||
; AVX512F-NEXT: vpmovsxbw %xmm8, %ymm9
|
||||
; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm4
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
|
||||
; AVX512F-NEXT: vpmullw %ymm4, %ymm9, %ymm9
|
||||
; AVX512F-NEXT: vpmovsxwd %ymm9, %zmm9
|
||||
; AVX512F-NEXT: vpmovdb %zmm9, %xmm9
|
||||
|
|
|
@ -520,7 +520,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
|
|||
; SSE41-NEXT: psrlw $2, %xmm2
|
||||
; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
|
||||
; SSE41-NEXT: pmovsxbw %xmm2, %xmm1
|
||||
; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm3
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7]
|
||||
; SSE41-NEXT: pmullw %xmm3, %xmm1
|
||||
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
|
||||
; SSE41-NEXT: pand %xmm4, %xmm1
|
||||
|
@ -550,7 +550,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
|
|||
; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2
|
||||
; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm3
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7]
|
||||
; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
|
||||
|
@ -577,8 +577,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
|
|||
; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
|
||||
; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
|
||||
; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
|
||||
|
|
|
@ -470,7 +470,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
|
|||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpmovsxbw %xmm3, %xmm6
|
||||
; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm7
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7]
|
||||
; AVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
|
||||
; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
|
||||
|
@ -530,7 +530,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
|
|||
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
||||
; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
|
||||
; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
|
||||
; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
|
||||
|
|
|
@ -1277,7 +1277,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
|
|||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
|
||||
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm7
|
||||
; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm8
|
||||
; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
|
||||
; AVX512F-NEXT: vpmullw %ymm3, %ymm8, %ymm8
|
||||
; AVX512F-NEXT: vpmovsxwd %ymm8, %zmm8
|
||||
; AVX512F-NEXT: vpmovdb %zmm8, %xmm8
|
||||
|
|
Loading…
Reference in New Issue