forked from OSchip/llvm-project
[X86] Replace support for vXi32 SMUL_LOHI/UMUL_LOHI with MULHS/MULHU support instead.
Summary: The only time vector SMUL_LOHI/UMUL_LOHI nodes are created is during division/remainder lowering. If its created before op legalization, generic DAGCombine immediately turns that SMUL_LOHI/UMUL_LOHI into a MULHS/MULHU since only the upper half is used. That node will stick around through vector op legalization and will be turned back into UMUL_LOHI/SMUL_LOHI during op legalization. It will then be custom lowered by the X86 backend. Due to this two step lowering the vector shuffles created by the custom lowering get legalized after their inputs rather than before. This prevents the shuffles from being combined with any build_vector of constants. This patch uses changes vXi32 to use MULHS/MULHU instead. This is what the later DAG combine did anyway. But by skipping the change back to UMUL_LOHI/SMUL_LOHI we lower it before any constant BUILD_VECTORS. This allows the vector_shuffle creation to constant fold with the build_vectors. This accounts for the test changes here. Reviewers: RKSimon, spatel Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D51254 llvm-svn: 340690
This commit is contained in:
parent
a11a3b3818
commit
ebec2793d1
|
@ -782,8 +782,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
|
||||
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
|
||||
setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
|
||||
setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
|
||||
setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
|
||||
|
@ -1087,9 +1087,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
|
||||
setOperationAction(ISD::MUL, MVT::v32i8, Custom);
|
||||
|
||||
setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
|
||||
|
||||
setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
|
||||
setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
|
||||
setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
|
||||
|
@ -1331,8 +1330,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setOperationAction(ISD::MUL, MVT::v8i64, Custom);
|
||||
setOperationAction(ISD::MUL, MVT::v16i32, Legal);
|
||||
|
||||
setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
|
||||
setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
|
||||
setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
|
||||
setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
|
||||
|
||||
setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
|
||||
setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
|
||||
|
@ -22901,6 +22900,75 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
|
|||
if (VT.is256BitVector() && !Subtarget.hasInt256())
|
||||
return Lower256IntArith(Op, DAG);
|
||||
|
||||
if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
|
||||
assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
|
||||
(VT == MVT::v8i32 && Subtarget.hasInt256()) ||
|
||||
(VT == MVT::v16i32 && Subtarget.hasAVX512()));
|
||||
SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
|
||||
|
||||
int NumElts = VT.getVectorNumElements();
|
||||
|
||||
// PMULxD operations multiply each even value (starting at 0) of LHS with
|
||||
// the related value of RHS and produce a widen result.
|
||||
// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
|
||||
// => <2 x i64> <ae|cg>
|
||||
//
|
||||
// In other word, to have all the results, we need to perform two PMULxD:
|
||||
// 1. one with the even values.
|
||||
// 2. one with the odd values.
|
||||
// To achieve #2, with need to place the odd values at an even position.
|
||||
//
|
||||
// Place the odd value at an even position (basically, shift all values 1
|
||||
// step to the left):
|
||||
const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
|
||||
9, -1, 11, -1, 13, -1, 15, -1};
|
||||
// <a|b|c|d> => <b|undef|d|undef>
|
||||
SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
|
||||
makeArrayRef(&Mask[0], NumElts));
|
||||
// <e|f|g|h> => <f|undef|h|undef>
|
||||
SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
|
||||
makeArrayRef(&Mask[0], NumElts));
|
||||
|
||||
// Emit two multiplies, one for the lower 2 ints and one for the higher 2
|
||||
// ints.
|
||||
MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
|
||||
bool IsSigned = Op->getOpcode() == ISD::MULHS;
|
||||
unsigned Opcode =
|
||||
(!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
|
||||
// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
|
||||
// => <2 x i64> <ae|cg>
|
||||
SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
|
||||
DAG.getBitcast(MulVT, Op0),
|
||||
DAG.getBitcast(MulVT, Op1)));
|
||||
// PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
|
||||
// => <2 x i64> <bf|dh>
|
||||
SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
|
||||
DAG.getBitcast(MulVT, Odd0),
|
||||
DAG.getBitcast(MulVT, Odd1)));
|
||||
|
||||
// Shuffle it back into the right order.
|
||||
SmallVector<int, 16> ShufMask(NumElts);
|
||||
for (int i = 0; i != NumElts; ++i)
|
||||
ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
|
||||
|
||||
SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
|
||||
|
||||
// If we have a signed multiply but no PMULDQ fix up the result of an
|
||||
// unsigned multiply.
|
||||
if (IsSigned && !Subtarget.hasSSE41()) {
|
||||
SDValue ShAmt = DAG.getConstant(31, dl, VT);
|
||||
SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
|
||||
DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
|
||||
SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
|
||||
DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
|
||||
|
||||
SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
|
||||
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
|
||||
}
|
||||
|
||||
return Res;
|
||||
}
|
||||
|
||||
// Only i8 vectors should need custom lowering after this.
|
||||
assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
|
||||
(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
|
||||
|
@ -23084,105 +23152,6 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
|
|||
return DAG.getBitcast(VT, CallInfo.first);
|
||||
}
|
||||
|
||||
static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
|
||||
MVT VT = Op0.getSimpleValueType();
|
||||
SDLoc dl(Op);
|
||||
|
||||
// Decompose 256-bit ops into smaller 128-bit ops.
|
||||
if (VT.is256BitVector() && !Subtarget.hasInt256()) {
|
||||
unsigned Opcode = Op.getOpcode();
|
||||
unsigned NumElems = VT.getVectorNumElements();
|
||||
MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
|
||||
SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
|
||||
SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
|
||||
SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
|
||||
SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
|
||||
SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
|
||||
SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
|
||||
SDValue Ops[] = {
|
||||
DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
|
||||
DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
|
||||
};
|
||||
return DAG.getMergeValues(Ops, dl);
|
||||
}
|
||||
|
||||
assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
|
||||
(VT == MVT::v8i32 && Subtarget.hasInt256()) ||
|
||||
(VT == MVT::v16i32 && Subtarget.hasAVX512()));
|
||||
|
||||
int NumElts = VT.getVectorNumElements();
|
||||
|
||||
// PMULxD operations multiply each even value (starting at 0) of LHS with
|
||||
// the related value of RHS and produce a widen result.
|
||||
// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
|
||||
// => <2 x i64> <ae|cg>
|
||||
//
|
||||
// In other word, to have all the results, we need to perform two PMULxD:
|
||||
// 1. one with the even values.
|
||||
// 2. one with the odd values.
|
||||
// To achieve #2, with need to place the odd values at an even position.
|
||||
//
|
||||
// Place the odd value at an even position (basically, shift all values 1
|
||||
// step to the left):
|
||||
const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
|
||||
// <a|b|c|d> => <b|undef|d|undef>
|
||||
SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
|
||||
makeArrayRef(&Mask[0], NumElts));
|
||||
// <e|f|g|h> => <f|undef|h|undef>
|
||||
SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
|
||||
makeArrayRef(&Mask[0], NumElts));
|
||||
|
||||
// Emit two multiplies, one for the lower 2 ints and one for the higher 2
|
||||
// ints.
|
||||
MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
|
||||
bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
|
||||
unsigned Opcode =
|
||||
(!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
|
||||
// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
|
||||
// => <2 x i64> <ae|cg>
|
||||
SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
|
||||
DAG.getBitcast(MulVT, Op0),
|
||||
DAG.getBitcast(MulVT, Op1)));
|
||||
// PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
|
||||
// => <2 x i64> <bf|dh>
|
||||
SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
|
||||
DAG.getBitcast(MulVT, Odd0),
|
||||
DAG.getBitcast(MulVT, Odd1)));
|
||||
|
||||
// Shuffle it back into the right order.
|
||||
SmallVector<int, 16> HighMask(NumElts);
|
||||
SmallVector<int, 16> LowMask(NumElts);
|
||||
for (int i = 0; i != NumElts; ++i) {
|
||||
HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
|
||||
LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
|
||||
}
|
||||
|
||||
SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
|
||||
SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
|
||||
|
||||
// If we have a signed multiply but no PMULDQ fix up the high parts of a
|
||||
// unsigned multiply.
|
||||
if (IsSigned && !Subtarget.hasSSE41()) {
|
||||
SDValue ShAmt = DAG.getConstant(
|
||||
31, dl,
|
||||
DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
|
||||
SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
|
||||
DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
|
||||
SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
|
||||
DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
|
||||
|
||||
SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
|
||||
Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
|
||||
}
|
||||
|
||||
// The first result of MUL_LOHI is actually the low value, followed by the
|
||||
// high value.
|
||||
SDValue Ops[] = {Lows, Highs};
|
||||
return DAG.getMergeValues(Ops, dl);
|
||||
}
|
||||
|
||||
// Return true if the required (according to Opcode) shift-imm form is natively
|
||||
// supported by the Subtarget
|
||||
static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
|
||||
|
@ -25579,8 +25548,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|||
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
|
||||
case ISD::MULHS:
|
||||
case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
|
||||
case ISD::UMUL_LOHI:
|
||||
case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
|
||||
case ISD::ROTL:
|
||||
case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
|
||||
case ISD::SRA:
|
||||
|
|
|
@ -78,22 +78,19 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
|
|||
define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
|
||||
; SSE2-LABEL: test_div7_4i32:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: pand %xmm1, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
||||
; SSE2-NEXT: pmuludq %xmm1, %xmm3
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
||||
; SSE2-NEXT: psrad $31, %xmm1
|
||||
; SSE2-NEXT: pand %xmm0, %xmm1
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm2
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE2-NEXT: pmuludq %xmm2, %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
||||
; SSE2-NEXT: pmuludq %xmm4, %xmm3
|
||||
; SSE2-NEXT: pmuludq %xmm2, %xmm3
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
|
||||
; SSE2-NEXT: psubd %xmm2, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
||||
; SSE2-NEXT: psrad $31, %xmm3
|
||||
; SSE2-NEXT: pand %xmm2, %xmm3
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm3
|
||||
; SSE2-NEXT: psubd %xmm3, %xmm1
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE2-NEXT: psrld $31, %xmm0
|
||||
|
@ -134,13 +131,12 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
|
|||
;
|
||||
; AVX2-LABEL: test_div7_4i32:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
|
||||
; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
|
||||
; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpsrad $2, %xmm0, %xmm0
|
||||
|
@ -384,33 +380,30 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
|
|||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: pand %xmm1, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
||||
; SSE2-NEXT: pmuludq %xmm1, %xmm3
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
||||
; SSE2-NEXT: psrad $31, %xmm1
|
||||
; SSE2-NEXT: pand %xmm0, %xmm1
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm2
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
|
||||
; SSE2-NEXT: pmuludq %xmm1, %xmm2
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
||||
; SSE2-NEXT: pmuludq %xmm4, %xmm3
|
||||
; SSE2-NEXT: pmuludq %xmm1, %xmm3
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
|
||||
; SSE2-NEXT: psubd %xmm2, %xmm1
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE2-NEXT: psrld $31, %xmm2
|
||||
; SSE2-NEXT: psrad $2, %xmm1
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm1
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
|
||||
; SSE2-NEXT: pmuludq %xmm2, %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
||||
; SSE2-NEXT: pmuludq %xmm2, %xmm3
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; SSE2-NEXT: psubd %xmm1, %xmm0
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
||||
; SSE2-NEXT: psrad $31, %xmm3
|
||||
; SSE2-NEXT: pand %xmm1, %xmm3
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm3
|
||||
; SSE2-NEXT: psubd %xmm3, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm1
|
||||
; SSE2-NEXT: psrld $31, %xmm1
|
||||
; SSE2-NEXT: psrad $2, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm2
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [7,7,7,7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
|
||||
; SSE2-NEXT: pmuludq %xmm1, %xmm2
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
|
||||
; SSE2-NEXT: pmuludq %xmm1, %xmm3
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
||||
; SSE2-NEXT: psubd %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: test_rem7_4i32:
|
||||
|
@ -448,13 +441,12 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
|
|||
;
|
||||
; AVX2-LABEL: test_rem7_4i32:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
|
||||
; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
|
||||
; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
|
||||
; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1
|
||||
|
|
|
@ -88,41 +88,37 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
|
|||
define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
|
||||
; AVX1-LABEL: test_div7_8i32:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
|
||||
; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3
|
||||
; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
|
||||
; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
|
||||
; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
||||
; AVX1-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
|
||||
; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
|
||||
; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
|
||||
; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
|
||||
; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
|
||||
; AVX1-NEXT: vpsrld $31, %xmm0, %xmm2
|
||||
; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_div7_8i32:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
|
||||
; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpmuldq %ymm2, %ymm0, %ymm2
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
|
||||
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vpsrld $31, %ymm0, %ymm1
|
||||
; AVX2-NEXT: vpsrad $2, %ymm0, %ymm0
|
||||
|
@ -363,46 +359,42 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
|
|||
define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
|
||||
; AVX1-LABEL: test_rem7_8i32:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
|
||||
; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3
|
||||
; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsrld $31, %xmm2, %xmm4
|
||||
; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7]
|
||||
; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
|
||||
; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
|
||||
; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
|
||||
; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
|
||||
; AVX1-NEXT: vpmuldq %xmm4, %xmm5, %xmm4
|
||||
; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsrld $31, %xmm1, %xmm4
|
||||
; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_rem7_8i32:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
|
||||
; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpmuldq %ymm2, %ymm0, %ymm2
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
|
||||
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2
|
||||
; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1
|
||||
|
|
|
@ -86,7 +86,6 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
|
|||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2
|
||||
; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
|
||||
; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
|
||||
; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1
|
||||
; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
|
||||
|
@ -313,7 +312,6 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
|
|||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
|
||||
; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2
|
||||
; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
|
||||
; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
|
||||
; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1
|
||||
; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
|
||||
|
|
|
@ -128,13 +128,12 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
|
|||
;
|
||||
; AVX2-LABEL: test_div7_4i32:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
|
||||
; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
|
||||
; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
|
||||
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
|
@ -422,13 +421,12 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
|
|||
;
|
||||
; AVX2-LABEL: test_rem7_4i32:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
|
||||
; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
|
||||
; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
|
||||
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
|
||||
; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
|
||||
|
|
|
@ -96,41 +96,37 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
|
|||
define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
|
||||
; AVX1-LABEL: test_div7_8i32:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
||||
; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
|
||||
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
|
||||
; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
|
||||
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
|
||||
; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
|
||||
; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
|
||||
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
|
||||
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
||||
; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_div7_8i32:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
|
||||
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
|
||||
; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
|
||||
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
||||
|
@ -371,46 +367,42 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
|
|||
define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
|
||||
; AVX1-LABEL: test_rem7_8i32:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3
|
||||
; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm3
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm4
|
||||
; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
|
||||
; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7]
|
||||
; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
|
||||
; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
|
||||
; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
|
||||
; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm4
|
||||
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
|
||||
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4
|
||||
; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1
|
||||
; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: test_rem7_8i32:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
|
||||
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
|
||||
; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
|
||||
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2
|
||||
; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
|
||||
|
|
|
@ -94,7 +94,6 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
|
|||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
|
||||
; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
|
||||
; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
|
||||
; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
|
||||
; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1
|
||||
; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
|
||||
|
@ -324,7 +323,6 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
|
|||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
|
||||
; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
|
||||
; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
|
||||
; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
|
||||
; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1
|
||||
; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
|
||||
|
|
|
@ -24,24 +24,19 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
|
|||
; SSE2-LABEL: PR20355:
|
||||
; SSE2: # %bb.0: # %entry
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: pand %xmm0, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
||||
; SSE2-NEXT: psrad $31, %xmm3
|
||||
; SSE2-NEXT: pand %xmm1, %xmm3
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm3
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
|
||||
; SSE2-NEXT: pmuludq %xmm1, %xmm0
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
|
||||
; SSE2-NEXT: pmuludq %xmm2, %xmm0
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
|
||||
; SSE2-NEXT: psubd %xmm3, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm4, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE2-NEXT: pmuludq %xmm1, %xmm2
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
||||
; SSE2-NEXT: pmuludq %xmm1, %xmm3
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; SSE2-NEXT: psrad $31, %xmm0
|
||||
; SSE2-NEXT: pand %xmm1, %xmm0
|
||||
; SSE2-NEXT: psubd %xmm0, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm0
|
||||
; SSE2-NEXT: psrld $31, %xmm0
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm0
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: PR20355:
|
||||
|
@ -71,13 +66,12 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
|
|||
;
|
||||
; AVX2-LABEL: PR20355:
|
||||
; AVX2: # %bb.0: # %entry
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
|
||||
; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766]
|
||||
; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
|
||||
; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: retq
|
||||
|
|
|
@ -106,13 +106,12 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
|
|||
;
|
||||
; AVX2-LABEL: test3:
|
||||
; AVX2: ## %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpmuldq %xmm4, %xmm5, %xmm4
|
||||
; AVX2-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766]
|
||||
; AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3
|
||||
; AVX2-NEXT: vpmuldq %xmm4, %xmm0, %xmm4
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
|
||||
; AVX2-NEXT: vpsrld $31, %xmm3, %xmm4
|
||||
; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,3,3,3]
|
||||
|
|
Loading…
Reference in New Issue