diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 6b4a6993983f..5ee8ea6ec038 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -226,7 +226,7 @@ namespace { SDValue &Index, SDValue &Disp, SDValue &Segment); - // Convience method where P is also root. + // Convenience method where P is also root. bool tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, @@ -234,6 +234,12 @@ namespace { return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); } + // Try to fold a vector load. This makes sure the load isn't non-temporal. + bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment); + /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, @@ -449,6 +455,12 @@ namespace { bool matchBEXTRFromAnd(SDNode *Node); bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; + + MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, + const SDLoc &dl, MVT VT, SDNode *Node); + MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, + const SDLoc &dl, MVT VT, SDNode *Node, + SDValue &InFlag); }; } @@ -2006,6 +2018,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, N.getOperand(1), Base, Scale, Index, Disp, Segment); } +bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + if (!ISD::isNON_EXTLoad(N.getNode()) || + useNonTemporalLoad(cast(N)) || + !IsProfitableToFold(N, P, Root) || + !IsLegalToFold(N, P, Root, OptLevel)) + return false; + + return selectAddr(N.getNode(), + N.getOperand(1), Base, Scale, Index, Disp, Segment); +} + /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. @@ -2563,6 +2589,83 @@ bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) { return true; } +// Emit a PCMISTR(I/M) instruction. +MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, + bool MayFoldLoad, const SDLoc &dl, + MVT VT, SDNode *Node) { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + SDValue Imm = Node->getOperand(2); + const ConstantInt *Val = cast(Imm)->getConstantIntValue(); + Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); + + // If there is a load, it will be behind a bitcast. We don't need to check + // alignment on this load. + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() && + tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4)) { + SDValue Load = N1.getOperand(0); + SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, + Load.getOperand(0) }; + SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other); + MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + // Update the chain. + ReplaceUses(Load.getValue(1), SDValue(CNode, 2)); + // Record the mem-refs + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(Load)->getMemOperand(); + CNode->setMemRefs(MemOp, MemOp + 1); + return CNode; + } + + SDValue Ops[] = { N0, N1, Imm }; + SDVTList VTs = CurDAG->getVTList(VT, MVT::i32); + MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); + return CNode; +} + +// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need +// to emit a second instruction after this one. This is needed since we have two +// copyToReg nodes glued before this and we need to continue that glue through. +MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, + bool MayFoldLoad, const SDLoc &dl, + MVT VT, SDNode *Node, + SDValue &InFlag) { + SDValue N0 = Node->getOperand(0); + SDValue N2 = Node->getOperand(2); + SDValue Imm = Node->getOperand(4); + const ConstantInt *Val = cast(Imm)->getConstantIntValue(); + Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); + + // If there is a load, it will be behind a bitcast. We don't need to check + // alignment on this load. + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() && + tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4)) { + SDValue Load = N2.getOperand(0); + SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, + Load.getOperand(0), InFlag }; + SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue); + MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + InFlag = SDValue(CNode, 3); + // Update the chain. + ReplaceUses(Load.getValue(1), SDValue(CNode, 2)); + // Record the mem-refs + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(Load)->getMemOperand(); + CNode->setMemRefs(MemOp, MemOp + 1); + return CNode; + } + + SDValue Ops[] = { N0, N2, Imm, InFlag }; + SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue); + MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); + InFlag = SDValue(CNode, 2); + return CNode; +} + /// If the high bits of an 'and' operand are known zero, try setting the /// high bits of an 'and' constant operand to produce a smaller encoding by /// creating a small, sign-extended negative immediate rather than a large @@ -3184,6 +3287,70 @@ void X86DAGToDAGISel::Select(SDNode *Node) { } break; } + case X86ISD::PCMPISTR: { + if (!Subtarget->hasSSE42()) + break; + + bool NeedIndex = !SDValue(Node, 0).use_empty(); + bool NeedMask = !SDValue(Node, 1).use_empty(); + // We can't fold a load if we are going to make two instructions. + bool MayFoldLoad = !NeedIndex || !NeedMask; + + MachineSDNode *CNode; + if (NeedMask) { + unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr; + unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm; + CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); + } + if (NeedIndex || !NeedMask) { + unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr; + unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm; + CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node); + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + } + + // Connect the flag usage to the last instruction created. + ReplaceUses(SDValue(Node, 2), SDValue(CNode, 0)); + CurDAG->RemoveDeadNode(Node); + return; + } + case X86ISD::PCMPESTR: { + if (!Subtarget->hasSSE42()) + break; + + // Copy the two implicit register inputs. + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX, + Node->getOperand(1), + SDValue()).getValue(1); + InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX, + Node->getOperand(3), InFlag).getValue(1); + + bool NeedIndex = !SDValue(Node, 0).use_empty(); + bool NeedMask = !SDValue(Node, 1).use_empty(); + // We can't fold a load if we are going to make two instructions. + bool MayFoldLoad = !NeedIndex || !NeedMask; + + MachineSDNode *CNode; + if (NeedMask) { + unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr; + unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm; + CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, + InFlag); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); + } + if (NeedIndex || !NeedMask) { + unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr; + unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm; + CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag); + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + } + // Connect the flag usage to the last instruction created. + ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); + CurDAG->RemoveDeadNode(Node); + return; + } + case ISD::STORE: if (foldLoadStoreIntoMemOperand(Node)) return; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ef08b90122db..1ba801edf796 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20947,50 +20947,50 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse42_pcmpistria128: - Opcode = X86ISD::PCMPISTRI; + Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpestria128: - Opcode = X86ISD::PCMPESTRI; + Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpistric128: - Opcode = X86ISD::PCMPISTRI; + Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpestric128: - Opcode = X86ISD::PCMPESTRI; + Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpistrio128: - Opcode = X86ISD::PCMPISTRI; + Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpestrio128: - Opcode = X86ISD::PCMPESTRI; + Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpistris128: - Opcode = X86ISD::PCMPISTRI; + Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpestris128: - Opcode = X86ISD::PCMPESTRI; + Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpistriz128: - Opcode = X86ISD::PCMPISTRI; + Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_E; break; case Intrinsic::x86_sse42_pcmpestriz128: - Opcode = X86ISD::PCMPESTRI; + Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_E; break; } SmallVector NewOps(Op->op_begin()+1, Op->op_end()); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); - SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); + SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2); + SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } @@ -20998,15 +20998,28 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::x86_sse42_pcmpestri128: { unsigned Opcode; if (IntNo == Intrinsic::x86_sse42_pcmpistri128) - Opcode = X86ISD::PCMPISTRI; + Opcode = X86ISD::PCMPISTR; else - Opcode = X86ISD::PCMPESTRI; + Opcode = X86ISD::PCMPESTR; SmallVector NewOps(Op->op_begin()+1, Op->op_end()); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } + case Intrinsic::x86_sse42_pcmpistrm128: + case Intrinsic::x86_sse42_pcmpestrm128: { + unsigned Opcode; + if (IntNo == Intrinsic::x86_sse42_pcmpistrm128) + Opcode = X86ISD::PCMPISTR; + else + Opcode = X86ISD::PCMPESTR; + + SmallVector NewOps(Op->op_begin()+1, Op->op_end()); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); + return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1); + } + case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -25794,8 +25807,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND"; case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS"; case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND"; - case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; - case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; + case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR"; + case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR"; case X86ISD::XTEST: return "X86ISD::XTEST"; case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; case X86ISD::EXPAND: return "X86ISD::EXPAND"; @@ -26179,79 +26192,6 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, return sinkMBB; } -// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 -// or XMM0_V32I8 in AVX all of this code can be replaced with that -// in the .td file. -static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB, - const TargetInstrInfo *TII) { - unsigned Opc; - switch (MI.getOpcode()) { - default: llvm_unreachable("illegal opcode!"); - case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; - case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; - case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; - case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; - case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; - case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; - case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; - case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; - } - - DebugLoc dl = MI.getDebugLoc(); - MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); - - unsigned NumArgs = MI.getNumOperands(); - for (unsigned i = 1; i < NumArgs; ++i) { - MachineOperand &Op = MI.getOperand(i); - if (!(Op.isReg() && Op.isImplicit())) - MIB.add(Op); - } - if (MI.hasOneMemOperand()) - MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) - .addReg(X86::XMM0); - - MI.eraseFromParent(); - return BB; -} - -// FIXME: Custom handling because TableGen doesn't support multiple implicit -// defs in an instruction pattern -static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB, - const TargetInstrInfo *TII) { - unsigned Opc; - switch (MI.getOpcode()) { - default: llvm_unreachable("illegal opcode!"); - case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; - case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; - case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; - case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; - case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; - case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; - case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; - case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; - } - - DebugLoc dl = MI.getDebugLoc(); - MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); - - unsigned NumArgs = MI.getNumOperands(); // remove the results - for (unsigned i = 1; i < NumArgs; ++i) { - MachineOperand &Op = MI.getOperand(i); - if (!(Op.isReg() && Op.isImplicit())) - MIB.add(Op); - } - if (MI.hasOneMemOperand()) - MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) - .addReg(X86::ECX); - - MI.eraseFromParent(); - return BB; -} - static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB, const X86Subtarget &Subtarget) { DebugLoc dl = MI.getDebugLoc(); @@ -28167,32 +28107,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } - // String/text processing lowering. - case X86::PCMPISTRM128REG: - case X86::VPCMPISTRM128REG: - case X86::PCMPISTRM128MEM: - case X86::VPCMPISTRM128MEM: - case X86::PCMPESTRM128REG: - case X86::VPCMPESTRM128REG: - case X86::PCMPESTRM128MEM: - case X86::VPCMPESTRM128MEM: - assert(Subtarget.hasSSE42() && - "Target must have SSE4.2 or AVX features enabled"); - return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo()); - - // String/text processing lowering. - case X86::PCMPISTRIREG: - case X86::VPCMPISTRIREG: - case X86::PCMPISTRIMEM: - case X86::VPCMPISTRIMEM: - case X86::PCMPESTRIREG: - case X86::VPCMPESTRIREG: - case X86::PCMPESTRIMEM: - case X86::VPCMPESTRIMEM: - assert(Subtarget.hasSSE42() && - "Target must have SSE4.2 or AVX features enabled"); - return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo()); - // Thread synchronization. case X86::MONITOR: return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index c159de49867c..6af7b8da01a5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -576,8 +576,13 @@ namespace llvm { RDSEED, // SSE42 string comparisons. - PCMPISTRI, - PCMPESTRI, + // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG + // will emit one or two instructions based on which results are used. If + // flags and index/mask this allows us to use a single instruction since + // we won't have to pick and opcode for flags. Instead we can rely on the + // DAG to CSE everything and decide at isel. + PCMPISTR, + PCMPESTR, // Test if in transactional execution. XTEST, diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 70051393c802..6c5e12c18562 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -555,17 +555,6 @@ def X86RndScalesRnd : SDNode<"X86ISD::VRNDSCALES_RND", SDTFPBinOpImmRound>; def X86ReducesRnd : SDNode<"X86ISD::VREDUCES_RND", SDTFPBinOpImmRound>; def X86GetMantsRnd : SDNode<"X86ISD::VGETMANTS_RND", SDTFPBinOpImmRound>; -def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, - SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, - SDTCisVT<4, i8>]>; -def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, - SDTCisVT<2, v16i8>, SDTCisVT<3, i32>, - SDTCisVT<4, v16i8>, SDTCisVT<5, i32>, - SDTCisVT<6, i8>]>; - -def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>; -def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>; - def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 4301fb3444c7..3261b88b95b2 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -632,9 +632,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 }, { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 }, { X86::PCMPESTRIrr, X86::PCMPESTRIrm, 0 }, - { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, 0 }, + { X86::PCMPESTRMrr, X86::PCMPESTRMrm, 0 }, { X86::PCMPISTRIrr, X86::PCMPISTRIrm, 0 }, - { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, 0 }, + { X86::PCMPISTRMrr, X86::PCMPISTRMrm, 0 }, { X86::PHMINPOSUWrr, X86::PHMINPOSUWrm, TB_ALIGN_16 }, { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE }, { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE }, @@ -736,10 +736,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPABSDrr, X86::VPABSDrm, 0 }, { X86::VPABSWrr, X86::VPABSWrm, 0 }, { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 }, - { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 }, + { X86::VPCMPESTRMrr, X86::VPCMPESTRMrm, 0 }, { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 }, - { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 }, - { X86::VPHMINPOSUWrr, X86::VPHMINPOSUWrm, 0 }, + { X86::VPCMPISTRMrr, X86::VPCMPISTRMrm, 0 }, + { X86::VPHMINPOSUWrr, X86::VPHMINPOSUWrm, 0 }, { X86::VPERMILPDri, X86::VPERMILPDmi, 0 }, { X86::VPERMILPSri, X86::VPERMILPSmi, 0 }, { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE }, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 8f88b9b333b4..5eb2fd095251 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -6383,25 +6383,6 @@ let Constraints = "$src1 = $dst" in // SSE4.2 - String/text Processing Instructions //===----------------------------------------------------------------------===// -// Packed Compare Implicit Length Strings, Return Mask -multiclass pseudo_pcmpistrm { - def REG : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, u8imm:$src3), - [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2, - imm:$src3))]>; - def MEM : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, u8imm:$src3), - [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, - (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; -} - -let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { - defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>, - Requires<[HasAVX]>, VEX_WIG; - defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", loadv2i64>, - Requires<[UseSSE42]>; -} - multiclass pcmpistrm_SS42AI { def rr : SS42AI<0x62, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2, u8imm:$src3), @@ -6416,27 +6397,8 @@ multiclass pcmpistrm_SS42AI { let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { let Predicates = [HasAVX] in - defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; - defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ; -} - -// Packed Compare Explicit Length Strings, Return Mask -multiclass pseudo_pcmpestrm { - def REG : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, VR128:$src3, u8imm:$src5), - [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 - VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; - def MEM : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, i128mem:$src3, u8imm:$src5), - [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, - (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>; -} - -let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { - defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>, - Requires<[HasAVX]>; - defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", loadv2i64>, - Requires<[UseSSE42]>; + defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; + defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; } multiclass SS42AI_pcmpestrm { @@ -6453,27 +6415,8 @@ multiclass SS42AI_pcmpestrm { let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { let Predicates = [HasAVX] in - defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; - defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">; -} - -// Packed Compare Implicit Length Strings, Return Index -multiclass pseudo_pcmpistri { - def REG : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, VR128:$src2, u8imm:$src3), - [(set GR32:$dst, EFLAGS, - (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>; - def MEM : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, i128mem:$src2, u8imm:$src3), - [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1, - (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; -} - -let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { - defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>, - Requires<[HasAVX]>, VEX_WIG; - defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", loadv2i64>, - Requires<[UseSSE42]>; + defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; + defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; } multiclass SS42AI_pcmpistri { @@ -6494,26 +6437,6 @@ let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; } -// Packed Compare Explicit Length Strings, Return Index -multiclass pseudo_pcmpestri { - def REG : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, VR128:$src3, u8imm:$src5), - [(set GR32:$dst, EFLAGS, - (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; - def MEM : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, i128mem:$src3, u8imm:$src5), - [(set GR32:$dst, EFLAGS, - (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX, - imm:$src5))]>; -} - -let Defs = [EFLAGS], Uses = [EAX, EDX], hasNoSchedulingInfo = 1, usesCustomInserter = 1 in { - defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>, - Requires<[HasAVX]>; - defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", loadv2i64>, - Requires<[UseSSE42]>; -} - multiclass SS42AI_pcmpestri { def rr : SS42AI<0x61, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src3, u8imm:$src5), diff --git a/llvm/test/CodeGen/X86/sse42.ll b/llvm/test/CodeGen/X86/sttni.ll similarity index 70% rename from llvm/test/CodeGen/X86/sse42.ll rename to llvm/test/CodeGen/X86/sttni.ll index c53511fcc532..21a34969cfb6 100644 --- a/llvm/test/CodeGen/X86/sse42.ll +++ b/llvm/test/CodeGen/X86/sttni.ll @@ -4,8 +4,10 @@ declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) +declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8) declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8) +declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8) define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { ; X32-LABEL: pcmpestri_reg_eq_i8: @@ -962,3 +964,374 @@ exit: %result_ext = zext i16 %result to i32 ret i32 %result_ext } + +define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpestr_index_flag: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: setb %bl +; X32-NEXT: movl %ecx, (%edi) +; X32-NEXT: movl %ebx, (%esi) +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx +; X32-NEXT: retl +; +; X64-LABEL: pcmpestr_index_flag: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: xorl %r10d, %r10d +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: setb %r10b +; X64-NEXT: movl %ecx, (%r9) +; X64-NEXT: movl %r10d, (%r8) +; X64-NEXT: retq +entry: + %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + store i32 %index, i32* %iptr + store i32 %flag, i32* %fptr + ret void +} + +define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpestr_mask_flag: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: pcmpestrm $24, %xmm1, %xmm0 +; X32-NEXT: setb %bl +; X32-NEXT: movdqa %xmm0, (%esi) +; X32-NEXT: movl %ebx, (%ecx) +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebx +; X32-NEXT: retl +; +; X64-LABEL: pcmpestr_mask_flag: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: xorl %r9d, %r9d +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0 +; X64-NEXT: setb %r9b +; X64-NEXT: movdqa %xmm0, (%r8) +; X64-NEXT: movl %r9d, (%rcx) +; X64-NEXT: retq +entry: + %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %flag, i32* %fptr + ret void +} + +define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr) nounwind { +; X32-LABEL: pcmpestr_mask_index: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movdqa %xmm0, %xmm2 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: pcmpestrm $24, %xmm1, %xmm0 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: pcmpestri $24, %xmm1, %xmm2 +; X32-NEXT: movdqa %xmm0, (%edi) +; X32-NEXT: movl %ecx, (%esi) +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: retl +; +; X64-LABEL: pcmpestr_mask_index: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0 +; X64-NEXT: pcmpestri $24, %xmm1, %xmm2 +; X64-NEXT: movdqa %xmm0, (%r9) +; X64-NEXT: movl %ecx, (%r8) +; X64-NEXT: retq +entry: + %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %index, i32* %iptr + ret void +} + +define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpestr_mask_index_flag: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movdqa %xmm0, %xmm2 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: pcmpestrm $24, %xmm1, %xmm0 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm2 +; X32-NEXT: setb %bl +; X32-NEXT: movdqa %xmm0, (%ebp) +; X32-NEXT: movl %ecx, (%edi) +; X32-NEXT: movl %ebx, (%esi) +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpestr_mask_index_flag: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rcx, %r9 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0 +; X64-NEXT: xorl %esi, %esi +; X64-NEXT: pcmpestri $24, %xmm1, %xmm2 +; X64-NEXT: setb %sil +; X64-NEXT: movdqa %xmm0, (%r10) +; X64-NEXT: movl %ecx, (%r9) +; X64-NEXT: movl %esi, (%r8) +; X64-NEXT: retq +entry: + %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %index, i32* %iptr + store i32 %flag, i32* %fptr + ret void +} + +define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpistr_index_flag: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: setb %dl +; X32-NEXT: movl %ecx, (%esi) +; X32-NEXT: movl %edx, (%eax) +; X32-NEXT: popl %esi +; X32-NEXT: retl +; +; X64-LABEL: pcmpistr_index_flag: +; X64: # %bb.0: # %entry +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: setb %al +; X64-NEXT: movl %ecx, (%rdi) +; X64-NEXT: movl %eax, (%rsi) +; X64-NEXT: retq +entry: + %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + store i32 %index, i32* %iptr + store i32 %flag, i32* %fptr + ret void +} + +define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpistr_mask_flag: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: pcmpistrm $24, %xmm1, %xmm0 +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: setb %cl +; X32-NEXT: movdqa %xmm0, (%edx) +; X32-NEXT: movl %ecx, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: pcmpistr_mask_flag: +; X64: # %bb.0: # %entry +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0 +; X64-NEXT: setb %al +; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: movl %eax, (%rsi) +; X64-NEXT: retq +entry: + %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %flag, i32* %fptr + ret void +} + +define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr) nounwind { +; X32-LABEL: pcmpistr_mask_index: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: pcmpistrm $24, %xmm1, %xmm0 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movdqa %xmm0, (%edx) +; X32-NEXT: movl %ecx, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: pcmpistr_mask_index: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0 +; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: movl %ecx, (%rsi) +; X64-NEXT: retq +entry: + %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %index, i32* %iptr + ret void +} + +define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpistr_mask_index_flag: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %esi +; X32-NEXT: movdqa %xmm0, %xmm2 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: pcmpistrm $24, %xmm1, %xmm0 +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: pcmpistri $24, %xmm1, %xmm2 +; X32-NEXT: setb %bl +; X32-NEXT: movdqa %xmm0, (%esi) +; X32-NEXT: movl %ecx, (%edx) +; X32-NEXT: movl %ebx, (%eax) +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebx +; X32-NEXT: retl +; +; X64-LABEL: pcmpistr_mask_index_flag: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: pcmpistri $24, %xmm1, %xmm2 +; X64-NEXT: setb %al +; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: movl %ecx, (%rsi) +; X64-NEXT: movl %eax, (%rdx) +; X64-NEXT: retq +entry: + %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %index, i32* %iptr + store i32 %flag, i32* %fptr + ret void +} + +; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri. +define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, <16 x i8>* %rhsptr, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpistr_mask_index_flag_load: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %esi +; X32-NEXT: movdqa %xmm0, %xmm1 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm2 +; X32-NEXT: pcmpistrm $24, %xmm2, %xmm0 +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: pcmpistri $24, %xmm2, %xmm1 +; X32-NEXT: setb %bl +; X32-NEXT: movdqa %xmm0, (%esi) +; X32-NEXT: movl %ecx, (%edx) +; X32-NEXT: movl %ebx, (%eax) +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebx +; X32-NEXT: retl +; +; X64-LABEL: pcmpistr_mask_index_flag_load: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: movdqu (%rdi), %xmm2 +; X64-NEXT: pcmpistrm $24, %xmm2, %xmm0 +; X64-NEXT: xorl %edi, %edi +; X64-NEXT: pcmpistri $24, %xmm2, %xmm1 +; X64-NEXT: setb %dil +; X64-NEXT: movdqa %xmm0, (%rsi) +; X64-NEXT: movl %ecx, (%rdx) +; X64-NEXT: movl %edi, (%rax) +; X64-NEXT: retq +entry: + %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 1 + %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + store <16 x i8> %mask, <16 x i8>* %mptr + store i32 %index, i32* %iptr + store i32 %flag, i32* %fptr + ret void +} + +; Make sure we don't fold nontemporal loads. +define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, <16 x i8>* %rhsptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_nontemporal: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movntdqa (%ecx), %xmm1 +; X32-NEXT: xorl %ebx, %ebx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: setb %bl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: popl %ebx +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_nontemporal: +; X64: # %bb.0: # %entry +; X64-NEXT: movntdqa (%rsi), %xmm1 +; X64-NEXT: xorl %esi, %esi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: setb %sil +; X64-NEXT: movl %esi, %eax +; X64-NEXT: retq +entry: + %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 16, !nontemporal !0 + %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + ret i32 %flag +} + +!0 = !{ i32 1 }