[X86] Make the STTNI flag intrinsics use the flags from pcmpestrm/pcmpistrm if the mask instrinsics are also used in the same basic block.

Summary:
Previously the flag intrinsics always used the index instructions even if a mask instruction also exists.

To fix fix this I've created a single ISD node type that returns index, mask, and flags. The SelectionDAG CSE process will merge all flavors of intrinsics with the same inputs to a s ingle node. Then during isel we just have to look at which results are used to know what instruction to generate. If both mask and index are used we'll need to emit two instructions. But for all other cases we can emit a single instruction.

Since I had to do manual isel anyway, I've removed the pseudo instructions and custom inserter code that was working around tablegen limitations with multiple implicit defs.

I've also renamed the recently added sse42.ll test case to sttni.ll since it focuses on that subset of the sse4.2 instructions.

Reviewers: chandlerc, RKSimon, spatel

Reviewed By: chandlerc

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D46202

llvm-svn: 331091
This commit is contained in:
Craig Topper 2018-04-27 22:15:33 +00:00
parent 4b542c6e64
commit d656410293
7 changed files with 588 additions and 217 deletions

View File

@ -226,7 +226,7 @@ namespace {
SDValue &Index, SDValue &Disp,
SDValue &Segment);
// Convience method where P is also root.
// Convenience method where P is also root.
bool tryFoldLoad(SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
@ -234,6 +234,12 @@ namespace {
return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
}
// Try to fold a vector load. This makes sure the load isn't non-temporal.
bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment);
/// Implement addressing mode selection for inline asm expressions.
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintID,
@ -449,6 +455,12 @@ namespace {
bool matchBEXTRFromAnd(SDNode *Node);
bool shrinkAndImmediate(SDNode *N);
bool isMaskZeroExtended(SDNode *N) const;
MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
const SDLoc &dl, MVT VT, SDNode *Node);
MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
const SDLoc &dl, MVT VT, SDNode *Node,
SDValue &InFlag);
};
}
@ -2006,6 +2018,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
N.getOperand(1), Base, Scale, Index, Disp, Segment);
}
bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
if (!ISD::isNON_EXTLoad(N.getNode()) ||
useNonTemporalLoad(cast<LoadSDNode>(N)) ||
!IsProfitableToFold(N, P, Root) ||
!IsLegalToFold(N, P, Root, OptLevel))
return false;
return selectAddr(N.getNode(),
N.getOperand(1), Base, Scale, Index, Disp, Segment);
}
/// Return an SDNode that returns the value of the global base register.
/// Output instructions required to initialize the global base register,
/// if necessary.
@ -2563,6 +2589,83 @@ bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
return true;
}
// Emit a PCMISTR(I/M) instruction.
MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
bool MayFoldLoad, const SDLoc &dl,
MVT VT, SDNode *Node) {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
SDValue Imm = Node->getOperand(2);
const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
// If there is a load, it will be behind a bitcast. We don't need to check
// alignment on this load.
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
Tmp3, Tmp4)) {
SDValue Load = N1.getOperand(0);
SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
Load.getOperand(0) };
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
// Update the chain.
ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
// Record the mem-refs
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
CNode->setMemRefs(MemOp, MemOp + 1);
return CNode;
}
SDValue Ops[] = { N0, N1, Imm };
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
return CNode;
}
// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
// to emit a second instruction after this one. This is needed since we have two
// copyToReg nodes glued before this and we need to continue that glue through.
MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
bool MayFoldLoad, const SDLoc &dl,
MVT VT, SDNode *Node,
SDValue &InFlag) {
SDValue N0 = Node->getOperand(0);
SDValue N2 = Node->getOperand(2);
SDValue Imm = Node->getOperand(4);
const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
// If there is a load, it will be behind a bitcast. We don't need to check
// alignment on this load.
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
Tmp3, Tmp4)) {
SDValue Load = N2.getOperand(0);
SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
Load.getOperand(0), InFlag };
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
InFlag = SDValue(CNode, 3);
// Update the chain.
ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
// Record the mem-refs
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
CNode->setMemRefs(MemOp, MemOp + 1);
return CNode;
}
SDValue Ops[] = { N0, N2, Imm, InFlag };
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
InFlag = SDValue(CNode, 2);
return CNode;
}
/// If the high bits of an 'and' operand are known zero, try setting the
/// high bits of an 'and' constant operand to produce a smaller encoding by
/// creating a small, sign-extended negative immediate rather than a large
@ -3184,6 +3287,70 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
}
break;
}
case X86ISD::PCMPISTR: {
if (!Subtarget->hasSSE42())
break;
bool NeedIndex = !SDValue(Node, 0).use_empty();
bool NeedMask = !SDValue(Node, 1).use_empty();
// We can't fold a load if we are going to make two instructions.
bool MayFoldLoad = !NeedIndex || !NeedMask;
MachineSDNode *CNode;
if (NeedMask) {
unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
}
if (NeedIndex || !NeedMask) {
unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
}
// Connect the flag usage to the last instruction created.
ReplaceUses(SDValue(Node, 2), SDValue(CNode, 0));
CurDAG->RemoveDeadNode(Node);
return;
}
case X86ISD::PCMPESTR: {
if (!Subtarget->hasSSE42())
break;
// Copy the two implicit register inputs.
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
Node->getOperand(1),
SDValue()).getValue(1);
InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
Node->getOperand(3), InFlag).getValue(1);
bool NeedIndex = !SDValue(Node, 0).use_empty();
bool NeedMask = !SDValue(Node, 1).use_empty();
// We can't fold a load if we are going to make two instructions.
bool MayFoldLoad = !NeedIndex || !NeedMask;
MachineSDNode *CNode;
if (NeedMask) {
unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
InFlag);
ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
}
if (NeedIndex || !NeedMask) {
unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
}
// Connect the flag usage to the last instruction created.
ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
CurDAG->RemoveDeadNode(Node);
return;
}
case ISD::STORE:
if (foldLoadStoreIntoMemOperand(Node))
return;

View File

@ -20947,50 +20947,50 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_sse42_pcmpistria128:
Opcode = X86ISD::PCMPISTRI;
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_A;
break;
case Intrinsic::x86_sse42_pcmpestria128:
Opcode = X86ISD::PCMPESTRI;
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_A;
break;
case Intrinsic::x86_sse42_pcmpistric128:
Opcode = X86ISD::PCMPISTRI;
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_B;
break;
case Intrinsic::x86_sse42_pcmpestric128:
Opcode = X86ISD::PCMPESTRI;
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_B;
break;
case Intrinsic::x86_sse42_pcmpistrio128:
Opcode = X86ISD::PCMPISTRI;
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_O;
break;
case Intrinsic::x86_sse42_pcmpestrio128:
Opcode = X86ISD::PCMPESTRI;
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_O;
break;
case Intrinsic::x86_sse42_pcmpistris128:
Opcode = X86ISD::PCMPISTRI;
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_S;
break;
case Intrinsic::x86_sse42_pcmpestris128:
Opcode = X86ISD::PCMPESTRI;
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_S;
break;
case Intrinsic::x86_sse42_pcmpistriz128:
Opcode = X86ISD::PCMPISTRI;
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_E;
break;
case Intrinsic::x86_sse42_pcmpestriz128:
Opcode = X86ISD::PCMPESTRI;
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_E;
break;
}
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
@ -20998,15 +20998,28 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::x86_sse42_pcmpestri128: {
unsigned Opcode;
if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
Opcode = X86ISD::PCMPISTRI;
Opcode = X86ISD::PCMPISTR;
else
Opcode = X86ISD::PCMPESTRI;
Opcode = X86ISD::PCMPESTR;
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps);
}
case Intrinsic::x86_sse42_pcmpistrm128:
case Intrinsic::x86_sse42_pcmpestrm128: {
unsigned Opcode;
if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
Opcode = X86ISD::PCMPISTR;
else
Opcode = X86ISD::PCMPESTR;
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
}
case Intrinsic::eh_sjlj_lsda: {
MachineFunction &MF = DAG.getMachineFunction();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@ -25794,8 +25807,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
case X86ISD::XTEST: return "X86ISD::XTEST";
case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
case X86ISD::EXPAND: return "X86ISD::EXPAND";
@ -26179,79 +26192,6 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
return sinkMBB;
}
// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
// or XMM0_V32I8 in AVX all of this code can be replaced with that
// in the .td file.
static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
const TargetInstrInfo *TII) {
unsigned Opc;
switch (MI.getOpcode()) {
default: llvm_unreachable("illegal opcode!");
case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
}
DebugLoc dl = MI.getDebugLoc();
MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
unsigned NumArgs = MI.getNumOperands();
for (unsigned i = 1; i < NumArgs; ++i) {
MachineOperand &Op = MI.getOperand(i);
if (!(Op.isReg() && Op.isImplicit()))
MIB.add(Op);
}
if (MI.hasOneMemOperand())
MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
.addReg(X86::XMM0);
MI.eraseFromParent();
return BB;
}
// FIXME: Custom handling because TableGen doesn't support multiple implicit
// defs in an instruction pattern
static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
const TargetInstrInfo *TII) {
unsigned Opc;
switch (MI.getOpcode()) {
default: llvm_unreachable("illegal opcode!");
case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
}
DebugLoc dl = MI.getDebugLoc();
MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
unsigned NumArgs = MI.getNumOperands(); // remove the results
for (unsigned i = 1; i < NumArgs; ++i) {
MachineOperand &Op = MI.getOperand(i);
if (!(Op.isReg() && Op.isImplicit()))
MIB.add(Op);
}
if (MI.hasOneMemOperand())
MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
.addReg(X86::ECX);
MI.eraseFromParent();
return BB;
}
static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
const X86Subtarget &Subtarget) {
DebugLoc dl = MI.getDebugLoc();
@ -28167,32 +28107,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
// String/text processing lowering.
case X86::PCMPISTRM128REG:
case X86::VPCMPISTRM128REG:
case X86::PCMPISTRM128MEM:
case X86::VPCMPISTRM128MEM:
case X86::PCMPESTRM128REG:
case X86::VPCMPESTRM128REG:
case X86::PCMPESTRM128MEM:
case X86::VPCMPESTRM128MEM:
assert(Subtarget.hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
// String/text processing lowering.
case X86::PCMPISTRIREG:
case X86::VPCMPISTRIREG:
case X86::PCMPISTRIMEM:
case X86::VPCMPISTRIMEM:
case X86::PCMPESTRIREG:
case X86::VPCMPESTRIREG:
case X86::PCMPESTRIMEM:
case X86::VPCMPESTRIMEM:
assert(Subtarget.hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
// Thread synchronization.
case X86::MONITOR:
return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);

View File

@ -576,8 +576,13 @@ namespace llvm {
RDSEED,
// SSE42 string comparisons.
PCMPISTRI,
PCMPESTRI,
// These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
// will emit one or two instructions based on which results are used. If
// flags and index/mask this allows us to use a single instruction since
// we won't have to pick and opcode for flags. Instead we can rely on the
// DAG to CSE everything and decide at isel.
PCMPISTR,
PCMPESTR,
// Test if in transactional execution.
XTEST,

View File

@ -555,17 +555,6 @@ def X86RndScalesRnd : SDNode<"X86ISD::VRNDSCALES_RND", SDTFPBinOpImmRound>;
def X86ReducesRnd : SDNode<"X86ISD::VREDUCES_RND", SDTFPBinOpImmRound>;
def X86GetMantsRnd : SDNode<"X86ISD::VGETMANTS_RND", SDTFPBinOpImmRound>;
def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
SDTCisVT<4, i8>]>;
def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
SDTCisVT<6, i8>]>;
def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1,
[SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,

View File

@ -632,9 +632,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 },
{ X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 },
{ X86::PCMPESTRIrr, X86::PCMPESTRIrm, 0 },
{ X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, 0 },
{ X86::PCMPESTRMrr, X86::PCMPESTRMrm, 0 },
{ X86::PCMPISTRIrr, X86::PCMPISTRIrm, 0 },
{ X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, 0 },
{ X86::PCMPISTRMrr, X86::PCMPISTRMrm, 0 },
{ X86::PHMINPOSUWrr, X86::PHMINPOSUWrm, TB_ALIGN_16 },
{ X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE },
{ X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE },
@ -736,9 +736,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPABSDrr, X86::VPABSDrm, 0 },
{ X86::VPABSWrr, X86::VPABSWrm, 0 },
{ X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
{ X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
{ X86::VPCMPESTRMrr, X86::VPCMPESTRMrm, 0 },
{ X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
{ X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 },
{ X86::VPCMPISTRMrr, X86::VPCMPISTRMrm, 0 },
{ X86::VPHMINPOSUWrr, X86::VPHMINPOSUWrm, 0 },
{ X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
{ X86::VPERMILPSri, X86::VPERMILPSmi, 0 },

View File

@ -6383,25 +6383,6 @@ let Constraints = "$src1 = $dst" in
// SSE4.2 - String/text Processing Instructions
//===----------------------------------------------------------------------===//
// Packed Compare Implicit Length Strings, Return Mask
multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
def REG : PseudoI<(outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, u8imm:$src3),
[(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
imm:$src3))]>;
def MEM : PseudoI<(outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
[(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
(bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
}
let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
Requires<[HasAVX]>, VEX_WIG;
defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", loadv2i64>,
Requires<[UseSSE42]>;
}
multiclass pcmpistrm_SS42AI<string asm> {
def rr : SS42AI<0x62, MRMSrcReg, (outs),
(ins VR128:$src1, VR128:$src2, u8imm:$src3),
@ -6416,27 +6397,8 @@ multiclass pcmpistrm_SS42AI<string asm> {
let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
let Predicates = [HasAVX] in
defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ;
}
// Packed Compare Explicit Length Strings, Return Mask
multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
def REG : PseudoI<(outs VR128:$dst),
(ins VR128:$src1, VR128:$src3, u8imm:$src5),
[(set VR128:$dst, (int_x86_sse42_pcmpestrm128
VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
def MEM : PseudoI<(outs VR128:$dst),
(ins VR128:$src1, i128mem:$src3, u8imm:$src5),
[(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
(bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
}
let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
Requires<[HasAVX]>;
defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", loadv2i64>,
Requires<[UseSSE42]>;
defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
}
multiclass SS42AI_pcmpestrm<string asm> {
@ -6453,27 +6415,8 @@ multiclass SS42AI_pcmpestrm<string asm> {
let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
let Predicates = [HasAVX] in
defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">;
}
// Packed Compare Implicit Length Strings, Return Index
multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
def REG : PseudoI<(outs GR32:$dst),
(ins VR128:$src1, VR128:$src2, u8imm:$src3),
[(set GR32:$dst, EFLAGS,
(X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
def MEM : PseudoI<(outs GR32:$dst),
(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
[(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
(bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
}
let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
Requires<[HasAVX]>, VEX_WIG;
defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", loadv2i64>,
Requires<[UseSSE42]>;
defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
}
multiclass SS42AI_pcmpistri<string asm> {
@ -6494,26 +6437,6 @@ let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
}
// Packed Compare Explicit Length Strings, Return Index
multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
def REG : PseudoI<(outs GR32:$dst),
(ins VR128:$src1, VR128:$src3, u8imm:$src5),
[(set GR32:$dst, EFLAGS,
(X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
def MEM : PseudoI<(outs GR32:$dst),
(ins VR128:$src1, i128mem:$src3, u8imm:$src5),
[(set GR32:$dst, EFLAGS,
(X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
imm:$src5))]>;
}
let Defs = [EFLAGS], Uses = [EAX, EDX], hasNoSchedulingInfo = 1, usesCustomInserter = 1 in {
defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
Requires<[HasAVX]>;
defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", loadv2i64>,
Requires<[UseSSE42]>;
}
multiclass SS42AI_pcmpestri<string asm> {
def rr : SS42AI<0x61, MRMSrcReg, (outs),
(ins VR128:$src1, VR128:$src3, u8imm:$src5),

View File

@ -4,8 +4,10 @@
declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8)
declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8)
declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8)
define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
; X32-LABEL: pcmpestri_reg_eq_i8:
@ -962,3 +964,374 @@ exit:
%result_ext = zext i16 %result to i32
ret i32 %result_ext
}
define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i32* %iptr, i32* %fptr) nounwind {
; X32-LABEL: pcmpestr_index_flag:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: xorl %ebx, %ebx
; X32-NEXT: pcmpestri $24, %xmm1, %xmm0
; X32-NEXT: setb %bl
; X32-NEXT: movl %ecx, (%edi)
; X32-NEXT: movl %ebx, (%esi)
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X32-NEXT: retl
;
; X64-LABEL: pcmpestr_index_flag:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rcx, %r8
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: xorl %r10d, %r10d
; X64-NEXT: movl %edi, %eax
; X64-NEXT: movl %esi, %edx
; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
; X64-NEXT: setb %r10b
; X64-NEXT: movl %ecx, (%r9)
; X64-NEXT: movl %r10d, (%r8)
; X64-NEXT: retq
entry:
%flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
%index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
store i32 %index, i32* %iptr
store i32 %flag, i32* %fptr
ret void
}
define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %fptr) nounwind {
; X32-LABEL: pcmpestr_mask_flag:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: xorl %ebx, %ebx
; X32-NEXT: pcmpestrm $24, %xmm1, %xmm0
; X32-NEXT: setb %bl
; X32-NEXT: movdqa %xmm0, (%esi)
; X32-NEXT: movl %ebx, (%ecx)
; X32-NEXT: popl %esi
; X32-NEXT: popl %ebx
; X32-NEXT: retl
;
; X64-LABEL: pcmpestr_mask_flag:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: xorl %r9d, %r9d
; X64-NEXT: movl %edi, %eax
; X64-NEXT: movl %esi, %edx
; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0
; X64-NEXT: setb %r9b
; X64-NEXT: movdqa %xmm0, (%r8)
; X64-NEXT: movl %r9d, (%rcx)
; X64-NEXT: retq
entry:
%flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
%mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
store <16 x i8> %mask, <16 x i8>* %mptr
store i32 %flag, i32* %fptr
ret void
}
define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr) nounwind {
; X32-LABEL: pcmpestr_mask_index:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: movdqa %xmm0, %xmm2
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: pcmpestrm $24, %xmm1, %xmm0
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NEXT: pcmpestri $24, %xmm1, %xmm2
; X32-NEXT: movdqa %xmm0, (%edi)
; X32-NEXT: movl %ecx, (%esi)
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: retl
;
; X64-LABEL: pcmpestr_mask_index:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rcx, %r8
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: movl %edi, %eax
; X64-NEXT: movl %esi, %edx
; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0
; X64-NEXT: pcmpestri $24, %xmm1, %xmm2
; X64-NEXT: movdqa %xmm0, (%r9)
; X64-NEXT: movl %ecx, (%r8)
; X64-NEXT: retq
entry:
%index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
%mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
store <16 x i8> %mask, <16 x i8>* %mptr
store i32 %index, i32* %iptr
ret void
}
define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
; X32-LABEL: pcmpestr_mask_index_flag:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebp
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: movdqa %xmm0, %xmm2
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: pcmpestrm $24, %xmm1, %xmm0
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X32-NEXT: xorl %ebx, %ebx
; X32-NEXT: pcmpestri $24, %xmm1, %xmm2
; X32-NEXT: setb %bl
; X32-NEXT: movdqa %xmm0, (%ebp)
; X32-NEXT: movl %ecx, (%edi)
; X32-NEXT: movl %ebx, (%esi)
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X32-NEXT: popl %ebp
; X32-NEXT: retl
;
; X64-LABEL: pcmpestr_mask_index_flag:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rcx, %r9
; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: movl %edi, %eax
; X64-NEXT: movl %esi, %edx
; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0
; X64-NEXT: xorl %esi, %esi
; X64-NEXT: pcmpestri $24, %xmm1, %xmm2
; X64-NEXT: setb %sil
; X64-NEXT: movdqa %xmm0, (%r10)
; X64-NEXT: movl %ecx, (%r9)
; X64-NEXT: movl %esi, (%r8)
; X64-NEXT: retq
entry:
%index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
%mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
%flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
store <16 x i8> %mask, <16 x i8>* %mptr
store i32 %index, i32* %iptr
store i32 %flag, i32* %fptr
ret void
}
define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, i32* %iptr, i32* %fptr) nounwind {
; X32-LABEL: pcmpistr_index_flag:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: pcmpistri $24, %xmm1, %xmm0
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: setb %dl
; X32-NEXT: movl %ecx, (%esi)
; X32-NEXT: movl %edx, (%eax)
; X32-NEXT: popl %esi
; X32-NEXT: retl
;
; X64-LABEL: pcmpistr_index_flag:
; X64: # %bb.0: # %entry
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
; X64-NEXT: setb %al
; X64-NEXT: movl %ecx, (%rdi)
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
entry:
%flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
%index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
store i32 %index, i32* %iptr
store i32 %flag, i32* %fptr
ret void
}
define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %fptr) nounwind {
; X32-LABEL: pcmpistr_mask_flag:
; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: xorl %ecx, %ecx
; X32-NEXT: pcmpistrm $24, %xmm1, %xmm0
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: setb %cl
; X32-NEXT: movdqa %xmm0, (%edx)
; X32-NEXT: movl %ecx, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: pcmpistr_mask_flag:
; X64: # %bb.0: # %entry
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0
; X64-NEXT: setb %al
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
entry:
%flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
%mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
store <16 x i8> %mask, <16 x i8>* %mptr
store i32 %flag, i32* %fptr
ret void
}
define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr) nounwind {
; X32-LABEL: pcmpistr_mask_index:
; X32: # %bb.0: # %entry
; X32-NEXT: pcmpistri $24, %xmm1, %xmm0
; X32-NEXT: pcmpistrm $24, %xmm1, %xmm0
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movdqa %xmm0, (%edx)
; X32-NEXT: movl %ecx, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: pcmpistr_mask_index:
; X64: # %bb.0: # %entry
; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: movl %ecx, (%rsi)
; X64-NEXT: retq
entry:
%index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
%mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
store <16 x i8> %mask, <16 x i8>* %mptr
store i32 %index, i32* %iptr
ret void
}
define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
; X32-LABEL: pcmpistr_mask_index_flag:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %esi
; X32-NEXT: movdqa %xmm0, %xmm2
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: pcmpistrm $24, %xmm1, %xmm0
; X32-NEXT: xorl %ebx, %ebx
; X32-NEXT: pcmpistri $24, %xmm1, %xmm2
; X32-NEXT: setb %bl
; X32-NEXT: movdqa %xmm0, (%esi)
; X32-NEXT: movl %ecx, (%edx)
; X32-NEXT: movl %ebx, (%eax)
; X32-NEXT: popl %esi
; X32-NEXT: popl %ebx
; X32-NEXT: retl
;
; X64-LABEL: pcmpistr_mask_index_flag:
; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: pcmpistri $24, %xmm1, %xmm2
; X64-NEXT: setb %al
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: movl %ecx, (%rsi)
; X64-NEXT: movl %eax, (%rdx)
; X64-NEXT: retq
entry:
%index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
%mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
%flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
store <16 x i8> %mask, <16 x i8>* %mptr
store i32 %index, i32* %iptr
store i32 %flag, i32* %fptr
ret void
}
; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri.
define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, <16 x i8>* %rhsptr, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
; X32-LABEL: pcmpistr_mask_index_flag_load:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %esi
; X32-NEXT: movdqa %xmm0, %xmm1
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movdqu (%ecx), %xmm2
; X32-NEXT: pcmpistrm $24, %xmm2, %xmm0
; X32-NEXT: xorl %ebx, %ebx
; X32-NEXT: pcmpistri $24, %xmm2, %xmm1
; X32-NEXT: setb %bl
; X32-NEXT: movdqa %xmm0, (%esi)
; X32-NEXT: movl %ecx, (%edx)
; X32-NEXT: movl %ebx, (%eax)
; X32-NEXT: popl %esi
; X32-NEXT: popl %ebx
; X32-NEXT: retl
;
; X64-LABEL: pcmpistr_mask_index_flag_load:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: movdqu (%rdi), %xmm2
; X64-NEXT: pcmpistrm $24, %xmm2, %xmm0
; X64-NEXT: xorl %edi, %edi
; X64-NEXT: pcmpistri $24, %xmm2, %xmm1
; X64-NEXT: setb %dil
; X64-NEXT: movdqa %xmm0, (%rsi)
; X64-NEXT: movl %ecx, (%rdx)
; X64-NEXT: movl %edi, (%rax)
; X64-NEXT: retq
entry:
%rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 1
%index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
%mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
%flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
store <16 x i8> %mask, <16 x i8>* %mptr
store i32 %index, i32* %iptr
store i32 %flag, i32* %fptr
ret void
}
; Make sure we don't fold nontemporal loads.
define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, <16 x i8>* %rhsptr, i32 %rhs_len) nounwind {
; X32-LABEL: pcmpestri_nontemporal:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movntdqa (%ecx), %xmm1
; X32-NEXT: xorl %ebx, %ebx
; X32-NEXT: pcmpestri $24, %xmm1, %xmm0
; X32-NEXT: setb %bl
; X32-NEXT: movl %ebx, %eax
; X32-NEXT: popl %ebx
; X32-NEXT: retl
;
; X64-LABEL: pcmpestri_nontemporal:
; X64: # %bb.0: # %entry
; X64-NEXT: movntdqa (%rsi), %xmm1
; X64-NEXT: xorl %esi, %esi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
; X64-NEXT: setb %sil
; X64-NEXT: movl %esi, %eax
; X64-NEXT: retq
entry:
%rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 16, !nontemporal !0
%flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
ret i32 %flag
}
!0 = !{ i32 1 }