forked from OSchip/llvm-project
[X86] Use setcc ISD opcode for AVX512 integer comparisons all the way to isel
I don't believe there is any real reason to have separate X86 specific opcodes for vector compares. Setcc has the same behavior just uses a different encoding for the condition code. I had to change the CondCodeAction for SETLT and SETLE to prevent some transforms from changing SETGT lowering. Differential Revision: https://reviews.llvm.org/D43608 llvm-svn: 335173
This commit is contained in:
parent
11b02759a3
commit
c2696d577b
|
@ -470,7 +470,7 @@ namespace {
|
|||
// type.
|
||||
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
|
||||
unsigned Opcode = N->getOpcode();
|
||||
if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMU ||
|
||||
if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
|
||||
Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) {
|
||||
// We can get 256-bit 8 element types here without VLX being enabled. When
|
||||
// this happens we will use 512-bit operations and the mask will not be
|
||||
|
|
|
@ -814,6 +814,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setOperationAction(ISD::SETCC, VT, Custom);
|
||||
setOperationAction(ISD::CTPOP, VT, Custom);
|
||||
setOperationAction(ISD::CTTZ, VT, Custom);
|
||||
|
||||
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
|
||||
// setcc all the way to isel and prefer SETGT in some isel patterns.
|
||||
setCondCodeAction(ISD::SETLT, VT, Custom);
|
||||
setCondCodeAction(ISD::SETLE, VT, Custom);
|
||||
}
|
||||
|
||||
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
|
||||
|
@ -1056,6 +1061,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setOperationAction(ISD::CTPOP, VT, Custom);
|
||||
setOperationAction(ISD::CTTZ, VT, Custom);
|
||||
setOperationAction(ISD::CTLZ, VT, Custom);
|
||||
|
||||
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
|
||||
// setcc all the way to isel and prefer SETGT in some isel patterns.
|
||||
setCondCodeAction(ISD::SETLT, VT, Custom);
|
||||
setCondCodeAction(ISD::SETLE, VT, Custom);
|
||||
}
|
||||
|
||||
if (Subtarget.hasAnyFMA()) {
|
||||
|
@ -1338,6 +1348,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setOperationAction(ISD::CTTZ, VT, Custom);
|
||||
setOperationAction(ISD::ROTL, VT, Custom);
|
||||
setOperationAction(ISD::ROTR, VT, Custom);
|
||||
setOperationAction(ISD::SETCC, VT, Custom);
|
||||
|
||||
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
|
||||
// setcc all the way to isel and prefer SETGT in some isel patterns.
|
||||
setCondCodeAction(ISD::SETLT, VT, Custom);
|
||||
setCondCodeAction(ISD::SETLE, VT, Custom);
|
||||
}
|
||||
|
||||
// Need to promote to 64-bit even though we have 32-bit masked instructions
|
||||
|
@ -1551,6 +1567,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setOperationAction(ISD::UMAX, VT, Legal);
|
||||
setOperationAction(ISD::SMIN, VT, Legal);
|
||||
setOperationAction(ISD::UMIN, VT, Legal);
|
||||
setOperationAction(ISD::SETCC, VT, Custom);
|
||||
|
||||
setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
|
||||
setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
|
||||
|
@ -5180,8 +5197,8 @@ static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
|
|||
default:
|
||||
return false;
|
||||
case X86ISD::CMPM:
|
||||
case X86ISD::CMPMU:
|
||||
case X86ISD::CMPM_RND:
|
||||
case ISD::SETCC:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -6978,17 +6995,13 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
|
|||
BOperand = ZeroExtended.getOperand(0);
|
||||
else
|
||||
BOperand = Ld.getOperand(0).getOperand(0);
|
||||
if (BOperand.getValueType().isVector() &&
|
||||
BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
|
||||
if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
|
||||
NumElts == 8)) || // for broadcastmb2q
|
||||
(EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
|
||||
NumElts == 16))) { // for broadcastmw2d
|
||||
SDValue Brdcst =
|
||||
DAG.getNode(X86ISD::VBROADCASTM, dl,
|
||||
MVT::getVectorVT(EltType, NumElts), BOperand);
|
||||
return DAG.getBitcast(VT, Brdcst);
|
||||
}
|
||||
MVT MaskVT = BOperand.getSimpleValueType();
|
||||
if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
|
||||
(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
|
||||
SDValue Brdcst =
|
||||
DAG.getNode(X86ISD::VBROADCASTM, dl,
|
||||
MVT::getVectorVT(EltType, NumElts), BOperand);
|
||||
return DAG.getBitcast(VT, Brdcst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -14757,8 +14770,8 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
|||
int NumElems = VT.getVectorNumElements();
|
||||
if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
|
||||
(Subtarget.hasDQI() && (NumElems < 32)))
|
||||
return DAG.getNode(X86ISD::CMPM, DL, VT, DAG.getConstant(0, DL, ExtVT),
|
||||
Shuffle, DAG.getConstant(6, DL, MVT::i8));
|
||||
return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
|
||||
Shuffle, ISD::SETGT);
|
||||
|
||||
return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
|
||||
}
|
||||
|
@ -14996,9 +15009,9 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
|
|||
"Should have a size-matched integer condition!");
|
||||
// Build a mask by testing the condition against zero.
|
||||
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
|
||||
SDValue Mask = DAG.getNode(X86ISD::CMPM, dl, MaskVT, Cond,
|
||||
getZeroVector(VT, Subtarget, DAG, dl),
|
||||
DAG.getConstant(4, dl, MVT::i8));
|
||||
SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
|
||||
getZeroVector(VT, Subtarget, DAG, dl),
|
||||
ISD::SETNE);
|
||||
// Now return a new VSELECT using the mask.
|
||||
return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
|
||||
}
|
||||
|
@ -16962,8 +16975,8 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
|
|||
DAG.getConstant(ShiftInx, DL, ExtVT));
|
||||
In = DAG.getBitcast(InVT, In);
|
||||
}
|
||||
return DAG.getNode(X86ISD::CMPM, DL, VT, DAG.getConstant(0, DL, InVT),
|
||||
In, DAG.getConstant(6, DL, MVT::i8));
|
||||
return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
|
||||
In, ISD::SETGT);
|
||||
}
|
||||
// Use TESTD/Q, extended vector to packed dword/qword.
|
||||
assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
|
||||
|
@ -17010,11 +17023,10 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
|
|||
}
|
||||
// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
|
||||
if (Subtarget.hasDQI())
|
||||
return DAG.getNode(X86ISD::CMPM, DL, VT, DAG.getConstant(0, DL, InVT),
|
||||
In, DAG.getConstant(6, DL, MVT::i8));
|
||||
return DAG.getNode(X86ISD::CMPM, DL, VT, In,
|
||||
getZeroVector(InVT, Subtarget, DAG, DL),
|
||||
DAG.getConstant(4, DL, MVT::i8));
|
||||
return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
|
||||
In, ISD::SETGT);
|
||||
return DAG.getSetCC(DL, VT, In, getZeroVector(InVT, Subtarget, DAG, DL),
|
||||
ISD::SETNE);
|
||||
}
|
||||
|
||||
SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
|
||||
|
@ -18091,28 +18103,13 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
|
|||
ISD::isBuildVectorAllZeros(Op0.getNode()))
|
||||
std::swap(Op0, Op1);
|
||||
|
||||
bool Swap = false;
|
||||
unsigned SSECC;
|
||||
switch (SetCCOpcode) {
|
||||
default: llvm_unreachable("Unexpected SETCC condition");
|
||||
case ISD::SETNE: SSECC = 4; break;
|
||||
case ISD::SETEQ: SSECC = 0; break;
|
||||
case ISD::SETULT: SSECC = 1; break;
|
||||
case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
|
||||
case ISD::SETUGT:
|
||||
case ISD::SETGT: SSECC = 6; break;
|
||||
case ISD::SETUGE:
|
||||
case ISD::SETGE: SSECC = 5; break;
|
||||
case ISD::SETULE:
|
||||
case ISD::SETLE: SSECC = 2; break;
|
||||
}
|
||||
if (Swap)
|
||||
// Prefer SETGT over SETLT.
|
||||
if (SetCCOpcode == ISD::SETLT) {
|
||||
SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
|
||||
std::swap(Op0, Op1);
|
||||
}
|
||||
|
||||
unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) ? X86ISD::CMPMU
|
||||
: X86ISD::CMPM;
|
||||
return DAG.getNode(Opc, dl, VT, Op0, Op1,
|
||||
DAG.getConstant(SSECC, dl, MVT::i8));
|
||||
return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
|
||||
}
|
||||
|
||||
/// Try to turn a VSETULT into a VSETULE by modifying its second
|
||||
|
@ -20167,7 +20164,6 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
|
|||
default: break;
|
||||
case X86ISD::CMPM:
|
||||
case X86ISD::CMPM_RND:
|
||||
case X86ISD::CMPMU:
|
||||
case X86ISD::VPSHUFBITQMB:
|
||||
case X86ISD::VFPCLASS:
|
||||
return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
|
||||
|
@ -22978,8 +22974,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
|
|||
SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
|
||||
if (VT.is512BitVector()) {
|
||||
assert(VT == MVT::v64i8 && "Unexpected element type!");
|
||||
SDValue CMP = DAG.getNode(X86ISD::CMPM, dl, MVT::v64i1, Zeros, R,
|
||||
DAG.getConstant(6, dl, MVT::i8));
|
||||
SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R,
|
||||
ISD::SETGT);
|
||||
return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
|
||||
}
|
||||
return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
|
||||
|
@ -23499,9 +23495,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
|
|||
V0 = DAG.getBitcast(VT, V0);
|
||||
V1 = DAG.getBitcast(VT, V1);
|
||||
Sel = DAG.getBitcast(VT, Sel);
|
||||
Sel = DAG.getNode(X86ISD::CMPM, dl, MaskVT,
|
||||
DAG.getConstant(0, dl, VT), Sel,
|
||||
DAG.getConstant(6, dl, MVT::i8));
|
||||
Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
|
||||
ISD::SETGT);
|
||||
return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
|
||||
} else if (Subtarget.hasSSE41()) {
|
||||
// On SSE41 targets we make use of the fact that VSELECT lowers
|
||||
|
@ -25716,7 +25711,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
case X86ISD::COMI: return "X86ISD::COMI";
|
||||
case X86ISD::UCOMI: return "X86ISD::UCOMI";
|
||||
case X86ISD::CMPM: return "X86ISD::CMPM";
|
||||
case X86ISD::CMPMU: return "X86ISD::CMPMU";
|
||||
case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
|
||||
case X86ISD::SETCC: return "X86ISD::SETCC";
|
||||
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
|
||||
|
|
|
@ -345,7 +345,6 @@ namespace llvm {
|
|||
/// Vector comparison generating mask bits for fp and
|
||||
/// integer signed and unsigned data types.
|
||||
CMPM,
|
||||
CMPMU,
|
||||
// Vector comparison with rounding mode for FP values
|
||||
CMPM_RND,
|
||||
|
||||
|
|
|
@ -2160,7 +2160,7 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
|
|||
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
|
||||
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
|
||||
(_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
|
||||
(_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
|
||||
EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
|
||||
let isCommutable = IsCommutable in
|
||||
def rrk : AVX512BI<opc, MRMSrcReg,
|
||||
|
@ -2240,11 +2240,15 @@ multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
|
|||
|
||||
// This fragment treats X86cmpm as commutable to help match loads in both
|
||||
// operands for PCMPEQ.
|
||||
def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>;
|
||||
def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2),
|
||||
(X86cmpm_c node:$src1, node:$src2, (i8 0))>;
|
||||
(X86setcc_commute node:$src1, node:$src2, SETEQ)>;
|
||||
def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2),
|
||||
(X86cmpm node:$src1, node:$src2, (i8 6))>;
|
||||
(setcc node:$src1, node:$src2, SETGT)>;
|
||||
|
||||
// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
|
||||
// increase the pattern complexity the way an immediate would.
|
||||
let AddedComplexity = 2 in {
|
||||
// FIXME: Is there a better scheduler class for VPCMP?
|
||||
defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c,
|
||||
SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>,
|
||||
|
@ -2277,33 +2281,29 @@ defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm,
|
|||
defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
|
||||
SchedWriteVecALU, avx512vl_i64_info, HasAVX512>,
|
||||
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
|
||||
}
|
||||
|
||||
// Transforms to swizzle an immediate to help matching memory operand in first
|
||||
// operand.
|
||||
def CommutePCMPCC : SDNodeXForm<imm, [{
|
||||
uint8_t Imm = N->getZExtValue() & 0x7;
|
||||
Imm = X86::getSwappedVPCMPImm(Imm);
|
||||
return getI8Imm(Imm, SDLoc(N));
|
||||
}]>;
|
||||
|
||||
multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
|
||||
X86FoldableSchedWrite sched, X86VectorVTInfo _,
|
||||
string Name> {
|
||||
multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
|
||||
PatFrag CommFrag, X86FoldableSchedWrite sched,
|
||||
X86VectorVTInfo _, string Name> {
|
||||
let isCommutable = 1 in
|
||||
def rri : AVX512AIi8<opc, MRMSrcReg,
|
||||
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
|
||||
!strconcat("vpcmp${cc}", Suffix,
|
||||
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
|
||||
imm:$cc))]>,
|
||||
[(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1),
|
||||
(_.VT _.RC:$src2),
|
||||
cond)))]>,
|
||||
EVEX_4V, Sched<[sched]>;
|
||||
def rmi : AVX512AIi8<opc, MRMSrcMem,
|
||||
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
|
||||
!strconcat("vpcmp${cc}", Suffix,
|
||||
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
|
||||
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
|
||||
(_.VT (bitconvert (_.LdFrag addr:$src2))),
|
||||
imm:$cc))]>,
|
||||
[(set _.KRC:$dst, (_.KVT
|
||||
(Frag:$cc
|
||||
(_.VT _.RC:$src1),
|
||||
(_.VT (bitconvert (_.LdFrag addr:$src2))),
|
||||
cond)))]>,
|
||||
EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>;
|
||||
let isCommutable = 1 in
|
||||
def rrik : AVX512AIi8<opc, MRMSrcReg,
|
||||
|
@ -2313,8 +2313,9 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
|
|||
"\t{$src2, $src1, $dst {${mask}}|",
|
||||
"$dst {${mask}}, $src1, $src2}"),
|
||||
[(set _.KRC:$dst, (and _.KRCWM:$mask,
|
||||
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
|
||||
imm:$cc)))]>,
|
||||
(_.KVT (Frag:$cc (_.VT _.RC:$src1),
|
||||
(_.VT _.RC:$src2),
|
||||
cond))))]>,
|
||||
EVEX_4V, EVEX_K, Sched<[sched]>;
|
||||
def rmik : AVX512AIi8<opc, MRMSrcMem,
|
||||
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
|
||||
|
@ -2323,9 +2324,12 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
|
|||
"\t{$src2, $src1, $dst {${mask}}|",
|
||||
"$dst {${mask}}, $src1, $src2}"),
|
||||
[(set _.KRC:$dst, (and _.KRCWM:$mask,
|
||||
(OpNode (_.VT _.RC:$src1),
|
||||
(_.VT (bitconvert (_.LdFrag addr:$src2))),
|
||||
imm:$cc)))]>,
|
||||
(_.KVT
|
||||
(Frag:$cc
|
||||
(_.VT _.RC:$src1),
|
||||
(_.VT (bitconvert
|
||||
(_.LdFrag addr:$src2))),
|
||||
cond))))]>,
|
||||
EVEX_4V, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>;
|
||||
|
||||
// Accept explicit immediate argument form instead of comparison code.
|
||||
|
@ -2359,31 +2363,34 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
|
|||
NotMemoryFoldable;
|
||||
}
|
||||
|
||||
def : Pat<(OpNode (bitconvert (_.LdFrag addr:$src2)),
|
||||
(_.VT _.RC:$src1), imm:$cc),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
|
||||
(CommutePCMPCC imm:$cc))>;
|
||||
def : Pat<(_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
|
||||
(_.VT _.RC:$src1), cond)),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmi")
|
||||
_.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
|
||||
|
||||
def : Pat<(and _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src2)),
|
||||
(_.VT _.RC:$src1), imm:$cc)),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
|
||||
_.RC:$src1, addr:$src2,
|
||||
(CommutePCMPCC imm:$cc))>;
|
||||
def : Pat<(and _.KRCWM:$mask,
|
||||
(_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
|
||||
(_.VT _.RC:$src1), cond))),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmik")
|
||||
_.KRCWM:$mask, _.RC:$src1, addr:$src2,
|
||||
(CommFrag.OperandTransform $cc))>;
|
||||
}
|
||||
|
||||
multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
|
||||
X86FoldableSchedWrite sched, X86VectorVTInfo _,
|
||||
string Name> :
|
||||
avx512_icmp_cc<opc, Suffix, OpNode, sched, _, Name> {
|
||||
multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
|
||||
PatFrag CommFrag, X86FoldableSchedWrite sched,
|
||||
X86VectorVTInfo _, string Name> :
|
||||
avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched, _, Name> {
|
||||
def rmib : AVX512AIi8<opc, MRMSrcMem,
|
||||
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
|
||||
AVX512ICC:$cc),
|
||||
!strconcat("vpcmp${cc}", Suffix,
|
||||
"\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
|
||||
"$dst, $src1, ${src2}", _.BroadcastStr, "}"),
|
||||
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
|
||||
(X86VBroadcast (_.ScalarLdFrag addr:$src2)),
|
||||
imm:$cc))]>,
|
||||
[(set _.KRC:$dst, (_.KVT (Frag:$cc
|
||||
(_.VT _.RC:$src1),
|
||||
(X86VBroadcast
|
||||
(_.ScalarLdFrag addr:$src2)),
|
||||
cond)))]>,
|
||||
EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
|
||||
def rmibk : AVX512AIi8<opc, MRMSrcMem,
|
||||
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
|
||||
|
@ -2392,9 +2399,11 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
|
|||
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
|
||||
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
|
||||
[(set _.KRC:$dst, (and _.KRCWM:$mask,
|
||||
(OpNode (_.VT _.RC:$src1),
|
||||
(X86VBroadcast (_.ScalarLdFrag addr:$src2)),
|
||||
imm:$cc)))]>,
|
||||
(_.KVT (Frag:$cc
|
||||
(_.VT _.RC:$src1),
|
||||
(X86VBroadcast
|
||||
(_.ScalarLdFrag addr:$src2)),
|
||||
cond))))]>,
|
||||
EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>;
|
||||
|
||||
// Accept explicit immediate argument form instead of comparison code.
|
||||
|
@ -2417,77 +2426,118 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
|
|||
NotMemoryFoldable;
|
||||
}
|
||||
|
||||
def : Pat<(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
|
||||
(_.VT _.RC:$src1), imm:$cc),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmib") _.RC:$src1, addr:$src2,
|
||||
(CommutePCMPCC imm:$cc))>;
|
||||
def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
|
||||
(_.VT _.RC:$src1), cond)),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmib")
|
||||
_.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
|
||||
|
||||
def : Pat<(and _.KRCWM:$mask, (OpNode (X86VBroadcast
|
||||
(_.ScalarLdFrag addr:$src2)),
|
||||
(_.VT _.RC:$src1), imm:$cc)),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmibk") _.KRCWM:$mask,
|
||||
_.RC:$src1, addr:$src2,
|
||||
(CommutePCMPCC imm:$cc))>;
|
||||
def : Pat<(and _.KRCWM:$mask,
|
||||
(_.KVT (CommFrag:$cc (X86VBroadcast
|
||||
(_.ScalarLdFrag addr:$src2)),
|
||||
(_.VT _.RC:$src1), cond))),
|
||||
(!cast<Instruction>(Name#_.ZSuffix#"rmibk")
|
||||
_.KRCWM:$mask, _.RC:$src1, addr:$src2,
|
||||
(CommFrag.OperandTransform $cc))>;
|
||||
}
|
||||
|
||||
multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode,
|
||||
X86SchedWriteWidths sched,
|
||||
multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag,
|
||||
PatFrag CommFrag, X86SchedWriteWidths sched,
|
||||
AVX512VLVectorVTInfo VTInfo, Predicate prd> {
|
||||
let Predicates = [prd] in
|
||||
defm Z : avx512_icmp_cc<opc, Suffix, OpNode, sched.ZMM, VTInfo.info512, NAME>,
|
||||
EVEX_V512;
|
||||
defm Z : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.ZMM,
|
||||
VTInfo.info512, NAME>, EVEX_V512;
|
||||
|
||||
let Predicates = [prd, HasVLX] in {
|
||||
defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, sched.YMM, VTInfo.info256,
|
||||
NAME>,
|
||||
EVEX_V256;
|
||||
defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, sched.XMM, VTInfo.info128,
|
||||
NAME>,
|
||||
EVEX_V128;
|
||||
defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.YMM,
|
||||
VTInfo.info256, NAME>, EVEX_V256;
|
||||
defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.XMM,
|
||||
VTInfo.info128, NAME>, EVEX_V128;
|
||||
}
|
||||
}
|
||||
|
||||
multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode,
|
||||
X86SchedWriteWidths sched,
|
||||
multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, PatFrag Frag,
|
||||
PatFrag CommFrag, X86SchedWriteWidths sched,
|
||||
AVX512VLVectorVTInfo VTInfo, Predicate prd> {
|
||||
let Predicates = [prd] in
|
||||
defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, sched.ZMM,
|
||||
defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.ZMM,
|
||||
VTInfo.info512, NAME>, EVEX_V512;
|
||||
|
||||
let Predicates = [prd, HasVLX] in {
|
||||
defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, sched.YMM,
|
||||
defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.YMM,
|
||||
VTInfo.info256, NAME>, EVEX_V256;
|
||||
defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, sched.XMM,
|
||||
defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.XMM,
|
||||
VTInfo.info128, NAME>, EVEX_V128;
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: Is there a better scheduler class for VPCMP/VPCMPU?
|
||||
defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, SchedWriteVecALU,
|
||||
avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>;
|
||||
defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, SchedWriteVecALU,
|
||||
avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>;
|
||||
def X86pcmpm_imm : SDNodeXForm<setcc, [{
|
||||
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
||||
uint8_t SSECC = X86::getVPCMPImmForCond(CC);
|
||||
return getI8Imm(SSECC, SDLoc(N));
|
||||
}]>;
|
||||
|
||||
defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, SchedWriteVecALU,
|
||||
avx512vl_i16_info, HasBWI>,
|
||||
// Swapped operand version of the above.
|
||||
def X86pcmpm_imm_commute : SDNodeXForm<setcc, [{
|
||||
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
||||
uint8_t SSECC = X86::getVPCMPImmForCond(CC);
|
||||
SSECC = X86::getSwappedVPCMPImm(SSECC);
|
||||
return getI8Imm(SSECC, SDLoc(N));
|
||||
}]>;
|
||||
|
||||
def X86pcmpm : PatFrag<(ops node:$src1, node:$src2, node:$cc),
|
||||
(setcc node:$src1, node:$src2, node:$cc), [{
|
||||
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
||||
return !ISD::isUnsignedIntSetCC(CC);
|
||||
}], X86pcmpm_imm>;
|
||||
|
||||
// Same as above, but commutes immediate. Use for load folding.
|
||||
def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
|
||||
(setcc node:$src1, node:$src2, node:$cc), [{
|
||||
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
||||
return !ISD::isUnsignedIntSetCC(CC);
|
||||
}], X86pcmpm_imm_commute>;
|
||||
|
||||
def X86pcmpum : PatFrag<(ops node:$src1, node:$src2, node:$cc),
|
||||
(setcc node:$src1, node:$src2, node:$cc), [{
|
||||
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
||||
return ISD::isUnsignedIntSetCC(CC);
|
||||
}], X86pcmpm_imm>;
|
||||
|
||||
// Same as above, but commutes immediate. Use for load folding.
|
||||
def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc),
|
||||
(setcc node:$src1, node:$src2, node:$cc), [{
|
||||
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
||||
return ISD::isUnsignedIntSetCC(CC);
|
||||
}], X86pcmpm_imm_commute>;
|
||||
|
||||
// FIXME: Is there a better scheduler class for VPCMP/VPCMPU?
|
||||
defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_commute,
|
||||
SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
|
||||
EVEX_CD8<8, CD8VF>;
|
||||
defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_commute,
|
||||
SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
|
||||
EVEX_CD8<8, CD8VF>;
|
||||
|
||||
defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_commute,
|
||||
SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
|
||||
VEX_W, EVEX_CD8<16, CD8VF>;
|
||||
defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, SchedWriteVecALU,
|
||||
avx512vl_i16_info, HasBWI>,
|
||||
defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_commute,
|
||||
SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
|
||||
VEX_W, EVEX_CD8<16, CD8VF>;
|
||||
|
||||
defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, SchedWriteVecALU,
|
||||
avx512vl_i32_info, HasAVX512>,
|
||||
EVEX_CD8<32, CD8VF>;
|
||||
defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, SchedWriteVecALU,
|
||||
avx512vl_i32_info, HasAVX512>,
|
||||
EVEX_CD8<32, CD8VF>;
|
||||
defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_commute,
|
||||
SchedWriteVecALU, avx512vl_i32_info,
|
||||
HasAVX512>, EVEX_CD8<32, CD8VF>;
|
||||
defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_commute,
|
||||
SchedWriteVecALU, avx512vl_i32_info,
|
||||
HasAVX512>, EVEX_CD8<32, CD8VF>;
|
||||
|
||||
defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, SchedWriteVecALU,
|
||||
avx512vl_i64_info, HasAVX512>,
|
||||
VEX_W, EVEX_CD8<64, CD8VF>;
|
||||
defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, SchedWriteVecALU,
|
||||
avx512vl_i64_info, HasAVX512>,
|
||||
VEX_W, EVEX_CD8<64, CD8VF>;
|
||||
defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_commute,
|
||||
SchedWriteVecALU, avx512vl_i64_info,
|
||||
HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
|
||||
defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_commute,
|
||||
SchedWriteVecALU, avx512vl_i64_info,
|
||||
HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
|
||||
|
||||
multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
|
||||
string Name> {
|
||||
|
@ -3085,6 +3135,7 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
|
|||
defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShuffle>;
|
||||
defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>;
|
||||
|
||||
// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
|
||||
multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr,
|
||||
X86VectorVTInfo Narrow,
|
||||
X86VectorVTInfo Wide> {
|
||||
|
@ -3107,9 +3158,34 @@ multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr,
|
|||
Narrow.KRC)>;
|
||||
}
|
||||
|
||||
multiclass axv512_icmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
|
||||
// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
|
||||
multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag,
|
||||
string InstStr,
|
||||
X86VectorVTInfo Narrow,
|
||||
X86VectorVTInfo Wide> {
|
||||
def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
|
||||
(Narrow.VT Narrow.RC:$src2), cond)),
|
||||
(COPY_TO_REGCLASS
|
||||
(!cast<Instruction>(InstStr##Zrri)
|
||||
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
|
||||
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
|
||||
(Frag.OperandTransform $cc)), Narrow.KRC)>;
|
||||
|
||||
def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
|
||||
(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
|
||||
(Narrow.VT Narrow.RC:$src2),
|
||||
cond)))),
|
||||
(COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
|
||||
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
|
||||
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
|
||||
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
|
||||
(Frag.OperandTransform $cc)), Narrow.KRC)>;
|
||||
}
|
||||
|
||||
// Same as above, but for fp types which don't use PatFrags.
|
||||
multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
|
||||
X86VectorVTInfo Narrow,
|
||||
X86VectorVTInfo Wide> {
|
||||
def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1),
|
||||
(Narrow.VT Narrow.RC:$src2), imm:$cc)),
|
||||
(COPY_TO_REGCLASS
|
||||
|
@ -3129,6 +3205,9 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
|
|||
}
|
||||
|
||||
let Predicates = [HasAVX512, NoVLX] in {
|
||||
// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
|
||||
// increase the pattern complexity the way an immediate would.
|
||||
let AddedComplexity = 2 in {
|
||||
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v8i32x_info, v16i32_info>;
|
||||
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v8i32x_info, v16i32_info>;
|
||||
|
||||
|
@ -3140,25 +3219,30 @@ let Predicates = [HasAVX512, NoVLX] in {
|
|||
|
||||
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v2i64x_info, v8i64_info>;
|
||||
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v2i64x_info, v8i64_info>;
|
||||
}
|
||||
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v8f32x_info, v16f32_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPD", v8i32x_info, v16i32_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUD", v8i32x_info, v16i32_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v8i32x_info, v16i32_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v8i32x_info, v16i32_info>;
|
||||
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v4f32x_info, v16f32_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPD", v4i32x_info, v16i32_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUD", v4i32x_info, v16i32_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v4i32x_info, v16i32_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v4i32x_info, v16i32_info>;
|
||||
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v4f64x_info, v8f64_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPQ", v4i64x_info, v8i64_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUQ", v4i64x_info, v8i64_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v4i64x_info, v8i64_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v4i64x_info, v8i64_info>;
|
||||
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v2f64x_info, v8f64_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPQ", v2i64x_info, v8i64_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUQ", v2i64x_info, v8i64_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v2i64x_info, v8i64_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v2i64x_info, v8i64_info>;
|
||||
|
||||
defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v8f32x_info, v16f32_info>;
|
||||
defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v4f32x_info, v16f32_info>;
|
||||
defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v4f64x_info, v8f64_info>;
|
||||
defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v2f64x_info, v8f64_info>;
|
||||
}
|
||||
|
||||
let Predicates = [HasBWI, NoVLX] in {
|
||||
// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
|
||||
// increase the pattern complexity the way an immediate would.
|
||||
let AddedComplexity = 2 in {
|
||||
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v32i8x_info, v64i8_info>;
|
||||
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v32i8x_info, v64i8_info>;
|
||||
|
||||
|
@ -3170,18 +3254,19 @@ let Predicates = [HasBWI, NoVLX] in {
|
|||
|
||||
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v8i16x_info, v32i16_info>;
|
||||
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v8i16x_info, v32i16_info>;
|
||||
}
|
||||
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPB", v32i8x_info, v64i8_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUB", v32i8x_info, v64i8_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v32i8x_info, v64i8_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v32i8x_info, v64i8_info>;
|
||||
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPB", v16i8x_info, v64i8_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUB", v16i8x_info, v64i8_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v16i8x_info, v64i8_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v16i8x_info, v64i8_info>;
|
||||
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPW", v16i16x_info, v32i16_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUW", v16i16x_info, v32i16_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v16i16x_info, v32i16_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v16i16x_info, v32i16_info>;
|
||||
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPW", v8i16x_info, v32i16_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUW", v8i16x_info, v32i16_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v8i16x_info, v32i16_info>;
|
||||
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v8i16x_info, v32i16_info>;
|
||||
}
|
||||
|
||||
// Mask setting all 0s or 1s
|
||||
|
@ -5701,9 +5786,9 @@ multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
|
|||
// as commutable here because we already canonicalized all zeros vectors to the
|
||||
// RHS during lowering.
|
||||
def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2),
|
||||
(X86cmpm node:$src1, node:$src2, (i8 0))>;
|
||||
(setcc node:$src1, node:$src2, SETEQ)>;
|
||||
def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2),
|
||||
(X86cmpm node:$src1, node:$src2, (i8 4))>;
|
||||
(setcc node:$src1, node:$src2, SETNE)>;
|
||||
|
||||
multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
|
||||
PatFrag OpNode, X86SchedWriteWidths sched> :
|
||||
|
|
|
@ -174,7 +174,6 @@ def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
|
|||
// Hack to make CMPM commutable in tablegen patterns for load folding.
|
||||
def X86cmpm_c : SDNode<"X86ISD::CMPM", X86CmpMaskCC, [SDNPCommutative]>;
|
||||
def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
|
||||
def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>;
|
||||
def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
|
||||
def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>;
|
||||
|
||||
|
|
|
@ -7874,6 +7874,23 @@ unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
|
|||
}
|
||||
}
|
||||
|
||||
/// Get the VPCMP immediate for the given condition.
|
||||
unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) {
|
||||
switch (CC) {
|
||||
default: llvm_unreachable("Unexpected SETCC condition");
|
||||
case ISD::SETNE: return 4;
|
||||
case ISD::SETEQ: return 0;
|
||||
case ISD::SETULT:
|
||||
case ISD::SETLT: return 1;
|
||||
case ISD::SETUGT:
|
||||
case ISD::SETGT: return 6;
|
||||
case ISD::SETUGE:
|
||||
case ISD::SETGE: return 5;
|
||||
case ISD::SETULE:
|
||||
case ISD::SETLE: return 2;
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the VPCMP immediate if the opcodes are swapped.
|
||||
unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
|
||||
switch (Imm) {
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include "X86InstrFMA3Info.h"
|
||||
#include "X86RegisterInfo.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/CodeGen/ISDOpcodes.h"
|
||||
#include "llvm/CodeGen/TargetInstrInfo.h"
|
||||
|
||||
#define GET_INSTRINFO_HEADER
|
||||
|
@ -96,6 +97,9 @@ CondCode getCondFromCMovOpc(unsigned Opc);
|
|||
/// e.g. turning COND_E to COND_NE.
|
||||
CondCode GetOppositeBranchCondition(CondCode CC);
|
||||
|
||||
/// Get the VPCMP immediate for the given condition.
|
||||
unsigned getVPCMPImmForCond(ISD::CondCode CC);
|
||||
|
||||
/// Get the VPCMP immediate if the opcodes are swapped.
|
||||
unsigned getSwappedVPCMPImm(unsigned Imm);
|
||||
|
||||
|
|
|
@ -1571,8 +1571,7 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
|
|||
; KNL-LABEL: store_v2i1:
|
||||
; KNL: ## %bb.0:
|
||||
; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
|
||||
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
|
||||
; KNL-NEXT: knotw %k0, %k0
|
||||
; KNL-NEXT: vptestnmq %zmm0, %zmm0, %k0
|
||||
; KNL-NEXT: kmovw %k0, %eax
|
||||
; KNL-NEXT: movb %al, (%rdi)
|
||||
; KNL-NEXT: vzeroupper
|
||||
|
@ -1589,8 +1588,7 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
|
|||
; AVX512BW-LABEL: store_v2i1:
|
||||
; AVX512BW: ## %bb.0:
|
||||
; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0
|
||||
; AVX512BW-NEXT: knotw %k0, %k0
|
||||
; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k0
|
||||
; AVX512BW-NEXT: kmovd %k0, %eax
|
||||
; AVX512BW-NEXT: movb %al, (%rdi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
|
@ -1622,8 +1620,7 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
|
|||
; KNL-LABEL: store_v4i1:
|
||||
; KNL: ## %bb.0:
|
||||
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
|
||||
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
||||
; KNL-NEXT: knotw %k0, %k0
|
||||
; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0
|
||||
; KNL-NEXT: kmovw %k0, %eax
|
||||
; KNL-NEXT: movb %al, (%rdi)
|
||||
; KNL-NEXT: vzeroupper
|
||||
|
@ -1640,8 +1637,7 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
|
|||
; AVX512BW-LABEL: store_v4i1:
|
||||
; AVX512BW: ## %bb.0:
|
||||
; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0
|
||||
; AVX512BW-NEXT: knotw %k0, %k0
|
||||
; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0
|
||||
; AVX512BW-NEXT: kmovd %k0, %eax
|
||||
; AVX512BW-NEXT: movb %al, (%rdi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
|
@ -1674,8 +1670,7 @@ define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
|
|||
; KNL: ## %bb.0:
|
||||
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
|
||||
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
|
||||
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
|
||||
; KNL-NEXT: knotw %k0, %k0
|
||||
; KNL-NEXT: vptestnmq %zmm0, %zmm0, %k0
|
||||
; KNL-NEXT: kmovw %k0, %eax
|
||||
; KNL-NEXT: movb %al, (%rdi)
|
||||
; KNL-NEXT: vzeroupper
|
||||
|
@ -1727,8 +1722,7 @@ define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
|
|||
; KNL: ## %bb.0:
|
||||
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
|
||||
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
|
||||
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
|
||||
; KNL-NEXT: knotw %k0, %k0
|
||||
; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0
|
||||
; KNL-NEXT: kmovw %k0, (%rdi)
|
||||
; KNL-NEXT: vzeroupper
|
||||
; KNL-NEXT: retq
|
||||
|
|
|
@ -140,10 +140,9 @@ define <4 x i64> @test_mm256_epi64(<8 x i32> %a, <8 x i32> %b) {
|
|||
; AVX512CD-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
||||
; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
|
||||
; AVX512CD-NEXT: kmovw %k0, %eax
|
||||
; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX512CD-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
|
||||
; AVX512CD-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
|
||||
; AVX512CD-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX512CD-NEXT: movzbl %al, %eax
|
||||
; AVX512CD-NEXT: vmovq %rax, %xmm0
|
||||
; AVX512CD-NEXT: vpbroadcastq %xmm0, %ymm0
|
||||
; AVX512CD-NEXT: retq
|
||||
;
|
||||
; AVX512VLCDBW-LABEL: test_mm256_epi64:
|
||||
|
|
Loading…
Reference in New Issue