From c2696d577b25fdb9cf5ca791527d038f97fa92ab Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 20 Jun 2018 21:05:02 +0000 Subject: [PATCH] [X86] Use setcc ISD opcode for AVX512 integer comparisons all the way to isel I don't believe there is any real reason to have separate X86 specific opcodes for vector compares. Setcc has the same behavior just uses a different encoding for the condition code. I had to change the CondCodeAction for SETLT and SETLE to prevent some transforms from changing SETGT lowering. Differential Revision: https://reviews.llvm.org/D43608 llvm-svn: 335173 --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 96 +++--- llvm/lib/Target/X86/X86ISelLowering.h | 1 - llvm/lib/Target/X86/X86InstrAVX512.td | 309 ++++++++++++------- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 1 - llvm/lib/Target/X86/X86InstrInfo.cpp | 17 + llvm/lib/Target/X86/X86InstrInfo.h | 4 + llvm/test/CodeGen/X86/avx512-mask-op.ll | 18 +- llvm/test/CodeGen/X86/broadcastm-lowering.ll | 7 +- 9 files changed, 273 insertions(+), 182 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 41c15b737b21..f7fd221b5170 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -470,7 +470,7 @@ namespace { // type. static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { unsigned Opcode = N->getOpcode(); - if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMU || + if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC || Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) { // We can get 256-bit 8 element types here without VLX being enabled. When // this happens we will use 512-bit operations and the mask will not be diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b2ee417c2390..033a343089d3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -814,6 +814,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); + + // The condition codes aren't legal in SSE/AVX and under AVX512 we use + // setcc all the way to isel and prefer SETGT in some isel patterns. + setCondCodeAction(ISD::SETLT, VT, Custom); + setCondCodeAction(ISD::SETLE, VT, Custom); } for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { @@ -1056,6 +1061,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); + + // The condition codes aren't legal in SSE/AVX and under AVX512 we use + // setcc all the way to isel and prefer SETGT in some isel patterns. + setCondCodeAction(ISD::SETLT, VT, Custom); + setCondCodeAction(ISD::SETLE, VT, Custom); } if (Subtarget.hasAnyFMA()) { @@ -1338,6 +1348,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + + // The condition codes aren't legal in SSE/AVX and under AVX512 we use + // setcc all the way to isel and prefer SETGT in some isel patterns. + setCondCodeAction(ISD::SETLT, VT, Custom); + setCondCodeAction(ISD::SETLE, VT, Custom); } // Need to promote to 64-bit even though we have 32-bit masked instructions @@ -1551,6 +1567,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); + setOperationAction(ISD::SETCC, VT, Custom); setOperationPromotedToType(ISD::AND, VT, MVT::v8i64); setOperationPromotedToType(ISD::OR, VT, MVT::v8i64); @@ -5180,8 +5197,8 @@ static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { default: return false; case X86ISD::CMPM: - case X86ISD::CMPMU: case X86ISD::CMPM_RND: + case ISD::SETCC: return true; } } @@ -6978,17 +6995,13 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, BOperand = ZeroExtended.getOperand(0); else BOperand = Ld.getOperand(0).getOperand(0); - if (BOperand.getValueType().isVector() && - BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) { - if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 || - NumElts == 8)) || // for broadcastmb2q - (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 || - NumElts == 16))) { // for broadcastmw2d - SDValue Brdcst = - DAG.getNode(X86ISD::VBROADCASTM, dl, - MVT::getVectorVT(EltType, NumElts), BOperand); - return DAG.getBitcast(VT, Brdcst); - } + MVT MaskVT = BOperand.getSimpleValueType(); + if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q + (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d + SDValue Brdcst = + DAG.getNode(X86ISD::VBROADCASTM, dl, + MVT::getVectorVT(EltType, NumElts), BOperand); + return DAG.getBitcast(VT, Brdcst); } } } @@ -14757,8 +14770,8 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, int NumElems = VT.getVectorNumElements(); if ((Subtarget.hasBWI() && (NumElems >= 32)) || (Subtarget.hasDQI() && (NumElems < 32))) - return DAG.getNode(X86ISD::CMPM, DL, VT, DAG.getConstant(0, DL, ExtVT), - Shuffle, DAG.getConstant(6, DL, MVT::i8)); + return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT), + Shuffle, ISD::SETGT); return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle); } @@ -14996,9 +15009,9 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { "Should have a size-matched integer condition!"); // Build a mask by testing the condition against zero. MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - SDValue Mask = DAG.getNode(X86ISD::CMPM, dl, MaskVT, Cond, - getZeroVector(VT, Subtarget, DAG, dl), - DAG.getConstant(4, dl, MVT::i8)); + SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond, + getZeroVector(VT, Subtarget, DAG, dl), + ISD::SETNE); // Now return a new VSELECT using the mask. return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2)); } @@ -16962,8 +16975,8 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, DAG.getConstant(ShiftInx, DL, ExtVT)); In = DAG.getBitcast(InVT, In); } - return DAG.getNode(X86ISD::CMPM, DL, VT, DAG.getConstant(0, DL, InVT), - In, DAG.getConstant(6, DL, MVT::i8)); + return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), + In, ISD::SETGT); } // Use TESTD/Q, extended vector to packed dword/qword. assert((InVT.is256BitVector() || InVT.is128BitVector()) && @@ -17010,11 +17023,10 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, } // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m. if (Subtarget.hasDQI()) - return DAG.getNode(X86ISD::CMPM, DL, VT, DAG.getConstant(0, DL, InVT), - In, DAG.getConstant(6, DL, MVT::i8)); - return DAG.getNode(X86ISD::CMPM, DL, VT, In, - getZeroVector(InVT, Subtarget, DAG, DL), - DAG.getConstant(4, DL, MVT::i8)); + return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), + In, ISD::SETGT); + return DAG.getSetCC(DL, VT, In, getZeroVector(InVT, Subtarget, DAG, DL), + ISD::SETNE); } SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { @@ -18091,28 +18103,13 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { ISD::isBuildVectorAllZeros(Op0.getNode())) std::swap(Op0, Op1); - bool Swap = false; - unsigned SSECC; - switch (SetCCOpcode) { - default: llvm_unreachable("Unexpected SETCC condition"); - case ISD::SETNE: SSECC = 4; break; - case ISD::SETEQ: SSECC = 0; break; - case ISD::SETULT: SSECC = 1; break; - case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; - case ISD::SETUGT: - case ISD::SETGT: SSECC = 6; break; - case ISD::SETUGE: - case ISD::SETGE: SSECC = 5; break; - case ISD::SETULE: - case ISD::SETLE: SSECC = 2; break; - } - if (Swap) + // Prefer SETGT over SETLT. + if (SetCCOpcode == ISD::SETLT) { + SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode); std::swap(Op0, Op1); + } - unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) ? X86ISD::CMPMU - : X86ISD::CMPM; - return DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(SSECC, dl, MVT::i8)); + return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode); } /// Try to turn a VSETULT into a VSETULE by modifying its second @@ -20167,7 +20164,6 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, default: break; case X86ISD::CMPM: case X86ISD::CMPM_RND: - case X86ISD::CMPMU: case X86ISD::VPSHUFBITQMB: case X86ISD::VFPCLASS: return DAG.getNode(ISD::AND, dl, VT, Op, VMask); @@ -22978,8 +22974,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); if (VT.is512BitVector()) { assert(VT == MVT::v64i8 && "Unexpected element type!"); - SDValue CMP = DAG.getNode(X86ISD::CMPM, dl, MVT::v64i1, Zeros, R, - DAG.getConstant(6, dl, MVT::i8)); + SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, + ISD::SETGT); return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP); } return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); @@ -23499,9 +23495,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); - Sel = DAG.getNode(X86ISD::CMPM, dl, MaskVT, - DAG.getConstant(0, dl, VT), Sel, - DAG.getConstant(6, dl, MVT::i8)); + Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel, + ISD::SETGT); return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } else if (Subtarget.hasSSE41()) { // On SSE41 targets we make use of the fact that VSELECT lowers @@ -25716,7 +25711,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::CMPM: return "X86ISD::CMPM"; - case X86ISD::CMPMU: return "X86ISD::CMPMU"; case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index d09be38c3b85..dc20d1c25666 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -345,7 +345,6 @@ namespace llvm { /// Vector comparison generating mask bits for fp and /// integer signed and unsigned data types. CMPM, - CMPMU, // Vector comparison with rounding mode for FP values CMPM_RND, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 2aee9e0977df..80eca2487c62 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2160,7 +2160,7 @@ multiclass avx512_icmp_packed opc, string OpcodeStr, PatFrag OpNode, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>, + (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>, EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>; let isCommutable = IsCommutable in def rrk : AVX512BI opc, string OpcodeStr, // This fragment treats X86cmpm as commutable to help match loads in both // operands for PCMPEQ. +def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>; def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2), - (X86cmpm_c node:$src1, node:$src2, (i8 0))>; + (X86setcc_commute node:$src1, node:$src2, SETEQ)>; def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2), - (X86cmpm node:$src1, node:$src2, (i8 6))>; + (setcc node:$src1, node:$src2, SETGT)>; +// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't +// increase the pattern complexity the way an immediate would. +let AddedComplexity = 2 in { // FIXME: Is there a better scheduler class for VPCMP? defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>, @@ -2277,33 +2281,29 @@ defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; +} -// Transforms to swizzle an immediate to help matching memory operand in first -// operand. -def CommutePCMPCC : SDNodeXFormgetZExtValue() & 0x7; - Imm = X86::getSwappedVPCMPImm(Imm); - return getI8Imm(Imm, SDLoc(N)); -}]>; - -multiclass avx512_icmp_cc opc, string Suffix, SDNode OpNode, - X86FoldableSchedWrite sched, X86VectorVTInfo _, - string Name> { +multiclass avx512_icmp_cc opc, string Suffix, PatFrag Frag, + PatFrag CommFrag, X86FoldableSchedWrite sched, + X86VectorVTInfo _, string Name> { let isCommutable = 1 in def rri : AVX512AIi8, + [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + cond)))]>, EVEX_4V, Sched<[sched]>; def rmi : AVX512AIi8, + [(set _.KRC:$dst, (_.KVT + (Frag:$cc + (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + cond)))]>, EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>; let isCommutable = 1 in def rrik : AVX512AIi8 opc, string Suffix, SDNode OpNode, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc)))]>, + (_.KVT (Frag:$cc (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + cond))))]>, EVEX_4V, EVEX_K, Sched<[sched]>; def rmik : AVX512AIi8 opc, string Suffix, SDNode OpNode, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert (_.LdFrag addr:$src2))), - imm:$cc)))]>, + (_.KVT + (Frag:$cc + (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))), + cond))))]>, EVEX_4V, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. @@ -2359,31 +2363,34 @@ multiclass avx512_icmp_cc opc, string Suffix, SDNode OpNode, NotMemoryFoldable; } - def : Pat<(OpNode (bitconvert (_.LdFrag addr:$src2)), - (_.VT _.RC:$src1), imm:$cc), - (!cast(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, - (CommutePCMPCC imm:$cc))>; + def : Pat<(_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)), + (_.VT _.RC:$src1), cond)), + (!cast(Name#_.ZSuffix#"rmi") + _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; - def : Pat<(and _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src2)), - (_.VT _.RC:$src1), imm:$cc)), - (!cast(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, - _.RC:$src1, addr:$src2, - (CommutePCMPCC imm:$cc))>; + def : Pat<(and _.KRCWM:$mask, + (_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)), + (_.VT _.RC:$src1), cond))), + (!cast(Name#_.ZSuffix#"rmik") + _.KRCWM:$mask, _.RC:$src1, addr:$src2, + (CommFrag.OperandTransform $cc))>; } -multiclass avx512_icmp_cc_rmb opc, string Suffix, SDNode OpNode, - X86FoldableSchedWrite sched, X86VectorVTInfo _, - string Name> : - avx512_icmp_cc { +multiclass avx512_icmp_cc_rmb opc, string Suffix, PatFrag Frag, + PatFrag CommFrag, X86FoldableSchedWrite sched, + X86VectorVTInfo _, string Name> : + avx512_icmp_cc { def rmib : AVX512AIi8, + [(set _.KRC:$dst, (_.KVT (Frag:$cc + (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + cond)))]>, EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; def rmibk : AVX512AIi8 opc, string Suffix, SDNode OpNode, "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), - (X86VBroadcast (_.ScalarLdFrag addr:$src2)), - imm:$cc)))]>, + (_.KVT (Frag:$cc + (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + cond))))]>, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. @@ -2417,77 +2426,118 @@ multiclass avx512_icmp_cc_rmb opc, string Suffix, SDNode OpNode, NotMemoryFoldable; } - def : Pat<(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src2)), - (_.VT _.RC:$src1), imm:$cc), - (!cast(Name#_.ZSuffix#"rmib") _.RC:$src1, addr:$src2, - (CommutePCMPCC imm:$cc))>; + def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), cond)), + (!cast(Name#_.ZSuffix#"rmib") + _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; - def : Pat<(and _.KRCWM:$mask, (OpNode (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), - (_.VT _.RC:$src1), imm:$cc)), - (!cast(Name#_.ZSuffix#"rmibk") _.KRCWM:$mask, - _.RC:$src1, addr:$src2, - (CommutePCMPCC imm:$cc))>; + def : Pat<(and _.KRCWM:$mask, + (_.KVT (CommFrag:$cc (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), cond))), + (!cast(Name#_.ZSuffix#"rmibk") + _.KRCWM:$mask, _.RC:$src1, addr:$src2, + (CommFrag.OperandTransform $cc))>; } -multiclass avx512_icmp_cc_vl opc, string Suffix, SDNode OpNode, - X86SchedWriteWidths sched, +multiclass avx512_icmp_cc_vl opc, string Suffix, PatFrag Frag, + PatFrag CommFrag, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc, - EVEX_V512; + defm Z : avx512_icmp_cc, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc, - EVEX_V256; - defm Z128 : avx512_icmp_cc, - EVEX_V128; + defm Z256 : avx512_icmp_cc, EVEX_V256; + defm Z128 : avx512_icmp_cc, EVEX_V128; } } -multiclass avx512_icmp_cc_rmb_vl opc, string Suffix, SDNode OpNode, - X86SchedWriteWidths sched, +multiclass avx512_icmp_cc_rmb_vl opc, string Suffix, PatFrag Frag, + PatFrag CommFrag, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc_rmb, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc_rmb, EVEX_V256; - defm Z128 : avx512_icmp_cc_rmb, EVEX_V128; } } -// FIXME: Is there a better scheduler class for VPCMP/VPCMPU? -defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, SchedWriteVecALU, - avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; -defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, SchedWriteVecALU, - avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; +def X86pcmpm_imm : SDNodeXForm(N->getOperand(2))->get(); + uint8_t SSECC = X86::getVPCMPImmForCond(CC); + return getI8Imm(SSECC, SDLoc(N)); +}]>; -defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, SchedWriteVecALU, - avx512vl_i16_info, HasBWI>, +// Swapped operand version of the above. +def X86pcmpm_imm_commute : SDNodeXForm(N->getOperand(2))->get(); + uint8_t SSECC = X86::getVPCMPImmForCond(CC); + SSECC = X86::getSwappedVPCMPImm(SSECC); + return getI8Imm(SSECC, SDLoc(N)); +}]>; + +def X86pcmpm : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast(N->getOperand(2))->get(); + return !ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm>; + +// Same as above, but commutes immediate. Use for load folding. +def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast(N->getOperand(2))->get(); + return !ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm_commute>; + +def X86pcmpum : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast(N->getOperand(2))->get(); + return ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm>; + +// Same as above, but commutes immediate. Use for load folding. +def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast(N->getOperand(2))->get(); + return ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm_commute>; + +// FIXME: Is there a better scheduler class for VPCMP/VPCMPU? +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_commute, + SchedWriteVecALU, avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_commute, + SchedWriteVecALU, avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; + +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_commute, + SchedWriteVecALU, avx512vl_i16_info, HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, SchedWriteVecALU, - avx512vl_i16_info, HasBWI>, +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_commute, + SchedWriteVecALU, avx512vl_i16_info, HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, SchedWriteVecALU, - avx512vl_i32_info, HasAVX512>, - EVEX_CD8<32, CD8VF>; -defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, SchedWriteVecALU, - avx512vl_i32_info, HasAVX512>, - EVEX_CD8<32, CD8VF>; +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_commute, + SchedWriteVecALU, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_commute, + SchedWriteVecALU, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, SchedWriteVecALU, - avx512vl_i64_info, HasAVX512>, - VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, SchedWriteVecALU, - avx512vl_i64_info, HasAVX512>, - VEX_W, EVEX_CD8<64, CD8VF>; +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_commute, + SchedWriteVecALU, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_commute, + SchedWriteVecALU, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; multiclass avx512_vcmp_common { @@ -3085,6 +3135,7 @@ multiclass avx512_mask_shiftop_w opc1, bits<8> opc2, string OpcodeStr, defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShuffle>; defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>; +// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. multiclass axv512_icmp_packed_no_vlx_lowering { @@ -3107,9 +3158,34 @@ multiclass axv512_icmp_packed_no_vlx_lowering; } -multiclass axv512_icmp_packed_cc_no_vlx_lowering { +def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), cond)), + (COPY_TO_REGCLASS + (!cast(InstStr##Zrri) + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), + (Frag.OperandTransform $cc)), Narrow.KRC)>; + +def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, + (Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), + cond)))), + (COPY_TO_REGCLASS (!cast(InstStr##Zrrik) + (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), + (Frag.OperandTransform $cc)), Narrow.KRC)>; +} + +// Same as above, but for fp types which don't use PatFrags. +multiclass axv512_cmp_packed_cc_no_vlx_lowering { def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1), (Narrow.VT Narrow.RC:$src2), imm:$cc)), (COPY_TO_REGCLASS @@ -3129,6 +3205,9 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, } let Predicates = [HasAVX512, NoVLX] in { + // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't + // increase the pattern complexity the way an immediate would. + let AddedComplexity = 2 in { defm : axv512_icmp_packed_no_vlx_lowering; defm : axv512_icmp_packed_no_vlx_lowering; @@ -3140,25 +3219,30 @@ let Predicates = [HasAVX512, NoVLX] in { defm : axv512_icmp_packed_no_vlx_lowering; defm : axv512_icmp_packed_no_vlx_lowering; + } - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + + defm : axv512_cmp_packed_cc_no_vlx_lowering; + defm : axv512_cmp_packed_cc_no_vlx_lowering; + defm : axv512_cmp_packed_cc_no_vlx_lowering; + defm : axv512_cmp_packed_cc_no_vlx_lowering; } let Predicates = [HasBWI, NoVLX] in { + // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't + // increase the pattern complexity the way an immediate would. + let AddedComplexity = 2 in { defm : axv512_icmp_packed_no_vlx_lowering; defm : axv512_icmp_packed_no_vlx_lowering; @@ -3170,18 +3254,19 @@ let Predicates = [HasBWI, NoVLX] in { defm : axv512_icmp_packed_no_vlx_lowering; defm : axv512_icmp_packed_no_vlx_lowering; + } - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; + defm : axv512_icmp_packed_cc_no_vlx_lowering; } // Mask setting all 0s or 1s @@ -5701,9 +5786,9 @@ multiclass avx512_vptest_wb opc, string OpcodeStr, // as commutable here because we already canonicalized all zeros vectors to the // RHS during lowering. def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2), - (X86cmpm node:$src1, node:$src2, (i8 0))>; + (setcc node:$src1, node:$src2, SETEQ)>; def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2), - (X86cmpm node:$src1, node:$src2, (i8 4))>; + (setcc node:$src1, node:$src2, SETNE)>; multiclass avx512_vptest_all_forms opc_wb, bits<8> opc_dq, string OpcodeStr, PatFrag OpNode, X86SchedWriteWidths sched> : diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 91b2a568f4de..1f4247bcb46d 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -174,7 +174,6 @@ def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; // Hack to make CMPM commutable in tablegen patterns for load folding. def X86cmpm_c : SDNode<"X86ISD::CMPM", X86CmpMaskCC, [SDNPCommutative]>; def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; -def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>; def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index e618f7cbd48c..cfa25731c406 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7874,6 +7874,23 @@ unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes, } } +/// Get the VPCMP immediate for the given condition. +unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) { + switch (CC) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETNE: return 4; + case ISD::SETEQ: return 0; + case ISD::SETULT: + case ISD::SETLT: return 1; + case ISD::SETUGT: + case ISD::SETGT: return 6; + case ISD::SETUGE: + case ISD::SETGE: return 5; + case ISD::SETULE: + case ISD::SETLE: return 2; + } +} + /// Get the VPCMP immediate if the opcodes are swapped. unsigned X86::getSwappedVPCMPImm(unsigned Imm) { switch (Imm) { diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 79f74108694d..8d2c592be43a 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -18,6 +18,7 @@ #include "X86InstrFMA3Info.h" #include "X86RegisterInfo.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER @@ -96,6 +97,9 @@ CondCode getCondFromCMovOpc(unsigned Opc); /// e.g. turning COND_E to COND_NE. CondCode GetOppositeBranchCondition(CondCode CC); +/// Get the VPCMP immediate for the given condition. +unsigned getVPCMPImmForCond(ISD::CondCode CC); + /// Get the VPCMP immediate if the opcodes are swapped. unsigned getSwappedVPCMPImm(unsigned Imm); diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 3d7545313ef7..a99b8bfc6e89 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1571,8 +1571,7 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; KNL-LABEL: store_v2i1: ; KNL: ## %bb.0: ; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: knotw %k0, %k0 +; KNL-NEXT: vptestnmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: vzeroupper @@ -1589,8 +1588,7 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; AVX512BW-LABEL: store_v2i1: ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512BW-NEXT: knotw %k0, %k0 +; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -1622,8 +1620,7 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; KNL-LABEL: store_v4i1: ; KNL: ## %bb.0: ; KNL-NEXT: vpslld $31, %xmm0, %xmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: knotw %k0, %k0 +; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: vzeroupper @@ -1640,8 +1637,7 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; AVX512BW-LABEL: store_v4i1: ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512BW-NEXT: knotw %k0, %k0 +; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -1674,8 +1670,7 @@ define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) { ; KNL: ## %bb.0: ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: knotw %k0, %k0 +; KNL-NEXT: vptestnmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rdi) ; KNL-NEXT: vzeroupper @@ -1727,8 +1722,7 @@ define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) { ; KNL: ## %bb.0: ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: knotw %k0, %k0 +; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/broadcastm-lowering.ll b/llvm/test/CodeGen/X86/broadcastm-lowering.ll index 243ff145c181..f8d670154882 100644 --- a/llvm/test/CodeGen/X86/broadcastm-lowering.ll +++ b/llvm/test/CodeGen/X86/broadcastm-lowering.ll @@ -140,10 +140,9 @@ define <4 x i64> @test_mm256_epi64(<8 x i32> %a, <8 x i32> %b) { ; AVX512CD-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512CD-NEXT: kmovw %k0, %eax -; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512CD-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; AVX512CD-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX512CD-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512CD-NEXT: movzbl %al, %eax +; AVX512CD-NEXT: vmovq %rax, %xmm0 +; AVX512CD-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512CD-NEXT: retq ; ; AVX512VLCDBW-LABEL: test_mm256_epi64: