diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bfd8c89599ba..2ebaec778e32 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9882,11 +9882,7 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps); V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); - // We have to cast V2 around. - MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); - V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT, - DAG.getBitcast(MaskVT, V1Mask), - DAG.getBitcast(MaskVT, V2))); + V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2); return DAG.getNode(ISD::OR, DL, VT, V1, V2); } @@ -35055,13 +35051,13 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::AND); - EVT VT = N->getValueType(0); - if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64) + MVT VT = N->getSimpleValueType(0); + if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); SDValue X, Y; - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); + SDValue N0 = peekThroughBitcasts(N->getOperand(0)); + SDValue N1 = peekThroughBitcasts(N->getOperand(1)); if (N0.getOpcode() == ISD::XOR && ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) { X = N0.getOperand(0); @@ -35073,6 +35069,8 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { } else return SDValue(); + X = DAG.getBitcast(VT, X); + Y = DAG.getBitcast(VT, Y); return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y); } @@ -35402,27 +35400,6 @@ static SDValue combineParity(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp); } -// This promotes vectors and/or/xor to a vXi64 type. We used to do this during -// op legalization, but DAG combine yields better results. -// TODO: This is largely just to reduce the number of isel patterns. Maybe we -// can just add all the patterns or do C++ based selection in X86ISelDAGToDAG? -static SDValue promoteVecLogicOp(SDNode *N, SelectionDAG &DAG) { - MVT VT = N->getSimpleValueType(0); - - if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) - return SDValue(); - - // Already correct type. - if (VT.getVectorElementType() == MVT::i64) - return SDValue(); - - MVT NewVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); - SDValue Op0 = DAG.getBitcast(NewVT, N->getOperand(0)); - SDValue Op1 = DAG.getBitcast(NewVT, N->getOperand(1)); - return DAG.getBitcast(VT, DAG.getNode(N->getOpcode(), SDLoc(N), NewVT, - Op0, Op1)); -} - static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -35457,9 +35434,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (SDValue V = promoteVecLogicOp(N, DAG)) - return V; - if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; @@ -35647,7 +35621,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, if (!Subtarget.hasSSE41()) return SDValue(); - MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; + MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8; X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); @@ -35782,9 +35756,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (SDValue V = promoteVecLogicOp(N, DAG)) - return V; - if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; @@ -37760,7 +37731,9 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, if ((VT.isVector() || VT == MVT::f128) && Subtarget.hasSSE2()) { SDLoc dl(N); - MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + unsigned IntBits = std::min(VT.getScalarSizeInBits(), 64U); + MVT IntSVT = MVT::getIntegerVT(IntBits); + MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits); SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0)); SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1)); @@ -37813,9 +37786,6 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (SDValue V = promoteVecLogicOp(N, DAG)) - return V; - if (SDValue SetCC = foldXor1SetCC(N, DAG)) return SetCC; @@ -38043,15 +38013,22 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + MVT VT = N->getSimpleValueType(0); + // ANDNP(0, x) -> x if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) return N->getOperand(1); // ANDNP(x, 0) -> 0 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode())) - return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N)); + return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); - EVT VT = N->getValueType(0); + // Turn ANDNP back to AND if input is inverted. + if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(N->getOperand(0).getOperand(1).getNode())) { + return DAG.getNode(ISD::AND, SDLoc(N), VT, + N->getOperand(0).getOperand(0), N->getOperand(1)); + } // Attempt to recursively combine a bitmask ANDNP with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 5550eb0061fa..ec314f329fdd 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -68,13 +68,6 @@ class X86VectorVTInfo("load" # VTName); - PatFrag i64LdFrag = !cast("load" # - !if (!eq (TypeVariantName, "i"), - !if (!eq (Size, 128), "v2i64", - !if (!eq (Size, 256), "v4i64", - !if (!eq (Size, 512), "v8i64", - VTName))), VTName)); - PatFrag AlignedLdFrag = !cast("alignedload" # VTName); PatFrag ScalarLdFrag = !cast("load" # EltVT); @@ -102,10 +95,6 @@ class X86VectorVTInfo("v" # !srl(Size, 6) # "i64"); - // A vector type of the same width with element type i32. This is used to // create the canonical constant zero node ImmAllZerosV. ValueType i32VT = !cast("v" # !srl(Size, 5) # "i32"); @@ -5094,152 +5083,147 @@ let Predicates = [HasAVX512, NoVLX] in { // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// -// OpNodeMsk is the OpNode to use when element size is important. OpNode will -// be set to null_frag for 32-bit elements. -multiclass avx512_logic_rm opc, string OpcodeStr, - SDPatternOperator OpNode, - SDNode OpNodeMsk, X86FoldableSchedWrite sched, - X86VectorVTInfo _, bit IsCommutable = 0> { - let hasSideEffects = 0 in - defm rr : AVX512_maskable_logic, AVX512BIBase, EVEX_4V, - Sched<[sched]>; - - let hasSideEffects = 0, mayLoad = 1 in - defm rm : AVX512_maskable_logic, - AVX512BIBase, EVEX_4V, - Sched<[sched.Folded, sched.ReadAfterFold]>; -} - -// OpNodeMsk is the OpNode to use where element size is important. So use -// for all of the broadcast patterns. -multiclass avx512_logic_rmb opc, string OpcodeStr, - SDPatternOperator OpNode, - SDNode OpNodeMsk, X86FoldableSchedWrite sched, X86VectorVTInfo _, - bit IsCommutable = 0> : - avx512_logic_rm { - defm rmb : AVX512_maskable_logic, - AVX512BIBase, EVEX_4V, EVEX_B, - Sched<[sched.Folded, sched.ReadAfterFold]>; -} - -multiclass avx512_logic_rmb_vl opc, string OpcodeStr, - SDPatternOperator OpNode, - SDNode OpNodeMsk, X86SchedWriteWidths sched, - AVX512VLVectorVTInfo VTInfo, - bit IsCommutable = 0> { - let Predicates = [HasAVX512] in - defm Z : avx512_logic_rmb, EVEX_V512; - - let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_logic_rmb, EVEX_V256; - defm Z128 : avx512_logic_rmb, EVEX_V128; - } -} - -multiclass avx512_logic_rm_vl_dq opc_d, bits<8> opc_q, string OpcodeStr, - SDNode OpNode, X86SchedWriteWidths sched, - bit IsCommutable = 0> { - defm Q : avx512_logic_rmb_vl, - VEX_W, EVEX_CD8<64, CD8VF>; - defm D : avx512_logic_rmb_vl, - EVEX_CD8<32, CD8VF>; -} - -defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, - SchedWriteVecLogic, 1>; -defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, - SchedWriteVecLogic, 1>; -defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, - SchedWriteVecLogic, 1>; -defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, - SchedWriteVecLogic>; +defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and, + SchedWriteVecLogic, HasAVX512, 1>; +defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or, + SchedWriteVecLogic, HasAVX512, 1>; +defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, + SchedWriteVecLogic, HasAVX512, 1>; +defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, + SchedWriteVecLogic, HasAVX512>; let Predicates = [HasVLX] in { def : Pat<(v16i8 (and VR128X:$src1, VR128X:$src2)), (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>; def : Pat<(v8i16 (and VR128X:$src1, VR128X:$src2)), (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>; - def : Pat<(v4i32 (and VR128X:$src1, VR128X:$src2)), - (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>; def : Pat<(v16i8 (or VR128X:$src1, VR128X:$src2)), (VPORQZ128rr VR128X:$src1, VR128X:$src2)>; def : Pat<(v8i16 (or VR128X:$src1, VR128X:$src2)), (VPORQZ128rr VR128X:$src1, VR128X:$src2)>; - def : Pat<(v4i32 (or VR128X:$src1, VR128X:$src2)), - (VPORQZ128rr VR128X:$src1, VR128X:$src2)>; def : Pat<(v16i8 (xor VR128X:$src1, VR128X:$src2)), (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>; def : Pat<(v8i16 (xor VR128X:$src1, VR128X:$src2)), (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>; - def : Pat<(v4i32 (xor VR128X:$src1, VR128X:$src2)), - (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>; def : Pat<(v16i8 (X86andnp VR128X:$src1, VR128X:$src2)), (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>; def : Pat<(v8i16 (X86andnp VR128X:$src1, VR128X:$src2)), (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>; - def : Pat<(v4i32 (X86andnp VR128X:$src1, VR128X:$src2)), - (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>; + + def : Pat<(and VR128X:$src1, (loadv16i8 addr:$src2)), + (VPANDQZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(and VR128X:$src1, (loadv8i16 addr:$src2)), + (VPANDQZ128rm VR128X:$src1, addr:$src2)>; + + def : Pat<(or VR128X:$src1, (loadv16i8 addr:$src2)), + (VPORQZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(or VR128X:$src1, (loadv8i16 addr:$src2)), + (VPORQZ128rm VR128X:$src1, addr:$src2)>; + + def : Pat<(xor VR128X:$src1, (loadv16i8 addr:$src2)), + (VPXORQZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(xor VR128X:$src1, (loadv8i16 addr:$src2)), + (VPXORQZ128rm VR128X:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR128X:$src1, (loadv16i8 addr:$src2)), + (VPANDNQZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128X:$src1, (loadv8i16 addr:$src2)), + (VPANDNQZ128rm VR128X:$src1, addr:$src2)>; + + def : Pat<(and VR128X:$src1, + (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPANDDZ128rmb VR128X:$src1, addr:$src2)>; + def : Pat<(or VR128X:$src1, + (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPORDZ128rmb VR128X:$src1, addr:$src2)>; + def : Pat<(xor VR128X:$src1, + (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPXORDZ128rmb VR128X:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128X:$src1, + (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPANDNDZ128rmb VR128X:$src1, addr:$src2)>; + + def : Pat<(and VR128X:$src1, + (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPANDQZ128rmb VR128X:$src1, addr:$src2)>; + def : Pat<(or VR128X:$src1, + (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPORQZ128rmb VR128X:$src1, addr:$src2)>; + def : Pat<(xor VR128X:$src1, + (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPXORQZ128rmb VR128X:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128X:$src1, + (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPANDNQZ128rmb VR128X:$src1, addr:$src2)>; def : Pat<(v32i8 (and VR256X:$src1, VR256X:$src2)), (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>; def : Pat<(v16i16 (and VR256X:$src1, VR256X:$src2)), (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>; - def : Pat<(v8i32 (and VR256X:$src1, VR256X:$src2)), - (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>; def : Pat<(v32i8 (or VR256X:$src1, VR256X:$src2)), (VPORQZ256rr VR256X:$src1, VR256X:$src2)>; def : Pat<(v16i16 (or VR256X:$src1, VR256X:$src2)), (VPORQZ256rr VR256X:$src1, VR256X:$src2)>; - def : Pat<(v8i32 (or VR256X:$src1, VR256X:$src2)), - (VPORQZ256rr VR256X:$src1, VR256X:$src2)>; def : Pat<(v32i8 (xor VR256X:$src1, VR256X:$src2)), (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>; def : Pat<(v16i16 (xor VR256X:$src1, VR256X:$src2)), (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>; - def : Pat<(v8i32 (xor VR256X:$src1, VR256X:$src2)), - (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>; def : Pat<(v32i8 (X86andnp VR256X:$src1, VR256X:$src2)), (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>; def : Pat<(v16i16 (X86andnp VR256X:$src1, VR256X:$src2)), (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>; - def : Pat<(v8i32 (X86andnp VR256X:$src1, VR256X:$src2)), - (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>; + + def : Pat<(and VR256X:$src1, (loadv32i8 addr:$src2)), + (VPANDQZ256rm VR256X:$src1, addr:$src2)>; + def : Pat<(and VR256X:$src1, (loadv16i16 addr:$src2)), + (VPANDQZ256rm VR256X:$src1, addr:$src2)>; + + def : Pat<(or VR256X:$src1, (loadv32i8 addr:$src2)), + (VPORQZ256rm VR256X:$src1, addr:$src2)>; + def : Pat<(or VR256X:$src1, (loadv16i16 addr:$src2)), + (VPORQZ256rm VR256X:$src1, addr:$src2)>; + + def : Pat<(xor VR256X:$src1, (loadv32i8 addr:$src2)), + (VPXORQZ256rm VR256X:$src1, addr:$src2)>; + def : Pat<(xor VR256X:$src1, (loadv16i16 addr:$src2)), + (VPXORQZ256rm VR256X:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR256X:$src1, (loadv32i8 addr:$src2)), + (VPANDNQZ256rm VR256X:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256X:$src1, (loadv16i16 addr:$src2)), + (VPANDNQZ256rm VR256X:$src1, addr:$src2)>; + + def : Pat<(and VR256X:$src1, + (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPANDDZ256rmb VR256X:$src1, addr:$src2)>; + def : Pat<(or VR256X:$src1, + (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPORDZ256rmb VR256X:$src1, addr:$src2)>; + def : Pat<(xor VR256X:$src1, + (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPXORDZ256rmb VR256X:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256X:$src1, + (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPANDNDZ256rmb VR256X:$src1, addr:$src2)>; + + def : Pat<(and VR256X:$src1, + (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPANDQZ256rmb VR256X:$src1, addr:$src2)>; + def : Pat<(or VR256X:$src1, + (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPORQZ256rmb VR256X:$src1, addr:$src2)>; + def : Pat<(xor VR256X:$src1, + (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPXORQZ256rmb VR256X:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256X:$src1, + (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPANDNQZ256rmb VR256X:$src1, addr:$src2)>; } let Predicates = [HasAVX512] in { @@ -5247,31 +5231,209 @@ let Predicates = [HasAVX512] in { (VPANDQZrr VR512:$src1, VR512:$src2)>; def : Pat<(v32i16 (and VR512:$src1, VR512:$src2)), (VPANDQZrr VR512:$src1, VR512:$src2)>; - def : Pat<(v16i32 (and VR512:$src1, VR512:$src2)), - (VPANDQZrr VR512:$src1, VR512:$src2)>; def : Pat<(v64i8 (or VR512:$src1, VR512:$src2)), (VPORQZrr VR512:$src1, VR512:$src2)>; def : Pat<(v32i16 (or VR512:$src1, VR512:$src2)), (VPORQZrr VR512:$src1, VR512:$src2)>; - def : Pat<(v16i32 (or VR512:$src1, VR512:$src2)), - (VPORQZrr VR512:$src1, VR512:$src2)>; def : Pat<(v64i8 (xor VR512:$src1, VR512:$src2)), (VPXORQZrr VR512:$src1, VR512:$src2)>; def : Pat<(v32i16 (xor VR512:$src1, VR512:$src2)), (VPXORQZrr VR512:$src1, VR512:$src2)>; - def : Pat<(v16i32 (xor VR512:$src1, VR512:$src2)), - (VPXORQZrr VR512:$src1, VR512:$src2)>; def : Pat<(v64i8 (X86andnp VR512:$src1, VR512:$src2)), (VPANDNQZrr VR512:$src1, VR512:$src2)>; def : Pat<(v32i16 (X86andnp VR512:$src1, VR512:$src2)), (VPANDNQZrr VR512:$src1, VR512:$src2)>; - def : Pat<(v16i32 (X86andnp VR512:$src1, VR512:$src2)), - (VPANDNQZrr VR512:$src1, VR512:$src2)>; + + def : Pat<(and VR512:$src1, (loadv64i8 addr:$src2)), + (VPANDQZrm VR512:$src1, addr:$src2)>; + def : Pat<(and VR512:$src1, (loadv32i16 addr:$src2)), + (VPANDQZrm VR512:$src1, addr:$src2)>; + + def : Pat<(or VR512:$src1, (loadv64i8 addr:$src2)), + (VPORQZrm VR512:$src1, addr:$src2)>; + def : Pat<(or VR512:$src1, (loadv32i16 addr:$src2)), + (VPORQZrm VR512:$src1, addr:$src2)>; + + def : Pat<(xor VR512:$src1, (loadv64i8 addr:$src2)), + (VPXORQZrm VR512:$src1, addr:$src2)>; + def : Pat<(xor VR512:$src1, (loadv32i16 addr:$src2)), + (VPXORQZrm VR512:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR512:$src1, (loadv64i8 addr:$src2)), + (VPANDNQZrm VR512:$src1, addr:$src2)>; + def : Pat<(X86andnp VR512:$src1, (loadv32i16 addr:$src2)), + (VPANDNQZrm VR512:$src1, addr:$src2)>; + + def : Pat<(and VR512:$src1, + (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPANDDZrmb VR512:$src1, addr:$src2)>; + def : Pat<(or VR512:$src1, + (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPORDZrmb VR512:$src1, addr:$src2)>; + def : Pat<(xor VR512:$src1, + (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPXORDZrmb VR512:$src1, addr:$src2)>; + def : Pat<(X86andnp VR512:$src1, + (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), + (VPANDNDZrmb VR512:$src1, addr:$src2)>; + + def : Pat<(and VR512:$src1, + (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPANDQZrmb VR512:$src1, addr:$src2)>; + def : Pat<(or VR512:$src1, + (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPORQZrmb VR512:$src1, addr:$src2)>; + def : Pat<(xor VR512:$src1, + (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPXORQZrmb VR512:$src1, addr:$src2)>; + def : Pat<(X86andnp VR512:$src1, + (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), + (VPANDNQZrmb VR512:$src1, addr:$src2)>; } +// Patterns to catch vselect with different type than logic op. +multiclass avx512_logical_lowering { + // Masked register-register logical operations. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))), + _.RC:$src0)), + (!cast(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask, + _.RC:$src1, _.RC:$src2)>; + + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))), + _.ImmAllZerosV)), + (!cast(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1, + _.RC:$src2)>; + + // Masked register-memory logical operations. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert (IntInfo.VT (OpNode _.RC:$src1, + (load addr:$src2)))), + _.RC:$src0)), + (!cast(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask, + _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert (IntInfo.VT (OpNode _.RC:$src1, + (load addr:$src2)))), + _.ImmAllZerosV)), + (!cast(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1, + addr:$src2)>; +} + +multiclass avx512_logical_lowering_bcast { + // Register-broadcast logical operations. + def : Pat<(IntInfo.VT (OpNode _.RC:$src1, + (bitconvert (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))), + (!cast(InstrStr#rmb) _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert + (IntInfo.VT (OpNode _.RC:$src1, + (bitconvert (_.VT + (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))))), + _.RC:$src0)), + (!cast(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, + _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert + (IntInfo.VT (OpNode _.RC:$src1, + (bitconvert (_.VT + (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))))), + _.ImmAllZerosV)), + (!cast(InstrStr#rmbkz) _.KRCWM:$mask, + _.RC:$src1, addr:$src2)>; +} + +multiclass avx512_logical_lowering_sizes { +let Predicates = [HasVLX] in { + defm : avx512_logical_lowering; + defm : avx512_logical_lowering; +} +let Predicates = [HasAVX512] in { + defm : avx512_logical_lowering; +} +} + +multiclass avx512_logical_lowering_sizes_bcast { +let Predicates = [HasVLX] in { + defm : avx512_logical_lowering_bcast; + defm : avx512_logical_lowering_bcast; +} +let Predicates = [HasAVX512] in { + defm : avx512_logical_lowering_bcast; +} +} + +multiclass avx512_logical_lowering_types { + // i64 vselect with i32/i16/i8 logic op + defm : avx512_logical_lowering_sizes; + defm : avx512_logical_lowering_sizes; + defm : avx512_logical_lowering_sizes; + + // i32 vselect with i64/i16/i8 logic op + defm : avx512_logical_lowering_sizes; + defm : avx512_logical_lowering_sizes; + defm : avx512_logical_lowering_sizes; + + // f32 vselect with i64/i32/i16/i8 logic op + defm : avx512_logical_lowering_sizes; + defm : avx512_logical_lowering_sizes; + defm : avx512_logical_lowering_sizes; + defm : avx512_logical_lowering_sizes; + + // f64 vselect with i64/i32/i16/i8 logic op + defm : avx512_logical_lowering_sizes; + defm : avx512_logical_lowering_sizes; + defm : avx512_logical_lowering_sizes; + defm : avx512_logical_lowering_sizes; + + defm : avx512_logical_lowering_sizes_bcast; + defm : avx512_logical_lowering_sizes_bcast; +} + +defm : avx512_logical_lowering_types<"VPAND", and>; +defm : avx512_logical_lowering_types<"VPOR", or>; +defm : avx512_logical_lowering_types<"VPXOR", xor>; +defm : avx512_logical_lowering_types<"VPANDN", X86andnp>; + //===----------------------------------------------------------------------===// // AVX-512 FP arithmetic //===----------------------------------------------------------------------===// @@ -5575,73 +5737,6 @@ defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI, defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI, SchedWriteFLogicSizes, 1>; -// Patterns catch floating point selects with bitcasted integer logic ops. -multiclass avx512_fp_logical_lowering { -let Predicates = [prd] in { - // Masked register-register logical operations. - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))), - _.RC:$src0)), - (!cast(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask, - _.RC:$src1, _.RC:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV)), - (!cast(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1, - _.RC:$src2)>; - // Masked register-memory logical operations. - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (bitconvert (_.i64VT (OpNode _.RC:$src1, - (load addr:$src2)))), - _.RC:$src0)), - (!cast(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask, - _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (bitconvert (_.i64VT (OpNode _.RC:$src1, (load addr:$src2)))), - _.ImmAllZerosV)), - (!cast(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1, - addr:$src2)>; - // Register-broadcast logical operations. - def : Pat<(_.i64VT (OpNode _.RC:$src1, - (bitconvert (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))), - (!cast(InstrStr#rmb) _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (bitconvert - (_.i64VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))))), - _.RC:$src0)), - (!cast(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, - _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (bitconvert - (_.i64VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))))), - _.ImmAllZerosV)), - (!cast(InstrStr#rmbkz) _.KRCWM:$mask, - _.RC:$src1, addr:$src2)>; -} -} - -multiclass avx512_fp_logical_lowering_sizes { - defm : avx512_fp_logical_lowering; - defm : avx512_fp_logical_lowering; - defm : avx512_fp_logical_lowering; - defm : avx512_fp_logical_lowering; - defm : avx512_fp_logical_lowering; - defm : avx512_fp_logical_lowering; -} - -defm : avx512_fp_logical_lowering_sizes<"VPAND", and>; -defm : avx512_fp_logical_lowering_sizes<"VPOR", or>; -defm : avx512_fp_logical_lowering_sizes<"VPXOR", xor>; -defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>; - let Predicates = [HasVLX,HasDQI] in { // Use packed logical operations for scalar ops. def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)), @@ -5771,15 +5866,12 @@ multiclass avx512_vptest opc, string OpcodeStr, PatFrag OpNode, defm rr : AVX512_maskable_cmp, + (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)>, EVEX_4V, Sched<[sched]>; defm rm : AVX512_maskable_cmp, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -5813,7 +5905,7 @@ multiclass avx512_vptest_mb opc, string OpcodeStr, PatFrag OpNode, // Use 512bit version to implement 128/256 bit in case NoVLX. multiclass avx512_vptest_lowering { - def : Pat<(_.KVT (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))), + def : Pat<(_.KVT (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)), (_.KVT (COPY_TO_REGCLASS (!cast(Name # "Zrr") @@ -5824,7 +5916,7 @@ multiclass avx512_vptest_lowering; def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))), + (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV))), (COPY_TO_REGCLASS (!cast(Name # "Zrrk") @@ -5927,6 +6019,125 @@ defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem, defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm, SchedWriteVecLogic>, T8XS; + +multiclass avx512_vptest_lowering_pats { + def : Pat<(_.KVT (OpNode (bitconvert + (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), + _.ImmAllZerosV)), + (!cast(InstrStr # "rr") _.RC:$src1, _.RC:$src2)>; + + def : Pat<(_.KVT (and _.KRC:$mask, + (OpNode (bitconvert + (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), + _.ImmAllZerosV))), + (!cast(InstrStr # "rrk") _.KRC:$mask, _.RC:$src1, + _.RC:$src2)>; + + def : Pat<(_.KVT (OpNode (bitconvert + (AndInfo.VT (and _.RC:$src1, + (AndInfo.LdFrag addr:$src2)))), + _.ImmAllZerosV)), + (!cast(InstrStr # "rm") _.RC:$src1, addr:$src2)>; + + def : Pat<(_.KVT (and _.KRC:$mask, + (OpNode (bitconvert + (AndInfo.VT (and _.RC:$src1, + (AndInfo.LdFrag addr:$src2)))), + _.ImmAllZerosV))), + (!cast(InstrStr # "rmk") _.KRC:$mask, _.RC:$src1, + addr:$src2)>; +} + +// Patterns to use 512-bit instructions when 128/256 are not available. +multiclass avx512_vptest_lowering_wide_pats { + def : Pat<(_.KVT (OpNode (bitconvert + (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), + _.ImmAllZerosV)), + (_.KVT (COPY_TO_REGCLASS + (!cast(InstrStr#"rr") + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src1, _.SubRegIdx), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src2, _.SubRegIdx)), + _.KRC))>; + + def : Pat<(_.KVT (and _.KRC:$mask, + (OpNode (bitconvert + (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), + _.ImmAllZerosV))), + (COPY_TO_REGCLASS + (!cast(InstrStr#"rrk") + (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src1, _.SubRegIdx), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src2, _.SubRegIdx)), + _.KRC)>; +} + +multiclass avx512_vptest_lowering_sizes { +let Predicates = [prd, HasVLX] in { + defm : avx512_vptest_lowering_pats; + defm : avx512_vptest_lowering_pats; +} +let Predicates = [prd] in { + defm : avx512_vptest_lowering_pats; +} + +let Predicates = [prd, NoVLX] in { + defm : avx512_vptest_lowering_wide_pats; + defm : avx512_vptest_lowering_wide_pats; +} +} + +multiclass avx512_vptest_lowering_types { + defm : avx512_vptest_lowering_sizes; + defm : avx512_vptest_lowering_sizes; + defm : avx512_vptest_lowering_sizes; + + defm : avx512_vptest_lowering_sizes; + defm : avx512_vptest_lowering_sizes; + defm : avx512_vptest_lowering_sizes; + + defm : avx512_vptest_lowering_sizes; + defm : avx512_vptest_lowering_sizes; + defm : avx512_vptest_lowering_sizes; + + defm : avx512_vptest_lowering_sizes; + defm : avx512_vptest_lowering_sizes; + defm : avx512_vptest_lowering_sizes; +} + +defm : avx512_vptest_lowering_types<"VPTESTM", X86pcmpnem>; +defm : avx512_vptest_lowering_types<"VPTESTNM", X86pcmpeqm>; + //===----------------------------------------------------------------------===// // AVX-512 Shift instructions //===----------------------------------------------------------------------===// @@ -11443,19 +11654,68 @@ defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU, // TODO: We should maybe have a more generalized algorithm for folding to // vpternlog. let Predicates = [HasAVX512] in { - def : Pat<(v8i64 (xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV)))), + def : Pat<(xor VR512:$src, (bc_v64i8 (v16i32 immAllOnesV))), + (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; + def : Pat<(xor VR512:$src, (bc_v32i16 (v16i32 immAllOnesV))), + (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; + def : Pat<(xor VR512:$src, (bc_v16i32 (v16i32 immAllOnesV))), + (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; + def : Pat<(xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV))), (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; } let Predicates = [HasAVX512, NoVLX] in { - def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))), + def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (i8 15)), sub_xmm)>; - def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))), + def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (i8 15)), sub_xmm)>; + def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (i8 15)), sub_xmm)>; + def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (i8 15)), sub_xmm)>; + + def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (i8 15)), sub_ymm)>; + def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (i8 15)), sub_ymm)>; + def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (i8 15)), sub_ymm)>; + def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))), (EXTRACT_SUBREG (VPTERNLOGQZrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), @@ -11465,9 +11725,22 @@ let Predicates = [HasAVX512, NoVLX] in { } let Predicates = [HasVLX] in { - def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))), + def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))), (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; - def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))), + def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))), + (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; + def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))), + (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; + def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))), + (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; + + def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))), + (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; + def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))), + (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; + def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))), + (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; + def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))), (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; } diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 7e31527a8776..7bc8d0aa5309 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -855,6 +855,7 @@ def bc_v4f64 : PatFrag<(ops node:$in), (v4f64 (bitconvert node:$in))>; // 512-bit bitconvert pattern fragments def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>; +def bc_v32i16 : PatFrag<(ops node:$in), (v32i16 (bitconvert node:$in))>; def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>; def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>; def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 8f97ce370684..85e4fd385633 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -2417,6 +2417,34 @@ let Predicates = [HasAVX2, NoVLX] in { (VPANDNYrr VR256:$src1, VR256:$src2)>; def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), (VPANDNYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), + (VPANDYrm VR256:$src1, addr:$src2)>; + def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), + (VPANDYrm VR256:$src1, addr:$src2)>; + def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), + (VPANDYrm VR256:$src1, addr:$src2)>; + + def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), + (VPORYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), + (VPORYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), + (VPORYrm VR256:$src1, addr:$src2)>; + + def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), + (VPXORYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), + (VPXORYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), + (VPXORYrm VR256:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), + (VPANDNYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), + (VPANDNYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), + (VPANDNYrm VR256:$src1, addr:$src2)>; } // If only AVX1 is supported, we need to handle integer operations with @@ -2458,12 +2486,39 @@ let Predicates = [HasAVX1Only] in { def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), (VANDNPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), + (VANDPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), + (VANDPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), + (VANDPSYrm VR256:$src1, addr:$src2)>; def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), (VANDPSYrm VR256:$src1, addr:$src2)>; + + def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), + (VORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), + (VORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), + (VORPSYrm VR256:$src1, addr:$src2)>; def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), (VORPSYrm VR256:$src1, addr:$src2)>; + + def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), + (VXORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), + (VXORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), + (VXORPSYrm VR256:$src1, addr:$src2)>; def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), (VXORPSYrm VR256:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), + (VANDNPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), + (VANDNPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), + (VANDNPSYrm VR256:$src1, addr:$src2)>; def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), (VANDNPSYrm VR256:$src1, addr:$src2)>; } @@ -2589,6 +2644,34 @@ let Predicates = [HasAVX, NoVLX] in { (VPANDNrr VR128:$src1, VR128:$src2)>; def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), (VPANDNrr VR128:$src1, VR128:$src2)>; + + def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)), + (VPANDrm VR128:$src1, addr:$src2)>; + def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)), + (VPANDrm VR128:$src1, addr:$src2)>; + def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)), + (VPANDrm VR128:$src1, addr:$src2)>; + + def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)), + (VPORrm VR128:$src1, addr:$src2)>; + def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)), + (VPORrm VR128:$src1, addr:$src2)>; + def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)), + (VPORrm VR128:$src1, addr:$src2)>; + + def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)), + (VPXORrm VR128:$src1, addr:$src2)>; + def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)), + (VPXORrm VR128:$src1, addr:$src2)>; + def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)), + (VPXORrm VR128:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)), + (VPANDNrm VR128:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)), + (VPANDNrm VR128:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)), + (VPANDNrm VR128:$src1, addr:$src2)>; } let Predicates = [UseSSE2] in { @@ -2619,6 +2702,34 @@ let Predicates = [UseSSE2] in { (PANDNrr VR128:$src1, VR128:$src2)>; def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), (PANDNrr VR128:$src1, VR128:$src2)>; + + def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)), + (PANDrm VR128:$src1, addr:$src2)>; + def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)), + (PANDrm VR128:$src1, addr:$src2)>; + def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)), + (PANDrm VR128:$src1, addr:$src2)>; + + def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)), + (PORrm VR128:$src1, addr:$src2)>; + def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)), + (PORrm VR128:$src1, addr:$src2)>; + def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)), + (PORrm VR128:$src1, addr:$src2)>; + + def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)), + (PXORrm VR128:$src1, addr:$src2)>; + def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)), + (PXORrm VR128:$src1, addr:$src2)>; + def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)), + (PXORrm VR128:$src1, addr:$src2)>; + + def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)), + (PANDNrm VR128:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)), + (PANDNrm VR128:$src1, addr:$src2)>; + def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)), + (PANDNrm VR128:$src1, addr:$src2)>; } // Patterns for packed operations when we don't have integer type available. diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td index 39f50c10ae11..9d810a675e3b 100644 --- a/llvm/lib/Target/X86/X86InstrXOP.td +++ b/llvm/lib/Target/X86/X86InstrXOP.td @@ -350,6 +350,7 @@ multiclass xop4op_int opc, string OpcodeStr, RegisterClass RC, [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1), (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V, Sched<[sched]>; + // FIXME: This pattern can't match. def rrm : IXOPi8Reg, VEX_L; } +let Predicates = [HasXOP] in { + def : Pat<(v16i8 (or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, VR128:$src2))), + (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v8i16 (or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, VR128:$src2))), + (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v4i32 (or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, VR128:$src2))), + (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>; + + def : Pat<(or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, (bc_v16i8 (loadv2i64 addr:$src2)))), + (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>; + def : Pat<(or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, (bc_v8i16 (loadv2i64 addr:$src2)))), + (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>; + def : Pat<(or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, (bc_v4i32 (loadv2i64 addr:$src2)))), + (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>; + + def : Pat<(v32i8 (or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, VR256:$src2))), + (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>; + def : Pat<(v16i16 (or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, VR256:$src2))), + (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>; + def : Pat<(v8i32 (or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, VR256:$src2))), + (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>; + + def : Pat<(or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, (bc_v32i8 (loadv4i64 addr:$src2)))), + (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>; + def : Pat<(or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, (bc_v16i16 (loadv4i64 addr:$src2)))), + (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>; + def : Pat<(or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>; +} + multiclass xop_vpermil2 Opc, string OpcodeStr, RegisterClass RC, X86MemOperand intmemop, X86MemOperand fpmemop, ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag, diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll index d836e9e439be..6bc69213d427 100644 --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -601,17 +601,17 @@ define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind { define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { ; AVX512F-LABEL: andd512fold: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpandq (%rdi), %zmm0, %zmm0 +; AVX512F-NEXT: vpandd (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: andd512fold: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpandq (%rdi), %zmm0, %zmm0 +; AVX512VL-NEXT: vpandd (%rdi), %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: andd512fold: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpandq (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandd (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: andd512fold: diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index aa89ee7c3906..d888ea4ac440 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -3614,8 +3614,8 @@ define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> ; CHECK-LABEL: test_mm512_fnmsub_round_ps: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0] -; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 -; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 +; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4 +; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0 ; CHECK-NEXT: vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} entry: @@ -3837,8 +3837,8 @@ define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, ; CHECK-LABEL: test_mm512_fnmsub_ps: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0] -; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 -; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 +; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4 +; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0 ; CHECK-NEXT: ret{{[l|q]}} entry: diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index a70ff9bc1b16..26e8636df8f8 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1658,7 +1658,7 @@ define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) { define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_xor_epi32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xef,0xc1] +; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xef,0xc1] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) ret < 16 x i32> %res @@ -1687,7 +1687,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_or_epi32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xeb,0xc1] +; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xeb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) ret < 16 x i32> %res @@ -1716,7 +1716,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: test_and_epi32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xdb,0xc1] +; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xdb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1) ret < 16 x i32> %res diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll index bb1e8550ba23..65d9d67b2caa 100644 --- a/llvm/test/CodeGen/X86/avx512-logic.ll +++ b/llvm/test/CodeGen/X86/avx512-logic.ll @@ -7,7 +7,7 @@ define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon ; ALL-LABEL: vpandd: ; ALL: ## %bb.0: ## %entry ; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq entry: ; Force the execution domain with an add. @@ -21,7 +21,7 @@ define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readno ; ALL-LABEL: vpandnd: ; ALL: ## %bb.0: ## %entry ; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vpandnd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq entry: ; Force the execution domain with an add. @@ -37,7 +37,7 @@ define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ; ALL-LABEL: vpord: ; ALL: ## %bb.0: ## %entry ; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpord %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq entry: ; Force the execution domain with an add. @@ -51,7 +51,7 @@ define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon ; ALL-LABEL: vpxord: ; ALL: ## %bb.0: ## %entry ; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq entry: ; Force the execution domain with an add. @@ -132,7 +132,7 @@ define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind { define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { ; KNL-LABEL: andd512fold: ; KNL: ## %bb.0: ## %entry -; KNL-NEXT: vpandq (%rdi), %zmm0, %zmm0 +; KNL-NEXT: vpandd (%rdi), %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: andd512fold: diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 1449f5cf7b42..13ffb9f65bf7 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -3177,7 +3177,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) { ; KNL: ## %bb.0: ; KNL-NEXT: pushq %rax ; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0 ; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testw %ax, %ax @@ -3196,7 +3196,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) { ; SKX: ## %bb.0: ; SKX-NEXT: pushq %rax ; SKX-NEXT: .cfi_def_cfa_offset 16 -; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vpord %zmm1, %zmm0, %zmm0 ; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: testw %ax, %ax @@ -3215,7 +3215,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) { ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: pushq %rax ; AVX512BW-NEXT: .cfi_def_cfa_offset 16 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testw %ax, %ax @@ -3234,7 +3234,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: pushq %rax ; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512DQ-NEXT: kmovw %k0, %eax ; AVX512DQ-NEXT: testw %ax, %ax @@ -3253,7 +3253,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) { ; X86: ## %bb.0: ; X86-NEXT: subl $12, %esp ; X86-NEXT: .cfi_def_cfa_offset 16 -; X86-NEXT: vporq %zmm1, %zmm0, %zmm0 +; X86-NEXT: vpord %zmm1, %zmm0, %zmm0 ; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: testw %ax, %ax @@ -3287,7 +3287,7 @@ define void @ktest_allones(<16 x i32> %x, <16 x i32> %y) { ; CHECK: ## %bb.0: ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; CHECK-NEXT: kortestw %k0, %k0 ; CHECK-NEXT: jb LBB65_2 @@ -3303,7 +3303,7 @@ define void @ktest_allones(<16 x i32> %x, <16 x i32> %y) { ; X86: ## %bb.0: ; X86-NEXT: subl $12, %esp ; X86-NEXT: .cfi_def_cfa_offset 16 -; X86-NEXT: vporq %zmm1, %zmm0, %zmm0 +; X86-NEXT: vpord %zmm1, %zmm0, %zmm0 ; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; X86-NEXT: kortestw %k0, %k0 ; X86-NEXT: jb LBB65_2 diff --git a/llvm/test/CodeGen/X86/avx512-schedule.ll b/llvm/test/CodeGen/X86/avx512-schedule.ll index e0237ff0d830..7ea2aef6ba4c 100755 --- a/llvm/test/CodeGen/X86/avx512-schedule.ll +++ b/llvm/test/CodeGen/X86/avx512-schedule.ll @@ -5029,13 +5029,13 @@ define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon ; GENERIC-LABEL: vpandd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpandd %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpandd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.50] +; SKX-NEXT: vpandd %zmm1, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. @@ -5049,13 +5049,13 @@ define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readno ; GENERIC-LABEL: vpandnd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpandnd %zmm0, %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpandnd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.50] +; SKX-NEXT: vpandnd %zmm0, %zmm1, %zmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. @@ -5071,13 +5071,13 @@ define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ; GENERIC-LABEL: vpord: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpord %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpord: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.50] +; SKX-NEXT: vpord %zmm1, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. @@ -5091,13 +5091,13 @@ define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon ; GENERIC-LABEL: vpxord: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpxord %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpxord: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.50] +; SKX-NEXT: vpxord %zmm1, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll index 008a3b44ce03..90e533c09b7f 100644 --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -11,7 +11,7 @@ define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind { ; X86-NEXT: # %bb.1: ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 ; X86-NEXT: .LBB0_2: -; X86-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; X86-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: select00: @@ -22,7 +22,7 @@ define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind { ; X64-NEXT: # %bb.1: ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 ; X64-NEXT: .LBB0_2: -; X64-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; X64-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; X64-NEXT: retq %cmpres = icmp eq i32 %a, 255 %selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll index 2964c9059468..b1a63ffedf3f 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -657,7 +657,7 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) { ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index 139fabd25c95..c524021866d6 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -845,7 +845,7 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) { ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll index c43b19a71791..3f78d0c9c5cc 100644 --- a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll @@ -708,7 +708,6 @@ define i64 @v16i8_widened_with_ones(<16 x i8> %a, <16 x i8> %b) { ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpmovmskb %ymm0, %ecx ; AVX2-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000 ; AVX2-NEXT: orq %rcx, %rax diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll index bb79efcbad4c..19d3b47a659e 100644 --- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -359,7 +359,8 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) { ; AVX-LABEL: f64i8_i32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -367,7 +368,7 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) { ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -391,7 +392,8 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) { ; AVX-64-LABEL: f64i8_i32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-64-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -399,7 +401,7 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) { ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -425,12 +427,13 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) { } +; FIXME the load should be folded with the MOVDDUP with AVX1. PR39454 define <64 x i8> @f64xi8_i64(<64 x i8> %a) { ; AVX-LABEL: f64xi8_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275] -; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -438,7 +441,7 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) { ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -462,8 +465,8 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) { ; AVX-64-LABEL: f64xi8_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275] -; AVX-64-NEXT: # xmm3 = mem[0,0] +; AVX-64-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -471,7 +474,7 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) { ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -496,7 +499,6 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) { ret <64 x i8> %res2 } - define <64 x i8> @f64xi8_i128(<64 x i8> %a) { ; AVX-LABEL: f64xi8_i128: ; AVX: # %bb.0: @@ -509,7 +511,7 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) { ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -543,7 +545,7 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) { ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -844,7 +846,8 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -852,7 +855,7 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) { ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -876,7 +879,8 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) { ; AVX-64-LABEL: f32xi16_i32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-64-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -884,7 +888,7 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) { ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -910,12 +914,13 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) { } +; FIXME the load should be folded with the MOVDDUP with AVX1. PR39454 define <32 x i16> @f32xi16_i64(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [4.1720559249406128E-309,4.1720559249406128E-309] -; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -923,7 +928,7 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) { ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -947,8 +952,8 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) { ; AVX-64-LABEL: f32xi16_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = [4.1720559249406128E-309,4.1720559249406128E-309] -; AVX-64-NEXT: # xmm3 = mem[0,0] +; AVX-64-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -956,7 +961,7 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) { ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -994,7 +999,7 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) { ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -1028,7 +1033,7 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) { ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -1252,12 +1257,13 @@ define <8 x i32> @f8xi32_i128(<8 x i32> %a) { } +; FIXME the load should be folded with the MOVDDUP with AVX1. PR39454 define <16 x i32> @f16xi32_i64(<16 x i32> %a) { ; AVX-LABEL: f16xi32_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [2.1219957909652723E-314,2.1219957909652723E-314] -; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -1265,7 +1271,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) { ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -1283,14 +1289,14 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f16xi32_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = [2.1219957909652723E-314,2.1219957909652723E-314] -; AVX-64-NEXT: # xmm3 = mem[0,0] +; AVX-64-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] ; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -1298,7 +1304,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) { ; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -1316,7 +1322,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) { ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296] ; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq %res1 = add <16 x i32> , %a %res2 = and <16 x i32> , %res1 @@ -1336,7 +1342,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) { ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -1356,7 +1362,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) { ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retl ; ; AVX-64-LABEL: f16xi32_i128: @@ -1370,7 +1376,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) { ; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -1390,7 +1396,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) { ; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq %res1 = add <16 x i32> , %a %res2 = and <16 x i32> , %res1 diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll index 4b33b04a8b2d..452676e19db4 100644 --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -2338,8 +2338,7 @@ define i1 @allones_v4i32_and1(<4 x i32> %arg) { ; ; SKX-LABEL: allones_v4i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; SKX-NEXT: vptestmd %xmm1, %xmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: andb $15, %al ; SKX-NEXT: cmpb $15, %al @@ -2382,8 +2381,7 @@ define i1 @allzeros_v4i32_and1(<4 x i32> %arg) { ; ; SKX-LABEL: allzeros_v4i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; SKX-NEXT: vptestmd %xmm1, %xmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: testb $15, %al ; SKX-NEXT: sete %al @@ -2444,8 +2442,7 @@ define i1 @allones_v8i32_and1(<8 x i32> %arg) { ; ; SKX-LABEL: allones_v8i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; SKX-NEXT: vptestmd %ymm1, %ymm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -2506,8 +2503,7 @@ define i1 @allzeros_v8i32_and1(<8 x i32> %arg) { ; ; SKX-LABEL: allzeros_v8i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; SKX-NEXT: vptestmd %ymm1, %ymm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -2584,8 +2580,7 @@ define i1 @allones_v16i32_and1(<16 x i32> %arg) { ; ; KNL-LABEL: allones_v16i32_and1: ; KNL: # %bb.0: -; KNL-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: setb %al ; KNL-NEXT: vzeroupper @@ -2593,8 +2588,7 @@ define i1 @allones_v16i32_and1(<16 x i32> %arg) { ; ; SKX-LABEL: allones_v16i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SKX-NEXT: vptestmd %zmm1, %zmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -2671,8 +2665,7 @@ define i1 @allzeros_v16i32_and1(<16 x i32> %arg) { ; ; KNL-LABEL: allzeros_v16i32_and1: ; KNL: # %bb.0: -; KNL-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper @@ -2680,8 +2673,7 @@ define i1 @allzeros_v16i32_and1(<16 x i32> %arg) { ; ; SKX-LABEL: allzeros_v16i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SKX-NEXT: vptestmd %zmm1, %zmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -4010,8 +4002,7 @@ define i1 @allones_v4i32_and4(<4 x i32> %arg) { ; ; SKX-LABEL: allones_v4i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] -; SKX-NEXT: vptestmd %xmm1, %xmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: andb $15, %al ; SKX-NEXT: cmpb $15, %al @@ -4054,8 +4045,7 @@ define i1 @allzeros_v4i32_and4(<4 x i32> %arg) { ; ; SKX-LABEL: allzeros_v4i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] -; SKX-NEXT: vptestmd %xmm1, %xmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: testb $15, %al ; SKX-NEXT: sete %al @@ -4116,8 +4106,7 @@ define i1 @allones_v8i32_and4(<8 x i32> %arg) { ; ; SKX-LABEL: allones_v8i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; SKX-NEXT: vptestmd %ymm1, %ymm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -4178,8 +4167,7 @@ define i1 @allzeros_v8i32_and4(<8 x i32> %arg) { ; ; SKX-LABEL: allzeros_v8i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; SKX-NEXT: vptestmd %ymm1, %ymm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -4256,8 +4244,7 @@ define i1 @allones_v16i32_and4(<16 x i32> %arg) { ; ; KNL-LABEL: allones_v16i32_and4: ; KNL: # %bb.0: -; KNL-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: setb %al ; KNL-NEXT: vzeroupper @@ -4265,8 +4252,7 @@ define i1 @allones_v16i32_and4(<16 x i32> %arg) { ; ; SKX-LABEL: allones_v16i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; SKX-NEXT: vptestmd %zmm1, %zmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -4343,8 +4329,7 @@ define i1 @allzeros_v16i32_and4(<16 x i32> %arg) { ; ; KNL-LABEL: allzeros_v16i32_and4: ; KNL: # %bb.0: -; KNL-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper @@ -4352,8 +4337,7 @@ define i1 @allzeros_v16i32_and4(<16 x i32> %arg) { ; ; SKX-LABEL: allzeros_v16i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; SKX-NEXT: vptestmd %zmm1, %zmm0, %k0 +; SKX-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index a6bdfe9780cf..6f4a3812ffaa 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -531,18 +531,16 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind { ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm7 -; SSE41-NEXT: pmaxud %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pshufb %xmm6, %xmm7 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pmaxud %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: packssdw %xmm6, %xmm0 ; SSE41-NEXT: psubd %xmm2, %xmm3 ; SSE41-NEXT: psubd %xmm1, %xmm4 -; SSE41-NEXT: pshufb %xmm6, %xmm4 -; SSE41-NEXT: pshufb %xmm6, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm1, %xmm4 +; SSE41-NEXT: pshufb %xmm1, %xmm3 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE41-NEXT: pandn %xmm4, %xmm0 ; SSE41-NEXT: retq @@ -916,18 +914,16 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind { ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm6, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm7 -; SSE41-NEXT: pminud %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pshufb %xmm6, %xmm7 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pminud %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: packssdw %xmm6, %xmm4 ; SSE41-NEXT: psubd %xmm2, %xmm3 ; SSE41-NEXT: psubd %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm6, %xmm0 -; SSE41-NEXT: pshufb %xmm6, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm1, %xmm3 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: retq @@ -1052,18 +1048,16 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind { ; SSE41-NEXT: pcmpeqd %xmm1, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm6, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm7 -; SSE41-NEXT: pmaxud %xmm3, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pshufb %xmm6, %xmm7 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: pmaxud %xmm3, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: packssdw %xmm6, %xmm4 ; SSE41-NEXT: psubd %xmm2, %xmm3 ; SSE41-NEXT: psubd %xmm1, %xmm0 -; SSE41-NEXT: pshufb %xmm6, %xmm0 -; SSE41-NEXT: pshufb %xmm6, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm1, %xmm3 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll index ec160c94f5e9..f0989e8b081a 100644 --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -746,15 +746,16 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32> ; SSE2-LABEL: unsigned_sat_variable_v4i32_using_min: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: pxor %xmm0, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647] ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll index f109d69621c6..7cb0d3ff58f4 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll @@ -132,9 +132,9 @@ define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py, ; ; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm0 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm2 +; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm2 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm2 ; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 @@ -142,9 +142,9 @@ define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py, ; ; CHECK-XOP-LABEL: in_constant_varx_mone_invmask: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 ; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vpxor (%rdi), %xmm1, %xmm2 +; CHECK-XOP-NEXT: vpxor (%rdx), %xmm1, %xmm2 ; CHECK-XOP-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec-copysign-avx512.ll b/llvm/test/CodeGen/X86/vec-copysign-avx512.ll index 6fb0033e7504..b08b15ce004e 100644 --- a/llvm/test/CodeGen/X86/vec-copysign-avx512.ll +++ b/llvm/test/CodeGen/X86/vec-copysign-avx512.ll @@ -43,7 +43,7 @@ define <16 x float> @v16f32(<16 x float> %a, <16 x float> %b) nounwind { ; AVX512VL: ## %bb.0: ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512VLDQ-LABEL: v16f32: diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index fa4c8abe6d2d..b249eed2fc76 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -2046,27 +2046,27 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm1 ; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm2 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpord %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpslld $24, %zmm0, %zmm2 ; AVX512F-NEXT: vpslld $8, %zmm0, %zmm0 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 ; AVX512F-NEXT: vpslld $4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $4, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 ; AVX512F-NEXT: vpslld $2, %zmm1, %zmm1 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $2, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 ; AVX512F-NEXT: vpslld $1, %zmm1, %zmm1 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_bitreverse_v16i32: diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll index 10db0aeb25eb..71a9ba193968 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -172,15 +172,15 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-LABEL: testv16i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $2, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -201,15 +201,15 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512DQ-LABEL: testv16i32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrld $1, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $4, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $8, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -257,15 +257,15 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512BW-LABEL: testv16i32u: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $2, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -286,15 +286,15 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512DQ-LABEL: testv16i32u: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpsrld $1, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $4, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $8, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll index 89ae9510cdcd..560bceb2d05f 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -309,13 +309,13 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -372,15 +372,15 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; ; AVX512-LABEL: test_v32i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll index 04ec6cfc970e..169394040bf5 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -309,13 +309,13 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -372,15 +372,15 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; ; AVX512-LABEL: test_v32i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll index cb69ee80ee46..d1bf3e99c2c6 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -309,13 +309,13 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -372,15 +372,15 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; ; AVX512-LABEL: test_v32i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index eb51c1029dee..30de8d7c9083 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -876,7 +876,7 @@ define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind { ; AVX512-LABEL: splatconstant_rotate_mask_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vprold $4, %zmm0, %zmm0 -; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: retq %shl = shl <16 x i32> %a, %lshr = lshr <16 x i32> %a, @@ -980,10 +980,8 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq @@ -991,10 +989,8 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index e552f5f40360..7dc850391bca 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -3505,7 +3505,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; ; AVX512-LABEL: trunc_and_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -4309,7 +4309,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; ; AVX512-LABEL: trunc_xor_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -5113,7 +5113,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind ; ; AVX512-LABEL: trunc_or_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll index 1de03463e191..501d7e96835d 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll @@ -128,7 +128,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512CD: # %bb.0: ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0 @@ -138,7 +138,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512CDBW: # %bb.0: ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0 @@ -148,7 +148,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -169,7 +169,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -177,7 +177,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; BITALG: # %bb.0: ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; BITALG-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; BITALG-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; BITALG-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] @@ -195,7 +195,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CD: # %bb.0: ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0 @@ -205,7 +205,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CDBW: # %bb.0: ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0 @@ -215,7 +215,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -236,7 +236,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -244,7 +244,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; BITALG: # %bb.0: ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; BITALG-NEXT: vpaddd %zmm1, %zmm0, %zmm1 -; BITALG-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; BITALG-NEXT: vpandnd %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]