From 6015f5c8237c259ac04c539d55d200baa885a807 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Tue, 15 Dec 2015 08:40:41 +0000 Subject: [PATCH] Type legalizer for masked gather and scatter intrinsics. Full type legalizer that works with all vectors length - from 2 to 16, (i32, i64, float, double). This intrinsic, for example void @llvm.masked.scatter.v2f32(<2 x float>%data , <2 x float*>%ptrs , i32 align , <2 x i1>%mask ) requires type widening for data and type promotion for mask. Differential Revision: http://reviews.llvm.org/D13633 llvm-svn: 255629 --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 20 +- .../SelectionDAG/LegalizeIntegerTypes.cpp | 129 +- .../CodeGen/SelectionDAG/LegalizeTypes.cpp | 17 + llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 16 +- .../SelectionDAG/LegalizeVectorTypes.cpp | 182 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 183 +- llvm/lib/Target/X86/X86InstrAVX512.td | 26 +- .../test/CodeGen/X86/masked_gather_scatter.ll | 1642 ++++++++++++++++- llvm/test/CodeGen/X86/masked_memop.ll | 91 +- 9 files changed, 2085 insertions(+), 221 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 548549ab1353..23816bde07c0 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -2122,12 +2122,13 @@ public: : MaskedGatherScatterSDNode(ISD::MGATHER, Order, dl, Operands, VTs, MemVT, MMO) { assert(getValue().getValueType() == getValueType(0) && - "Incompatible type of the PathThru value in MaskedGatherSDNode"); + "Incompatible type of the PassThru value in MaskedGatherSDNode"); assert(getMask().getValueType().getVectorNumElements() == - getValueType(0).getVectorNumElements() && - "Vector width mismatch between mask and data"); - assert(getMask().getValueType().getScalarType() == MVT::i1 && + getValueType(0).getVectorNumElements() && "Vector width mismatch between mask and data"); + assert(getIndex().getValueType().getVectorNumElements() == + getValueType(0).getVectorNumElements() && + "Vector width mismatch between index and data"); } static bool classof(const SDNode *N) { @@ -2143,13 +2144,14 @@ public: friend class SelectionDAG; MaskedScatterSDNode(unsigned Order, DebugLoc dl,ArrayRef Operands, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) - : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, Operands, VTs, - MemVT, MMO) { + : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, Operands, VTs, MemVT, + MMO) { assert(getMask().getValueType().getVectorNumElements() == - getValue().getValueType().getVectorNumElements() && - "Vector width mismatch between mask and data"); - assert(getMask().getValueType().getScalarType() == MVT::i1 && + getValue().getValueType().getVectorNumElements() && "Vector width mismatch between mask and data"); + assert(getIndex().getValueType().getVectorNumElements() == + getValue().getValueType().getVectorNumElements() && + "Vector width mismatch between index and data"); } static bool classof(const SDNode *N) { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 2cfcf77b17a5..63c9cc528710 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -66,8 +66,11 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::CTTZ: Res = PromoteIntRes_CTTZ(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break; - case ISD::LOAD: Res = PromoteIntRes_LOAD(cast(N));break; - case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast(N));break; + case ISD::LOAD: Res = PromoteIntRes_LOAD(cast(N)); break; + case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast(N)); + break; + case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast(N)); + break; case ISD::SELECT: Res = PromoteIntRes_SELECT(N); break; case ISD::VSELECT: Res = PromoteIntRes_VSELECT(N); break; case ISD::SELECT_CC: Res = PromoteIntRes_SELECT_CC(N); break; @@ -181,7 +184,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) { N->getChain(), N->getBasePtr(), N->getMemOperand(), N->getOrdering(), N->getSynchScope()); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; @@ -194,7 +197,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) { N->getChain(), N->getBasePtr(), Op2, N->getMemOperand(), N->getOrdering(), N->getSynchScope()); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; @@ -479,7 +482,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) { SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(), N->getMemoryVT(), N->getMemOperand()); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; @@ -489,20 +492,34 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0()); - SDValue Mask = N->getMask(); - EVT NewMaskVT = getSetCCResultType(NVT); - if (NewMaskVT != N->getMask().getValueType()) - Mask = PromoteTargetBoolean(Mask, NewMaskVT); SDLoc dl(N); - SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(), - Mask, ExtSrc0, N->getMemoryVT(), + N->getMask(), ExtSrc0, N->getMemoryVT(), N->getMemOperand(), ISD::SEXTLOAD); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } + +SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue ExtSrc0 = GetPromotedInteger(N->getValue()); + assert(NVT == ExtSrc0.getValueType() && + "Gather result type and the passThru agrument type should be the same"); + + SDLoc dl(N); + SDValue Ops[] = {N->getChain(), ExtSrc0, N->getMask(), N->getBasePtr(), + N->getIndex()}; + SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + /// Promote the overflow flag of an overflowing arithmetic node. SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) { // Simply change the return type of the boolean result. @@ -889,6 +906,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { OpNo); break; case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast(N), OpNo); break; + case ISD::MGATHER: Res = PromoteIntOp_MGATHER(cast(N), + OpNo); break; + case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast(N), + OpNo); break; case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break; case ISD::FP16_TO_FP: case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break; @@ -1157,56 +1178,49 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){ N->getMemoryVT(), N->getMemOperand()); } -SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){ +SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, + unsigned OpNo) { SDValue DataOp = N->getValue(); EVT DataVT = DataOp.getValueType(); SDValue Mask = N->getMask(); - EVT MaskVT = Mask.getValueType(); SDLoc dl(N); bool TruncateStore = false; - if (!TLI.isTypeLegal(DataVT)) { - if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) { - DataOp = GetPromotedInteger(DataOp); - if (!TLI.isTypeLegal(MaskVT)) - Mask = PromoteTargetBoolean(Mask, DataOp.getValueType()); - TruncateStore = true; - } + if (OpNo == 2) { + // Mask comes before the data operand. If the data operand is legal, we just + // promote the mask. + // When the data operand has illegal type, we should legalize the data + // operand first. The mask will be promoted/splitted/widened according to + // the data operand type. + if (TLI.isTypeLegal(DataVT)) + Mask = PromoteTargetBoolean(Mask, DataVT); else { - assert(getTypeAction(DataVT) == TargetLowering::TypeWidenVector && - "Unexpected data legalization in MSTORE"); - DataOp = GetWidenedVector(DataOp); + if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) + return PromoteIntOp_MSTORE(N, 3); + + else if (getTypeAction(DataVT) == TargetLowering::TypeWidenVector) + return WidenVecOp_MSTORE(N, 3); - if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) - Mask = GetWidenedVector(Mask); else { - EVT BoolVT = getSetCCResultType(DataOp.getValueType()); - - // We can't use ModifyToType() because we should fill the mask with - // zeroes - unsigned WidenNumElts = BoolVT.getVectorNumElements(); - unsigned MaskNumElts = MaskVT.getVectorNumElements(); - - unsigned NumConcat = WidenNumElts / MaskNumElts; - SmallVector Ops(NumConcat); - SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT); - Ops[0] = Mask; - for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = ZeroVal; - - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); + assert (getTypeAction(DataVT) == TargetLowering::TypeSplitVector); + return SplitVecOp_MSTORE(N, 3); } } + } else { // Data operand + assert(OpNo == 3 && "Unexpected operand for promotion"); + DataOp = GetPromotedInteger(DataOp); + Mask = PromoteTargetBoolean(Mask, DataOp.getValueType()); + TruncateStore = true; } - else - Mask = PromoteTargetBoolean(N->getMask(), DataOp.getValueType()); + return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask, N->getMemoryVT(), N->getMemOperand(), TruncateStore); } -SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){ +SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, + unsigned OpNo) { assert(OpNo == 2 && "Only know how to promote the mask!"); EVT DataVT = N->getValueType(0); SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); @@ -1215,6 +1229,31 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo) return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N, + unsigned OpNo) { + + SmallVector NewOps(N->op_begin(), N->op_end()); + if (OpNo == 2) { + // The Mask + EVT DataVT = N->getValueType(0); + NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); + } else + NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo)); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + +SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, + unsigned OpNo) { + SmallVector NewOps(N->op_begin(), N->op_end()); + if (OpNo == 2) { + // The Mask + EVT DataVT = N->getValue().getValueType(); + NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); + } else + NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo)); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op); @@ -2071,7 +2110,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N, } } - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Ch); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index d6b4f7921f2b..2a0b0aa44794 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -1127,6 +1127,23 @@ SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) { return DAG.getNode(ExtendCode, dl, BoolVT, Bool); } +/// WidenTargetBoolean - Widen the given target boolean to a target boolean +/// of the given type. The boolean vector is widened and then promoted to match +/// the target boolean type of the given ValVT. +SDValue DAGTypeLegalizer::WidenTargetBoolean(SDValue Bool, EVT ValVT, + bool WithZeroes) { + SDLoc dl(Bool); + EVT BoolVT = Bool.getValueType(); + + assert(ValVT.getVectorNumElements() > BoolVT.getVectorNumElements() && + TLI.isTypeLegal(ValVT) && + "Unexpected types in WidenTargetBoolean"); + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), BoolVT.getScalarType(), + ValVT.getVectorNumElements()); + Bool = ModifyToType(Bool, WideVT, WithZeroes); + return PromoteTargetBoolean(Bool, ValVT); +} + /// SplitInteger - Return the lower LoVT bits of Op in Lo and the upper HiVT /// bits in Hi. void DAGTypeLegalizer::SplitInteger(SDValue Op, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 84ea374345e9..7994f4dad4a9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -187,6 +187,11 @@ private: std::pair ExpandAtomic(SDNode *Node); SDValue PromoteTargetBoolean(SDValue Bool, EVT ValVT); + + /// Modify Bit Vector to match SetCC result type of ValVT. + /// The bit vector is widened with zeroes when WithZeroes is true. + SDValue WidenTargetBoolean(SDValue Bool, EVT ValVT, bool WithZeroes = false); + void ReplaceValueWith(SDValue From, SDValue To); void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi); void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT, @@ -261,6 +266,7 @@ private: SDValue PromoteIntRes_INT_EXTEND(SDNode *N); SDValue PromoteIntRes_LOAD(LoadSDNode *N); SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N); + SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N); SDValue PromoteIntRes_Overflow(SDNode *N); SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_SDIV(SDNode *N); @@ -307,6 +313,8 @@ private: SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -710,6 +718,7 @@ private: SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N); + SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N); SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N); SDValue WidenVecRes_SELECT(SDNode* N); SDValue WidenVecRes_SELECT_CC(SDNode* N); @@ -737,6 +746,7 @@ private: SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_STORE(SDNode* N); SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo); + SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_SETCC(SDNode* N); SDValue WidenVecOp_Convert(SDNode *N); @@ -776,8 +786,10 @@ private: /// Modifies a vector input (widen or narrows) to a vector of NVT. The /// input vector must have the same element type as NVT. - SDValue ModifyToType(SDValue InOp, EVT WidenVT); - + /// When FillWithZeroes is "on" the vector will be widened with + /// zeroes. + /// By default, the vector will be widened with undefined values. + SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false); //===--------------------------------------------------------------------===// // Generic Splitting: LegalizeTypesGeneric.cpp diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 5e4d469df37f..d0187d36dee2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -235,7 +235,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) { N->isInvariant(), N->getOriginalAlignment(), N->getAAInfo()); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Result.getValue(1)); return Result; @@ -1020,7 +1020,7 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(LD, 1), Ch); } @@ -1034,6 +1034,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue Ch = MLD->getChain(); SDValue Ptr = MLD->getBasePtr(); SDValue Mask = MLD->getMask(); + SDValue Src0 = MLD->getSrc0(); unsigned Alignment = MLD->getOriginalAlignment(); ISD::LoadExtType ExtType = MLD->getExtensionType(); @@ -1043,16 +1044,22 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, (Alignment == MLD->getValueType(0).getSizeInBits()/8) ? Alignment/2 : Alignment; + // Split Mask operand SDValue MaskLo, MaskHi; - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); EVT MemoryVT = MLD->getMemoryVT(); EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); - SDValue Src0 = MLD->getSrc0(); SDValue Src0Lo, Src0Hi; - std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl); + if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Src0, Src0Lo, Src0Hi); + else + std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MLD->getPointerInfo(), @@ -1080,7 +1087,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(MLD, 1), Ch); @@ -1095,20 +1102,33 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, SDValue Ch = MGT->getChain(); SDValue Ptr = MGT->getBasePtr(); SDValue Mask = MGT->getMask(); + SDValue Src0 = MGT->getValue(); + SDValue Index = MGT->getIndex(); unsigned Alignment = MGT->getOriginalAlignment(); + // Split Mask operand SDValue MaskLo, MaskHi; - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); EVT MemoryVT = MGT->getMemoryVT(); EVT LoMemVT, HiMemVT; + // Split MemoryVT std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue Src0Lo, Src0Hi; - std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(MGT->getValue(), dl); + if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Src0, Src0Lo, Src0Hi); + else + std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl); SDValue IndexHi, IndexLo; - std::tie(IndexLo, IndexHi) = DAG.SplitVector(MGT->getIndex(), dl); + if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Index, IndexLo, IndexHi); + else + std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MGT->getPointerInfo(), @@ -1128,7 +1148,7 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(MGT, 1), Ch); } @@ -1599,23 +1619,31 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, SDValue Ptr = MGT->getBasePtr(); SDValue Index = MGT->getIndex(); SDValue Mask = MGT->getMask(); + SDValue Src0 = MGT->getValue(); unsigned Alignment = MGT->getOriginalAlignment(); SDValue MaskLo, MaskHi; - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + // Split Mask operand + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); EVT MemoryVT = MGT->getMemoryVT(); EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue Src0Lo, Src0Hi; - std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(MGT->getValue(), dl); + if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Src0, Src0Lo, Src0Hi); + else + std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl); SDValue IndexHi, IndexLo; - if (Index.getNode()) - std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl); + if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Index, IndexLo, IndexHi); else - IndexLo = IndexHi = Index; + std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MGT->getPointerInfo(), @@ -1641,7 +1669,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(MGT, 1), Ch); @@ -1665,9 +1693,21 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; - GetSplitVector(Data, DataLo, DataHi); + if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) + // Split Data operand + GetSplitVector(Data, DataLo, DataHi); + else + std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); + SDValue MaskLo, MaskHi; - GetSplitVector(Mask, MaskLo, MaskHi); + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + // Split Mask operand + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); + + MaskLo = PromoteTargetBoolean(MaskLo, DataLo.getValueType()); + MaskHi = PromoteTargetBoolean(MaskHi, DataHi.getValueType()); // if Alignment is equal to the vector size, // take the half of it for the second part @@ -1712,25 +1752,29 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned Alignment = N->getOriginalAlignment(); SDLoc DL(N); + // Split all operands EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; - GetSplitVector(Data, DataLo, DataHi); - SDValue MaskLo, MaskHi; - GetSplitVector(Mask, MaskLo, MaskHi); - - SDValue PtrLo, PtrHi; - if (Ptr.getValueType().isVector()) // gather form vector of pointers - std::tie(PtrLo, PtrHi) = DAG.SplitVector(Ptr, DL); + if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) + // Split Data operand + GetSplitVector(Data, DataLo, DataHi); else - PtrLo = PtrHi = Ptr; + std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); + + SDValue MaskLo, MaskHi; + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + // Split Mask operand + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); SDValue IndexHi, IndexLo; - if (Index.getNode()) - std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL); + if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Index, IndexLo, IndexHi); else - IndexLo = IndexHi = Index; + std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL); SDValue Lo, Hi; MachineMemOperand *MMO = DAG.getMachineFunction(). @@ -1738,7 +1782,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N, MachineMemOperand::MOStore, LoMemVT.getStoreSize(), Alignment, N->getAAInfo(), N->getRanges()); - SDValue OpsLo[] = {Ch, DataLo, MaskLo, PtrLo, IndexLo}; + SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo}; Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(), DL, OpsLo, MMO); @@ -1747,7 +1791,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N, MachineMemOperand::MOStore, HiMemVT.getStoreSize(), Alignment, N->getAAInfo(), N->getRanges()); - SDValue OpsHi[] = {Ch, DataHi, MaskHi, PtrHi, IndexHi}; + SDValue OpsHi[] = {Ch, DataHi, MaskHi, Ptr, IndexHi}; Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(), DL, OpsHi, MMO); @@ -1975,6 +2019,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::MLOAD: Res = WidenVecRes_MLOAD(cast(N)); break; + case ISD::MGATHER: + Res = WidenVecRes_MGATHER(cast(N)); + break; case ISD::ADD: case ISD::AND: @@ -2728,7 +2775,35 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) { SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(), Mask, Src0, N->getMemoryVT(), N->getMemOperand(), ExtType); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + +SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) { + + EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Mask = N->getMask(); + SDValue Src0 = GetWidenedVector(N->getValue()); + unsigned NumElts = WideVT.getVectorNumElements(); + SDLoc dl(N); + + // The mask should be widened as well + Mask = WidenTargetBoolean(Mask, WideVT, true); + + // Widen the Index operand + SDValue Index = N->getIndex(); + EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(), + Index.getValueType().getScalarType(), + NumElts); + Index = ModifyToType(Index, WideIndexVT); + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; @@ -2890,6 +2965,7 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::STORE: Res = WidenVecOp_STORE(N); break; case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break; + case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break; case ISD::SETCC: Res = WidenVecOp_SETCC(N); break; case ISD::FCOPYSIGN: Res = WidenVecOp_FCOPYSIGN(N); break; @@ -3137,6 +3213,34 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) { false); } +SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) { + assert(OpNo == 1 && "Can widen only data operand of mscatter"); + MaskedScatterSDNode *MSC = cast(N); + SDValue DataOp = MSC->getValue(); + SDValue Mask = MSC->getMask(); + + // Widen the value + SDValue WideVal = GetWidenedVector(DataOp); + EVT WideVT = WideVal.getValueType(); + unsigned NumElts = WideVal.getValueType().getVectorNumElements(); + SDLoc dl(N); + + // The mask should be widened as well + Mask = WidenTargetBoolean(Mask, WideVT, true); + + // Widen index + SDValue Index = MSC->getIndex(); + EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(), + Index.getValueType().getScalarType(), + NumElts); + Index = ModifyToType(Index, WideIndexVT); + + SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index}; + return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), + MSC->getMemoryVT(), dl, Ops, + MSC->getMemOperand()); +} + SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) { SDValue InOp0 = GetWidenedVector(N->getOperand(0)); SDValue InOp1 = GetWidenedVector(N->getOperand(1)); @@ -3600,7 +3704,9 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl &StChain, /// Modifies a vector input (widen or narrows) to a vector of NVT. The /// input vector must have the same element type as NVT. -SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) { +/// FillWithZeroes specifies that the vector should be widened with zeroes. +SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT, + bool FillWithZeroes) { // Note that InOp might have been widened so it might already have // the right width or it might need be narrowed. EVT InVT = InOp.getValueType(); @@ -3617,10 +3723,11 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) { if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) { unsigned NumConcat = WidenNumElts / InNumElts; SmallVector Ops(NumConcat); - SDValue UndefVal = DAG.getUNDEF(InVT); + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) : + DAG.getUNDEF(InVT); Ops[0] = InOp; for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = UndefVal; + Ops[i] = FillVal; return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops); } @@ -3640,8 +3747,9 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) { ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); - SDValue UndefVal = DAG.getUNDEF(EltVT); + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : + DAG.getUNDEF(EltVT); for ( ; Idx < WidenNumElts; ++Idx) - Ops[Idx] = UndefVal; + Ops[Idx] = FillVal; return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 829d6302bcee..3765d64617bc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1579,7 +1579,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); } - if (EltSize >= 32 && VT.getSizeInBits() <= 512) { + if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) { setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } @@ -1605,6 +1605,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::MGATHER, VT, Legal); + setOperationAction(ISD::MSCATTER, VT, Custom); } } for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { @@ -1813,6 +1815,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::MSCATTER); + setTargetDAGCombine(ISD::MGATHER); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -19760,6 +19764,16 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, EVT EltVT = NVT.getVectorElementType(); SDLoc dl(InOp); + if (InOp.getOpcode() == ISD::CONCAT_VECTORS && + InOp.getNumOperands() == 2) { + SDValue N1 = InOp.getOperand(1); + if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || + N1.isUndef()) { + InOp = InOp.getOperand(0); + InVT = InOp.getSimpleValueType(); + InNumElts = InVT.getVectorNumElements(); + } + } if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { SmallVector Ops; @@ -19783,28 +19797,93 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, assert(Subtarget->hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); + // X86 scatter kills mask register, so its type should be added to + // the list of return values. + // If the "scatter" has 2 return values, it is already handled. + if (Op.getNode()->getNumValues() == 2) + return Op; + MaskedScatterSDNode *N = cast(Op.getNode()); - MVT VT = N->getValue().getSimpleValueType(); + SDValue Src = N->getValue(); + MVT VT = Src.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); SDLoc dl(Op); - // X86 scatter kills mask register, so its type should be added to - // the list of return values - if (N->getNumValues() == 1) { - SDValue Index = N->getIndex(); - if (!Subtarget->hasVLX() && !VT.is512BitVector() && - !Index.getSimpleValueType().is512BitVector()) - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + SDValue NewScatter; + SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Chain = N->getChain(); + SDValue BasePtr = N->getBasePtr(); + MVT MemVT = N->getMemoryVT().getSimpleVT(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); - SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; + if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) { + // The v2i32 value was promoted to v2i64. + // Now we "redo" the type legalizer's work and widen the original + // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64 + // with a shuffle. + assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) && + "Unexpected memory type"); + int ShuffleMask[] = {0, 2, -1, -1}; + Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src), + DAG.getUNDEF(MVT::v4i32), ShuffleMask); + // Now we have 4 elements instead of 2. + // Expand the index. + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4); + Index = ExtendToType(Index, NewIndexVT, DAG); - SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); - return SDValue(NewScatter.getNode(), 0); + // Expand the mask with zeroes + // Mask may be <2 x i64> or <2 x i1> at this moment + assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && + "Unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + VT = MVT::v4i32; } - return Op; + + unsigned NumElts = VT.getVectorNumElements(); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && + !Index.getSimpleValueType().is512BitVector()) { + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (IndexVT == MVT::v8i32) + // Just extend index + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + else { + // The minimal number of elts in scatter is 8 + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + // Use original index here, do not modify the index twice + Index = ExtendToType(N->getIndex(), NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + // Use the original mask here, do not modify the mask twice + Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); + + // The value that should be stored + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src = ExtendToType(Src, NewVT, DAG); + } + } + // If the mask is "wide" at this point - truncate it to i1 vector + MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts); + Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask); + + // The mask is killed by scatter, add it to the values + SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; + NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); + return SDValue(NewScatter.getNode(), 0); } static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget, @@ -19869,17 +19948,59 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, "MGATHER/MSCATTER are supported on AVX-512 arch only"); MaskedGatherSDNode *N = cast(Op.getNode()); - MVT VT = Op.getSimpleValueType(); - assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); SDLoc dl(Op); - + MVT VT = Op.getSimpleValueType(); SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Src0 = N->getValue(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + unsigned NumElts = VT.getVectorNumElements(); + assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; - DAG.UpdateNodeOperands(N, Ops); + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (NumElts == 8) { + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), Index }; + DAG.UpdateNodeOperands(N, Ops); + return Op; + } + + // Minimal number of elements in Gather + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + Index = ExtendToType(Index, NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts); + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); + + // The pass-thru value + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src0 = ExtendToType(Src0, NewVT, DAG); + + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewGather.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewGather.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); } return Op; } @@ -26907,6 +27028,20 @@ static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + // Gather and Scatter instructions use k-registers for masks. The type of + // the masks is v*i1. So the mask will be truncated anyway. + // The SIGN_EXTEND_INREG my be dropped. + SDValue Mask = N->getOperand(2); + if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[2] = Mask.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + } + return SDValue(); +} + // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. @@ -27348,6 +27483,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); + case ISD::MGATHER: + case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); } return SDValue(); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 62f28b79ecdb..48918d66d240 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2176,17 +2176,19 @@ let Predicates = [HasAVX512] in { (EXTRACT_SUBREG (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_16bit)>; - def : Pat<(v16i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK16)>; - def : Pat<(v8i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK8)>; -} -let Predicates = [HasBWI] in { - def : Pat<(v32i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK32)>; - def : Pat<(v64i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK64)>; } +def : Pat<(v16i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK16)>; +def : Pat<(v8i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK8)>; +def : Pat<(v4i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK4)>; +def : Pat<(v2i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK2)>; +def : Pat<(v32i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK32)>; +def : Pat<(v64i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK64)>; // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. @@ -2489,6 +2491,9 @@ def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))), (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>; +def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))), + (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>; + def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))), (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>; @@ -2497,6 +2502,7 @@ def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))), def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; + def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 19d2c891f85a..84f04c01efc2 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -1,29 +1,51 @@ -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=KNL +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64 +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -; KNL-LABEL: test1 -; KNL: kxnorw %k1, %k1, %k1 -; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; SCALAR-LABEL: test1 -; SCALAR: extractelement <16 x float*> +; SCALAR: extractelement <16 x float*> ; SCALAR-NEXT: load float ; SCALAR-NEXT: insertelement <16 x float> ; SCALAR-NEXT: extractelement <16 x float*> ; SCALAR-NEXT: load float define <16 x float> @test1(float* %base, <16 x i32> %ind) { +; KNL_64-LABEL: test1: +; KNL_64: # BB#0: +; KNL_64-NEXT: kxnorw %k1, %k1, %k1 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %zmm1, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test1: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: kxnorw %k1, %k1, %k1 +; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test1: +; SKX: # BB#0: +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind - + %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef) ret <16 x float>%res } @@ -31,23 +53,41 @@ define <16 x float> @test1(float* %base, <16 x i32> %ind) { declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>) declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> ) - -; KNL-LABEL: test2 -; KNL: kmovw %esi, %k1 -; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} + ; SCALAR-LABEL: test2 -; SCALAR: extractelement <16 x float*> +; SCALAR: extractelement <16 x float*> ; SCALAR-NEXT: load float ; SCALAR-NEXT: insertelement <16 x float> ; SCALAR-NEXT: br label %else ; SCALAR: else: -; SCALAR-NEXT: %res.phi.else = phi +; SCALAR-NEXT: %res.phi.else = phi ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1 ; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true ; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) { +; KNL_64-LABEL: test2: +; KNL_64: # BB#0: +; KNL_64-NEXT: kmovw %esi, %k1 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %zmm1, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test2: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test2: +; SKX: # BB#0: +; SKX-NEXT: kmovw %esi, %k1 +; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer @@ -59,10 +99,28 @@ define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) { ret <16 x float> %res } -; KNL-LABEL: test3 -; KNL: kmovw %esi, %k1 -; KNL: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) { +; KNL_64-LABEL: test3: +; KNL_64: # BB#0: +; KNL_64-NEXT: kmovw %esi, %k1 +; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %zmm1, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test3: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test3: +; SKX: # BB#0: +; SKX-NEXT: kmovw %esi, %k1 +; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer @@ -74,13 +132,38 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) { ret <16 x i32> %res } -; KNL-LABEL: test4 -; KNL: kmovw %esi, %k1 -; KNL: kmovw -; KNL: vpgatherdd -; KNL: vpgatherdd define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { +; KNL_64-LABEL: test4: +; KNL_64: # BB#0: +; KNL_64-NEXT: kmovw %esi, %k1 +; KNL_64-NEXT: kmovw %k1, %k2 +; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} +; KNL_64-NEXT: vmovaps %zmm1, %zmm2 +; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} +; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test4: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; KNL_32-NEXT: kmovw %k1, %k2 +; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} +; KNL_32-NEXT: vmovaps %zmm1, %zmm2 +; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} +; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test4: +; SKX: # BB#0: +; SKX-NEXT: kmovw %esi, %k1 +; SKX-NEXT: kmovw %k1, %k2 +; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} +; SKX-NEXT: vmovaps %zmm1, %zmm2 +; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} +; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0 +; SKX-NEXT: retq %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer @@ -93,10 +176,6 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { ret <16 x i32> %res } -; KNL-LABEL: test5 -; KNL: kmovw %k1, %k2 -; KNL: vpscatterdd {{.*}}%k2 -; KNL: vpscatterdd {{.*}}%k1 ; SCALAR-LABEL: test5 ; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0 @@ -113,6 +192,30 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { ; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { +; KNL_64-LABEL: test5: +; KNL_64: # BB#0: +; KNL_64-NEXT: kmovw %esi, %k1 +; KNL_64-NEXT: kmovw %k1, %k2 +; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} +; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test5: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; KNL_32-NEXT: kmovw %k1, %k2 +; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2} +; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test5: +; SKX: # BB#0: +; SKX-NEXT: kmovw %esi, %k1 +; SKX-NEXT: kmovw %k1, %k2 +; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} +; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} +; SKX-NEXT: retq %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer @@ -127,11 +230,6 @@ define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> ) declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> ) -; KNL-LABEL: test6 -; KNL: kxnorw %k1, %k1, %k1 -; KNL: kxnorw %k2, %k2, %k2 -; KNL: vpgatherqd (,%zmm{{.*}}), %ymm{{.*}} {%k2} -; KNL: vpscatterqd %ymm{{.*}}, (,%zmm{{.*}}) {%k1} ; SCALAR-LABEL: test6 ; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4 @@ -143,6 +241,33 @@ declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x ; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) { +; KNL_64-LABEL: test6: +; KNL_64: # BB#0: +; KNL_64-NEXT: kxnorw %k1, %k1, %k1 +; KNL_64-NEXT: kxnorw %k2, %k2, %k2 +; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_64-NEXT: vmovaps %zmm2, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test6: +; KNL_32: # BB#0: +; KNL_32-NEXT: kxnorw %k1, %k1, %k1 +; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2 +; KNL_32-NEXT: kxnorw %k2, %k2, %k2 +; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2} +; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1} +; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test6: +; SKX: # BB#0: +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: kxnorw %k2, %k2, %k2 +; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; SKX-NEXT: vmovaps %zmm2, %zmm0 +; SKX-NEXT: retq %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> , <8 x i32> undef) @@ -150,13 +275,41 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) { ret <8 x i32>%a } -; In this case the index should be promoted to <8 x i64> for KNL -; KNL-LABEL: test7 -; KNL: vpmovsxdq %ymm0, %zmm0 -; KNL: kmovw %k1, %k2 -; KNL: vpgatherqd {{.*}} {%k2} -; KNL: vpgatherqd {{.*}} {%k1} define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) { +; +; KNL_64-LABEL: test7: +; KNL_64: # BB#0: +; KNL_64-NEXT: movzbl %sil, %eax +; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_64-NEXT: kmovw %k1, %k2 +; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2} +; KNL_64-NEXT: vmovaps %zmm1, %zmm2 +; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} +; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test7: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_32-NEXT: kmovw %k1, %k2 +; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2} +; KNL_32-NEXT: vmovaps %zmm1, %zmm2 +; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} +; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test7: +; SKX: # BB#0: +; SKX-NEXT: kmovb %esi, %k1 +; SKX-NEXT: kmovw %k1, %k2 +; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2} +; SKX-NEXT: vmovaps %zmm1, %zmm2 +; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1} +; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; SKX-NEXT: retq %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer @@ -171,15 +324,57 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) { ; No uniform base in this case, index <8 x i64> contains addresses, ; each gather call will be split into two -; KNL-LABEL: test8 -; KNL: kshiftrw $8, %k1, %k2 -; KNL: vpgatherqd -; KNL: vpgatherqd -; KNL: vinserti64x4 -; KNL: vpgatherqd -; KNL: vpgatherqd -; KNL: vinserti64x4 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) { +; KNL_64-LABEL: test8: +; KNL_64: # BB#0: +; KNL_64-NEXT: kmovw %edi, %k1 +; KNL_64-NEXT: kshiftrw $8, %k1, %k2 +; KNL_64-NEXT: kmovw %k2, %k3 +; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} +; KNL_64-NEXT: kmovw %k1, %k3 +; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} +; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4 +; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} +; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 +; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test8: +; KNL_32: # BB#0: +; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; KNL_32-NEXT: kmovw %k1, %k2 +; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} +; KNL_32-NEXT: vmovaps %zmm1, %zmm2 +; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} +; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test8: +; SKX: # BB#0: +; SKX-NEXT: kmovw %edi, %k1 +; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: kmovw %k2, %k3 +; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} +; SKX-NEXT: kmovw %k1, %k3 +; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} +; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4 +; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} +; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 +; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0 +; SKX-NEXT: retq +; +; SKX_32-LABEL: test8: +; SKX_32: # BB#0: +; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; SKX_32-NEXT: kmovw %k1, %k2 +; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} +; SKX_32-NEXT: vmovaps %zmm1, %zmm2 +; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} +; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 +; SKX_32-NEXT: retl + %imask = bitcast i16 %mask to <16 x i1> %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef) %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) @@ -193,18 +388,60 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) { ; Masked gather for agregate types ; Test9 and Test10 should give the same result (scalar and vector indices in GEP) -; KNL-LABEL: test9 -; KNL: vpbroadcastq %rdi, %zmm -; KNL: vpmovsxdq -; KNL: vpbroadcastq -; KNL: vpmuludq -; KNL: vpaddq -; KNL: vpaddq -; KNL: vpaddq -; KNL: vpaddq -; KNL: vpgatherqd (,%zmm define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { +; KNL_64-LABEL: test9: +; KNL_64: # BB#0: # %entry +; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 +; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 +; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 +; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1 +; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 +; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1 +; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 +; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 +; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 +; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 +; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 +; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 +; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; KNL_64-NEXT: kxnorw %k1, %k1, %k1 +; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test9: +; KNL_32: # BB#0: # %entry +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 +; KNL_32-NEXT: vpbroadcastd .LCPI8_0, %ymm3 +; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 +; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3 +; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 +; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1 +; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 +; KNL_32-NEXT: kxnorw %k1, %k1, %k1 +; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test9: +; SKX: # BB#0: # %entry +; SKX-NEXT: vpbroadcastq %rdi, %zmm2 +; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX-NEXT: retq entry: %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer @@ -214,17 +451,59 @@ entry: ret <8 x i32> %res } -; KNL-LABEL: test10 -; KNL: vpbroadcastq %rdi, %zmm -; KNL: vpmovsxdq -; KNL: vpbroadcastq -; KNL: vpmuludq -; KNL: vpaddq -; KNL: vpaddq -; KNL: vpaddq -; KNL: vpaddq -; KNL: vpgatherqd (,%zmm define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { +; KNL_64-LABEL: test10: +; KNL_64: # BB#0: # %entry +; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 +; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 +; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 +; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1 +; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 +; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1 +; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 +; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 +; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 +; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 +; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 +; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 +; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; KNL_64-NEXT: kxnorw %k1, %k1, %k1 +; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test10: +; KNL_32: # BB#0: # %entry +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 +; KNL_32-NEXT: vpbroadcastd .LCPI9_0, %ymm3 +; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 +; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3 +; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 +; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1 +; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 +; KNL_32-NEXT: kxnorw %k1, %k1, %k1 +; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test10: +; SKX: # BB#0: # %entry +; SKX-NEXT: vpbroadcastq %rdi, %zmm2 +; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX-NEXT: retq entry: %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer @@ -235,10 +514,28 @@ entry: } ; Splat index in GEP, requires broadcast -; KNL-LABEL: test11 -; KNL: vpbroadcastd %esi, %zmm -; KNL: vgatherdps (%rdi,%zmm define <16 x float> @test11(float* %base, i32 %ind) { +; KNL_64-LABEL: test11: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpbroadcastd %esi, %zmm1 +; KNL_64-NEXT: kxnorw %k1, %k1, %k1 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test11: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1 +; KNL_32-NEXT: kxnorw %k1, %k1, %k1 +; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test11: +; SKX: # BB#0: +; SKX-NEXT: vpbroadcastd %esi, %zmm1 +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; SKX-NEXT: retq %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer @@ -250,10 +547,28 @@ define <16 x float> @test11(float* %base, i32 %ind) { } ; We are checking the uniform base here. It is taken directly from input to vgatherdps -; KNL-LABEL: test12 -; KNL: kxnorw %k1, %k1, %k1 -; KNL: vgatherdps (%rdi,%zmm define <16 x float> @test12(float* %base, <16 x i32> %ind) { +; KNL_64-LABEL: test12: +; KNL_64: # BB#0: +; KNL_64-NEXT: kxnorw %k1, %k1, %k1 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %zmm1, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test12: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: kxnorw %k1, %k1, %k1 +; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test12: +; SKX: # BB#0: +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind @@ -263,10 +578,25 @@ define <16 x float> @test12(float* %base, <16 x i32> %ind) { } ; The same as the previous, but the mask is undefined -; KNL-LABEL: test13 -; KNL-NOT: kxnorw -; KNL: vgatherdps (%rdi,%zmm define <16 x float> @test13(float* %base, <16 x i32> %ind) { +; KNL_64-LABEL: test13: +; KNL_64: # BB#0: +; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %zmm1, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test13: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test13: +; SKX: # BB#0: +; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind @@ -276,10 +606,58 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) { } ; The base pointer is not splat, can't find unform base -; KNL-LABEL: test14 -; KNL: vgatherqps (,%zmm0) -; KNL: vgatherqps (,%zmm0) define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { +; KNL_64-LABEL: test14: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 +; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0 +; KNL_64-NEXT: vmovd %esi, %xmm1 +; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1 +; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1 +; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; KNL_64-NEXT: kshiftrw $8, %k0, %k1 +; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} +; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} +; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test14: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 +; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0 +; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 +; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test14: +; SKX: # BB#0: +; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 +; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 +; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 +; SKX-NEXT: vmovd %esi, %xmm1 +; SKX-NEXT: vpbroadcastd %xmm1, %ymm1 +; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX-NEXT: vpsllq $2, %zmm1, %zmm1 +; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: kshiftrw $8, %k0, %k1 +; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} +; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} +; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0 +; SKX-NEXT: retq +; +; SKX_32-LABEL: test14: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 +; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0 +; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 +; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} +; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer @@ -290,19 +668,585 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { ret <16 x float>%res } +declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>) +declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>) -; KNL-LABEL: test15 -; KNL: kmovw %eax, %k1 -; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; Gather smaller than existing instruction +define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { +; +; KNL_64-LABEL: test15: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2 +; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0 +; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test15: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2 +; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0 +; KNL_32-NEXT: vpandq .LCPI14_0, %zmm0, %zmm0 +; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1 +; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test15: +; SKX: # BB#0: +; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq -; SCALAR-LABEL: test15 -; SCALAR: extractelement <16 x float*> + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind + %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef) + ret <4 x float>%res +} + +; Gather smaller than existing instruction +define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) { +; +; KNL_64-LABEL: test16: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 +; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 +; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vmovaps %zmm2, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test16: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 +; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 +; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_32-NEXT: vpandq .LCPI15_0, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test16: +; SKX: # BB#0: +; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1} +; SKX-NEXT: vmovaps %zmm2, %zmm0 +; SKX-NEXT: retq + + %sext_ind = sext <4 x i32> %ind to <4 x i64> + %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind + %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0) + ret <4 x double>%res +} + +define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) { +; +; KNL_64-LABEL: test17: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 +; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vmovaps %zmm2, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test17: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpandq .LCPI16_0, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test17: +; SKX: # BB#0: +; SKX-NEXT: vpmovq2m %xmm1, %k1 +; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vmovaps %zmm2, %zmm0 +; SKX-NEXT: retq + + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind + %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0) + ret <2 x double>%res +} + +declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> ) +declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> ) +declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> ) +declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> ) +declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> ) + +define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { +; +; KNL_64-LABEL: test18: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2 +; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test18: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2 +; KNL_32-NEXT: vpandq .LCPI17_0, %zmm2, %zmm2 +; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test18: +; SKX: # BB#0: +; SKX-NEXT: vpmovd2m %xmm2, %k1 +; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX-NEXT: retq + call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) + ret void +} + +define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) { +; +; KNL_64-LABEL: test19: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 +; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 +; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test19: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 +; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 +; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpandq .LCPI18_0, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test19: +; SKX: # BB#0: +; SKX-NEXT: vpmovd2m %xmm1, %k1 +; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1} +; SKX-NEXT: retq +; +; SKX_32-LABEL: test19: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpmovd2m %xmm1, %k1 +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1} +; SKX_32-NEXT: retl + %gep = getelementptr double, double* %ptr, <4 x i64> %ind + call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask) + ret void +} + +; Data type requires widening +define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { +; +; KNL_64-LABEL: test20: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; KNL_64-NEXT: vmovq %xmm2, %xmm2 +; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2 +; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test20: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; KNL_32-NEXT: vmovq %xmm2, %xmm2 +; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2 +; KNL_32-NEXT: vpandq .LCPI19_0, %zmm2, %zmm2 +; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test20: +; SKX: # BB#0: +; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: kshiftlw $2, %k0, %k0 +; SKX-NEXT: kshiftrw $2, %k0, %k1 +; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1} +; SKX-NEXT: retq + call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask) + ret void +} + +; Data type requires promotion +define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { +; +; KNL_64-LABEL: test21: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 +; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test21: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 +; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: vpandq .LCPI20_0, %zmm2, %zmm2 +; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test21: +; SKX: # BB#0: +; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: kshiftlw $2, %k0, %k0 +; SKX-NEXT: kshiftrw $2, %k0, %k1 +; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX-NEXT: retq + call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) + ret void +} + +; The result type requires widening +declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>) + +define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) { +; +; +; KNL_64-LABEL: test22: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL_64-NEXT: vmovq %xmm1, %xmm1 +; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} +; KNL_64-NEXT: vmovaps %zmm2, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test22: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL_32-NEXT: vmovq %xmm1, %xmm1 +; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL_32-NEXT: vpandq .LCPI21_0, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1} +; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test22: +; SKX: # BB#0: +; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX-NEXT: vpmovq2m %xmm1, %k0 +; SKX-NEXT: kshiftlw $2, %k0, %k0 +; SKX-NEXT: kshiftrw $2, %k0, %k1 +; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vmovaps %zmm2, %zmm0 +; SKX-NEXT: retq + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind + %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0) + ret <2 x float>%res +} + +declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) +declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>) + +define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { +; +; KNL_64-LABEL: test23: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 +; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vmovaps %zmm2, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test23: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpandq .LCPI22_0, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test23: +; SKX: # BB#0: +; SKX-NEXT: vpmovq2m %xmm1, %k1 +; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vmovaps %zmm2, %zmm0 +; SKX-NEXT: retq + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind + %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) + ret <2 x i32>%res +} + +define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { +; +; +; KNL_64-LABEL: test24: +; KNL_64: # BB#0: +; KNL_64-NEXT: movb $3, %al +; KNL_64-NEXT: movzbl %al, %eax +; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %zmm1, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test24: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1 +; KNL_32-NEXT: vpandq .LCPI23_1, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test24: +; SKX: # BB#0: +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind + %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> , <2 x i32> undef) + ret <2 x i32>%res +} + +define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) { +; +; KNL_64-LABEL: test25: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 +; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vmovaps %zmm2, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test25: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpandq .LCPI24_0, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test25: +; SKX: # BB#0: +; SKX-NEXT: vpmovq2m %xmm1, %k1 +; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vmovaps %zmm2, %zmm0 +; SKX-NEXT: retq + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind + %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0) + ret <2 x i64>%res +} + +define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { +; +; KNL_64-LABEL: test26: +; KNL_64: # BB#0: +; KNL_64-NEXT: movb $3, %al +; KNL_64-NEXT: movzbl %al, %eax +; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %zmm1, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test26: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2 +; KNL_32-NEXT: vpandq .LCPI25_1, %zmm2, %zmm2 +; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test26: +; SKX: # BB#0: +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind + %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> , <2 x i64> %src0) + ret <2 x i64>%res +} + +; Result type requires widening; all-ones mask +define <2 x float> @test27(float* %base, <2 x i32> %ind) { +; +; KNL_64-LABEL: test27: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1 +; KNL_64-NEXT: movb $3, %al +; KNL_64-NEXT: movzbl %al, %eax +; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test27: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 +; KNL_32-NEXT: movb $3, %cl +; KNL_32-NEXT: movzbl %cl, %ecx +; KNL_32-NEXT: kmovw %ecx, %k1 +; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test27: +; SKX: # BB#0: +; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SKX-NEXT: movb $3, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} +; SKX-NEXT: retq + %sext_ind = sext <2 x i32> %ind to <2 x i64> + %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind + %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> , <2 x float> undef) + ret <2 x float>%res +} + +; Data type requires promotion, mask is all-ones +define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { +; +; +; KNL_64-LABEL: test28: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_64-NEXT: movb $3, %al +; KNL_64-NEXT: movzbl %al, %eax +; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test28: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2 +; KNL_32-NEXT: vpandq .LCPI27_1, %zmm2, %zmm2 +; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 +; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test28: +; SKX: # BB#0: +; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX-NEXT: movb $3, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX-NEXT: retq + call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> ) + ret void +} + + +; SCALAR-LABEL: test29 +; SCALAR: extractelement <16 x float*> ; SCALAR-NEXT: load float ; SCALAR-NEXT: insertelement <16 x float> ; SCALAR-NEXT: extractelement <16 x float*> ; SCALAR-NEXT: load float -define <16 x float> @test15(float* %base, <16 x i32> %ind) { +define <16 x float> @test29(float* %base, <16 x i32> %ind) { +; KNL_64-LABEL: test29: +; KNL_64: # BB#0: +; KNL_64-NEXT: movw $44, %ax +; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %zmm1, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test29: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: movw $44, %cx +; KNL_32-NEXT: kmovw %ecx, %k1 +; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test29: +; SKX: # BB#0: +; SKX-NEXT: movw $44, %ax +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: retq %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer @@ -316,14 +1260,136 @@ define <16 x float> @test15(float* %base, <16 x i32> %ind) { ; Check non-power-of-2 case. It should be scalarized. declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>) -; KNL-LABEL: test16 -; KNL: testb -; KNL: je -; KNL: testb -; KNL: je -; KNL: testb -; KNL: je -define <3 x i32> @test16(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { +define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { +; KNL_64-LABEL: test30: +; KNL_64: # BB#0: +; KNL_64-NEXT: andl $1, %edx +; KNL_64-NEXT: kmovw %edx, %k1 +; KNL_64-NEXT: andl $1, %esi +; KNL_64-NEXT: kmovw %esi, %k2 +; KNL_64-NEXT: movl %edi, %eax +; KNL_64-NEXT: andl $1, %eax +; KNL_64-NEXT: kmovw %eax, %k0 +; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 +; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 +; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; KNL_64-NEXT: # implicit-def: %XMM0 +; KNL_64-NEXT: testb $1, %dil +; KNL_64-NEXT: je .LBB29_2 +; KNL_64-NEXT: # BB#1: # %cond.load +; KNL_64-NEXT: vmovq %xmm1, %rax +; KNL_64-NEXT: vmovd (%rax), %xmm0 +; KNL_64-NEXT: .LBB29_2: # %else +; KNL_64-NEXT: kmovw %k2, %eax +; KNL_64-NEXT: movl %eax, %ecx +; KNL_64-NEXT: andl $1, %ecx +; KNL_64-NEXT: testb %cl, %cl +; KNL_64-NEXT: je .LBB29_4 +; KNL_64-NEXT: # BB#3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx +; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 +; KNL_64-NEXT: .LBB29_4: # %else2 +; KNL_64-NEXT: kmovw %k1, %ecx +; KNL_64-NEXT: movl %ecx, %edx +; KNL_64-NEXT: andl $1, %edx +; KNL_64-NEXT: testb %dl, %dl +; KNL_64-NEXT: je .LBB29_6 +; KNL_64-NEXT: # BB#5: # %cond.load4 +; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL_64-NEXT: vmovq %xmm1, %rdx +; KNL_64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0 +; KNL_64-NEXT: .LBB29_6: # %else5 +; KNL_64-NEXT: kmovw %k0, %edx +; KNL_64-NEXT: vmovd %edx, %xmm1 +; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; KNL_64-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test30: +; KNL_32: # BB#0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: andl $1, %eax +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: andl $1, %eax +; KNL_32-NEXT: kmovw %eax, %k2 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: movl %eax, %ecx +; KNL_32-NEXT: andl $1, %ecx +; KNL_32-NEXT: kmovw %ecx, %k0 +; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 +; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; KNL_32-NEXT: # implicit-def: %XMM0 +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: je .LBB29_2 +; KNL_32-NEXT: # BB#1: # %cond.load +; KNL_32-NEXT: vmovd %xmm1, %eax +; KNL_32-NEXT: vmovd (%eax), %xmm0 +; KNL_32-NEXT: .LBB29_2: # %else +; KNL_32-NEXT: kmovw %k2, %eax +; KNL_32-NEXT: movl %eax, %ecx +; KNL_32-NEXT: andl $1, %ecx +; KNL_32-NEXT: testb %cl, %cl +; KNL_32-NEXT: je .LBB29_4 +; KNL_32-NEXT: # BB#3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx +; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 +; KNL_32-NEXT: .LBB29_4: # %else2 +; KNL_32-NEXT: kmovw %k1, %ecx +; KNL_32-NEXT: movl %ecx, %edx +; KNL_32-NEXT: andl $1, %edx +; KNL_32-NEXT: testb %dl, %dl +; KNL_32-NEXT: je .LBB29_6 +; KNL_32-NEXT: # BB#5: # %cond.load4 +; KNL_32-NEXT: vpextrd $2, %xmm1, %edx +; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0 +; KNL_32-NEXT: .LBB29_6: # %else5 +; KNL_32-NEXT: kmovw %k0, %edx +; KNL_32-NEXT: vmovd %edx, %xmm1 +; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; KNL_32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test30: +; SKX: # BB#0: +; SKX-NEXT: vpmovd2m %xmm2, %k1 +; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) +; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 +; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 +; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SKX-NEXT: # implicit-def: %XMM0 +; SKX-NEXT: andb $1, %al +; SKX-NEXT: je .LBB29_2 +; SKX-NEXT: # BB#1: # %cond.load +; SKX-NEXT: vmovq %xmm1, %rax +; SKX-NEXT: vmovd (%rax), %xmm0 +; SKX-NEXT: .LBB29_2: # %else +; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) +; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SKX-NEXT: andb $1, %al +; SKX-NEXT: je .LBB29_4 +; SKX-NEXT: # BB#3: # %cond.load1 +; SKX-NEXT: vpextrq $1, %xmm1, %rax +; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0 +; SKX-NEXT: .LBB29_4: # %else2 +; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) +; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SKX-NEXT: andb $1, %al +; SKX-NEXT: je .LBB29_6 +; SKX-NEXT: # BB#5: # %cond.load4 +; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1 +; SKX-NEXT: vmovq %xmm1, %rax +; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0 +; SKX-NEXT: .LBB29_6: # %else5 +; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1} +; SKX-NEXT: vmovaps %zmm3, %zmm0 +; SKX-NEXT: retq + %sext_ind = sext <3 x i32> %ind to <3 x i64> %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0) @@ -332,11 +1398,405 @@ define <3 x i32> @test16(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>) -; KNL-LABEL: test17 +; KNL-LABEL: test31 ; KNL: vpgatherqq ; KNL: vpgatherqq -define <16 x float*> @test17(<16 x float**> %ptrs) { +define <16 x float*> @test31(<16 x float**> %ptrs) { +; KNL_64-LABEL: test31: +; KNL_64: # BB#0: +; KNL_64-NEXT: kxnorw %k1, %k1, %k1 +; KNL_64-NEXT: kxnorw %k2, %k2, %k2 +; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} +; KNL_64-NEXT: kshiftrw $8, %k1, %k1 +; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} +; KNL_64-NEXT: vmovaps %zmm2, %zmm0 +; KNL_64-NEXT: vmovaps %zmm3, %zmm1 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test31: +; KNL_32: # BB#0: +; KNL_32-NEXT: kxnorw %k1, %k1, %k1 +; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test31: +; SKX: # BB#0: +; SKX-NEXT: kxnorw %k1, %k1, %k1 +; SKX-NEXT: kxnorw %k2, %k2, %k2 +; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} +; SKX-NEXT: kshiftrw $8, %k1, %k1 +; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} +; SKX-NEXT: vmovaps %zmm2, %zmm0 +; SKX-NEXT: vmovaps %zmm3, %zmm1 +; SKX-NEXT: retq +; +; SKX_32-LABEL: test31: +; SKX_32: # BB#0: +; SKX_32-NEXT: kxnorw %k1, %k1, %k1 +; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} +; SKX_32-NEXT: vmovaps %zmm1, %zmm0 +; SKX_32-NEXT: retl %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> , <16 x float*> undef) ret <16 x float*>%res } + +define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { +; KNL_64-LABEL: test_gather_16i32: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; KNL_64-NEXT: kshiftrw $8, %k1, %k2 +; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} +; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test_gather_16i32: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} +; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test_gather_16i32: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 +; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 +; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2 +; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} +; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 +; SKX-NEXT: retq +; +; SKX_32-LABEL: test_gather_16i32: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; SKX_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1 +; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} +; SKX_32-NEXT: vmovaps %zmm2, %zmm0 +; SKX_32-NEXT: retl + %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0) + ret <16 x i32> %res +} +define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { +; KNL_64-LABEL: test_gather_16i64: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL_64-NEXT: kshiftrw $8, %k1, %k2 +; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} +; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} +; KNL_64-NEXT: vmovaps %zmm3, %zmm0 +; KNL_64-NEXT: vmovaps %zmm4, %zmm1 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test_gather_16i64: +; KNL_32: # BB#0: +; KNL_32-NEXT: pushl %ebp +; KNL_32-NEXT: .Ltmp0: +; KNL_32-NEXT: .cfi_def_cfa_offset 8 +; KNL_32-NEXT: .Ltmp1: +; KNL_32-NEXT: .cfi_offset %ebp, -8 +; KNL_32-NEXT: movl %esp, %ebp +; KNL_32-NEXT: .Ltmp2: +; KNL_32-NEXT: .cfi_def_cfa_register %ebp +; KNL_32-NEXT: andl $-64, %esp +; KNL_32-NEXT: subl $64, %esp +; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL_32-NEXT: vpandd .LCPI32_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 +; KNL_32-NEXT: kshiftrw $8, %k1, %k2 +; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} +; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} +; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: movl %ebp, %esp +; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: retl +; +; SKX-LABEL: test_gather_16i64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 +; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 +; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} +; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} +; SKX-NEXT: vmovaps %zmm3, %zmm0 +; SKX-NEXT: vmovaps %zmm4, %zmm1 +; SKX-NEXT: retq + %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) + ret <16 x i64> %res +} +declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) +define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) { +; KNL_64-LABEL: test_gather_16f32: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2 +; KNL_64-NEXT: kshiftrw $8, %k1, %k2 +; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} +; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} +; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test_gather_16f32: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL_32-NEXT: vpandd .LCPI33_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1} +; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: retl +; +; SKX-LABEL: test_gather_16f32: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 +; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 +; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2 +; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} +; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} +; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0 +; SKX-NEXT: retq + %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0) + ret <16 x float> %res +} +define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) { +; KNL_64-LABEL: test_gather_16f64: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL_64-NEXT: kshiftrw $8, %k1, %k2 +; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} +; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} +; KNL_64-NEXT: vmovaps %zmm3, %zmm0 +; KNL_64-NEXT: vmovaps %zmm4, %zmm1 +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test_gather_16f64: +; KNL_32: # BB#0: +; KNL_32-NEXT: pushl %ebp +; KNL_32-NEXT: .Ltmp3: +; KNL_32-NEXT: .cfi_def_cfa_offset 8 +; KNL_32-NEXT: .Ltmp4: +; KNL_32-NEXT: .cfi_offset %ebp, -8 +; KNL_32-NEXT: movl %esp, %ebp +; KNL_32-NEXT: .Ltmp5: +; KNL_32-NEXT: .cfi_def_cfa_register %ebp +; KNL_32-NEXT: andl $-64, %esp +; KNL_32-NEXT: subl $64, %esp +; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL_32-NEXT: vpandd .LCPI34_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 +; KNL_32-NEXT: kshiftrw $8, %k1, %k2 +; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} +; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} +; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: movl %ebp, %esp +; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: retl +; +; SKX-LABEL: test_gather_16f64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 +; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 +; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} +; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} +; SKX-NEXT: vmovaps %zmm3, %zmm0 +; SKX-NEXT: vmovaps %zmm4, %zmm1 +; SKX-NEXT: retq + %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) + ret <16 x double> %res +} +declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) +define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { +; KNL_64-LABEL: test_scatter_16i32: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL_64-NEXT: kshiftrw $8, %k1, %k2 +; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} +; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0 +; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test_scatter_16i32: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test_scatter_16i32: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 +; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 +; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} +; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0 +; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} +; SKX-NEXT: retq +; +; SKX_32-LABEL: test_scatter_16i32: +; SKX_32: # BB#0: +; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; SKX_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1 +; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} +; SKX_32-NEXT: retl + call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask) + ret void +} +define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { +; KNL_64-LABEL: test_scatter_16i64: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL_64-NEXT: kshiftrw $8, %k1, %k2 +; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} +; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test_scatter_16i64: +; KNL_32: # BB#0: +; KNL_32-NEXT: pushl %ebp +; KNL_32-NEXT: .Ltmp6: +; KNL_32-NEXT: .cfi_def_cfa_offset 8 +; KNL_32-NEXT: .Ltmp7: +; KNL_32-NEXT: .cfi_offset %ebp, -8 +; KNL_32-NEXT: movl %esp, %ebp +; KNL_32-NEXT: .Ltmp8: +; KNL_32-NEXT: .cfi_def_cfa_register %ebp +; KNL_32-NEXT: andl $-64, %esp +; KNL_32-NEXT: subl $64, %esp +; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL_32-NEXT: vpandd .LCPI36_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 +; KNL_32-NEXT: kshiftrw $8, %k1, %k2 +; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1} +; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} +; KNL_32-NEXT: movl %ebp, %esp +; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: retl +; +; SKX-LABEL: test_scatter_16i64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 +; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 +; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} +; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} +; SKX-NEXT: retq + call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask) + ret void +} +declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask) +define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) { +; KNL_64-LABEL: test_scatter_16f32: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL_64-NEXT: kshiftrw $8, %k1, %k2 +; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} +; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0 +; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test_scatter_16f32: +; KNL_32: # BB#0: +; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL_32-NEXT: vpandd .LCPI37_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} +; KNL_32-NEXT: retl +; +; SKX-LABEL: test_scatter_16f32: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 +; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 +; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} +; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0 +; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} +; SKX-NEXT: retq + call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask) + ret void +} +declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask) +define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) { +; KNL_64-LABEL: test_scatter_16f64: +; KNL_64: # BB#0: +; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 +; KNL_64-NEXT: kshiftrw $8, %k1, %k2 +; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} +; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test_scatter_16f64: +; KNL_32: # BB#0: +; KNL_32-NEXT: pushl %ebp +; KNL_32-NEXT: .Ltmp9: +; KNL_32-NEXT: .cfi_def_cfa_offset 8 +; KNL_32-NEXT: .Ltmp10: +; KNL_32-NEXT: .cfi_offset %ebp, -8 +; KNL_32-NEXT: movl %esp, %ebp +; KNL_32-NEXT: .Ltmp11: +; KNL_32-NEXT: .cfi_def_cfa_register %ebp +; KNL_32-NEXT: andl $-64, %esp +; KNL_32-NEXT: subl $64, %esp +; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL_32-NEXT: vpandd .LCPI38_0{1to16}, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 +; KNL_32-NEXT: kshiftrw $8, %k1, %k2 +; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1} +; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} +; KNL_32-NEXT: movl %ebp, %esp +; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: retl +; +; SKX-LABEL: test_scatter_16f64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 +; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 +; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} +; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} +; SKX-NEXT: retq + call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask) + ret void +} +declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask) diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index 1a9cf008e869..0662d1b22eda 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -1,7 +1,7 @@ -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2 -; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=AVX_SCALAR -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=skx < %s | FileCheck %s -check-prefix=SKX +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s --check-prefix=AVX512 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX2 +; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s --check-prefix=AVX_SCALAR +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=skx < %s | FileCheck %s --check-prefix=SKX ; AVX512-LABEL: test1 ; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z} @@ -274,6 +274,15 @@ define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { ; AVX2-NOT: blend ; AVX2: ret define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) { +; SKX-LABEL: test18: +; SKX: ## BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; SKX-NEXT: kshiftlw $2, %k0, %k0 +; SKX-NEXT: kshiftrw $2, %k0, %k1 +; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} +; SKX-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef) ret <2 x float> %res @@ -363,3 +372,77 @@ define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) { %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer) ret <16 x %mystruct*> %res } + +define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) { +; SKX-LABEL: test_store_16i64: +; SKX: ## BB#0: +; SKX-NEXT: vpmovb2m %xmm0, %k1 +; SKX-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} +; SKX-NEXT: kshiftrw $8, %k1, %k1 +; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} +; SKX-NEXT: retq + call void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask) + ret void +} +declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask) +define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { +; SKX-LABEL: test_store_16f64: +; SKX: ## BB#0: +; SKX-NEXT: vpmovb2m %xmm0, %k1 +; SKX-NEXT: vmovupd %zmm1, (%rdi) {%k1} +; SKX-NEXT: kshiftrw $8, %k1, %k1 +; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} +; SKX-NEXT: retq + call void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask) + ret void +} +declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask) +define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) { +; SKX-LABEL: test_load_16i64: +; SKX: ## BB#0: +; SKX-NEXT: vpmovb2m %xmm0, %k1 +; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} +; SKX-NEXT: kshiftrw $8, %k1, %k1 +; SKX-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: vmovaps %zmm2, %zmm1 +; SKX-NEXT: retq + %res = call <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) + ret <16 x i64> %res +} +declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) +define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { +; SKX-LABEL: test_load_16f64: +; SKX: ## BB#0: +; SKX-NEXT: vpmovb2m %xmm0, %k1 +; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; SKX-NEXT: kshiftrw $8, %k1, %k1 +; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: vmovaps %zmm2, %zmm1 +; SKX-NEXT: retq + %res = call <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) + ret <16 x double> %res +} +declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) + +define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) { +; SKX-LABEL: test_load_32f64: +; SKX: ## BB#0: +; SKX-NEXT: vpmovb2m %ymm0, %k1 +; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1} +; SKX-NEXT: kshiftrd $16, %k1, %k2 +; SKX-NEXT: vmovupd 128(%rdi), %zmm3 {%k2} +; SKX-NEXT: kshiftrw $8, %k1, %k1 +; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} +; SKX-NEXT: kshiftrw $8, %k2, %k1 +; SKX-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} +; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: vmovaps %zmm2, %zmm1 +; SKX-NEXT: vmovaps %zmm3, %zmm2 +; SKX-NEXT: vmovaps %zmm4, %zmm3 +; SKX-NEXT: retq + %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) + ret <32 x double> %res +} +declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)