From 1ada137854be55265f1b8128e1a19c57c48582f9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 20 Aug 2019 06:58:00 +0000 Subject: [PATCH] [X86] Add back the -x86-experimental-vector-widening-legalization comand line flag and all associated code, but leave it enabled by default Google is reporting performance issues with the new default behavior and have asked for a way to switch back to the old behavior while we investigate and make fixes. I've restored all of the code that had since been removed and added additional checks of the command flag onto code paths that are not otherwise guarded by a check of getTypeAction. I've also modified the cost model tables to hopefully get us back to the previous costs. Hopefully we won't need to support this for very long since we have no test coverage of the old behavior so we can very easily break it. llvm-svn: 369332 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 1309 +++++++++++++++-- .../lib/Target/X86/X86TargetTransformInfo.cpp | 65 +- 2 files changed, 1229 insertions(+), 145 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a519c200e49b..9be4350ef98e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -65,6 +65,12 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); +cl::opt ExperimentalVectorWideningLegalization( + "x86-experimental-vector-widening-legalization", cl::init(true), + cl::desc("Enable an experimental vector type legalization through widening " + "rather than promotion."), + cl::Hidden); + static cl::opt ExperimentalPrefLoopAlignment( "x86-experimental-pref-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments " @@ -816,6 +822,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UREM, VT, Custom); } + if (!ExperimentalVectorWideningLegalization) { + setOperationAction(ISD::MUL, MVT::v2i16, Custom); + setOperationAction(ISD::MUL, MVT::v2i32, Custom); + setOperationAction(ISD::MUL, MVT::v4i16, Custom); + } + setOperationAction(ISD::MUL, MVT::v2i8, Custom); setOperationAction(ISD::MUL, MVT::v4i8, Custom); setOperationAction(ISD::MUL, MVT::v8i8, Custom); @@ -854,10 +866,29 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); + if (!ExperimentalVectorWideningLegalization) { + // Use widening instead of promotion. + for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8, + MVT::v4i16, MVT::v2i16 }) { + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); + } + } + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + // Provide custom widening for v2f32 setcc. This is really for VLX when + // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to + // type legalization changing the result type to v4i1 during widening. + // It works fine for SSE2 and is probably faster so no need to qualify with + // VLX support. + if (!ExperimentalVectorWideningLegalization) + setOperationAction(ISD::SETCC, MVT::v2i32, Custom); + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); @@ -877,6 +908,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } + // We support custom legalizing of sext and anyext loads for specific + // memory vector types which we can load as a scalar (or sequence of + // scalars) and extend in-register to a legal 128-bit vector type. For sext + // loads these must work with a single scalar load. + if (!ExperimentalVectorWideningLegalization) { + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); + } + } + for (auto VT : { MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); @@ -950,14 +996,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); + if (ExperimentalVectorWideningLegalization) { + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); + } else { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom); + } // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. @@ -1024,10 +1074,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); } + if (!ExperimentalVectorWideningLegalization) { + // Avoid narrow result types when widening. The legal types are listed + // in the next loop. + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); + } + } + // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); + if (!ExperimentalVectorWideningLegalization) + setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); @@ -1376,10 +1438,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - // Need to custom widen this if we don't have AVX512BW. - setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); + if (ExperimentalVectorWideningLegalization) { + // Need to custom widen this if we don't have AVX512BW. + setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); + } for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); @@ -1883,7 +1947,8 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return TypeSplitVector; - if (VT.getVectorNumElements() != 1 && + if (ExperimentalVectorWideningLegalization && + VT.getVectorNumElements() != 1 && VT.getVectorElementType() != MVT::i1) return TypeWidenVector; @@ -18776,7 +18841,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { - if (VT != MVT::v8i64) + if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) return SDValue(); In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), @@ -19110,6 +19175,22 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(In, DL); + if (!ExperimentalVectorWideningLegalization) { + // Without vector widening we need to manually construct X86 specific + // nodes and an unpcklqdq. + Lo = DAG.getNode(X86ISD::VTRUNC, DL, VT, Lo); + Hi = DAG.getNode(X86ISD::VTRUNC, DL, VT, Hi); + + // Manually concat the truncates using a shuffle. + unsigned NumElts = VT.getVectorNumElements(); + SmallVector ShufMask(NumElts); + for (unsigned i = 0; i != NumElts / 2; ++i) + ShufMask[i] = i; + for (unsigned i = NumElts / 2; i != NumElts; ++i) + ShufMask[i] = i + (NumElts / 2); + return DAG.getVectorShuffle(VT, DL, Lo, Hi, ShufMask); + } + EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); @@ -21322,7 +21403,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { - if (VT != MVT::v8i64) + if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) return SDValue(); In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), @@ -21469,13 +21550,14 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); } - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && "Unexpected VT"); - assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) == - TargetLowering::TypeWidenVector && "Unexpected type action!"); + if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) != + TargetLowering::TypeWidenVector) + return SDValue(); - EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT); + MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(), + StoreVT.getVectorNumElements() * 2); StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, DAG.getUNDEF(StoreVT)); @@ -21515,10 +21597,11 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, LoadSDNode *Ld = cast(Op.getNode()); SDLoc dl(Ld); + EVT MemVT = Ld->getMemoryVT(); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. if (RegVT.getVectorElementType() == MVT::i1) { - assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load"); + assert(EVT(RegVT) == MemVT && "Expected non-extending load"); assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); @@ -21537,7 +21620,179 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl); } - return SDValue(); + if (ExperimentalVectorWideningLegalization) + return SDValue(); + + // Nothing useful we can do without SSE2 shuffles. + assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned RegSz = RegVT.getSizeInBits(); + + ISD::LoadExtType Ext = Ld->getExtensionType(); + + assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) + && "Only anyext and sext are currently implemented."); + assert(MemVT != RegVT && "Cannot extend to the same type"); + assert(MemVT.isVector() && "Must load a vector from memory"); + + unsigned NumElems = RegVT.getVectorNumElements(); + unsigned MemSz = MemVT.getSizeInBits(); + assert(RegSz > MemSz && "Register size must be greater than the mem size"); + + if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) { + // The only way in which we have a legal 256-bit vector result but not the + // integer 256-bit operations needed to directly lower a sextload is if we + // have AVX1 but not AVX2. In that case, we can always emit a sextload to + // a 128-bit vector and a normal sign_extend to 256-bits that should get + // correctly legalized. We do this late to allow the canonical form of + // sextload to persist throughout the rest of the DAG combiner -- it wants + // to fold together any extensions it can, and so will fuse a sign_extend + // of an sextload into a sextload targeting a wider value. + SDValue Load; + if (MemSz == 128) { + // Just switch this to a normal load. + assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " + "it must be a legal 128-bit vector " + "type!"); + Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + } else { + assert(MemSz < 128 && + "Can't extend a type wider than 128 bits to a 256 bit vector!"); + // Do an sext load to a 128-bit vector type. We want to use the same + // number of elements, but elements half as wide. This will end up being + // recursively lowered by this routine, but will succeed as we definitely + // have all the necessary features if we're using AVX1. + EVT HalfEltVT = + EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); + EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); + Load = + DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), MemVT, Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + } + + // Replace chain users with the new chain. + assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); + + // Finally, do a normal sign-extend to the desired register. + SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT); + return DAG.getMergeValues({SExt, Load.getValue(1)}, dl); + } + + // All sizes must be a power of two. + assert(isPowerOf2_32(RegSz * MemSz * NumElems) && + "Non-power-of-two elements are not custom lowered!"); + + // Attempt to load the original value using scalar loads. + // Find the largest scalar type that divides the total loaded size. + MVT SclrLoadTy = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { + SclrLoadTy = Tp; + } + } + + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && + (64 <= MemSz)) + SclrLoadTy = MVT::f64; + + // Calculate the number of scalar loads that we need to perform + // in order to load our vector from memory. + unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); + + assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && + "Can only lower sext loads with a single scalar load!"); + + unsigned loadRegSize = RegSz; + if (Ext == ISD::SEXTLOAD && RegSz >= 256) + loadRegSize = 128; + + // If we don't have BWI we won't be able to create the shuffle needed for + // v8i8->v8i64. + if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && + MemVT == MVT::v8i8) + loadRegSize = 128; + + // Represent our vector as a sequence of elements which are the + // largest scalar that we can load. + EVT LoadUnitVecVT = EVT::getVectorVT( + *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits()); + + // Represent the data using the same element type that is stored in + // memory. In practice, we ''widen'' MemVT. + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + loadRegSize / MemVT.getScalarSizeInBits()); + + assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && + "Invalid vector type"); + + // We can't shuffle using an illegal type. + assert(TLI.isTypeLegal(WideVecVT) && + "We only lower types that form legal widened vector types"); + + SmallVector Chains; + SDValue Ptr = Ld->getBasePtr(); + unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8; + SDValue Increment = DAG.getConstant(OffsetInc, dl, + TLI.getPointerTy(DAG.getDataLayout())); + SDValue Res = DAG.getUNDEF(LoadUnitVecVT); + + unsigned Offset = 0; + for (unsigned i = 0; i < NumLoads; ++i) { + unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset); + + // Perform a single load. + SDValue ScalarLoad = + DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, + Ld->getPointerInfo().getWithOffset(Offset), + NewAlign, Ld->getMemOperand()->getFlags()); + Chains.push_back(ScalarLoad.getValue(1)); + // Create the first element type using SCALAR_TO_VECTOR in order to avoid + // another round of DAGCombining. + if (i == 0) + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); + else + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, + ScalarLoad, DAG.getIntPtrConstant(i, dl)); + + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + Offset += OffsetInc; + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + + // Bitcast the loaded value to a vector of the original element type, in + // the size of the target vector type. + SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res); + unsigned SizeRatio = RegSz / MemSz; + + if (Ext == ISD::SEXTLOAD) { + SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG); + return DAG.getMergeValues({Sext, TF}, dl); + } + + if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && + MemVT == MVT::v8i8) { + SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG); + return DAG.getMergeValues({Sext, TF}, dl); + } + + // Redistribute the loaded elements into the different locations. + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), ShuffleVec); + + // Bitcast to the requested type. + Shuff = DAG.getBitcast(RegVT, Shuff); + return DAG.getMergeValues({Shuff, TF}, dl); } /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes @@ -26910,13 +27165,12 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Chain = N->getChain(); SDValue BasePtr = N->getBasePtr(); - if (VT == MVT::v2f32 || VT == MVT::v2i32) { + if (VT == MVT::v2f32) { assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); // If the index is v2i64 and we have VLX we can use xmm for data and index. if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT)); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getUNDEF(MVT::v2f32)); SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode( @@ -26926,6 +27180,30 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); } + if (VT == MVT::v2i32) { + assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, + DAG.getUNDEF(MVT::v2i32)); + // If the index is v2i64 and we have VLX we can use xmm for data and index. + if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) { + SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; + SDValue NewScatter = DAG.getTargetMemSDNode( + VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); + return SDValue(NewScatter.getNode(), 1); + } + // Custom widen all the operands to avoid promotion. + EVT NewIndexVT = EVT::getVectorVT( + *DAG.getContext(), Index.getValueType().getVectorElementType(), 4); + Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, + DAG.getUNDEF(Index.getValueType())); + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getConstant(0, dl, MVT::v2i1)); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl, + Ops, N->getMemOperand(), N->getIndexType()); + } + MVT IndexVT = Index.getSimpleValueType(); MVT MaskVT = Mask.getSimpleValueType(); @@ -27348,22 +27626,37 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::MUL: { EVT VT = N->getValueType(0); - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"); - // Pre-promote these to vXi16 to avoid op legalization thinking all 16 - // elements are needed. - MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); - SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0)); - SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1)); - SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1); - Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); - unsigned NumConcats = 16 / VT.getVectorNumElements(); - SmallVector ConcatOps(NumConcats, DAG.getUNDEF(VT)); - ConcatOps[0] = Res; - Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps); - Results.push_back(Res); + assert(VT.isVector() && "Unexpected VT"); + if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger && + VT.getVectorNumElements() == 2) { + // Promote to a pattern that will be turned into PMULUDQ. + SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, + N->getOperand(0)); + SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, + N->getOperand(1)); + SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1); + Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul)); + } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + VT.getVectorElementType() == MVT::i8) { + // Pre-promote these to vXi16 to avoid op legalization thinking all 16 + // elements are needed. + MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); + SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0)); + SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1)); + SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1); + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + unsigned NumConcats = 16 / VT.getVectorNumElements(); + SmallVector ConcatOps(NumConcats, DAG.getUNDEF(VT)); + ConcatOps[0] = Res; + Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps); + Results.push_back(Res); + } return; } + case ISD::UADDSAT: + case ISD::SADDSAT: + case ISD::USUBSAT: + case ISD::SSUBSAT: case X86ISD::VPMADDWD: case X86ISD::AVG: { // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and @@ -27374,8 +27667,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT InVT = N->getOperand(0).getValueType(); assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."); - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); unsigned NumConcat = 128 / InVT.getSizeInBits(); EVT InWideVT = EVT::getVectorVT(*DAG.getContext(), @@ -27392,6 +27683,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, + DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } @@ -27420,6 +27714,26 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Hi); return; } + case ISD::SETCC: { + // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when + // setCC result type is v2i1 because type legalzation will end up with + // a v4i1 setcc plus an extend. + assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type"); + if (N->getOperand(0).getValueType() != MVT::v2f32 || + getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector) + return; + SDValue UNDEF = DAG.getUNDEF(MVT::v2f32); + SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(0), UNDEF); + SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(1), UNDEF); + SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS, + N->getOperand(2)); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + return; + } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -27440,9 +27754,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SREM: case ISD::UREM: { EVT VT = N->getValueType(0); - if (VT.isVector()) { - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); + if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) { // If this RHS is a constant splat vector we can widen this and let // division/remainder by constant optimize it. // TODO: Can we do something for non-splat? @@ -27460,6 +27772,17 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } + if (VT == MVT::v2i32) { + // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the + // v2i64 and unroll later. But then we create i64 scalar ops which + // might be slow in 64-bit mode or require a libcall in 32-bit mode. + Results.push_back(DAG.UnrollVectorOp(N)); + return; + } + + if (VT.isVector()) + return; + LLVM_FALLTHROUGH; } case ISD::SDIVREM: @@ -27470,8 +27793,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::TRUNCATE: { MVT VT = N->getSimpleValueType(0); - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + return; // The generic legalizer will try to widen the input type to the same // number of elements as the widened result type. But this isn't always @@ -27519,15 +27842,56 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::SIGN_EXTEND_VECTOR_INREG: { + if (ExperimentalVectorWideningLegalization) + return; + + EVT VT = N->getValueType(0); + SDValue In = N->getOperand(0); + EVT InVT = In.getValueType(); + if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && + (InVT == MVT::v16i16 || InVT == MVT::v32i8)) { + // Custom split this so we can extend i8/i16->i32 invec. This is better + // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using + // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting + // we allow the sra from the extend to i32 to be shared by the split. + EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(), + InVT.getVectorElementType(), + InVT.getVectorNumElements() / 2); + MVT ExtendVT = MVT::getVectorVT(MVT::i32, + VT.getVectorNumElements()); + In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT, + In, DAG.getIntPtrConstant(0, dl)); + In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In); + + // Fill a vector with sign bits for each element. + SDValue Zero = DAG.getConstant(0, dl, ExtendVT); + SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + + // Create an unpackl and unpackh to interleave the sign bits then bitcast + // to vXi64. + SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits); + Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo); + SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits); + Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi); + + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + Results.push_back(Res); + return; + } + return; + } case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && - (InVT == MVT::v4i16 || InVT == MVT::v4i8)){ - assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && - "Unexpected type action!"); + (InVT == MVT::v4i16 || InVT == MVT::v4i8) && + getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) { assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"); // Custom split this so we can extend i8/i16->i32 invec. This is better // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using @@ -27598,9 +27962,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Src = N->getOperand(0); EVT SrcVT = Src.getValueType(); + // Promote these manually to avoid over promotion to v2i64. Type + // legalization will revisit the v2i32 operation for more cleanup. + if ((VT == MVT::v2i8 || VT == MVT::v2i16) && + getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) { + // AVX512DQ provides instructions that produce a v2i64 result. + if (Subtarget.hasDQI()) + return; + + SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src); + Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext + : ISD::AssertSext, + dl, MVT::v2i32, Res, + DAG.getValueType(VT.getVectorElementType())); + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + Results.push_back(Res); + return; + } + if (VT.isVector() && VT.getScalarSizeInBits() < 32) { - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + return; // Try to create a 128 bit vector, but don't exceed a 32 bit element. unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U); @@ -27635,18 +28017,35 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, assert((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"); assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); + bool Widenv2i32 = + getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector; if (Src.getValueType() == MVT::v2f64) { - if (!IsSigned && !Subtarget.hasVLX()) { - // If we have VLX we can emit a target specific FP_TO_UINT node, - // otherwise we can defer to the generic legalizer which will widen - // the input as well. This will be further widened during op - // legalization to v8i32<-v8f64. - return; - } unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + if (!IsSigned && !Subtarget.hasVLX()) { + // If v2i32 is widened, we can defer to the generic legalizer. + if (Widenv2i32) + return; + // Custom widen by doubling to a legal vector with. Isel will + // further widen to v8f64. + Opc = ISD::FP_TO_UINT; + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, + Src, DAG.getUNDEF(MVT::v2f64)); + } SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); + if (!Widenv2i32) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + return; + } + if (SrcVT == MVT::v2f32 && + getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { + SDValue Idx = DAG.getIntPtrConstant(0, dl); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getUNDEF(MVT::v2f32)); + Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT + : ISD::FP_TO_UINT, dl, MVT::v4i32, Res); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); Results.push_back(Res); return; } @@ -27656,8 +28055,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - assert(!VT.isVector() && "Vectors should have been handled above!"); - if (Subtarget.hasDQI() && VT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { assert(!Subtarget.is64Bit() && "i64 should be legal"); @@ -27942,33 +28339,42 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - if (DstVT.isVector() && SrcVT == MVT::x86mmx) { - assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && - "Unexpected type action!"); + if (DstVT.isVector() && SrcVT == MVT::x86mmx && + getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) { EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT); SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0)); Results.push_back(Res); return; } + if (SrcVT != MVT::f64 || + (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) || + getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) + return; + + unsigned NumElts = DstVT.getVectorNumElements(); + EVT SVT = DstVT.getVectorElementType(); + EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); + SDValue Res; + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0)); + Res = DAG.getBitcast(WiderVT, Res); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); return; } case ISD::MGATHER: { EVT VT = N->getValueType(0); - if ((VT == MVT::v2f32 || VT == MVT::v2i32) && - (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { + if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { auto *Gather = cast(N); SDValue Index = Gather->getIndex(); if (Index.getValueType() != MVT::v2i64) return; - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); - EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); - SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, + SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Gather->getPassThru(), - DAG.getUNDEF(VT)); + DAG.getUNDEF(MVT::v2f32)); if (!Subtarget.hasVLX()) { // We need to widen the mask, but the instruction will only use 2 // of its elements. So we can use undef. @@ -27979,12 +28385,67 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Ops[] = { Gather->getChain(), PassThru, Mask, Gather->getBasePtr(), Index, Gather->getScale() }; SDValue Res = DAG.getTargetMemSDNode( - DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl, + DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl, Gather->getMemoryVT(), Gather->getMemOperand()); Results.push_back(Res); Results.push_back(Res.getValue(2)); return; } + if (VT == MVT::v2i32) { + auto *Gather = cast(N); + SDValue Index = Gather->getIndex(); + SDValue Mask = Gather->getMask(); + assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); + SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, + Gather->getPassThru(), + DAG.getUNDEF(MVT::v2i32)); + // If the index is v2i64 we can use it directly. + if (Index.getValueType() == MVT::v2i64 && + (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { + if (!Subtarget.hasVLX()) { + // We need to widen the mask, but the instruction will only use 2 + // of its elements. So we can use undef. + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getUNDEF(MVT::v2i1)); + Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); + } + SDValue Ops[] = { Gather->getChain(), PassThru, Mask, + Gather->getBasePtr(), Index, Gather->getScale() }; + SDValue Res = DAG.getTargetMemSDNode( + DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, + Gather->getMemoryVT(), Gather->getMemOperand()); + SDValue Chain = Res.getValue(2); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + Results.push_back(Chain); + return; + } + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { + EVT IndexVT = Index.getValueType(); + EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(), + IndexVT.getScalarType(), 4); + // Otherwise we need to custom widen everything to avoid promotion. + Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, + DAG.getUNDEF(IndexVT)); + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getConstant(0, dl, MVT::v2i1)); + SDValue Ops[] = { Gather->getChain(), PassThru, Mask, + Gather->getBasePtr(), Index, Gather->getScale() }; + SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other), + Gather->getMemoryVT(), dl, Ops, + Gather->getMemOperand(), + Gather->getIndexType()); + SDValue Chain = Res.getValue(1); + if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + Results.push_back(Chain); + return; + } + } return; } case ISD::LOAD: { @@ -27993,8 +28454,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // cast since type legalization will try to use an i64 load. MVT VT = N->getSimpleValueType(0); assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT"); - assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - "Unexpected type action!"); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + return; if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast(N); @@ -28004,10 +28465,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); SDValue Chain = Res.getValue(1); - MVT VecVT = MVT::getVectorVT(LdVT, 2); - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res); - EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); - Res = DAG.getBitcast(WideVT, Res); + MVT WideVT = MVT::getVectorVT(LdVT, 2); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); + MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() * 2); + Res = DAG.getBitcast(CastVT, Res); Results.push_back(Res); Results.push_back(Chain); return; @@ -33545,6 +34007,67 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, return HAddSub; } + // During Type Legalization, when promoting illegal vector types, + // the backend might introduce new shuffle dag nodes and bitcasts. + // + // This code performs the following transformation: + // fold: (shuffle (bitcast (BINOP A, B)), Undef, ) -> + // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, ) + // + // We do this only if both the bitcast and the BINOP dag nodes have + // one use. Also, perform this transformation only if the new binary + // operation is legal. This is to avoid introducing dag nodes that + // potentially need to be further expanded (or custom lowered) into a + // less optimal sequence of dag nodes. + if (!ExperimentalVectorWideningLegalization && + !DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && + N->getOpcode() == ISD::VECTOR_SHUFFLE && + N->getOperand(0).getOpcode() == ISD::BITCAST && + N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + SDValue BC0 = N0.getOperand(0); + EVT SVT = BC0.getValueType(); + unsigned Opcode = BC0.getOpcode(); + unsigned NumElts = VT.getVectorNumElements(); + + if (BC0.hasOneUse() && SVT.isVector() && + SVT.getVectorNumElements() * 2 == NumElts && + TLI.isOperationLegal(Opcode, VT)) { + bool CanFold = false; + switch (Opcode) { + default : break; + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + // isOperationLegal lies for integer ops on floating point types. + CanFold = VT.isInteger(); + break; + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + // isOperationLegal lies for floating point ops on integer types. + CanFold = VT.isFloatingPoint(); + break; + } + + unsigned SVTNumElts = SVT.getVectorNumElements(); + ShuffleVectorSDNode *SVOp = cast(N); + for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) + CanFold = SVOp->getMaskElt(i) == (int)(i * 2); + for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) + CanFold = SVOp->getMaskElt(i) < 0; + + if (CanFold) { + SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0)); + SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1)); + SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); + return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask()); + } + } + } + // Attempt to combine into a vector load/broadcast. if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true)) return LD; @@ -33640,6 +34163,54 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, } } + + // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the + // operands is an extend from v2i32 to v2i64. Turn it into a pmulld. + // FIXME: This can probably go away once we default to widening legalization. + if (!ExperimentalVectorWideningLegalization && + Subtarget.hasSSE41() && VT == MVT::v4i32 && + N->getOpcode() == ISD::VECTOR_SHUFFLE && + N->getOperand(0).getOpcode() == ISD::BITCAST && + N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) { + SDValue BC = N->getOperand(0); + SDValue MULUDQ = BC.getOperand(0); + ShuffleVectorSDNode *SVOp = cast(N); + ArrayRef Mask = SVOp->getMask(); + if (BC.hasOneUse() && MULUDQ.hasOneUse() && + Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) { + SDValue Op0 = MULUDQ.getOperand(0); + SDValue Op1 = MULUDQ.getOperand(1); + if (Op0.getOpcode() == ISD::BITCAST && + Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && + Op0.getOperand(0).getValueType() == MVT::v4i32) { + ShuffleVectorSDNode *SVOp0 = + cast(Op0.getOperand(0)); + ArrayRef Mask2 = SVOp0->getMask(); + if (Mask2[0] == 0 && Mask2[1] == -1 && + Mask2[2] == 1 && Mask2[3] == -1) { + Op0 = SVOp0->getOperand(0); + Op1 = DAG.getBitcast(MVT::v4i32, Op1); + Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask); + return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1); + } + } + if (Op1.getOpcode() == ISD::BITCAST && + Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && + Op1.getOperand(0).getValueType() == MVT::v4i32) { + ShuffleVectorSDNode *SVOp1 = + cast(Op1.getOperand(0)); + ArrayRef Mask2 = SVOp1->getMask(); + if (Mask2[0] == 0 && Mask2[1] == -1 && + Mask2[2] == 1 && Mask2[3] == -1) { + Op0 = DAG.getBitcast(MVT::v4i32, Op0); + Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask); + Op1 = SVOp1->getOperand(0); + return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1); + } + } + } + } + return SDValue(); } @@ -35532,7 +36103,7 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, SDLoc DL(ExtElt); - if (VecVT == MVT::v8i8) { + if (ExperimentalVectorWideningLegalization && VecVT == MVT::v8i8) { // Pad with undef. Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx, DAG.getUNDEF(VecVT)); @@ -36229,6 +36800,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Since SKX these selects have a proper lowering. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1 && + (ExperimentalVectorWideningLegalization || + VT.getVectorNumElements() > 4) && (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16)) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); @@ -37104,45 +37677,98 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, if ((NumElts % 2) != 0) return SDValue(); + unsigned RegSize = 128; + MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts); // Shrink the operands of mul. SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); - // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the - // lower part is needed. - SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); - if (Mode == MULU8 || Mode == MULS8) - return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, - DL, VT, MulLo); + if (ExperimentalVectorWideningLegalization || + NumElts >= OpsVT.getVectorNumElements()) { + // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the + // lower part is needed. + SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); + if (Mode == MULU8 || Mode == MULS8) + return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, + DL, VT, MulLo); - MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); - // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, - // the higher part is also needed. + MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); + // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, + // the higher part is also needed. + SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, + ReducedVT, NewN0, NewN1); + + // Repack the lower part and higher part result of mul into a wider + // result. + // Generate shuffle functioning as punpcklwd. + SmallVector ShuffleMask(NumElts); + for (unsigned i = 0, e = NumElts / 2; i < e; i++) { + ShuffleMask[2 * i] = i; + ShuffleMask[2 * i + 1] = i + NumElts; + } + SDValue ResLo = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); + ResLo = DAG.getBitcast(ResVT, ResLo); + // Generate shuffle functioning as punpckhwd. + for (unsigned i = 0, e = NumElts / 2; i < e; i++) { + ShuffleMask[2 * i] = i + NumElts / 2; + ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; + } + SDValue ResHi = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); + ResHi = DAG.getBitcast(ResVT, ResHi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); + } + + // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want + // to legalize the mul explicitly because implicit legalization for type + // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack + // instructions which will not exist when we explicitly legalize it by + // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with + // <4 x i16> undef). + // + // Legalize the operands of mul. + // FIXME: We may be able to handle non-concatenated vectors by insertion. + unsigned ReducedSizeInBits = ReducedVT.getSizeInBits(); + if ((RegSize % ReducedSizeInBits) != 0) + return SDValue(); + + SmallVector Ops(RegSize / ReducedSizeInBits, + DAG.getUNDEF(ReducedVT)); + Ops[0] = NewN0; + NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); + Ops[0] = NewN1; + NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); + + if (Mode == MULU8 || Mode == MULS8) { + // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower + // part is needed. + SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); + + // convert the type of mul result to VT. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG + : ISD::SIGN_EXTEND_VECTOR_INREG, + DL, ResVT, Mul); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } + + // Generate the lower and higher part of mul: pmulhw/pmulhuw. For + // MULU16/MULS16, both parts are needed. + SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, - ReducedVT, NewN0, NewN1); + OpsVT, NewN0, NewN1); // Repack the lower part and higher part result of mul into a wider - // result. - // Generate shuffle functioning as punpcklwd. - SmallVector ShuffleMask(NumElts); - for (unsigned i = 0, e = NumElts / 2; i < e; i++) { - ShuffleMask[2 * i] = i; - ShuffleMask[2 * i + 1] = i + NumElts; - } - SDValue ResLo = - DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); - ResLo = DAG.getBitcast(ResVT, ResLo); - // Generate shuffle functioning as punpckhwd. - for (unsigned i = 0, e = NumElts / 2; i < e; i++) { - ShuffleMask[2 * i] = i + NumElts / 2; - ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; - } - SDValue ResHi = - DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); - ResHi = DAG.getBitcast(ResVT, ResHi); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); + // result. Make sure the type of mul result is VT. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi); + Res = DAG.getBitcast(ResVT, Res); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); } static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, @@ -37250,7 +37876,8 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case. // Also allow v2i32 if it will be widened. MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); - if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT)) + if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) || + DAG.getTargetLoweringInfo().isTypeLegal(WVT))) return SDValue(); SDValue N0 = N->getOperand(0); @@ -39506,7 +40133,93 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, return Blend; } - return SDValue(); + if (ExperimentalVectorWideningLegalization) + return SDValue(); + + if (Mld->getExtensionType() != ISD::EXTLOAD) + return SDValue(); + + // Resolve extending loads. + EVT VT = Mld->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + EVT LdVT = Mld->getMemoryVT(); + SDLoc dl(Mld); + + assert(LdVT != VT && "Cannot extend to the same type"); + unsigned ToSz = VT.getScalarSizeInBits(); + unsigned FromSz = LdVT.getScalarSizeInBits(); + // From/To sizes and ElemCount must be pow of two. + assert (isPowerOf2_32(NumElems * FromSz * ToSz) && + "Unexpected size for extending masked load"); + + unsigned SizeRatio = ToSz / FromSz; + assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle. + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + LdVT.getScalarType(), NumElems*SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + // Convert PassThru value. + SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru()); + if (!Mld->getPassThru().isUndef()) { + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru, + DAG.getUNDEF(WideVecVT), ShuffleVec); + } + + // Prepare the new mask. + SDValue NewMask; + SDValue Mask = Mld->getMask(); + if (Mask.getValueType() == VT) { + // Mask and original value have the same type. + NewMask = DAG.getBitcast(WideVecVT, Mask); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) + ShuffleVec[i] = NumElems * SizeRatio; + NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, + DAG.getConstant(0, dl, WideVecVT), + ShuffleVec); + } else { + assert(Mask.getValueType().getVectorElementType() == MVT::i1); + unsigned WidenNumElts = NumElems*SizeRatio; + unsigned MaskNumElts = VT.getVectorNumElements(); + EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WidenNumElts); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); + SmallVector Ops(NumConcat, ZeroVal); + Ops[0] = Mask; + NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); + } + + SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), + Mld->getBasePtr(), NewMask, WidePassThru, + Mld->getMemoryVT(), Mld->getMemOperand(), + ISD::NON_EXTLOAD); + + SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), ShuffleVec); + SlicedVec = DAG.getBitcast(VT, SlicedVec); + + return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true); } /// If exactly one element of the mask is set for a non-truncating masked store, @@ -39544,34 +40257,111 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = Mst->getValue().getValueType(); + EVT StVT = Mst->getMemoryVT(); SDLoc dl(Mst); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (Mst->isTruncatingStore()) + if (!Mst->isTruncatingStore()) { + if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) + return ScalarStore; + + // If the mask value has been legalized to a non-boolean vector, try to + // simplify ops leading up to it. We only demand the MSB of each lane. + SDValue Mask = Mst->getMask(); + if (Mask.getScalarValueSizeInBits() != 1) { + APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); + if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) + return SDValue(N, 0); + } + + SDValue Value = Mst->getValue(); + if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && + TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), + Mst->getMemoryVT())) { + return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), + Mst->getBasePtr(), Mask, + Mst->getMemoryVT(), Mst->getMemOperand(), true); + } + + return SDValue(); + } + + if (ExperimentalVectorWideningLegalization) return SDValue(); - if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) - return ScalarStore; + // Resolve truncating stores. + unsigned NumElems = VT.getVectorNumElements(); - // If the mask value has been legalized to a non-boolean vector, try to - // simplify ops leading up to it. We only demand the MSB of each lane. + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromSz = VT.getScalarSizeInBits(); + unsigned ToSz = StVT.getScalarSizeInBits(); + + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + + // From/To sizes and ElemCount must be pow of two. + assert (isPowerOf2_32(NumElems * FromSz * ToSz) && + "Unexpected size for truncating masked store"); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + assert (((NumElems * FromSz) % ToSz) == 0 && + "Unexpected ratio for truncating masked store"); + + unsigned SizeRatio = FromSz / ToSz; + assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle. + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + StVT.getScalarType(), NumElems*SizeRatio); + + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue()); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + + SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, + DAG.getUNDEF(WideVecVT), + ShuffleVec); + + SDValue NewMask; SDValue Mask = Mst->getMask(); - if (Mask.getScalarValueSizeInBits() != 1) { - APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); - if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) - return SDValue(N, 0); + if (Mask.getValueType() == VT) { + // Mask and original value have the same type. + NewMask = DAG.getBitcast(WideVecVT, Mask); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) + ShuffleVec[i] = NumElems*SizeRatio; + NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, + DAG.getConstant(0, dl, WideVecVT), + ShuffleVec); + } else { + assert(Mask.getValueType().getVectorElementType() == MVT::i1); + unsigned WidenNumElts = NumElems*SizeRatio; + unsigned MaskNumElts = VT.getVectorNumElements(); + EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WidenNumElts); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); + SmallVector Ops(NumConcat, ZeroVal); + Ops[0] = Mask; + NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - SDValue Value = Mst->getValue(); - if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && - TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), - Mst->getMemoryVT())) { - return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), - Mst->getBasePtr(), Mask, - Mst->getMemoryVT(), Mst->getMemOperand(), true); - } - - return SDValue(); + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, + Mst->getBasePtr(), NewMask, StVT, + Mst->getMemOperand(), false); } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, @@ -39699,6 +40489,41 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, MVT::v16i8, St->getMemOperand()); } + // Look for a truncating store to a less than 128 bit vector that has been + // truncated from an any_extend_inreg from a 128 bit vector with the same + // element size. We can use a 64/32/16-bit extractelement and store that. + // Disabling this when widening legalization is in effect since the trunc + // store would have been unlikely to be created in that case. Only doing this + // when truncstore is legal since it would otherwise be decomposed below and + // then combined away. + if (St->isTruncatingStore() && TLI.isTruncStoreLegal(VT, StVT) && + StoredVal.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG && + StoredVal.getValueType().is128BitVector() && + !ExperimentalVectorWideningLegalization) { + EVT OrigVT = StoredVal.getOperand(0).getValueType(); + if (OrigVT.is128BitVector() && + OrigVT.getVectorElementType() == StVT.getVectorElementType()) { + unsigned StoreSize = StVT.getSizeInBits(); + assert((128 % StoreSize == 0) && "Unexpected store size!"); + MVT IntVT = MVT::getIntegerVT(StoreSize); + MVT CastVT = MVT::getVectorVT(IntVT, 128 / StoreSize); + StoredVal = DAG.getBitcast(CastVT, StoredVal.getOperand(0)); + // Use extract_store for the 64-bit case to support 32-bit targets. + if (IntVT == MVT::i64) { + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()}; + return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, + IntVT, St->getMemOperand()); + } + + // Otherwise just use an extract and store. + StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, IntVT, StoredVal, + DAG.getIntPtrConstant(0, dl)); + return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), + St->getMemOperand()); + } + } + // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. @@ -39725,7 +40550,85 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getMemoryVT(), St->getMemOperand(), DAG); } - return SDValue(); + if (ExperimentalVectorWideningLegalization) + return SDValue(); + + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromSz = VT.getScalarSizeInBits(); + unsigned ToSz = StVT.getScalarSizeInBits(); + + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegalOrCustom(VT, StVT)) + return SDValue(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromSz) % ToSz) return SDValue(); + + unsigned SizeRatio = FromSz / ToSz; + + assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + StVT.getScalarType(), NumElems*SizeRatio); + + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue()); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) + return SDValue(); + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, + DAG.getUNDEF(WideVecVT), + ShuffleVec); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. + + // Find the largest store unit + MVT StoreType = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) + StoreType = Tp; + } + + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && + (64 <= NumElems * ToSz)) + StoreType = MVT::f64; + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), + StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); + SmallVector Chains; + SDValue Ptr = St->getBasePtr(); + + // Perform one or more big stores into memory. + for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + StoreType, ShuffWide, + DAG.getIntPtrConstant(i, dl)); + SDValue Ch = + DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(), + St->getAlignment(), St->getMemOperand()->getFlags()); + Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl); + Chains.push_back(Ch); + } + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } // Turn load->store of MMX types into GPR load/stores. This avoids clobbering @@ -39741,7 +40644,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); - if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) && + if (((VT.isVector() && !VT.isFloatingPoint() && + !ExperimentalVectorWideningLegalization) || + (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) && isa(St->getValue()) && !cast(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { @@ -40256,7 +41161,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, // Only handle vXi16 types that are at least 128-bits unless they will be // widened. - if (!VT.isVector() || VT.getVectorElementType() != MVT::i16) + if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 || + (!ExperimentalVectorWideningLegalization && + VT.getVectorNumElements() < 8)) return SDValue(); // Input type should be vXi32. @@ -41320,6 +42227,127 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG, DAG.getConstant(EltSizeInBits - 1, DL, VT)); } +/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or +/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating +/// with UNDEFs) of the input to vectors of the same size as the target type +/// which then extends the lowest elements. +static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (ExperimentalVectorWideningLegalization) + return SDValue(); + + unsigned Opcode = N->getOpcode(); + // TODO - add ANY_EXTEND support. + if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) + return SDValue(); + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + if (!Subtarget.hasSSE2()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + EVT InVT = N0.getValueType(); + EVT InSVT = InVT.getScalarType(); + + // FIXME: Generic DAGCombiner previously had a bug that would cause a + // sign_extend of setcc to sometimes return the original node and tricked it + // into thinking CombineTo was used which prevented the target combines from + // running. + // Earlying out here to avoid regressions like this + // (v4i32 (sext (v4i1 (setcc (v4i16))))) + // Becomes + // (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef)))) + // Type legalized to + // (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32))))))) + // Leading to a packssdw+pmovsxwd + // We could write a DAG combine to fix this, but really we shouldn't be + // creating sext_invec that's forcing v8i16 into the DAG. + if (N0.getOpcode() == ISD::SETCC) + return SDValue(); + + // Input type must be a vector and we must be extending legal integer types. + if (!VT.isVector() || VT.getVectorNumElements() < 2) + return SDValue(); + if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) + return SDValue(); + if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) + return SDValue(); + + // If the input/output types are both legal then we have at least AVX1 and + // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly. + if (DAG.getTargetLoweringInfo().isTypeLegal(VT) && + DAG.getTargetLoweringInfo().isTypeLegal(InVT)) + return SDValue(); + + SDLoc DL(N); + + auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) { + EVT SrcVT = N.getValueType(); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), + Size / SrcVT.getScalarSizeInBits()); + SmallVector Opnds(Size / SrcVT.getSizeInBits(), + DAG.getUNDEF(SrcVT)); + Opnds[0] = N; + return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds); + }; + + // If target-size is less than 128-bits, extend to a type that would extend + // to 128 bits, extend that and extract the original target vector. + if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) { + unsigned Scale = 128 / VT.getSizeInBits(); + EVT ExVT = + EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); + SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); + SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, + DAG.getIntPtrConstant(0, DL)); + } + + // If target-size is 128-bits (or 256-bits on AVX target), then convert to + // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. + // Also use this if we don't have SSE41 to allow the legalizer do its job. + if (!Subtarget.hasSSE41() || VT.is128BitVector() || + (VT.is256BitVector() && Subtarget.hasAVX()) || + (VT.is512BitVector() && Subtarget.useAVX512Regs())) { + SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); + Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); + return DAG.getNode(Opcode, DL, VT, ExOp); + } + + auto SplitAndExtendInReg = [&](unsigned SplitSize) { + unsigned NumVecs = VT.getSizeInBits() / SplitSize; + unsigned NumSubElts = SplitSize / SVT.getSizeInBits(); + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); + EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); + + unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode); + SmallVector Opnds; + for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { + SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, + DAG.getIntPtrConstant(Offset, DL)); + SrcVec = ExtendVecSize(DL, SrcVec, SplitSize); + SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec); + Opnds.push_back(SrcVec); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); + }; + + // On pre-AVX targets, split into 128-bit nodes of + // ISD::*_EXTEND_VECTOR_INREG. + if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128)) + return SplitAndExtendInReg(128); + + // On pre-AVX512 targets, split into 256-bit nodes of + // ISD::*_EXTEND_VECTOR_INREG. + if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256)) + return SplitAndExtendInReg(256); + + return SDValue(); +} + // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm // result type. static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, @@ -41390,6 +42418,9 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT)); } + if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) + return V; + if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; @@ -41562,6 +42593,9 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineExtSetcc(N, DAG, Subtarget)) return V; + if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) + return V; + if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; @@ -41738,6 +42772,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, // go through type promotion to a 128-bit vector. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() && VT.getVectorElementType() == MVT::i1 && + (ExperimentalVectorWideningLegalization || + VT.getVectorNumElements() > 4) && (OpVT.getVectorElementType() == MVT::i8 || OpVT.getVectorElementType() == MVT::i16)) { SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS, @@ -43596,6 +44632,15 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, } } + // Combine (ext_invec (ext_invec X)) -> (ext_invec X) + // Disabling for widening legalization for now. We can enable if we find a + // case that needs it. Otherwise it can be deleted when we switch to + // widening legalization. + if (!ExperimentalVectorWideningLegalization && + In.getOpcode() == N->getOpcode() && + TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType())) + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0)); + // Attempt to combine as a shuffle. // TODO: SSE41 support if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 0ff08776c071..1181181a3c5a 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -50,6 +50,8 @@ using namespace llvm; #define DEBUG_TYPE "x86tti" +extern cl::opt ExperimentalVectorWideningLegalization; + //===----------------------------------------------------------------------===// // // X86 cost model. @@ -918,7 +920,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, // FIXME: We can use permq for 64-bit or larger extracts from 256-bit // vectors. int OrigSubElts = SubTp->getVectorNumElements(); - if (NumSubElts > OrigSubElts && + if (ExperimentalVectorWideningLegalization && + NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 && LT.second.getVectorElementType() == SubLT.second.getVectorElementType() && @@ -1330,6 +1333,12 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and // 256-bit wide vectors. + // Used with widening legalization + static const TypeConversionCostTblEntry AVX512FConversionTblWide[] = { + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, + }; + static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, @@ -1347,8 +1356,6 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, @@ -1401,19 +1408,28 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 }, }; + static const TypeConversionCostTblEntry AVX2ConversionTblWide[] = { + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, + }; + static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, @@ -1432,18 +1448,24 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, }; + static const TypeConversionCostTblEntry AVXConversionTblWide[] = { + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 }, + }; + static const TypeConversionCostTblEntry AVXConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, @@ -1642,18 +1664,35 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, SimpleDstTy, SimpleSrcTy)) return Entry->Cost; + if (ST->hasAVX512() && ExperimentalVectorWideningLegalization) + if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTblWide, ISD, + SimpleDstTy, SimpleSrcTy)) + return Entry->Cost; + if (ST->hasAVX512()) if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) return Entry->Cost; } + if (ST->hasAVX2() && ExperimentalVectorWideningLegalization) { + if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTblWide, ISD, + SimpleDstTy, SimpleSrcTy)) + return Entry->Cost; + } + if (ST->hasAVX2()) { if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) return Entry->Cost; } + if (ST->hasAVX() && ExperimentalVectorWideningLegalization) { + if (const auto *Entry = ConvertCostTableLookup(AVXConversionTblWide, ISD, + SimpleDstTy, SimpleSrcTy)) + return Entry->Cost; + } + if (ST->hasAVX()) { if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) @@ -2520,7 +2559,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, // in the table. // FIXME: Is there a better way to do this? EVT VT = TLI->getValueType(DL, ValTy); - if (VT.isSimple()) { + if (VT.isSimple() && ExperimentalVectorWideningLegalization) { MVT MTy = VT.getSimpleVT(); if (IsPairwise) { if (ST->hasAVX())