diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 00b4976b505a..dfd41b717f43 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3872,11 +3872,13 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, /// specifies a shuffle of elements that is suitable for input to UNPCKL. static bool isUNPCKLMask(ArrayRef Mask, EVT VT, bool HasInt256, bool V2IsSplat = false) { - unsigned NumElts = VT.getVectorNumElements(); + if (VT.is512BitVector()) + return false; assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); + unsigned NumElts = VT.getVectorNumElements(); if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; @@ -3911,6 +3913,8 @@ static bool isUNPCKHMask(ArrayRef Mask, EVT VT, bool HasInt256, bool V2IsSplat = false) { unsigned NumElts = VT.getVectorNumElements(); + if (VT.is512BitVector()) + return false; assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); @@ -3948,6 +3952,8 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef Mask, EVT VT, bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); bool Is256BitVec = VT.is256BitVector(); + if (VT.is512BitVector()) + return false; assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); @@ -3988,6 +3994,9 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef Mask, EVT VT, bool HasInt256) { static bool isUNPCKH_v_undef_Mask(ArrayRef Mask, EVT VT, bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); + if (VT.is512BitVector()) + return false; + assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); @@ -4093,6 +4102,44 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { return (FstHalf | (SndHalf << 4)); } +// Symetric in-lane mask. Each lane has 4 elements (for imm8) +static bool isPermImmMask(ArrayRef Mask, EVT VT, unsigned& Imm8) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize < 32) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + Imm8 = 0; + if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) { + for (unsigned i = 0; i != NumElts; ++i) { + if (Mask[i] < 0) + continue; + Imm8 |= Mask[i] << (i*2); + } + return true; + } + + unsigned LaneSize = 4; + SmallVector MaskVal(LaneSize, -1); + + for (unsigned l = 0; l != NumElts; l += LaneSize) { + for (unsigned i = 0; i != LaneSize; ++i) { + if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) + return false; + if (Mask[i+l] < 0) + continue; + if (MaskVal[i] < 0) { + MaskVal[i] = Mask[i+l] - l; + Imm8 |= MaskVal[i] << (i*2); + continue; + } + if (Mask[i+l] != (signed)(MaskVal[i]+l)) + return false; + } + } + return true; +} + /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. /// Note that VPERMIL mask matching is different depending whether theunderlying @@ -4163,7 +4210,8 @@ static bool isMOVSHDUPMask(ArrayRef Mask, EVT VT, unsigned NumElems = VT.getVectorNumElements(); if ((VT.is128BitVector() && NumElems != 4) || - (VT.is256BitVector() && NumElems != 8)) + (VT.is256BitVector() && NumElems != 8) || + (VT.is512BitVector() && NumElems != 16)) return false; // "i+1" is the value the indexed mask element must have @@ -4186,7 +4234,8 @@ static bool isMOVSLDUPMask(ArrayRef Mask, EVT VT, unsigned NumElems = VT.getVectorNumElements(); if ((VT.is128BitVector() && NumElems != 4) || - (VT.is256BitVector() && NumElems != 8)) + (VT.is256BitVector() && NumElems != 8) || + (VT.is512BitVector() && NumElems != 16)) return false; // "i" is the value the indexed mask element must have @@ -4449,27 +4498,6 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } -/// getShuffleCLImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. -/// Handles 256-bit. -static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getValueType(0).getSimpleVT(); - - unsigned NumElts = VT.getVectorNumElements(); - - assert((VT.is256BitVector() && NumElts == 4) && - "Unsupported vector type for VPERMQ/VPERMPD"); - - unsigned Mask = 0; - for (unsigned i = 0; i != NumElts; ++i) { - int Elt = N->getMaskElt(i); - if (Elt < 0) - continue; - Mask |= Elt << (i*2); - } - - return Mask; -} /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { @@ -5288,7 +5316,10 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl, LD->getPointerInfo().getWithOffset(StartOffset), false, false, false, 0); - SmallVector Mask(NumElems, EltNo); + SmallVector Mask; + for (unsigned i = 0; i != NumElems; ++i) + Mask.push_back(EltNo); + return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } @@ -5720,7 +5751,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. - if (VT == MVT::v4i32 || VT == MVT::v8i32) + if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getZeroVector(VT, Subtarget, DAG, dl); @@ -7413,21 +7444,30 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (BlendOp.getNode()) return BlendOp; - if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) { - SmallVector permclMask; - for (unsigned i = 0; i != 8; ++i) { - permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32)); - } - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, - &permclMask[0], 8); - // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 - return DAG.getNode(X86ISD::VPERMV, dl, VT, - DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); - } + unsigned Imm8; + if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) + return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); - if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64)) - return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, - getShuffleCLImmediate(SVOp), DAG); + if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) || + VT.is512BitVector()) { + EVT MaskEltVT = EVT::getIntegerVT(*DAG.getContext(), + VT.getVectorElementType().getSizeInBits()); + EVT MaskVectorVT = + EVT::getVectorVT(*DAG.getContext(),MaskEltVT, NumElems); + SmallVector permclMask; + for (unsigned i = 0; i != NumElems; ++i) { + permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); + } + + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, + &permclMask[0], NumElems); + if (V2IsUndef) + // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 + return DAG.getNode(X86ISD::VPERMV, dl, VT, + DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); + return DAG.getNode(X86ISD::VPERMV3, dl, VT, + DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2); + } //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, @@ -10149,6 +10189,36 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); } +SDValue X86TargetLowering::LowerSIGN_EXTEND_AVX512(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op->getValueType(0); + SDValue In = Op->getOperand(0); + EVT InVT = In.getValueType(); + SDLoc dl(Op); + + if (InVT.getVectorElementType().getSizeInBits() >=8 && + VT.getVectorElementType().getSizeInBits() >= 32) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + + if (InVT.getVectorElementType() == MVT::i1) { + unsigned int NumElts = InVT.getVectorNumElements(); + assert ((NumElts == 8 || NumElts == 16) && + "Unsupported SIGN_EXTEND operation"); + if (VT.getVectorElementType().getSizeInBits() >= 32) { + Constant *C = + ConstantInt::get(*DAG.getContext(), + (NumElts == 8)? APInt(64, ~0ULL): APInt(32, ~0U)); + SDValue CP = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast(CP)->getAlignment(); + SDValue Ld = DAG.getLoad(VT.getScalarType(), dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + return DAG.getNode(X86ISD::VBROADCASTM, dl, VT, In, Ld); + } + } + return SDValue(); +} + SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op->getValueType(0).getSimpleVT(); @@ -10156,6 +10226,9 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op, MVT InVT = In.getValueType().getSimpleVT(); SDLoc dl(Op); + if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) + return LowerSIGN_EXTEND_AVX512(Op, DAG); + if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && (VT != MVT::v8i32 || InVT != MVT::v8i16)) return SDValue(); @@ -13239,6 +13312,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; + case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index e3db491cccc2..c931b9b6667b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -318,6 +318,7 @@ namespace llvm { UNPCKH, VPERMILP, VPERMV, + VPERMV3, VPERMI, VPERM2X128, VBROADCAST, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 8abae14f946d..7fed783b35fc 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -473,6 +473,98 @@ defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512, defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512, VK8, v8i64, v8i1>, EVEX_V512, VEX_W; +//===----------------------------------------------------------------------===// +// AVX-512 - VPERM +// +// -- immediate form -- +multiclass avx512_perm_imm opc, string OpcodeStr, RegisterClass RC, + SDNode OpNode, PatFrag mem_frag, + X86MemOperand x86memop, ValueType OpVT> { + def ri : AVX512AIi8, + EVEX; + def mi : AVX512AIi8, EVEX; +} + +defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64, + i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +let ExeDomain = SSEPackedDouble in +defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64, + f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +// -- VPERM - register form -- +multiclass avx512_perm opc, string OpcodeStr, RegisterClass RC, + PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> { + + def rr : AVX5128I, EVEX_4V; + + def rm : AVX5128I, EVEX_4V; +} + +defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv8i64, i512mem, + v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem, + v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +let ExeDomain = SSEPackedSingle in +defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv8f64, f512mem, + v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; +let ExeDomain = SSEPackedDouble in +defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, + v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +// -- VPERM2I - 3 source operands form -- +multiclass avx512_perm_3src opc, string OpcodeStr, RegisterClass RC, + PatFrag mem_frag, X86MemOperand x86memop, + ValueType OpVT> { +let Constraints = "$src1 = $dst" in { + def rr : AVX5128I, + EVEX_4V; + + def rm : AVX5128I, EVEX_4V; + } +} +defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem, + v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, i512mem, + v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, i512mem, + v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, i512mem, + v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + + // Mask register copy, including // - copy between mask registers // - load/store mask registers @@ -713,3 +805,148 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Aligned and unaligned load and store +// + +multiclass avx512_mov_packed opc, RegisterClass RC, RegisterClass KRC, + X86MemOperand x86memop, PatFrag ld_frag, + string asm, Domain d> { +let neverHasSideEffects = 1 in + def rr : AVX512PI, + EVEX; +let canFoldAsLoad = 1 in + def rm : AVX512PI, EVEX; +let Constraints = "$src1 = $dst" in { + def rrk : AVX512PI, + EVEX, EVEX_K; + def rmk : AVX512PI, EVEX, EVEX_K; +} +} + +defm VMOVAPSZ : avx512_mov_packed<0x28, VR512, VK16WM, f512mem, alignedloadv16f32, + "vmovaps", SSEPackedSingle>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMOVAPDZ : avx512_mov_packed<0x28, VR512, VK8WM, f512mem, alignedloadv8f64, + "vmovapd", SSEPackedDouble>, + OpSize, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; +defm VMOVUPSZ : avx512_mov_packed<0x10, VR512, VK16WM, f512mem, loadv16f32, + "vmovups", SSEPackedSingle>, + TB, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMOVUPDZ : avx512_mov_packed<0x10, VR512, VK8WM, f512mem, loadv8f64, + "vmovupd", SSEPackedDouble>, + OpSize, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; +def VMOVAPSZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src), + "vmovaps\t{$src, $dst|$dst, $src}", + [(alignedstore512 (v16f32 VR512:$src), addr:$dst)], + SSEPackedSingle>, EVEX, EVEX_V512, TB, + EVEX_CD8<32, CD8VF>; +def VMOVAPDZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src), + "vmovapd\t{$src, $dst|$dst, $src}", + [(alignedstore512 (v8f64 VR512:$src), addr:$dst)], + SSEPackedDouble>, EVEX, EVEX_V512, + OpSize, TB, VEX_W, EVEX_CD8<64, CD8VF>; +def VMOVUPSZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src), + "vmovups\t{$src, $dst|$dst, $src}", + [(store (v16f32 VR512:$src), addr:$dst)], + SSEPackedSingle>, EVEX, EVEX_V512, TB, + EVEX_CD8<32, CD8VF>; +def VMOVUPDZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src), + "vmovupd\t{$src, $dst|$dst, $src}", + [(store (v8f64 VR512:$src), addr:$dst)], + SSEPackedDouble>, EVEX, EVEX_V512, + OpSize, TB, VEX_W, EVEX_CD8<64, CD8VF>; + +// Use vmovaps/vmovups for AVX-512 integer load/store. +// 512-bit load/store +def : Pat<(alignedloadv8i64 addr:$src), + (VMOVAPSZrm addr:$src)>; +def : Pat<(loadv8i64 addr:$src), + (VMOVUPSZrm addr:$src)>; + +def : Pat<(alignedstore512 (v8i64 VR512:$src), addr:$dst), + (VMOVAPSZmr addr:$dst, VR512:$src)>; +def : Pat<(alignedstore512 (v16i32 VR512:$src), addr:$dst), + (VMOVAPSZmr addr:$dst, VR512:$src)>; + +def : Pat<(store (v8i64 VR512:$src), addr:$dst), + (VMOVUPDZmr addr:$dst, VR512:$src)>; +def : Pat<(store (v16i32 VR512:$src), addr:$dst), + (VMOVUPSZmr addr:$dst, VR512:$src)>; + +let neverHasSideEffects = 1 in { + def VMOVDQA32rr : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src), + "vmovdqa32\t{$src, $dst|$dst, $src}", []>, + EVEX, EVEX_V512; + def VMOVDQA64rr : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src), + "vmovdqa64\t{$src, $dst|$dst, $src}", []>, + EVEX, EVEX_V512, VEX_W; +let mayStore = 1 in { + def VMOVDQA32mr : AVX512BI<0x7F, MRMDestMem, (outs), + (ins i512mem:$dst, VR512:$src), + "vmovdqa32\t{$src, $dst|$dst, $src}", []>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; + def VMOVDQA64mr : AVX512BI<0x7F, MRMDestMem, (outs), + (ins i512mem:$dst, VR512:$src), + "vmovdqa64\t{$src, $dst|$dst, $src}", []>, + EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} +let mayLoad = 1 in { +def VMOVDQA32rm : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst), + (ins i512mem:$src), + "vmovdqa32\t{$src, $dst|$dst, $src}", []>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; +def VMOVDQA64rm : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst), + (ins i512mem:$src), + "vmovdqa64\t{$src, $dst|$dst, $src}", []>, + EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} +} + +multiclass avx512_mov_int opc, string asm, RegisterClass RC, + RegisterClass KRC, + PatFrag ld_frag, X86MemOperand x86memop> { +let neverHasSideEffects = 1 in + def rr : AVX512XSI, + EVEX; +let canFoldAsLoad = 1 in + def rm : AVX512XSI, + EVEX; +let Constraints = "$src1 = $dst" in { + def rrk : AVX512XSI, + EVEX, EVEX_K; + def rmk : AVX512XSI, EVEX, EVEX_K; +} +} + +defm VMOVDQU32 : avx512_mov_int<0x6F, "vmovdqu32", VR512, VK16WM, memopv16i32, i512mem>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMOVDQU64 : avx512_mov_int<0x6F, "vmovdqu64", VR512, VK8WM, memopv8i64, i512mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + + diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 0b51521feea9..8587d382b2b3 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -151,6 +151,8 @@ def X86pmuludq : SDNode<"X86ISD::PMULUDQ", def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>; +def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>; def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisInt<2>]>; @@ -194,6 +196,7 @@ def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; def X86VPermilp : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>; def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; +def X86VPermv3 : SDNode<"X86ISD::VPERMV3", SDTShuff3Op>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; @@ -262,9 +265,16 @@ def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; -// 128-/256-bit extload pattern fragments +// 512-bit load pattern fragments +def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; +def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; +def loadv16i32 : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>; +def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; + +// 128-/256-/512-bit extload pattern fragments def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; +def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), @@ -278,6 +288,12 @@ def alignedstore256 : PatFrag<(ops node:$val, node:$ptr), return cast(N)->getAlignment() >= 32; }]>; +// Like 'store', but always requires 512-bit vector alignment. +def alignedstore512 : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast(N)->getAlignment() >= 64; +}]>; + // Like 'load', but always requires 128-bit vector alignment. def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast(N)->getAlignment() >= 16; @@ -293,6 +309,11 @@ def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast(N)->getAlignment() >= 32; }]>; +// Like 'load', but always requires 512-bit vector alignment. +def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast(N)->getAlignment() >= 64; +}]>; + def alignedloadfsf32 : PatFrag<(ops node:$ptr), (f32 (alignedload node:$ptr))>; def alignedloadfsf64 : PatFrag<(ops node:$ptr), @@ -316,6 +337,16 @@ def alignedloadv4f64 : PatFrag<(ops node:$ptr), def alignedloadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (alignedload256 node:$ptr))>; +// 512-bit aligned load pattern fragments +def alignedloadv16f32 : PatFrag<(ops node:$ptr), + (v16f32 (alignedload512 node:$ptr))>; +def alignedloadv8f64 : PatFrag<(ops node:$ptr), + (v8f64 (alignedload512 node:$ptr))>; +def alignedloadv16i32 : PatFrag<(ops node:$ptr), + (v16i32 (alignedload512 node:$ptr))>; +def alignedloadv8i64 : PatFrag<(ops node:$ptr), + (v8i64 (alignedload512 node:$ptr))>; + // Like 'load', but uses special alignment checks suitable for use in // memory operands in most SSE instructions, which are required to // be naturally aligned on some targets but not on others. If the subtarget @@ -339,9 +370,16 @@ def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; // 256-bit memop pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>; +def memopv8i32 : PatFrag<(ops node:$ptr), (v8i32 (memop node:$ptr))>; def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>; def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; +// 512-bit memop pattern fragments +def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop node:$ptr))>; +def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop node:$ptr))>; +def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop node:$ptr))>; +def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop node:$ptr))>; + // SSSE3 uses MMX registers for some instructions. They aren't aligned on a // 16-byte boundary. // FIXME: 8 byte alignment for mmx reads is not required diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 0443a93137b4..b7737685ffce 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -81,6 +81,7 @@ enum { TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT, TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT, + TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT, TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT }; @@ -1177,6 +1178,14 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::PDEP64rr, X86::PDEP64rm, 0 }, { X86::PEXT32rr, X86::PEXT32rm, 0 }, { X86::PEXT64rr, X86::PEXT64rm, 0 }, + + // AVX-512 foldable instructions + { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, + { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 }, + { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, + { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, + { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, + { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) { @@ -1454,6 +1463,8 @@ static bool isFrameLoadOpcode(int Opcode) { case X86::VMOVDQAYrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: + case X86::VMOVDQA32rm: + case X86::VMOVDQA64rm: return true; } } @@ -2890,12 +2901,15 @@ static bool isHReg(unsigned Reg) { // Try and copy between VR128/VR64 and GR64 registers. static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, - bool HasAVX) { + const X86Subtarget& Subtarget) { + + // SrcReg(VR128) -> DestReg(GR64) // SrcReg(VR64) -> DestReg(GR64) // SrcReg(GR64) -> DestReg(VR128) // SrcReg(GR64) -> DestReg(VR64) + bool HasAVX = Subtarget.hasAVX(); if (X86::GR64RegClass.contains(DestReg)) { if (X86::VR128RegClass.contains(SrcReg)) // Copy from a VR128 register to a GR64 register. @@ -2926,13 +2940,31 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, return 0; } +static +unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { + if (X86::VR128XRegClass.contains(DestReg, SrcReg) || + X86::VR256XRegClass.contains(DestReg, SrcReg) || + X86::VR512RegClass.contains(DestReg, SrcReg)) { + DestReg = get512BitSuperRegister(DestReg); + SrcReg = get512BitSuperRegister(SrcReg); + return X86::VMOVAPSZrr; + } + if ((X86::VK8RegClass.contains(DestReg) || + X86::VK16RegClass.contains(DestReg)) && + (X86::VK8RegClass.contains(SrcReg) || + X86::VK16RegClass.contains(SrcReg))) + return X86::KMOVWkk; + return 0; +} + void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { // First deal with the normal symmetric copies. bool HasAVX = TM.getSubtarget().hasAVX(); - unsigned Opc; + bool HasAVX512 = TM.getSubtarget().hasAVX512(); + unsigned Opc = 0; if (X86::GR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MOV64rr; else if (X86::GR32RegClass.contains(DestReg, SrcReg)) @@ -2950,14 +2982,17 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, "8-bit H register can not be copied outside GR8_NOREX"); } else Opc = X86::MOV8rr; - } else if (X86::VR128RegClass.contains(DestReg, SrcReg)) + } + else if (X86::VR64RegClass.contains(DestReg, SrcReg)) + Opc = X86::MMX_MOVQ64rr; + else if (HasAVX512) + Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg); + else if (X86::VR128RegClass.contains(DestReg, SrcReg)) Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; else if (X86::VR256RegClass.contains(DestReg, SrcReg)) Opc = X86::VMOVAPSYrr; - else if (X86::VR64RegClass.contains(DestReg, SrcReg)) - Opc = X86::MMX_MOVQ64rr; - else - Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, HasAVX); + if (!Opc) + Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, TM.getSubtarget()); if (Opc) { BuildMI(MBB, MI, DL, get(Opc), DestReg) diff --git a/llvm/test/CodeGen/X86/avx512-shuffle.ll b/llvm/test/CodeGen/X86/avx512-shuffle.ll new file mode 100644 index 000000000000..9f3d86a5e646 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-shuffle.ll @@ -0,0 +1,66 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; CHECK: LCP +; CHECK: .long 2 +; CHECK: .long 5 +; CHECK: .long 0 +; CHECK: .long 0 +; CHECK: .long 7 +; CHECK: .long 0 +; CHECK: .long 10 +; CHECK: .long 1 +; CHECK: .long 0 +; CHECK: .long 5 +; CHECK: .long 0 +; CHECK: .long 4 +; CHECK: .long 7 +; CHECK: .long 0 +; CHECK: .long 10 +; CHECK: .long 1 +; CHECK: test1: +; CHECK: vpermps +; CHECK: ret +define <16 x float> @test1(<16 x float> %a) nounwind { + %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> + ret <16 x float> %c +} + +; CHECK: test2: +; CHECK: vpermd +; CHECK: ret +define <16 x i32> @test2(<16 x i32> %a) nounwind { + %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> + ret <16 x i32> %c +} + +; CHECK: test3: +; CHECK: vpermq +; CHECK: ret +define <8 x i64> @test3(<8 x i64> %a) nounwind { + %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> + ret <8 x i64> %c +} + +; CHECK: test4: +; CHECK: vpermpd +; CHECK: ret +define <8 x double> @test4(<8 x double> %a) nounwind { + %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> + ret <8 x double> %c +} + +; CHECK: test5: +; CHECK: vpermi2pd +; CHECK: ret +define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind { + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + ret <8 x double> %c +} + +; CHECK: test6: +; CHECK: vpermq $30 +; CHECK: ret +define <8 x i64> @test6(<8 x i64> %a) nounwind { + %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> + ret <8 x i64> %c +} +