forked from OSchip/llvm-project
Expandload and Compressstore intrinsics
2 new intrinsics covering AVX-512 compress/expand functionality. This implementation includes syntax, DAG builder, operation lowering and tests. Does not include: handling of illegal data types, codegen prepare pass and the cost model. llvm-svn: 285876
This commit is contained in:
parent
7c7abafd81
commit
caaceef4b3
|
@ -100,7 +100,7 @@ namespace Intrinsic {
|
|||
Void, VarArg, MMX, Token, Metadata, Half, Float, Double,
|
||||
Integer, Vector, Pointer, Struct,
|
||||
Argument, ExtendArgument, TruncArgument, HalfVecArgument,
|
||||
SameVecWidthArgument, PtrToArgument, VecOfPtrsToElt
|
||||
SameVecWidthArgument, PtrToArgument, PtrToElt, VecOfPtrsToElt
|
||||
} Kind;
|
||||
|
||||
union {
|
||||
|
@ -123,7 +123,7 @@ namespace Intrinsic {
|
|||
assert(Kind == Argument || Kind == ExtendArgument ||
|
||||
Kind == TruncArgument || Kind == HalfVecArgument ||
|
||||
Kind == SameVecWidthArgument || Kind == PtrToArgument ||
|
||||
Kind == VecOfPtrsToElt);
|
||||
Kind == PtrToElt || Kind == VecOfPtrsToElt);
|
||||
return Argument_Info >> 3;
|
||||
}
|
||||
ArgKind getArgumentKind() const {
|
||||
|
|
|
@ -133,6 +133,7 @@ class LLVMVectorSameWidth<int num, LLVMType elty>
|
|||
ValueType ElTy = elty.VT;
|
||||
}
|
||||
class LLVMPointerTo<int num> : LLVMMatchType<num>;
|
||||
class LLVMPointerToElt<int num> : LLVMMatchType<num>;
|
||||
class LLVMVectorOfPointersToElt<int num> : LLVMMatchType<num>;
|
||||
|
||||
// Match the type of another intrinsic parameter that is expected to be a
|
||||
|
@ -718,13 +719,25 @@ def int_masked_gather: Intrinsic<[llvm_anyvector_ty],
|
|||
[LLVMVectorOfPointersToElt<0>, llvm_i32_ty,
|
||||
LLVMVectorSameWidth<0, llvm_i1_ty>,
|
||||
LLVMMatchType<0>],
|
||||
[IntrReadMem]>;
|
||||
[IntrReadMem]>;
|
||||
|
||||
def int_masked_scatter: Intrinsic<[],
|
||||
[llvm_anyvector_ty,
|
||||
LLVMVectorOfPointersToElt<0>, llvm_i32_ty,
|
||||
LLVMVectorSameWidth<0, llvm_i1_ty>]>;
|
||||
|
||||
def int_masked_expandload: Intrinsic<[llvm_anyvector_ty],
|
||||
[LLVMPointerToElt<0>,
|
||||
LLVMVectorSameWidth<0, llvm_i1_ty>,
|
||||
LLVMMatchType<0>],
|
||||
[IntrReadMem]>;
|
||||
|
||||
def int_masked_compressstore: Intrinsic<[],
|
||||
[llvm_anyvector_ty,
|
||||
LLVMPointerToElt<0>,
|
||||
LLVMVectorSameWidth<0, llvm_i1_ty>],
|
||||
[IntrArgMemOnly]>;
|
||||
|
||||
// Test whether a pointer is associated with a type metadata identifier.
|
||||
def int_type_test : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
|
||||
[IntrNoMem]>;
|
||||
|
|
|
@ -5583,7 +5583,7 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
|
|||
Alignment, MST->getAAInfo(), MST->getRanges());
|
||||
|
||||
Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
|
||||
MST->isTruncatingStore());
|
||||
MST->isTruncatingStore(), MST->isCompressingStore());
|
||||
|
||||
unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
|
||||
Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
|
||||
|
@ -5596,7 +5596,7 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
|
|||
MST->getRanges());
|
||||
|
||||
Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
|
||||
MST->isTruncatingStore());
|
||||
MST->isTruncatingStore(), MST->isCompressingStore());
|
||||
|
||||
AddToWorklist(Lo.getNode());
|
||||
AddToWorklist(Hi.getNode());
|
||||
|
|
|
@ -1212,7 +1212,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
|
|||
|
||||
return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask,
|
||||
N->getMemoryVT(), N->getMemOperand(),
|
||||
TruncateStore);
|
||||
TruncateStore, N->isCompressingStore());
|
||||
}
|
||||
|
||||
SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N,
|
||||
|
|
|
@ -3667,16 +3667,39 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
|
|||
DAG.setRoot(StoreNode);
|
||||
}
|
||||
|
||||
void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
|
||||
void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
|
||||
bool IsCompressing) {
|
||||
SDLoc sdl = getCurSDLoc();
|
||||
|
||||
// llvm.masked.store.*(Src0, Ptr, alignment, Mask)
|
||||
Value *PtrOperand = I.getArgOperand(1);
|
||||
auto getMaskedStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
|
||||
unsigned& Alignment) {
|
||||
// llvm.masked.store.*(Src0, Ptr, alignment, Mask)
|
||||
Src0 = I.getArgOperand(0);
|
||||
Ptr = I.getArgOperand(1);
|
||||
Alignment = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
|
||||
Mask = I.getArgOperand(3);
|
||||
};
|
||||
auto getCompressingStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
|
||||
unsigned& Alignment) {
|
||||
// llvm.masked.compressstore.*(Src0, Ptr, Mask)
|
||||
Src0 = I.getArgOperand(0);
|
||||
Ptr = I.getArgOperand(1);
|
||||
Mask = I.getArgOperand(2);
|
||||
Alignment = 0;
|
||||
};
|
||||
|
||||
Value *PtrOperand, *MaskOperand, *Src0Operand;
|
||||
unsigned Alignment;
|
||||
if (IsCompressing)
|
||||
getCompressingStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
|
||||
else
|
||||
getMaskedStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
|
||||
|
||||
SDValue Ptr = getValue(PtrOperand);
|
||||
SDValue Src0 = getValue(I.getArgOperand(0));
|
||||
SDValue Mask = getValue(I.getArgOperand(3));
|
||||
SDValue Src0 = getValue(Src0Operand);
|
||||
SDValue Mask = getValue(MaskOperand);
|
||||
|
||||
EVT VT = Src0.getValueType();
|
||||
unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
|
||||
if (!Alignment)
|
||||
Alignment = DAG.getEVTAlignment(VT);
|
||||
|
||||
|
@ -3689,7 +3712,8 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
|
|||
MachineMemOperand::MOStore, VT.getStoreSize(),
|
||||
Alignment, AAInfo);
|
||||
SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT,
|
||||
MMO, false);
|
||||
MMO, false /* Truncating */,
|
||||
IsCompressing);
|
||||
DAG.setRoot(StoreNode);
|
||||
setValue(&I, StoreNode);
|
||||
}
|
||||
|
@ -3710,7 +3734,7 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
|
|||
// extract the spalt value and use it as a uniform base.
|
||||
// In all other cases the function returns 'false'.
|
||||
//
|
||||
static bool getUniformBase(const Value *& Ptr, SDValue& Base, SDValue& Index,
|
||||
static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
|
||||
SelectionDAGBuilder* SDB) {
|
||||
|
||||
SelectionDAG& DAG = SDB->DAG;
|
||||
|
@ -3790,18 +3814,38 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
|
|||
setValue(&I, Scatter);
|
||||
}
|
||||
|
||||
void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
|
||||
void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
|
||||
SDLoc sdl = getCurSDLoc();
|
||||
|
||||
// @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
|
||||
Value *PtrOperand = I.getArgOperand(0);
|
||||
SDValue Ptr = getValue(PtrOperand);
|
||||
SDValue Src0 = getValue(I.getArgOperand(3));
|
||||
SDValue Mask = getValue(I.getArgOperand(2));
|
||||
auto getMaskedLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
|
||||
unsigned& Alignment) {
|
||||
// @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
|
||||
Ptr = I.getArgOperand(0);
|
||||
Alignment = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
|
||||
Mask = I.getArgOperand(2);
|
||||
Src0 = I.getArgOperand(3);
|
||||
};
|
||||
auto getExpandingLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
|
||||
unsigned& Alignment) {
|
||||
// @llvm.masked.expandload.*(Ptr, Mask, Src0)
|
||||
Ptr = I.getArgOperand(0);
|
||||
Alignment = 0;
|
||||
Mask = I.getArgOperand(1);
|
||||
Src0 = I.getArgOperand(2);
|
||||
};
|
||||
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
|
||||
unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
|
||||
Value *PtrOperand, *MaskOperand, *Src0Operand;
|
||||
unsigned Alignment;
|
||||
if (IsExpanding)
|
||||
getExpandingLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
|
||||
else
|
||||
getMaskedLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
|
||||
|
||||
SDValue Ptr = getValue(PtrOperand);
|
||||
SDValue Src0 = getValue(Src0Operand);
|
||||
SDValue Mask = getValue(MaskOperand);
|
||||
|
||||
EVT VT = Src0.getValueType();
|
||||
if (!Alignment)
|
||||
Alignment = DAG.getEVTAlignment(VT);
|
||||
|
||||
|
@ -3821,7 +3865,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
|
|||
Alignment, AAInfo, Ranges);
|
||||
|
||||
SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
|
||||
ISD::NON_EXTLOAD, false);
|
||||
ISD::NON_EXTLOAD, IsExpanding);
|
||||
if (AddToChain) {
|
||||
SDValue OutChain = Load.getValue(1);
|
||||
DAG.setRoot(OutChain);
|
||||
|
@ -5054,6 +5098,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
|
|||
case Intrinsic::masked_store:
|
||||
visitMaskedStore(I);
|
||||
return nullptr;
|
||||
case Intrinsic::masked_expandload:
|
||||
visitMaskedLoad(I, true /* IsExpanding */);
|
||||
return nullptr;
|
||||
case Intrinsic::masked_compressstore:
|
||||
visitMaskedStore(I, true /* IsCompressing */);
|
||||
return nullptr;
|
||||
case Intrinsic::x86_mmx_pslli_w:
|
||||
case Intrinsic::x86_mmx_pslli_d:
|
||||
case Intrinsic::x86_mmx_pslli_q:
|
||||
|
|
|
@ -874,8 +874,8 @@ private:
|
|||
void visitAlloca(const AllocaInst &I);
|
||||
void visitLoad(const LoadInst &I);
|
||||
void visitStore(const StoreInst &I);
|
||||
void visitMaskedLoad(const CallInst &I);
|
||||
void visitMaskedStore(const CallInst &I);
|
||||
void visitMaskedLoad(const CallInst &I, bool IsExpanding = false);
|
||||
void visitMaskedStore(const CallInst &I, bool IsCompressing = false);
|
||||
void visitMaskedGather(const CallInst &I);
|
||||
void visitMaskedScatter(const CallInst &I);
|
||||
void visitAtomicCmpXchg(const AtomicCmpXchgInst &I);
|
||||
|
|
|
@ -607,10 +607,11 @@ enum IIT_Info {
|
|||
IIT_HALF_VEC_ARG = 30,
|
||||
IIT_SAME_VEC_WIDTH_ARG = 31,
|
||||
IIT_PTR_TO_ARG = 32,
|
||||
IIT_VEC_OF_PTRS_TO_ELT = 33,
|
||||
IIT_I128 = 34,
|
||||
IIT_V512 = 35,
|
||||
IIT_V1024 = 36
|
||||
IIT_PTR_TO_ELT = 33,
|
||||
IIT_VEC_OF_PTRS_TO_ELT = 34,
|
||||
IIT_I128 = 35,
|
||||
IIT_V512 = 36,
|
||||
IIT_V1024 = 37
|
||||
};
|
||||
|
||||
|
||||
|
@ -744,6 +745,11 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
|
|||
ArgInfo));
|
||||
return;
|
||||
}
|
||||
case IIT_PTR_TO_ELT: {
|
||||
unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
|
||||
OutputTable.push_back(IITDescriptor::get(IITDescriptor::PtrToElt, ArgInfo));
|
||||
return;
|
||||
}
|
||||
case IIT_VEC_OF_PTRS_TO_ELT: {
|
||||
unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
|
||||
OutputTable.push_back(IITDescriptor::get(IITDescriptor::VecOfPtrsToElt,
|
||||
|
@ -870,6 +876,14 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
|
|||
Type *Ty = Tys[D.getArgumentNumber()];
|
||||
return PointerType::getUnqual(Ty);
|
||||
}
|
||||
case IITDescriptor::PtrToElt: {
|
||||
Type *Ty = Tys[D.getArgumentNumber()];
|
||||
VectorType *VTy = dyn_cast<VectorType>(Ty);
|
||||
if (!VTy)
|
||||
llvm_unreachable("Expected an argument of Vector Type");
|
||||
Type *EltTy = VTy->getVectorElementType();
|
||||
return PointerType::getUnqual(EltTy);
|
||||
}
|
||||
case IITDescriptor::VecOfPtrsToElt: {
|
||||
Type *Ty = Tys[D.getArgumentNumber()];
|
||||
VectorType *VTy = dyn_cast<VectorType>(Ty);
|
||||
|
@ -1048,7 +1062,7 @@ bool Intrinsic::matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor>
|
|||
if (D.getArgumentNumber() >= ArgTys.size())
|
||||
return true;
|
||||
VectorType * ReferenceType =
|
||||
dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
|
||||
dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
|
||||
VectorType *ThisArgType = dyn_cast<VectorType>(Ty);
|
||||
if (!ThisArgType || !ReferenceType ||
|
||||
(ReferenceType->getVectorNumElements() !=
|
||||
|
@ -1064,6 +1078,16 @@ bool Intrinsic::matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor>
|
|||
PointerType *ThisArgType = dyn_cast<PointerType>(Ty);
|
||||
return (!ThisArgType || ThisArgType->getElementType() != ReferenceType);
|
||||
}
|
||||
case IITDescriptor::PtrToElt: {
|
||||
if (D.getArgumentNumber() >= ArgTys.size())
|
||||
return true;
|
||||
VectorType * ReferenceType =
|
||||
dyn_cast<VectorType> (ArgTys[D.getArgumentNumber()]);
|
||||
PointerType *ThisArgType = dyn_cast<PointerType>(Ty);
|
||||
|
||||
return (!ThisArgType || !ReferenceType ||
|
||||
ThisArgType->getElementType() != ReferenceType->getElementType());
|
||||
}
|
||||
case IITDescriptor::VecOfPtrsToElt: {
|
||||
if (D.getArgumentNumber() >= ArgTys.size())
|
||||
return true;
|
||||
|
|
|
@ -1232,10 +1232,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
|
||||
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
|
||||
} else {
|
||||
setOperationAction(ISD::MLOAD, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::MLOAD, MVT::v8f32, Custom);
|
||||
setOperationAction(ISD::MSTORE, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::MSTORE, MVT::v8f32, Custom);
|
||||
for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
|
||||
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
|
||||
setOperationAction(ISD::MLOAD, VT, Custom);
|
||||
setOperationAction(ISD::MSTORE, VT, Custom);
|
||||
}
|
||||
}
|
||||
setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
|
||||
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
|
||||
|
@ -21940,26 +21941,48 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
|
|||
SDValue Mask = N->getMask();
|
||||
SDLoc dl(Op);
|
||||
|
||||
assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
|
||||
"Expanding masked load is supported on AVX-512 target only!");
|
||||
|
||||
assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
|
||||
"Expanding masked load is supported for 32 and 64-bit types only!");
|
||||
|
||||
// 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
|
||||
// VLX. These types for exp-loads are handled here.
|
||||
if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
|
||||
return Op;
|
||||
|
||||
assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
|
||||
"Cannot lower masked load op.");
|
||||
|
||||
assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
|
||||
assert((ScalarVT.getSizeInBits() >= 32 ||
|
||||
(Subtarget.hasBWI() &&
|
||||
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
|
||||
"Unsupported masked load op.");
|
||||
|
||||
// This operation is legal for targets with VLX, but without
|
||||
// VLX the vector should be widened to 512 bit
|
||||
unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
|
||||
unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
|
||||
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
|
||||
MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
|
||||
SDValue Src0 = N->getSrc0();
|
||||
Src0 = ExtendToType(Src0, WideDataVT, DAG);
|
||||
|
||||
// Mask element has to be i1
|
||||
MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
|
||||
assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
|
||||
"We handle 4x32, 4x64 and 2x64 vectors only in this casse");
|
||||
|
||||
MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
|
||||
|
||||
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
|
||||
if (MaskEltTy != MVT::i1)
|
||||
Mask = DAG.getNode(ISD::TRUNCATE, dl,
|
||||
MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
|
||||
SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
|
||||
N->getBasePtr(), Mask, Src0,
|
||||
N->getMemoryVT(), N->getMemOperand(),
|
||||
N->getExtensionType());
|
||||
N->getExtensionType(),
|
||||
N->isExpandingLoad());
|
||||
|
||||
SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
|
||||
NewLoad.getValue(0),
|
||||
|
@ -21977,10 +22000,20 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
|
|||
SDValue Mask = N->getMask();
|
||||
SDLoc dl(Op);
|
||||
|
||||
assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
|
||||
"Expanding masked load is supported on AVX-512 target only!");
|
||||
|
||||
assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
|
||||
"Expanding masked load is supported for 32 and 64-bit types only!");
|
||||
|
||||
// 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
|
||||
if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
|
||||
return Op;
|
||||
|
||||
assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
|
||||
"Cannot lower masked store op.");
|
||||
|
||||
assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
|
||||
assert((ScalarVT.getSizeInBits() >= 32 ||
|
||||
(Subtarget.hasBWI() &&
|
||||
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
|
||||
"Unsupported masked store op.");
|
||||
|
@ -21989,12 +22022,22 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
|
|||
// VLX the vector should be widened to 512 bit
|
||||
unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
|
||||
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
|
||||
MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
|
||||
|
||||
// Mask element has to be i1
|
||||
MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
|
||||
assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
|
||||
"We handle 4x32, 4x64 and 2x64 vectors only in this casse");
|
||||
|
||||
MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
|
||||
|
||||
DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
|
||||
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
|
||||
if (MaskEltTy != MVT::i1)
|
||||
Mask = DAG.getNode(ISD::TRUNCATE, dl,
|
||||
MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
|
||||
return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
|
||||
Mask, N->getMemoryVT(), N->getMemOperand(),
|
||||
N->isTruncatingStore());
|
||||
N->isTruncatingStore(), N->isCompressingStore());
|
||||
}
|
||||
|
||||
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
|
||||
|
@ -29881,6 +29924,11 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
|
|||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const X86Subtarget &Subtarget) {
|
||||
MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
|
||||
|
||||
// TODO: Expanding load with constant mask may be optimized as well.
|
||||
if (Mld->isExpandingLoad())
|
||||
return SDValue();
|
||||
|
||||
if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
|
||||
if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
|
||||
return ScalarLoad;
|
||||
|
@ -29996,6 +30044,10 @@ static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
|
|||
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
|
||||
const X86Subtarget &Subtarget) {
|
||||
MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
|
||||
|
||||
if (Mst->isCompressingStore())
|
||||
return SDValue();
|
||||
|
||||
if (!Mst->isTruncatingStore())
|
||||
return reduceMaskedStoreToScalarStore(Mst, DAG);
|
||||
|
||||
|
|
|
@ -965,28 +965,23 @@ def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
|
|||
|
||||
def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
|
||||
(X86mstore node:$src1, node:$src2, node:$src3), [{
|
||||
if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
|
||||
return Store->getAlignment() >= 16;
|
||||
return false;
|
||||
return cast<MaskedStoreSDNode>(N)->getAlignment() >= 16;
|
||||
}]>;
|
||||
|
||||
def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
|
||||
(X86mstore node:$src1, node:$src2, node:$src3), [{
|
||||
if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
|
||||
return Store->getAlignment() >= 32;
|
||||
return false;
|
||||
return cast<MaskedStoreSDNode>(N)->getAlignment() >= 32;
|
||||
}]>;
|
||||
|
||||
def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
|
||||
(X86mstore node:$src1, node:$src2, node:$src3), [{
|
||||
if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
|
||||
return Store->getAlignment() >= 64;
|
||||
return false;
|
||||
return cast<MaskedStoreSDNode>(N)->getAlignment() >= 64;
|
||||
}]>;
|
||||
|
||||
def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
|
||||
(X86mstore node:$src1, node:$src2, node:$src3), [{
|
||||
return isa<MaskedStoreSDNode>(N);
|
||||
(masked_store node:$src1, node:$src2, node:$src3), [{
|
||||
return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
|
||||
(!cast<MaskedStoreSDNode>(N)->isCompressingStore());
|
||||
}]>;
|
||||
|
||||
def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
|
||||
|
|
|
@ -0,0 +1,247 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX
|
||||
; RUN: llc -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
|
||||
|
||||
define <16 x float> @test1(float* %base) {
|
||||
; ALL-LABEL: test1:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: movw $-2049, %ax # imm = 0xF7FF
|
||||
; ALL-NEXT: kmovw %eax, %k1
|
||||
; ALL-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
|
||||
; ALL-NEXT: retq
|
||||
%res = call <16 x float> @llvm.masked.expandload.v16f32(float* %base, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
|
||||
ret <16 x float>%res
|
||||
}
|
||||
|
||||
define <16 x float> @test2(float* %base, <16 x float> %src0) {
|
||||
; ALL-LABEL: test2:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: movw $30719, %ax # imm = 0x77FF
|
||||
; ALL-NEXT: kmovw %eax, %k1
|
||||
; ALL-NEXT: vexpandps (%rdi), %zmm0 {%k1}
|
||||
; ALL-NEXT: retq
|
||||
%res = call <16 x float> @llvm.masked.expandload.v16f32(float* %base, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x float> %src0)
|
||||
ret <16 x float>%res
|
||||
}
|
||||
|
||||
define <8 x double> @test3(double* %base, <8 x double> %src0, <8 x i1> %mask) {
|
||||
; SKX-LABEL: test3:
|
||||
; SKX: # BB#0:
|
||||
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
|
||||
; SKX-NEXT: vpmovw2m %xmm1, %k1
|
||||
; SKX-NEXT: vexpandpd (%rdi), %zmm0 {%k1}
|
||||
; SKX-NEXT: retq
|
||||
;
|
||||
; KNL-LABEL: test3:
|
||||
; KNL: # BB#0:
|
||||
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
|
||||
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
|
||||
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
|
||||
; KNL-NEXT: vexpandpd (%rdi), %zmm0 {%k1}
|
||||
; KNL-NEXT: retq
|
||||
%res = call <8 x double> @llvm.masked.expandload.v8f64(double* %base, <8 x i1> %mask, <8 x double> %src0)
|
||||
ret <8 x double>%res
|
||||
}
|
||||
|
||||
define <4 x float> @test4(float* %base, <4 x float> %src0) {
|
||||
; SKX-LABEL: test4:
|
||||
; SKX: # BB#0:
|
||||
; SKX-NEXT: movb $7, %al
|
||||
; SKX-NEXT: kmovb %eax, %k1
|
||||
; SKX-NEXT: vexpandps (%rdi), %xmm0 {%k1}
|
||||
; SKX-NEXT: retq
|
||||
;
|
||||
; KNL-LABEL: test4:
|
||||
; KNL: # BB#0:
|
||||
; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
||||
; KNL-NEXT: movw $7, %ax
|
||||
; KNL-NEXT: kmovw %eax, %k1
|
||||
; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1}
|
||||
; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
||||
; KNL-NEXT: retq
|
||||
%res = call <4 x float> @llvm.masked.expandload.v4f32(float* %base, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> %src0)
|
||||
ret <4 x float>%res
|
||||
}
|
||||
|
||||
define <2 x i64> @test5(i64* %base, <2 x i64> %src0) {
|
||||
; SKX-LABEL: test5:
|
||||
; SKX: # BB#0:
|
||||
; SKX-NEXT: movb $2, %al
|
||||
; SKX-NEXT: kmovb %eax, %k1
|
||||
; SKX-NEXT: vpexpandq (%rdi), %xmm0 {%k1}
|
||||
; SKX-NEXT: retq
|
||||
;
|
||||
; KNL-LABEL: test5:
|
||||
; KNL: # BB#0:
|
||||
; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
||||
; KNL-NEXT: movb $2, %al
|
||||
; KNL-NEXT: kmovw %eax, %k1
|
||||
; KNL-NEXT: vpexpandq (%rdi), %zmm0 {%k1}
|
||||
; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
|
||||
; KNL-NEXT: retq
|
||||
%res = call <2 x i64> @llvm.masked.expandload.v2i64(i64* %base, <2 x i1> <i1 false, i1 true>, <2 x i64> %src0)
|
||||
ret <2 x i64>%res
|
||||
}
|
||||
|
||||
declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>)
|
||||
declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>)
|
||||
declare <4 x float> @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>)
|
||||
declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
|
||||
|
||||
define void @test6(float* %base, <16 x float> %V) {
|
||||
; ALL-LABEL: test6:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: movw $-2049, %ax # imm = 0xF7FF
|
||||
; ALL-NEXT: kmovw %eax, %k1
|
||||
; ALL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
|
||||
; ALL-NEXT: retq
|
||||
call void @llvm.masked.compressstore.v16f32(<16 x float> %V, float* %base, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true>)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) {
|
||||
; SKX-LABEL: test7:
|
||||
; SKX: # BB#0:
|
||||
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
|
||||
; SKX-NEXT: vpmovw2m %xmm1, %k1
|
||||
; SKX-NEXT: vcompressps %ymm0, (%rdi) {%k1}
|
||||
; SKX-NEXT: retq
|
||||
;
|
||||
; KNL-LABEL: test7:
|
||||
; KNL: # BB#0:
|
||||
; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
||||
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
|
||||
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
|
||||
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k0
|
||||
; KNL-NEXT: kshiftlw $8, %k0, %k0
|
||||
; KNL-NEXT: kshiftrw $8, %k0, %k1
|
||||
; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
|
||||
; KNL-NEXT: retq
|
||||
call void @llvm.masked.compressstore.v8f32(<8 x float> %V, float* %base, <8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test8(double* %base, <8 x double> %V, <8 x i1> %mask) {
|
||||
; SKX-LABEL: test8:
|
||||
; SKX: # BB#0:
|
||||
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
|
||||
; SKX-NEXT: vpmovw2m %xmm1, %k1
|
||||
; SKX-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
|
||||
; SKX-NEXT: retq
|
||||
;
|
||||
; KNL-LABEL: test8:
|
||||
; KNL: # BB#0:
|
||||
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
|
||||
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
|
||||
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
|
||||
; KNL-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
|
||||
; KNL-NEXT: retq
|
||||
call void @llvm.masked.compressstore.v8f64(<8 x double> %V, double* %base, <8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test9(i64* %base, <8 x i64> %V, <8 x i1> %mask) {
|
||||
; SKX-LABEL: test9:
|
||||
; SKX: # BB#0:
|
||||
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
|
||||
; SKX-NEXT: vpmovw2m %xmm1, %k1
|
||||
; SKX-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
|
||||
; SKX-NEXT: retq
|
||||
;
|
||||
; KNL-LABEL: test9:
|
||||
; KNL: # BB#0:
|
||||
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
|
||||
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
|
||||
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
|
||||
; KNL-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
|
||||
; KNL-NEXT: retq
|
||||
call void @llvm.masked.compressstore.v8i64(<8 x i64> %V, i64* %base, <8 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test10(i64* %base, <4 x i64> %V, <4 x i1> %mask) {
|
||||
; SKX-LABEL: test10:
|
||||
; SKX: # BB#0:
|
||||
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
|
||||
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
|
||||
; SKX-NEXT: vpcompressq %ymm0, (%rdi) {%k1}
|
||||
; SKX-NEXT: retq
|
||||
;
|
||||
; KNL-LABEL: test10:
|
||||
; KNL: # BB#0:
|
||||
; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
|
||||
; KNL-NEXT: vpslld $31, %xmm1, %xmm1
|
||||
; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
|
||||
; KNL-NEXT: vpmovsxdq %xmm1, %ymm1
|
||||
; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
|
||||
; KNL-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
|
||||
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
|
||||
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
|
||||
; KNL-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
|
||||
; KNL-NEXT: retq
|
||||
call void @llvm.masked.compressstore.v4i64(<4 x i64> %V, i64* %base, <4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test11(i64* %base, <2 x i64> %V, <2 x i1> %mask) {
|
||||
; SKX-LABEL: test11:
|
||||
; SKX: # BB#0:
|
||||
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
|
||||
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
|
||||
; SKX-NEXT: vpcompressq %xmm0, (%rdi) {%k1}
|
||||
; SKX-NEXT: retq
|
||||
;
|
||||
; KNL-LABEL: test11:
|
||||
; KNL: # BB#0:
|
||||
; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
||||
; KNL-NEXT: vpsllq $63, %xmm1, %xmm1
|
||||
; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
|
||||
; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
||||
; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
|
||||
; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1
|
||||
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
|
||||
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
|
||||
; KNL-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
|
||||
; KNL-NEXT: retq
|
||||
call void @llvm.masked.compressstore.v2i64(<2 x i64> %V, i64* %base, <2 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test12(float* %base, <4 x float> %V, <4 x i1> %mask) {
|
||||
; SKX-LABEL: test12:
|
||||
; SKX: # BB#0:
|
||||
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
|
||||
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
|
||||
; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1}
|
||||
; SKX-NEXT: retq
|
||||
;
|
||||
; KNL-LABEL: test12:
|
||||
; KNL: # BB#0:
|
||||
; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
|
||||
; KNL-NEXT: vpslld $31, %xmm1, %xmm1
|
||||
; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
|
||||
; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
|
||||
; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1
|
||||
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
|
||||
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
|
||||
; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
|
||||
; KNL-NEXT: retq
|
||||
call void @llvm.masked.compressstore.v4f32(<4 x float> %V, float* %base, <4 x i1> %mask)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.masked.compressstore.v16f32(<16 x float>, float* , <16 x i1>)
|
||||
declare void @llvm.masked.compressstore.v8f32(<8 x float>, float* , <8 x i1>)
|
||||
declare void @llvm.masked.compressstore.v8f64(<8 x double>, double* , <8 x i1>)
|
||||
declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32* , <16 x i1>)
|
||||
declare void @llvm.masked.compressstore.v8i32(<8 x i32>, i32* , <8 x i1>)
|
||||
declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64* , <8 x i1>)
|
||||
declare void @llvm.masked.compressstore.v4i32(<4 x i32>, i32* , <4 x i1>)
|
||||
declare void @llvm.masked.compressstore.v4f32(<4 x float>, float* , <4 x i1>)
|
||||
declare void @llvm.masked.compressstore.v4i64(<4 x i64>, i64* , <4 x i1>)
|
||||
declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64* , <2 x i1>)
|
|
@ -550,8 +550,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
|
|||
// overloaded, all the types can be specified directly.
|
||||
assert(((!TyEl->isSubClassOf("LLVMExtendedType") &&
|
||||
!TyEl->isSubClassOf("LLVMTruncatedType") &&
|
||||
!TyEl->isSubClassOf("LLVMVectorSameWidth") &&
|
||||
!TyEl->isSubClassOf("LLVMPointerToElt")) ||
|
||||
!TyEl->isSubClassOf("LLVMVectorSameWidth")) ||
|
||||
VT == MVT::iAny || VT == MVT::vAny) &&
|
||||
"Expected iAny or vAny type");
|
||||
} else
|
||||
|
|
|
@ -213,10 +213,11 @@ enum IIT_Info {
|
|||
IIT_HALF_VEC_ARG = 30,
|
||||
IIT_SAME_VEC_WIDTH_ARG = 31,
|
||||
IIT_PTR_TO_ARG = 32,
|
||||
IIT_VEC_OF_PTRS_TO_ELT = 33,
|
||||
IIT_I128 = 34,
|
||||
IIT_V512 = 35,
|
||||
IIT_V1024 = 36
|
||||
IIT_PTR_TO_ELT = 33,
|
||||
IIT_VEC_OF_PTRS_TO_ELT = 34,
|
||||
IIT_I128 = 35,
|
||||
IIT_V512 = 36,
|
||||
IIT_V1024 = 37
|
||||
};
|
||||
|
||||
|
||||
|
@ -277,6 +278,8 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
|
|||
Sig.push_back(IIT_PTR_TO_ARG);
|
||||
else if (R->isSubClassOf("LLVMVectorOfPointersToElt"))
|
||||
Sig.push_back(IIT_VEC_OF_PTRS_TO_ELT);
|
||||
else if (R->isSubClassOf("LLVMPointerToElt"))
|
||||
Sig.push_back(IIT_PTR_TO_ELT);
|
||||
else
|
||||
Sig.push_back(IIT_ARG);
|
||||
return Sig.push_back((Number << 3) | ArgCodes[Number]);
|
||||
|
|
Loading…
Reference in New Issue