forked from OSchip/llvm-project
Scalarizer for masked.gather and masked.scatter intrinsics.
When the target does not support these intrinsics they should be converted to a chain of scalar load or store operations. If the mask is not constant, the scalarizer will build a chain of conditional basic blocks. I added isLegalMaskedGather() isLegalMaskedScatter() APIs. Differential Revision: http://reviews.llvm.org/D13722 llvm-svn: 251237
This commit is contained in:
parent
be187a0a1a
commit
092858588a
|
@ -316,6 +316,12 @@ public:
|
|||
bool isLegalMaskedStore(Type *DataType) const;
|
||||
bool isLegalMaskedLoad(Type *DataType) const;
|
||||
|
||||
/// \brief Return true if the target supports masked gather/scatter
|
||||
/// AVX-512 fully supports gather and scatter for vectors with 32 and 64
|
||||
/// bits scalar type.
|
||||
bool isLegalMaskedScatter(Type *DataType) const;
|
||||
bool isLegalMaskedGather(Type *DataType) const;
|
||||
|
||||
/// \brief Return the cost of the scaling factor used in the addressing
|
||||
/// mode represented by AM for this target, for a load/store
|
||||
/// of the specified type.
|
||||
|
@ -569,6 +575,8 @@ public:
|
|||
unsigned AddrSpace) = 0;
|
||||
virtual bool isLegalMaskedStore(Type *DataType) = 0;
|
||||
virtual bool isLegalMaskedLoad(Type *DataType) = 0;
|
||||
virtual bool isLegalMaskedScatter(Type *DataType) = 0;
|
||||
virtual bool isLegalMaskedGather(Type *DataType) = 0;
|
||||
virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
|
||||
int64_t BaseOffset, bool HasBaseReg,
|
||||
int64_t Scale, unsigned AddrSpace) = 0;
|
||||
|
@ -698,6 +706,12 @@ public:
|
|||
bool isLegalMaskedLoad(Type *DataType) override {
|
||||
return Impl.isLegalMaskedLoad(DataType);
|
||||
}
|
||||
bool isLegalMaskedScatter(Type *DataType) override {
|
||||
return Impl.isLegalMaskedScatter(DataType);
|
||||
}
|
||||
bool isLegalMaskedGather(Type *DataType) override {
|
||||
return Impl.isLegalMaskedGather(DataType);
|
||||
}
|
||||
int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
|
||||
bool HasBaseReg, int64_t Scale,
|
||||
unsigned AddrSpace) override {
|
||||
|
|
|
@ -213,6 +213,10 @@ public:
|
|||
|
||||
bool isLegalMaskedLoad(Type *DataType) { return false; }
|
||||
|
||||
bool isLegalMaskedScatter(Type *DataType) { return false; }
|
||||
|
||||
bool isLegalMaskedGather(Type *DataType) { return false; }
|
||||
|
||||
int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
|
||||
bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
|
||||
// Guess that all legal addressing mode are free.
|
||||
|
|
|
@ -121,6 +121,14 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType) const {
|
|||
return TTIImpl->isLegalMaskedLoad(DataType);
|
||||
}
|
||||
|
||||
bool TargetTransformInfo::isLegalMaskedGather(Type *DataType) const {
|
||||
return TTIImpl->isLegalMaskedGather(DataType);
|
||||
}
|
||||
|
||||
bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType) const {
|
||||
return TTIImpl->isLegalMaskedGather(DataType);
|
||||
}
|
||||
|
||||
int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
|
||||
int64_t BaseOffset,
|
||||
bool HasBaseReg,
|
||||
|
|
|
@ -1215,7 +1215,7 @@ static void ScalarizeMaskedLoad(CallInst *CI) {
|
|||
|
||||
Value *Gep =
|
||||
Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
|
||||
LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal);
|
||||
LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
|
||||
VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
|
||||
|
||||
// Create "else" block, fill it in the next iteration
|
||||
|
@ -1353,6 +1353,250 @@ static void ScalarizeMaskedStore(CallInst *CI) {
|
|||
CI->eraseFromParent();
|
||||
}
|
||||
|
||||
// Translate a masked gather intrinsic like
|
||||
// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
|
||||
// <16 x i1> %Mask, <16 x i32> %Src)
|
||||
// to a chain of basic blocks, with loading element one-by-one if
|
||||
// the appropriate mask bit is set
|
||||
//
|
||||
// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
|
||||
// % Mask0 = extractelement <16 x i1> %Mask, i32 0
|
||||
// % ToLoad0 = icmp eq i1 % Mask0, true
|
||||
// br i1 % ToLoad0, label %cond.load, label %else
|
||||
//
|
||||
// cond.load:
|
||||
// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
|
||||
// % Load0 = load i32, i32* % Ptr0, align 4
|
||||
// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
|
||||
// br label %else
|
||||
//
|
||||
// else:
|
||||
// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
|
||||
// % Mask1 = extractelement <16 x i1> %Mask, i32 1
|
||||
// % ToLoad1 = icmp eq i1 % Mask1, true
|
||||
// br i1 % ToLoad1, label %cond.load1, label %else2
|
||||
//
|
||||
// cond.load1:
|
||||
// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
|
||||
// % Load1 = load i32, i32* % Ptr1, align 4
|
||||
// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
|
||||
// br label %else2
|
||||
// . . .
|
||||
// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
|
||||
// ret <16 x i32> %Result
|
||||
static void ScalarizeMaskedGather(CallInst *CI) {
|
||||
Value *Ptrs = CI->getArgOperand(0);
|
||||
Value *Alignment = CI->getArgOperand(1);
|
||||
Value *Mask = CI->getArgOperand(2);
|
||||
Value *Src0 = CI->getArgOperand(3);
|
||||
|
||||
VectorType *VecType = dyn_cast<VectorType>(CI->getType());
|
||||
|
||||
assert(VecType && "Unexpected return type of masked load intrinsic");
|
||||
|
||||
IRBuilder<> Builder(CI->getContext());
|
||||
Instruction *InsertPt = CI;
|
||||
BasicBlock *IfBlock = CI->getParent();
|
||||
BasicBlock *CondBlock = nullptr;
|
||||
BasicBlock *PrevIfBlock = CI->getParent();
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
|
||||
|
||||
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
|
||||
|
||||
Value *UndefVal = UndefValue::get(VecType);
|
||||
|
||||
// The result vector
|
||||
Value *VResult = UndefVal;
|
||||
unsigned VectorWidth = VecType->getNumElements();
|
||||
|
||||
// Shorten the way if the mask is a vector of constants.
|
||||
bool IsConstMask = isa<ConstantVector>(Mask);
|
||||
|
||||
if (IsConstMask) {
|
||||
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
|
||||
if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
|
||||
continue;
|
||||
Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
|
||||
"Ptr" + Twine(Idx));
|
||||
LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
|
||||
"Load" + Twine(Idx));
|
||||
VResult = Builder.CreateInsertElement(VResult, Load,
|
||||
Builder.getInt32(Idx),
|
||||
"Res" + Twine(Idx));
|
||||
}
|
||||
Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
|
||||
CI->replaceAllUsesWith(NewI);
|
||||
CI->eraseFromParent();
|
||||
return;
|
||||
}
|
||||
|
||||
PHINode *Phi = nullptr;
|
||||
Value *PrevPhi = UndefVal;
|
||||
|
||||
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
|
||||
|
||||
// Fill the "else" block, created in the previous iteration
|
||||
//
|
||||
// %Mask1 = extractelement <16 x i1> %Mask, i32 1
|
||||
// %ToLoad1 = icmp eq i1 %Mask1, true
|
||||
// br i1 %ToLoad1, label %cond.load, label %else
|
||||
//
|
||||
if (Idx > 0) {
|
||||
Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
|
||||
Phi->addIncoming(VResult, CondBlock);
|
||||
Phi->addIncoming(PrevPhi, PrevIfBlock);
|
||||
PrevPhi = Phi;
|
||||
VResult = Phi;
|
||||
}
|
||||
|
||||
Value *Predicate = Builder.CreateExtractElement(Mask,
|
||||
Builder.getInt32(Idx),
|
||||
"Mask" + Twine(Idx));
|
||||
Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
|
||||
ConstantInt::get(Predicate->getType(), 1),
|
||||
"ToLoad" + Twine(Idx));
|
||||
|
||||
// Create "cond" block
|
||||
//
|
||||
// %EltAddr = getelementptr i32* %1, i32 0
|
||||
// %Elt = load i32* %EltAddr
|
||||
// VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
|
||||
//
|
||||
CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
|
||||
Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
|
||||
"Ptr" + Twine(Idx));
|
||||
LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
|
||||
"Load" + Twine(Idx));
|
||||
VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
|
||||
"Res" + Twine(Idx));
|
||||
|
||||
// Create "else" block, fill it in the next iteration
|
||||
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
Instruction *OldBr = IfBlock->getTerminator();
|
||||
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
|
||||
OldBr->eraseFromParent();
|
||||
PrevIfBlock = IfBlock;
|
||||
IfBlock = NewIfBlock;
|
||||
}
|
||||
|
||||
Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
|
||||
Phi->addIncoming(VResult, CondBlock);
|
||||
Phi->addIncoming(PrevPhi, PrevIfBlock);
|
||||
Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
|
||||
CI->replaceAllUsesWith(NewI);
|
||||
CI->eraseFromParent();
|
||||
}
|
||||
|
||||
// Translate a masked scatter intrinsic, like
|
||||
// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
|
||||
// <16 x i1> %Mask)
|
||||
// to a chain of basic blocks, that stores element one-by-one if
|
||||
// the appropriate mask bit is set.
|
||||
//
|
||||
// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
|
||||
// % Mask0 = extractelement <16 x i1> % Mask, i32 0
|
||||
// % ToStore0 = icmp eq i1 % Mask0, true
|
||||
// br i1 %ToStore0, label %cond.store, label %else
|
||||
//
|
||||
// cond.store:
|
||||
// % Elt0 = extractelement <16 x i32> %Src, i32 0
|
||||
// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
|
||||
// store i32 %Elt0, i32* % Ptr0, align 4
|
||||
// br label %else
|
||||
//
|
||||
// else:
|
||||
// % Mask1 = extractelement <16 x i1> % Mask, i32 1
|
||||
// % ToStore1 = icmp eq i1 % Mask1, true
|
||||
// br i1 % ToStore1, label %cond.store1, label %else2
|
||||
//
|
||||
// cond.store1:
|
||||
// % Elt1 = extractelement <16 x i32> %Src, i32 1
|
||||
// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
|
||||
// store i32 % Elt1, i32* % Ptr1, align 4
|
||||
// br label %else2
|
||||
// . . .
|
||||
static void ScalarizeMaskedScatter(CallInst *CI) {
|
||||
Value *Src = CI->getArgOperand(0);
|
||||
Value *Ptrs = CI->getArgOperand(1);
|
||||
Value *Alignment = CI->getArgOperand(2);
|
||||
Value *Mask = CI->getArgOperand(3);
|
||||
|
||||
assert(isa<VectorType>(Src->getType()) &&
|
||||
"Unexpected data type in masked scatter intrinsic");
|
||||
assert(isa<VectorType>(Ptrs->getType()) &&
|
||||
isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
|
||||
"Vector of pointers is expected in masked scatter intrinsic");
|
||||
|
||||
IRBuilder<> Builder(CI->getContext());
|
||||
Instruction *InsertPt = CI;
|
||||
BasicBlock *IfBlock = CI->getParent();
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
|
||||
|
||||
unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
|
||||
unsigned VectorWidth = Src->getType()->getVectorNumElements();
|
||||
|
||||
// Shorten the way if the mask is a vector of constants.
|
||||
bool IsConstMask = isa<ConstantVector>(Mask);
|
||||
|
||||
if (IsConstMask) {
|
||||
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
|
||||
if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
|
||||
continue;
|
||||
Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
|
||||
"Elt" + Twine(Idx));
|
||||
Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
|
||||
"Ptr" + Twine(Idx));
|
||||
Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
|
||||
}
|
||||
CI->eraseFromParent();
|
||||
return;
|
||||
}
|
||||
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
|
||||
// Fill the "else" block, created in the previous iteration
|
||||
//
|
||||
// % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
|
||||
// % ToStore = icmp eq i1 % Mask1, true
|
||||
// br i1 % ToStore, label %cond.store, label %else
|
||||
//
|
||||
Value *Predicate = Builder.CreateExtractElement(Mask,
|
||||
Builder.getInt32(Idx),
|
||||
"Mask" + Twine(Idx));
|
||||
Value *Cmp =
|
||||
Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
|
||||
ConstantInt::get(Predicate->getType(), 1),
|
||||
"ToStore" + Twine(Idx));
|
||||
|
||||
// Create "cond" block
|
||||
//
|
||||
// % Elt1 = extractelement <16 x i32> %Src, i32 1
|
||||
// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
|
||||
// %store i32 % Elt1, i32* % Ptr1
|
||||
//
|
||||
BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
|
||||
Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
|
||||
"Elt" + Twine(Idx));
|
||||
Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
|
||||
"Ptr" + Twine(Idx));
|
||||
Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
|
||||
|
||||
// Create "else" block, fill it in the next iteration
|
||||
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
|
||||
Builder.SetInsertPoint(InsertPt);
|
||||
Instruction *OldBr = IfBlock->getTerminator();
|
||||
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
|
||||
OldBr->eraseFromParent();
|
||||
IfBlock = NewIfBlock;
|
||||
}
|
||||
CI->eraseFromParent();
|
||||
}
|
||||
|
||||
bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
|
||||
BasicBlock *BB = CI->getParent();
|
||||
|
||||
|
@ -1460,6 +1704,22 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
|
|||
}
|
||||
return false;
|
||||
}
|
||||
case Intrinsic::masked_gather: {
|
||||
if (!TTI->isLegalMaskedGather(CI->getType())) {
|
||||
ScalarizeMaskedGather(CI);
|
||||
ModifiedDT = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
case Intrinsic::masked_scatter: {
|
||||
if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
|
||||
ScalarizeMaskedScatter(CI);
|
||||
ModifiedDT = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
case Intrinsic::aarch64_stlxr:
|
||||
case Intrinsic::aarch64_stxr: {
|
||||
ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
|
||||
|
|
|
@ -1203,6 +1203,33 @@ bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
|
|||
return isLegalMaskedLoad(DataType);
|
||||
}
|
||||
|
||||
bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
|
||||
// This function is called now in two cases: from the Loop Vectorizer
|
||||
// and from the Scalarizer.
|
||||
// When the Loop Vectorizer asks about legality of the feature,
|
||||
// the vectorization factor is not calculated yet. The Loop Vectorizer
|
||||
// sends a scalar type and the decision is based on the width of the
|
||||
// scalar element.
|
||||
// Later on, the cost model will estimate usage this intrinsic based on
|
||||
// the vector type.
|
||||
// The Scalarizer asks again about legality. It sends a vector type.
|
||||
// In this case we can reject non-power-of-2 vectors.
|
||||
if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements()))
|
||||
return false;
|
||||
Type *ScalarTy = DataTy->getScalarType();
|
||||
// TODO: Pointers should also be legal,
|
||||
// but it requires additional support in composing intrinsics name.
|
||||
// getPrimitiveSizeInBits() returns 0 for PointerType
|
||||
int DataWidth = ScalarTy->getPrimitiveSizeInBits();
|
||||
|
||||
// AVX-512 allows gather and scatter
|
||||
return DataWidth >= 32 && ST->hasAVX512();
|
||||
}
|
||||
|
||||
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
|
||||
return isLegalMaskedGather(DataType);
|
||||
}
|
||||
|
||||
bool X86TTIImpl::areInlineCompatible(const Function *Caller,
|
||||
const Function *Callee) const {
|
||||
const TargetMachine &TM = getTLI()->getTargetMachine();
|
||||
|
|
|
@ -90,6 +90,8 @@ public:
|
|||
Type *Ty);
|
||||
bool isLegalMaskedLoad(Type *DataType);
|
||||
bool isLegalMaskedStore(Type *DataType);
|
||||
bool isLegalMaskedGather(Type *DataType);
|
||||
bool isLegalMaskedScatter(Type *DataType);
|
||||
bool areInlineCompatible(const Function *Caller,
|
||||
const Function *Callee) const;
|
||||
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=KNL
|
||||
; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
|
||||
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
@ -6,6 +8,14 @@ target triple = "x86_64-unknown-linux-gnu"
|
|||
; KNL-LABEL: test1
|
||||
; KNL: kxnorw %k1, %k1, %k1
|
||||
; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
|
||||
|
||||
; SCALAR-LABEL: test1
|
||||
; SCALAR: extractelement <16 x float*>
|
||||
; SCALAR-NEXT: load float
|
||||
; SCALAR-NEXT: insertelement <16 x float>
|
||||
; SCALAR-NEXT: extractelement <16 x float*>
|
||||
; SCALAR-NEXT: load float
|
||||
|
||||
define <16 x float> @test1(float* %base, <16 x i32> %ind) {
|
||||
|
||||
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
|
||||
|
@ -25,6 +35,18 @@ declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i3
|
|||
; KNL-LABEL: test2
|
||||
; KNL: kmovw %esi, %k1
|
||||
; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
|
||||
|
||||
; SCALAR-LABEL: test2
|
||||
; SCALAR: extractelement <16 x float*>
|
||||
; SCALAR-NEXT: load float
|
||||
; SCALAR-NEXT: insertelement <16 x float>
|
||||
; SCALAR-NEXT: br label %else
|
||||
; SCALAR: else:
|
||||
; SCALAR-NEXT: %res.phi.else = phi
|
||||
; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
|
||||
; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true
|
||||
; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2
|
||||
|
||||
define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
|
||||
|
||||
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
|
||||
|
@ -76,6 +98,20 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
|
|||
; KNL: vpscatterdd {{.*}}%k2
|
||||
; KNL: vpscatterdd {{.*}}%k1
|
||||
|
||||
; SCALAR-LABEL: test5
|
||||
; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0
|
||||
; SCALAR-NEXT: %ToStore0 = icmp eq i1 %Mask0, true
|
||||
; SCALAR-NEXT: br i1 %ToStore0, label %cond.store, label %else
|
||||
; SCALAR: cond.store:
|
||||
; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i32 0
|
||||
; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
|
||||
; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4
|
||||
; SCALAR-NEXT: br label %else
|
||||
; SCALAR: else:
|
||||
; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
|
||||
; SCALAR-NEXT: %ToStore1 = icmp eq i1 %Mask1, true
|
||||
; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2
|
||||
|
||||
define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
|
||||
|
||||
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
|
||||
|
@ -96,6 +132,16 @@ declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x
|
|||
; KNL: kxnorw %k2, %k2, %k2
|
||||
; KNL: vpgatherqd (,%zmm{{.*}}), %ymm{{.*}} {%k2}
|
||||
; KNL: vpscatterqd %ymm{{.*}}, (,%zmm{{.*}}) {%k1}
|
||||
|
||||
; SCALAR-LABEL: test6
|
||||
; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
|
||||
; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i32 1
|
||||
; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
|
||||
; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4
|
||||
; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i32 2
|
||||
; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
|
||||
; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
|
||||
|
||||
define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
|
||||
|
||||
%a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
|
||||
|
@ -245,3 +291,42 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
|
|||
}
|
||||
|
||||
|
||||
; KNL-LABEL: test15
|
||||
; KNL: kmovw %eax, %k1
|
||||
; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
|
||||
|
||||
; SCALAR-LABEL: test15
|
||||
; SCALAR: extractelement <16 x float*>
|
||||
; SCALAR-NEXT: load float
|
||||
; SCALAR-NEXT: insertelement <16 x float>
|
||||
; SCALAR-NEXT: extractelement <16 x float*>
|
||||
; SCALAR-NEXT: load float
|
||||
|
||||
define <16 x float> @test15(float* %base, <16 x i32> %ind) {
|
||||
|
||||
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
|
||||
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
|
||||
|
||||
%sext_ind = sext <16 x i32> %ind to <16 x i64>
|
||||
%gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
|
||||
|
||||
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
|
||||
ret <16 x float>%res
|
||||
}
|
||||
|
||||
; Check non-power-of-2 case. It should be scalarized.
|
||||
declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
|
||||
; KNL-LABEL: test16
|
||||
; KNL: testb
|
||||
; KNL: je
|
||||
; KNL: testb
|
||||
; KNL: je
|
||||
; KNL: testb
|
||||
; KNL: je
|
||||
define <3 x i32> @test16(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
|
||||
%sext_ind = sext <3 x i32> %ind to <3 x i64>
|
||||
%gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
|
||||
%res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
|
||||
ret <3 x i32>%res
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue