forked from OSchip/llvm-project
[AArch64][SVE]Add cost model for masked gather and scatter for scalable vector.
A new TTI interface has been added 'Optional <unsigned>getMaxVScale' that returns the maximum vscale for a given target. When known getMaxVScale is used to compute the cost of masked gather scatter for scalable vector. Depends on D92094 Differential Revision: https://reviews.llvm.org/D93030
This commit is contained in:
parent
d38a0258a5
commit
060cfd9795
|
@ -928,6 +928,10 @@ public:
|
|||
/// \return The width of the smallest vector register type.
|
||||
unsigned getMinVectorRegisterBitWidth() const;
|
||||
|
||||
/// \return The maximum value of vscale if the target specifies an
|
||||
/// architectural maximum vector length, and None otherwise.
|
||||
Optional<unsigned> getMaxVScale() const;
|
||||
|
||||
/// \return True if the vectorization factor should be chosen to
|
||||
/// make the vector of the smallest element type match the size of a
|
||||
/// vector register. For wider element types, this could result in
|
||||
|
@ -1504,6 +1508,7 @@ public:
|
|||
virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
|
||||
virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
|
||||
virtual unsigned getMinVectorRegisterBitWidth() = 0;
|
||||
virtual Optional<unsigned> getMaxVScale() const = 0;
|
||||
virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
|
||||
virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
|
||||
virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
|
||||
|
@ -1921,6 +1926,9 @@ public:
|
|||
unsigned getMinVectorRegisterBitWidth() override {
|
||||
return Impl.getMinVectorRegisterBitWidth();
|
||||
}
|
||||
Optional<unsigned> getMaxVScale() const override {
|
||||
return Impl.getMaxVScale();
|
||||
}
|
||||
bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
|
||||
return Impl.shouldMaximizeVectorBandwidth(OptSize);
|
||||
}
|
||||
|
|
|
@ -369,6 +369,8 @@ public:
|
|||
|
||||
unsigned getMinVectorRegisterBitWidth() const { return 128; }
|
||||
|
||||
Optional<unsigned> getMaxVScale() const { return None; }
|
||||
|
||||
bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; }
|
||||
|
||||
unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }
|
||||
|
|
|
@ -571,6 +571,8 @@ public:
|
|||
|
||||
unsigned getRegisterBitWidth(bool Vector) const { return 32; }
|
||||
|
||||
Optional<unsigned> getMaxVScale() const { return None; }
|
||||
|
||||
/// Estimate the overhead of scalarizing an instruction. Insert and Extract
|
||||
/// are set if the demanded result elements need to be inserted and/or
|
||||
/// extracted from vectors.
|
||||
|
@ -1239,8 +1241,6 @@ public:
|
|||
return thisT()->getMemcpyCost(ICA.getInst());
|
||||
|
||||
case Intrinsic::masked_scatter: {
|
||||
if (isa<ScalableVectorType>(RetTy))
|
||||
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
|
||||
assert(VF.isScalar() && "Can't vectorize types here.");
|
||||
const Value *Mask = Args[3];
|
||||
bool VarMask = !isa<Constant>(Mask);
|
||||
|
@ -1250,8 +1250,6 @@ public:
|
|||
VarMask, Alignment, CostKind, I);
|
||||
}
|
||||
case Intrinsic::masked_gather: {
|
||||
if (isa<ScalableVectorType>(RetTy))
|
||||
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
|
||||
assert(VF.isScalar() && "Can't vectorize types here.");
|
||||
const Value *Mask = Args[2];
|
||||
bool VarMask = !isa<Constant>(Mask);
|
||||
|
|
|
@ -627,6 +627,10 @@ unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const {
|
|||
return TTIImpl->getMinVectorRegisterBitWidth();
|
||||
}
|
||||
|
||||
Optional<unsigned> TargetTransformInfo::getMaxVScale() const {
|
||||
return TTIImpl->getMaxVScale();
|
||||
}
|
||||
|
||||
bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const {
|
||||
return TTIImpl->shouldMaximizeVectorBandwidth(OptSize);
|
||||
}
|
||||
|
|
|
@ -770,6 +770,26 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
|
|||
return Options;
|
||||
}
|
||||
|
||||
unsigned AArch64TTIImpl::getGatherScatterOpCost(
|
||||
unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
|
||||
Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
|
||||
auto *VT = cast<VectorType>(DataTy);
|
||||
auto LT = TLI->getTypeLegalizationCost(DL, DataTy);
|
||||
ElementCount LegalVF = LT.second.getVectorElementCount();
|
||||
if (!LegalVF.isScalable())
|
||||
return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
|
||||
Alignment, CostKind, I);
|
||||
|
||||
Optional<unsigned> MaxNumVScale = getMaxVScale();
|
||||
assert(MaxNumVScale && "Expected valid max vscale value");
|
||||
|
||||
unsigned MemOpCost =
|
||||
getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
|
||||
unsigned MaxNumElementsPerGather =
|
||||
MaxNumVScale.getValue() * LegalVF.getKnownMinValue();
|
||||
return LT.first * MaxNumElementsPerGather * MemOpCost;
|
||||
}
|
||||
|
||||
bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
|
||||
return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
|
||||
}
|
||||
|
|
|
@ -115,8 +115,19 @@ public:
|
|||
return ST->getMinVectorRegisterBitWidth();
|
||||
}
|
||||
|
||||
Optional<unsigned> getMaxVScale() const {
|
||||
if (ST->hasSVE())
|
||||
return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
|
||||
return BaseT::getMaxVScale();
|
||||
}
|
||||
|
||||
unsigned getMaxInterleaveFactor(unsigned VF);
|
||||
|
||||
unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
const Value *Ptr, bool VariableMask,
|
||||
Align Alignment, TTI::TargetCostKind CostKind,
|
||||
const Instruction *I = nullptr);
|
||||
|
||||
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
|
||||
TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
|
||||
const Instruction *I = nullptr);
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
; Check getIntrinsicInstrCost in BasicTTIImpl.h for masked gather
|
||||
|
||||
; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
|
||||
|
||||
; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
|
||||
|
||||
; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
|
||||
; WARN-NOT: warning
|
||||
|
||||
define <vscale x 4 x i32> @masked_gather_nxv4i32(<vscale x 4 x i32*> %ld, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru) {
|
||||
; CHECK-LABEL: 'masked_gather_nxv4i32'
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <vscale x 4 x i32> %res
|
||||
%res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
|
||||
ret <vscale x 4 x i32> %res
|
||||
}
|
||||
|
||||
define <vscale x 8 x i32> @masked_gather_nxv8i32(<vscale x 8 x i32*> %ld, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru) {
|
||||
; CHECK-LABEL: 'masked_gather_nxv8i32'
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <vscale x 8 x i32> %res
|
||||
%res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
|
||||
ret <vscale x 8 x i32> %res
|
||||
}
|
||||
|
||||
define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ld, <4 x i1> %masks, <4 x i32> %passthru) {
|
||||
; CHECK-LABEL: 'masked_gather_v4i32'
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res
|
||||
|
||||
%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
|
||||
declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 %align, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
|
||||
declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*> %ptrs, i32 %align, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
|
||||
declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i32> %passthru)
|
|
@ -0,0 +1,40 @@
|
|||
; Check getIntrinsicInstrCost in BasicTTIImpl.h with for masked scatter
|
||||
|
||||
; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
|
||||
|
||||
; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
|
||||
|
||||
; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
|
||||
; WARN-NOT: warning
|
||||
|
||||
|
||||
define void @masked_scatter_nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, <vscale x 4 x i1> %masks) {
|
||||
; CHECK-LABEL: 'masked_scatter_nxv4i32'
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
|
||||
call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @masked_scatter_nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32*> %ptrs, <vscale x 8 x i1> %masks) {
|
||||
; CHECK-LABEL: 'masked_scatter_nxv8i32'
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> %data, <vscale x 8 x i32*> %ptrs, i32 0, <vscale x 8 x i1> %masks)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
|
||||
call void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32*> %ptrs, i32 0, <vscale x 8 x i1> %masks)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @masked_scatter_v4i32(<4 x i32> %data, <4 x i32*> %ptrs, <4 x i1> %masks) {
|
||||
; CHECK-LABEL: 'masked_scatter_v4i32'
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data, <4 x i32*> %ptrs, i32 0, <4 x i1> %masks)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
|
||||
call void @llvm.masked.scatter.v4i32(<4 x i32> %data, <4 x i32*> %ptrs, i32 0, <4 x i1> %masks)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 %align, <vscale x 4 x i1> %masks)
|
||||
declare void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32*> %ptrs, i32 %align, <vscale x 8 x i1> %masks)
|
||||
declare void @llvm.masked.scatter.v4i32(<4 x i32> %data, <4 x i32*> %ptrs, i32 %align, <4 x i1> %masks)
|
Loading…
Reference in New Issue