forked from OSchip/llvm-project
[AArch64] Add AArch64TTIImpl::getMaskedMemoryOpCost function
When vectorising for AArch64 targets if you specify the SVE attribute we automatically then treat masked loads and stores as legal. Also, since we have no cost model for masked memory ops we believe it's cheap to use the masked load/store intrinsics even for fixed width vectors. This can lead to poor code quality as the intrinsics will currently be scalarised in the backend. This patch adds a basic cost model that marks fixed-width masked memory ops as significantly more expensive than for scalable vectors. Tests for the cost model are added here: Transforms/LoopVectorize/AArch64/masked-op-cost.ll Differential Revision: https://reviews.llvm.org/D100745
This commit is contained in:
parent
7302fe4328
commit
a458b7855e
|
@ -189,6 +189,55 @@ private:
|
|||
llvm_unreachable("Unexpected MemIndexedMode");
|
||||
}
|
||||
|
||||
InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
|
||||
Align Alignment,
|
||||
bool VariableMask,
|
||||
bool IsGatherScatter,
|
||||
TTI::TargetCostKind CostKind) {
|
||||
auto *VT = cast<FixedVectorType>(DataTy);
|
||||
// Assume the target does not have support for gather/scatter operations
|
||||
// and provide a rough estimate.
|
||||
//
|
||||
// First, compute the cost of the individual memory operations.
|
||||
InstructionCost AddrExtractCost =
|
||||
IsGatherScatter
|
||||
? getVectorInstrCost(Instruction::ExtractElement,
|
||||
FixedVectorType::get(
|
||||
PointerType::get(VT->getElementType(), 0),
|
||||
VT->getNumElements()),
|
||||
-1)
|
||||
: 0;
|
||||
InstructionCost LoadCost =
|
||||
VT->getNumElements() *
|
||||
(AddrExtractCost +
|
||||
getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
|
||||
|
||||
// Next, compute the cost of packing the result in a vector.
|
||||
int PackingCost = getScalarizationOverhead(VT, Opcode != Instruction::Store,
|
||||
Opcode == Instruction::Store);
|
||||
|
||||
InstructionCost ConditionalCost = 0;
|
||||
if (VariableMask) {
|
||||
// Compute the cost of conditionally executing the memory operations with
|
||||
// variable masks. This includes extracting the individual conditions, a
|
||||
// branches and PHIs to combine the results.
|
||||
// NOTE: Estimating the cost of conditionally executing the memory
|
||||
// operations accurately is quite difficult and the current solution
|
||||
// provides a very rough estimate only.
|
||||
ConditionalCost =
|
||||
VT->getNumElements() *
|
||||
(getVectorInstrCost(
|
||||
Instruction::ExtractElement,
|
||||
FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
|
||||
VT->getNumElements()),
|
||||
-1) +
|
||||
getCFInstrCost(Instruction::Br, CostKind) +
|
||||
getCFInstrCost(Instruction::PHI, CostKind));
|
||||
}
|
||||
|
||||
return LoadCost + PackingCost + ConditionalCost;
|
||||
}
|
||||
|
||||
protected:
|
||||
explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
|
||||
: BaseT(DL) {}
|
||||
|
@ -1024,50 +1073,20 @@ public:
|
|||
return Cost;
|
||||
}
|
||||
|
||||
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
|
||||
Align Alignment, unsigned AddressSpace,
|
||||
TTI::TargetCostKind CostKind) {
|
||||
return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
|
||||
CostKind);
|
||||
}
|
||||
|
||||
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
const Value *Ptr, bool VariableMask,
|
||||
Align Alignment,
|
||||
TTI::TargetCostKind CostKind,
|
||||
const Instruction *I = nullptr) {
|
||||
auto *VT = cast<FixedVectorType>(DataTy);
|
||||
// Assume the target does not have support for gather/scatter operations
|
||||
// and provide a rough estimate.
|
||||
//
|
||||
// First, compute the cost of extracting the individual addresses and the
|
||||
// individual memory operations.
|
||||
InstructionCost LoadCost =
|
||||
VT->getNumElements() *
|
||||
(getVectorInstrCost(
|
||||
Instruction::ExtractElement,
|
||||
FixedVectorType::get(PointerType::get(VT->getElementType(), 0),
|
||||
VT->getNumElements()),
|
||||
-1) +
|
||||
getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
|
||||
|
||||
// Next, compute the cost of packing the result in a vector.
|
||||
int PackingCost = getScalarizationOverhead(VT, Opcode != Instruction::Store,
|
||||
Opcode == Instruction::Store);
|
||||
|
||||
InstructionCost ConditionalCost = 0;
|
||||
if (VariableMask) {
|
||||
// Compute the cost of conditionally executing the memory operations with
|
||||
// variable masks. This includes extracting the individual conditions, a
|
||||
// branches and PHIs to combine the results.
|
||||
// NOTE: Estimating the cost of conditionally executing the memory
|
||||
// operations accurately is quite difficult and the current solution
|
||||
// provides a very rough estimate only.
|
||||
ConditionalCost =
|
||||
VT->getNumElements() *
|
||||
(getVectorInstrCost(
|
||||
Instruction::ExtractElement,
|
||||
FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
|
||||
VT->getNumElements()),
|
||||
-1) +
|
||||
getCFInstrCost(Instruction::Br, CostKind) +
|
||||
getCFInstrCost(Instruction::PHI, CostKind));
|
||||
}
|
||||
|
||||
return LoadCost + PackingCost + ConditionalCost;
|
||||
return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
|
||||
true, CostKind);
|
||||
}
|
||||
|
||||
InstructionCost getInterleavedMemoryOpCost(
|
||||
|
|
|
@ -1038,6 +1038,17 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
|
|||
return Options;
|
||||
}
|
||||
|
||||
InstructionCost
|
||||
AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
|
||||
Align Alignment, unsigned AddressSpace,
|
||||
TTI::TargetCostKind CostKind) {
|
||||
if (!isa<ScalableVectorType>(Src))
|
||||
return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
|
||||
CostKind);
|
||||
auto LT = TLI->getTypeLegalizationCost(DL, Src);
|
||||
return LT.first * 2;
|
||||
}
|
||||
|
||||
InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
|
||||
unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
|
||||
Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
|
||||
|
|
|
@ -133,6 +133,10 @@ public:
|
|||
|
||||
unsigned getMaxInterleaveFactor(unsigned VF);
|
||||
|
||||
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
|
||||
Align Alignment, unsigned AddressSpace,
|
||||
TTI::TargetCostKind CostKind);
|
||||
|
||||
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
const Value *Ptr, bool VariableMask,
|
||||
Align Alignment,
|
||||
|
|
|
@ -0,0 +1,142 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
|
||||
; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s
|
||||
|
||||
define void @fixed() {
|
||||
; CHECK-LABEL: 'fixed'
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8>* undef, i32 8, <2 x i1> undef, <2 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* undef, i32 8, <4 x i1> undef, <4 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 8, <8 x i1> undef, <8 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 109 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 8, <16 x i1> undef, <16 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16>* undef, i32 8, <2 x i1> undef, <2 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* undef, i32 8, <4 x i1> undef, <4 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* undef, i32 8, <8 x i1> undef, <8 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* undef, i32 8, <2 x i1> undef, <2 x i32> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* undef, i32 8, <4 x i1> undef, <4 x i32> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* undef, i32 8, <2 x i1> undef, <2 x i64> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half>* undef, i32 8, <2 x i1> undef, <2 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* undef, i32 8, <4 x i1> undef, <4 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* undef, i32 8, <8 x i1> undef, <8 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* undef, i32 8, <2 x i1> undef, <2 x float> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 8, <4 x i1> undef, <4 x float> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* undef, i32 8, <2 x i1> undef, <2 x double> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* undef, i32 8, <4 x i1> undef, <4 x i64> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* undef, i32 8, <32 x i1> undef, <32 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
entry:
|
||||
; Legal fixed-width integer types
|
||||
%v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8> *undef, i32 8, <2 x i1> undef, <2 x i8> undef)
|
||||
%v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8> *undef, i32 8, <4 x i1> undef, <4 x i8> undef)
|
||||
%v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8> *undef, i32 8, <8 x i1> undef, <8 x i8> undef)
|
||||
%v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8> *undef, i32 8, <16 x i1> undef, <16 x i8> undef)
|
||||
%v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16> *undef, i32 8, <2 x i1> undef, <2 x i16> undef)
|
||||
%v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16> *undef, i32 8, <4 x i1> undef, <4 x i16> undef)
|
||||
%v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16> *undef, i32 8, <8 x i1> undef, <8 x i16> undef)
|
||||
%v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32> *undef, i32 8, <2 x i1> undef, <2 x i32> undef)
|
||||
%v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32> *undef, i32 8, <4 x i1> undef, <4 x i32> undef)
|
||||
%v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64> *undef, i32 8, <2 x i1> undef, <2 x i64> undef)
|
||||
|
||||
; Legal fixed-width floating point types
|
||||
%v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half> *undef, i32 8, <2 x i1> undef, <2 x half> undef)
|
||||
%v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half> *undef, i32 8, <4 x i1> undef, <4 x half> undef)
|
||||
%v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half> *undef, i32 8, <8 x i1> undef, <8 x half> undef)
|
||||
%v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float> *undef, i32 8, <2 x i1> undef, <2 x float> undef)
|
||||
%v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float> *undef, i32 8, <4 x i1> undef, <4 x float> undef)
|
||||
%v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double> *undef, i32 8, <2 x i1> undef, <2 x double> undef)
|
||||
|
||||
; A couple of examples of illegal fixed-width types
|
||||
%v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64> *undef, i32 8, <4 x i1> undef, <4 x i64> undef)
|
||||
%v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half> *undef, i32 8, <32 x i1> undef, <32 x half> undef)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @scalable() {
|
||||
; CHECK-LABEL: 'scalable'
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0nxv2i8(<vscale x 2 x i8>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0nxv4i8(<vscale x 4 x i8>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0nxv8i8(<vscale x 8 x i8>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>* undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0nxv2i16(<vscale x 2 x i16>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0nxv4i16(<vscale x 4 x i16>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0nxv8i16(<vscale x 8 x i16>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0nxv2i32(<vscale x 2 x i32>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0nxv2f16(<vscale x 2 x half>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0nxv4f16(<vscale x 4 x half>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0nxv8f16(<vscale x 8 x half>* undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0nxv2f32(<vscale x 2 x float>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>* undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0nxv4i64(<vscale x 4 x i64>* undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0nxv32f16(<vscale x 32 x half>* undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
entry:
|
||||
; Legal scalable integer types
|
||||
%nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0nxv2i8(<vscale x 2 x i8> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
|
||||
%nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0nxv4i8(<vscale x 4 x i8> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
|
||||
%nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0nxv8i8(<vscale x 8 x i8> *undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
|
||||
%nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8> *undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
|
||||
%nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0nxv2i16(<vscale x 2 x i16> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
|
||||
%nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0nxv4i16(<vscale x 4 x i16> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
|
||||
%nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0nxv8i16(<vscale x 8 x i16> *undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
|
||||
%nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0nxv2i32(<vscale x 2 x i32> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
|
||||
%nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
|
||||
%nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
|
||||
|
||||
; Legal scalable floating point types
|
||||
%nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0nxv2f16(<vscale x 2 x half> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
|
||||
%nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0nxv4f16(<vscale x 4 x half> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
|
||||
%nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0nxv8f16(<vscale x 8 x half> *undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
|
||||
%nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0nxv2f32(<vscale x 2 x float> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
|
||||
%nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
|
||||
%nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double> *undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
|
||||
|
||||
; A couple of examples of illegal scalable types
|
||||
%nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0nxv4i64(<vscale x 4 x i64> *undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
|
||||
%nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0nxv32f16(<vscale x 32 x half> *undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <2 x i8> @llvm.masked.load.v2i8.p0v2i8(<2 x i8>*, i32, <2 x i1>, <2 x i8>)
|
||||
declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
|
||||
declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
|
||||
declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
|
||||
declare <2 x i16> @llvm.masked.load.v2i16.p0v2i16(<2 x i16>*, i32, <2 x i1>, <2 x i16>)
|
||||
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
|
||||
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
|
||||
declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
|
||||
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
|
||||
declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32, <2 x i1>, <2 x i64>)
|
||||
declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
|
||||
declare <2 x half> @llvm.masked.load.v2f16.p0v2f16(<2 x half>*, i32, <2 x i1>, <2 x half>)
|
||||
declare <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>*, i32, <4 x i1>, <4 x half>)
|
||||
declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
|
||||
declare <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>*, i32, <32 x i1>, <32 x half>)
|
||||
declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
|
||||
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
|
||||
declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
|
||||
|
||||
|
||||
declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0nxv2i8(<vscale x 2 x i8>*, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
|
||||
declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0nxv4i8(<vscale x 4 x i8>*, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
|
||||
declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0nxv8i8(<vscale x 8 x i8>*, i32, <vscale x 8 x i1>, <vscale x 8 x i8>)
|
||||
declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
|
||||
declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
|
||||
declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
|
||||
declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
|
||||
declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
|
||||
declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
|
||||
declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
|
||||
declare <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0nxv4i64(<vscale x 4 x i64>*, i32, <vscale x 4 x i1>, <vscale x 4 x i64>)
|
||||
declare <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0nxv2f16(<vscale x 2 x half>*, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
|
||||
declare <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
|
||||
declare <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
|
||||
declare <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0nxv32f16(<vscale x 32 x half>*, i32, <vscale x 32 x i1>, <vscale x 32 x half>)
|
||||
declare <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0nxv2f32(<vscale x 2 x float>*, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
|
||||
declare <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
|
||||
declare <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
|
|
@ -0,0 +1,92 @@
|
|||
; REQUIRES: asserts
|
||||
; RUN: opt -loop-vectorize -force-vector-interleave=1 -S -debug < %s 2>%t | FileCheck %s
|
||||
; RUN: cat %t | FileCheck %s --check-prefix=CHECK-COST
|
||||
|
||||
target triple = "aarch64-unknown-linux-gnu"
|
||||
|
||||
; CHECK-COST: Checking a loop in "fixed_width"
|
||||
; CHECK-COST: Found an estimated cost of 11 for VF 2 For instruction: store i32 2, i32* %arrayidx1, align 4
|
||||
; CHECK-COST: Found an estimated cost of 25 for VF 4 For instruction: store i32 2, i32* %arrayidx1, align 4
|
||||
; CHECK-COST: Selecting VF: 1.
|
||||
|
||||
; We should decide this loop is not worth vectorising using fixed width vectors
|
||||
define void @fixed_width(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) #0 {
|
||||
; CHECK-LABEL: @fixed_width(
|
||||
; CHECK-NOT: vector.body
|
||||
entry:
|
||||
%cmp6 = icmp sgt i64 %n, 0
|
||||
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup.loopexit: ; preds = %for.inc
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
|
||||
ret void
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.inc
|
||||
%i.07 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.07
|
||||
%0 = load i32, i32* %arrayidx, align 4
|
||||
%tobool.not = icmp eq i32 %0, 0
|
||||
br i1 %tobool.not, label %for.inc, label %if.then
|
||||
|
||||
if.then: ; preds = %for.body
|
||||
%arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %i.07
|
||||
store i32 2, i32* %arrayidx1, align 4
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %for.body, %if.then
|
||||
%inc = add nuw nsw i64 %i.07, 1
|
||||
%exitcond.not = icmp eq i64 %inc, %n
|
||||
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
|
||||
}
|
||||
|
||||
|
||||
; CHECK-COST: Checking a loop in "scalable"
|
||||
; CHECK-COST: Found an estimated cost of 2 for VF vscale x 4 For instruction: store i32 2, i32* %arrayidx1, align 4
|
||||
|
||||
define void @scalable(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) #0 {
|
||||
; CHECK-LABEL: @scalable(
|
||||
; CHECK: vector.body
|
||||
; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32
|
||||
entry:
|
||||
%cmp6 = icmp sgt i64 %n, 0
|
||||
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup.loopexit: ; preds = %for.inc
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
|
||||
ret void
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.inc
|
||||
%i.07 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.07
|
||||
%0 = load i32, i32* %arrayidx, align 4
|
||||
%tobool.not = icmp eq i32 %0, 0
|
||||
br i1 %tobool.not, label %for.inc, label %if.then
|
||||
|
||||
if.then: ; preds = %for.body
|
||||
%arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %i.07
|
||||
store i32 2, i32* %arrayidx1, align 4
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %for.body, %if.then
|
||||
%inc = add nuw nsw i64 %i.07, 1
|
||||
%exitcond.not = icmp eq i64 %inc, %n
|
||||
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0
|
||||
}
|
||||
|
||||
attributes #0 = { "target-features"="+neon,+sve" }
|
||||
|
||||
!0 = distinct !{!0, !1, !2, !3, !4}
|
||||
!1 = !{!"llvm.loop.mustprogress"}
|
||||
!2 = !{!"llvm.loop.vectorize.width", i32 4}
|
||||
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
|
||||
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
|
|
@ -128,8 +128,9 @@ for.inc: ; preds = %for.body, %if.then
|
|||
attributes #0 = {"target-cpu"="generic" "target-features"="+neon,+sve"}
|
||||
|
||||
|
||||
!0 = distinct !{!0, !1, !2, !3, !4}
|
||||
!0 = distinct !{!0, !1, !2, !3, !4, !5}
|
||||
!1 = !{!"llvm.loop.mustprogress"}
|
||||
!2 = !{!"llvm.loop.vectorize.width", i32 4}
|
||||
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
|
||||
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
|
||||
!5 = !{!"llvm.loop.interleave.count", i32 2}
|
||||
|
|
Loading…
Reference in New Issue