forked from OSchip/llvm-project
1163 lines
46 KiB
C++
1163 lines
46 KiB
C++
//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AArch64TargetTransformInfo.h"
|
|
#include "AArch64ExpandImm.h"
|
|
#include "MCTargetDesc/AArch64AddressingModes.h"
|
|
#include "llvm/Analysis/LoopInfo.h"
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
#include "llvm/CodeGen/BasicTTIImpl.h"
|
|
#include "llvm/CodeGen/CostTable.h"
|
|
#include "llvm/CodeGen/TargetLowering.h"
|
|
#include "llvm/IR/IntrinsicInst.h"
|
|
#include "llvm/IR/IntrinsicsAArch64.h"
|
|
#include "llvm/IR/PatternMatch.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include <algorithm>
|
|
using namespace llvm;
|
|
using namespace llvm::PatternMatch;
|
|
|
|
#define DEBUG_TYPE "aarch64tti"
|
|
|
|
static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
|
|
cl::init(true), cl::Hidden);
|
|
|
|
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
|
|
const Function *Callee) const {
|
|
const TargetMachine &TM = getTLI()->getTargetMachine();
|
|
|
|
const FeatureBitset &CallerBits =
|
|
TM.getSubtargetImpl(*Caller)->getFeatureBits();
|
|
const FeatureBitset &CalleeBits =
|
|
TM.getSubtargetImpl(*Callee)->getFeatureBits();
|
|
|
|
// Inline a callee if its target-features are a subset of the callers
|
|
// target-features.
|
|
return (CallerBits & CalleeBits) == CalleeBits;
|
|
}
|
|
|
|
/// Calculate the cost of materializing a 64-bit value. This helper
|
|
/// method might only calculate a fraction of a larger immediate. Therefore it
|
|
/// is valid to return a cost of ZERO.
|
|
int AArch64TTIImpl::getIntImmCost(int64_t Val) {
|
|
// Check if the immediate can be encoded within an instruction.
|
|
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
|
|
return 0;
|
|
|
|
if (Val < 0)
|
|
Val = ~Val;
|
|
|
|
// Calculate how many moves we will need to materialize this constant.
|
|
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
|
|
AArch64_IMM::expandMOVImm(Val, 64, Insn);
|
|
return Insn.size();
|
|
}
|
|
|
|
/// Calculate the cost of materializing the given constant.
|
|
int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
|
|
TTI::TargetCostKind CostKind) {
|
|
assert(Ty->isIntegerTy());
|
|
|
|
unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
|
if (BitSize == 0)
|
|
return ~0U;
|
|
|
|
// Sign-extend all constants to a multiple of 64-bit.
|
|
APInt ImmVal = Imm;
|
|
if (BitSize & 0x3f)
|
|
ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
|
|
|
|
// Split the constant into 64-bit chunks and calculate the cost for each
|
|
// chunk.
|
|
int Cost = 0;
|
|
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
|
|
APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
|
|
int64_t Val = Tmp.getSExtValue();
|
|
Cost += getIntImmCost(Val);
|
|
}
|
|
// We need at least one instruction to materialze the constant.
|
|
return std::max(1, Cost);
|
|
}
|
|
|
|
int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
|
|
const APInt &Imm, Type *Ty,
|
|
TTI::TargetCostKind CostKind,
|
|
Instruction *Inst) {
|
|
assert(Ty->isIntegerTy());
|
|
|
|
unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
|
// There is no cost model for constants with a bit size of 0. Return TCC_Free
|
|
// here, so that constant hoisting will ignore this constant.
|
|
if (BitSize == 0)
|
|
return TTI::TCC_Free;
|
|
|
|
unsigned ImmIdx = ~0U;
|
|
switch (Opcode) {
|
|
default:
|
|
return TTI::TCC_Free;
|
|
case Instruction::GetElementPtr:
|
|
// Always hoist the base address of a GetElementPtr.
|
|
if (Idx == 0)
|
|
return 2 * TTI::TCC_Basic;
|
|
return TTI::TCC_Free;
|
|
case Instruction::Store:
|
|
ImmIdx = 0;
|
|
break;
|
|
case Instruction::Add:
|
|
case Instruction::Sub:
|
|
case Instruction::Mul:
|
|
case Instruction::UDiv:
|
|
case Instruction::SDiv:
|
|
case Instruction::URem:
|
|
case Instruction::SRem:
|
|
case Instruction::And:
|
|
case Instruction::Or:
|
|
case Instruction::Xor:
|
|
case Instruction::ICmp:
|
|
ImmIdx = 1;
|
|
break;
|
|
// Always return TCC_Free for the shift value of a shift instruction.
|
|
case Instruction::Shl:
|
|
case Instruction::LShr:
|
|
case Instruction::AShr:
|
|
if (Idx == 1)
|
|
return TTI::TCC_Free;
|
|
break;
|
|
case Instruction::Trunc:
|
|
case Instruction::ZExt:
|
|
case Instruction::SExt:
|
|
case Instruction::IntToPtr:
|
|
case Instruction::PtrToInt:
|
|
case Instruction::BitCast:
|
|
case Instruction::PHI:
|
|
case Instruction::Call:
|
|
case Instruction::Select:
|
|
case Instruction::Ret:
|
|
case Instruction::Load:
|
|
break;
|
|
}
|
|
|
|
if (Idx == ImmIdx) {
|
|
int NumConstants = (BitSize + 63) / 64;
|
|
int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
|
|
return (Cost <= NumConstants * TTI::TCC_Basic)
|
|
? static_cast<int>(TTI::TCC_Free)
|
|
: Cost;
|
|
}
|
|
return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
|
|
}
|
|
|
|
int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
|
|
const APInt &Imm, Type *Ty,
|
|
TTI::TargetCostKind CostKind) {
|
|
assert(Ty->isIntegerTy());
|
|
|
|
unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
|
// There is no cost model for constants with a bit size of 0. Return TCC_Free
|
|
// here, so that constant hoisting will ignore this constant.
|
|
if (BitSize == 0)
|
|
return TTI::TCC_Free;
|
|
|
|
// Most (all?) AArch64 intrinsics do not support folding immediates into the
|
|
// selected instruction, so we compute the materialization cost for the
|
|
// immediate directly.
|
|
if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
|
|
return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
|
|
|
|
switch (IID) {
|
|
default:
|
|
return TTI::TCC_Free;
|
|
case Intrinsic::sadd_with_overflow:
|
|
case Intrinsic::uadd_with_overflow:
|
|
case Intrinsic::ssub_with_overflow:
|
|
case Intrinsic::usub_with_overflow:
|
|
case Intrinsic::smul_with_overflow:
|
|
case Intrinsic::umul_with_overflow:
|
|
if (Idx == 1) {
|
|
int NumConstants = (BitSize + 63) / 64;
|
|
int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
|
|
return (Cost <= NumConstants * TTI::TCC_Basic)
|
|
? static_cast<int>(TTI::TCC_Free)
|
|
: Cost;
|
|
}
|
|
break;
|
|
case Intrinsic::experimental_stackmap:
|
|
if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
|
|
return TTI::TCC_Free;
|
|
break;
|
|
case Intrinsic::experimental_patchpoint_void:
|
|
case Intrinsic::experimental_patchpoint_i64:
|
|
if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
|
|
return TTI::TCC_Free;
|
|
break;
|
|
case Intrinsic::experimental_gc_statepoint:
|
|
if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
|
|
return TTI::TCC_Free;
|
|
break;
|
|
}
|
|
return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
|
|
}
|
|
|
|
TargetTransformInfo::PopcntSupportKind
|
|
AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
|
|
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
|
|
if (TyWidth == 32 || TyWidth == 64)
|
|
return TTI::PSK_FastHardware;
|
|
// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
|
|
return TTI::PSK_Software;
|
|
}
|
|
|
|
unsigned
|
|
AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
|
|
TTI::TargetCostKind CostKind) {
|
|
auto *RetTy = ICA.getReturnType();
|
|
switch (ICA.getID()) {
|
|
case Intrinsic::umin:
|
|
case Intrinsic::umax: {
|
|
auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
|
|
// umin(x,y) -> sub(x,usubsat(x,y))
|
|
// umax(x,y) -> add(x,usubsat(y,x))
|
|
if (LT.second == MVT::v2i64)
|
|
return LT.first * 2;
|
|
LLVM_FALLTHROUGH;
|
|
}
|
|
case Intrinsic::smin:
|
|
case Intrinsic::smax: {
|
|
static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
|
|
MVT::v8i16, MVT::v2i32, MVT::v4i32};
|
|
auto LT = TLI->getTypeLegalizationCost(DL, RetTy);
|
|
if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }))
|
|
return LT.first;
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
|
|
}
|
|
|
|
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
|
|
ArrayRef<const Value *> Args) {
|
|
|
|
// A helper that returns a vector type from the given type. The number of
|
|
// elements in type Ty determine the vector width.
|
|
auto toVectorTy = [&](Type *ArgTy) {
|
|
return VectorType::get(ArgTy->getScalarType(),
|
|
cast<VectorType>(DstTy)->getElementCount());
|
|
};
|
|
|
|
// Exit early if DstTy is not a vector type whose elements are at least
|
|
// 16-bits wide.
|
|
if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
|
|
return false;
|
|
|
|
// Determine if the operation has a widening variant. We consider both the
|
|
// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
|
|
// instructions.
|
|
//
|
|
// TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
|
|
// verify that their extending operands are eliminated during code
|
|
// generation.
|
|
switch (Opcode) {
|
|
case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
|
|
case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
|
|
break;
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
// To be a widening instruction (either the "wide" or "long" versions), the
|
|
// second operand must be a sign- or zero extend having a single user. We
|
|
// only consider extends having a single user because they may otherwise not
|
|
// be eliminated.
|
|
if (Args.size() != 2 ||
|
|
(!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
|
|
!Args[1]->hasOneUse())
|
|
return false;
|
|
auto *Extend = cast<CastInst>(Args[1]);
|
|
|
|
// Legalize the destination type and ensure it can be used in a widening
|
|
// operation.
|
|
auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
|
|
unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
|
|
if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
|
|
return false;
|
|
|
|
// Legalize the source type and ensure it can be used in a widening
|
|
// operation.
|
|
auto *SrcTy = toVectorTy(Extend->getSrcTy());
|
|
auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
|
|
unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
|
|
if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
|
|
return false;
|
|
|
|
// Get the total number of vector elements in the legalized types.
|
|
unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorMinNumElements();
|
|
unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
|
|
|
|
// Return true if the legalized types have the same number of vector elements
|
|
// and the destination element type size is twice that of the source type.
|
|
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
|
|
}
|
|
|
|
int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
|
|
TTI::CastContextHint CCH,
|
|
TTI::TargetCostKind CostKind,
|
|
const Instruction *I) {
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
|
assert(ISD && "Invalid opcode");
|
|
|
|
// If the cast is observable, and it is used by a widening instruction (e.g.,
|
|
// uaddl, saddw, etc.), it may be free.
|
|
if (I && I->hasOneUse()) {
|
|
auto *SingleUser = cast<Instruction>(*I->user_begin());
|
|
SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
|
|
if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
|
|
// If the cast is the second operand, it is free. We will generate either
|
|
// a "wide" or "long" version of the widening instruction.
|
|
if (I == SingleUser->getOperand(1))
|
|
return 0;
|
|
// If the cast is not the second operand, it will be free if it looks the
|
|
// same as the second operand. In this case, we will generate a "long"
|
|
// version of the widening instruction.
|
|
if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
|
|
if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
|
|
cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
// TODO: Allow non-throughput costs that aren't binary.
|
|
auto AdjustCost = [&CostKind](int Cost) {
|
|
if (CostKind != TTI::TCK_RecipThroughput)
|
|
return Cost == 0 ? 0 : 1;
|
|
return Cost;
|
|
};
|
|
|
|
EVT SrcTy = TLI->getValueType(DL, Src);
|
|
EVT DstTy = TLI->getValueType(DL, Dst);
|
|
|
|
if (!SrcTy.isSimple() || !DstTy.isSimple())
|
|
return AdjustCost(
|
|
BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
|
|
|
|
static const TypeConversionCostTblEntry
|
|
ConversionTbl[] = {
|
|
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
|
|
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
|
|
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
|
|
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
|
|
|
|
// The number of shll instructions for the extension.
|
|
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
|
|
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
|
|
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
|
|
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
|
|
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
|
|
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
|
|
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
|
|
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
|
|
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
|
|
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
|
|
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
|
|
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
|
|
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
|
|
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
|
|
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
|
|
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
|
|
|
|
// LowerVectorINT_TO_FP:
|
|
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
|
|
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
|
|
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
|
|
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
|
|
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
|
|
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
|
|
|
|
// Complex: to v2f32
|
|
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
|
|
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
|
|
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
|
|
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
|
|
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
|
|
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
|
|
|
|
// Complex: to v4f32
|
|
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
|
|
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
|
|
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
|
|
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
|
|
|
|
// Complex: to v8f32
|
|
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
|
|
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
|
|
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
|
|
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
|
|
|
|
// Complex: to v16f32
|
|
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
|
|
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
|
|
|
|
// Complex: to v2f64
|
|
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
|
|
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
|
|
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
|
|
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
|
|
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
|
|
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
|
|
|
|
|
|
// LowerVectorFP_TO_INT
|
|
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
|
|
{ ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
|
|
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
|
|
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
|
|
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
|
|
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
|
|
|
|
// Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
|
|
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
|
|
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
|
|
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
|
|
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
|
|
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
|
|
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
|
|
|
|
// Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
|
|
{ ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
|
|
{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
|
|
{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
|
|
{ ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
|
|
|
|
// Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
|
|
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
|
|
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
|
|
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
|
|
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
|
|
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
|
|
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
|
|
};
|
|
|
|
if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
|
|
DstTy.getSimpleVT(),
|
|
SrcTy.getSimpleVT()))
|
|
return AdjustCost(Entry->Cost);
|
|
|
|
return AdjustCost(
|
|
BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
|
|
}
|
|
|
|
int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
|
|
VectorType *VecTy,
|
|
unsigned Index) {
|
|
|
|
// Make sure we were given a valid extend opcode.
|
|
assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
|
|
"Invalid opcode");
|
|
|
|
// We are extending an element we extract from a vector, so the source type
|
|
// of the extend is the element type of the vector.
|
|
auto *Src = VecTy->getElementType();
|
|
|
|
// Sign- and zero-extends are for integer types only.
|
|
assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
|
|
|
|
// Get the cost for the extract. We compute the cost (if any) for the extend
|
|
// below.
|
|
auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
|
|
|
|
// Legalize the types.
|
|
auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
|
|
auto DstVT = TLI->getValueType(DL, Dst);
|
|
auto SrcVT = TLI->getValueType(DL, Src);
|
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
|
|
// If the resulting type is still a vector and the destination type is legal,
|
|
// we may get the extension for free. If not, get the default cost for the
|
|
// extend.
|
|
if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
|
|
return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
|
|
CostKind);
|
|
|
|
// The destination type should be larger than the element type. If not, get
|
|
// the default cost for the extend.
|
|
if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
|
|
return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
|
|
CostKind);
|
|
|
|
switch (Opcode) {
|
|
default:
|
|
llvm_unreachable("Opcode should be either SExt or ZExt");
|
|
|
|
// For sign-extends, we only need a smov, which performs the extension
|
|
// automatically.
|
|
case Instruction::SExt:
|
|
return Cost;
|
|
|
|
// For zero-extends, the extend is performed automatically by a umov unless
|
|
// the destination type is i64 and the element type is i8 or i16.
|
|
case Instruction::ZExt:
|
|
if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
|
|
return Cost;
|
|
}
|
|
|
|
// If we are unable to perform the extend for free, get the default cost.
|
|
return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
|
|
CostKind);
|
|
}
|
|
|
|
unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
|
|
TTI::TargetCostKind CostKind) {
|
|
if (CostKind != TTI::TCK_RecipThroughput)
|
|
return Opcode == Instruction::PHI ? 0 : 1;
|
|
assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
|
|
// Branches are assumed to be predicted.
|
|
return 0;
|
|
}
|
|
|
|
int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
|
|
unsigned Index) {
|
|
assert(Val->isVectorTy() && "This must be a vector type");
|
|
|
|
if (Index != -1U) {
|
|
// Legalize the type.
|
|
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
|
|
|
|
// This type is legalized to a scalar type.
|
|
if (!LT.second.isVector())
|
|
return 0;
|
|
|
|
// The type may be split. Normalize the index to the new type.
|
|
unsigned Width = LT.second.getVectorNumElements();
|
|
Index = Index % Width;
|
|
|
|
// The element at index zero is already inside the vector.
|
|
if (Index == 0)
|
|
return 0;
|
|
}
|
|
|
|
// All other insert/extracts cost this much.
|
|
return ST->getVectorInsertExtractBaseCost();
|
|
}
|
|
|
|
int AArch64TTIImpl::getArithmeticInstrCost(
|
|
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
|
|
TTI::OperandValueKind Opd1Info,
|
|
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
|
|
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
|
|
const Instruction *CxtI) {
|
|
// TODO: Handle more cost kinds.
|
|
if (CostKind != TTI::TCK_RecipThroughput)
|
|
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
|
|
Opd2Info, Opd1PropInfo,
|
|
Opd2PropInfo, Args, CxtI);
|
|
|
|
// Legalize the type.
|
|
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
|
|
|
|
// If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
|
|
// add in the widening overhead specified by the sub-target. Since the
|
|
// extends feeding widening instructions are performed automatically, they
|
|
// aren't present in the generated code and have a zero cost. By adding a
|
|
// widening overhead here, we attach the total cost of the combined operation
|
|
// to the widening instruction.
|
|
int Cost = 0;
|
|
if (isWideningInstruction(Ty, Opcode, Args))
|
|
Cost += ST->getWideningBaseCost();
|
|
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
|
|
|
switch (ISD) {
|
|
default:
|
|
return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
|
|
Opd2Info,
|
|
Opd1PropInfo, Opd2PropInfo);
|
|
case ISD::SDIV:
|
|
if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
|
|
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
|
|
// On AArch64, scalar signed division by constants power-of-two are
|
|
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
|
|
// The OperandValue properties many not be same as that of previous
|
|
// operation; conservatively assume OP_None.
|
|
Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
|
|
Opd1Info, Opd2Info,
|
|
TargetTransformInfo::OP_None,
|
|
TargetTransformInfo::OP_None);
|
|
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
|
|
Opd1Info, Opd2Info,
|
|
TargetTransformInfo::OP_None,
|
|
TargetTransformInfo::OP_None);
|
|
Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
|
|
Opd1Info, Opd2Info,
|
|
TargetTransformInfo::OP_None,
|
|
TargetTransformInfo::OP_None);
|
|
Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
|
|
Opd1Info, Opd2Info,
|
|
TargetTransformInfo::OP_None,
|
|
TargetTransformInfo::OP_None);
|
|
return Cost;
|
|
}
|
|
LLVM_FALLTHROUGH;
|
|
case ISD::UDIV:
|
|
if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
|
|
auto VT = TLI->getValueType(DL, Ty);
|
|
if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
|
|
// Vector signed division by constant are expanded to the
|
|
// sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
|
|
// to MULHS + SUB + SRL + ADD + SRL.
|
|
int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
|
|
Opd1Info, Opd2Info,
|
|
TargetTransformInfo::OP_None,
|
|
TargetTransformInfo::OP_None);
|
|
int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
|
|
Opd1Info, Opd2Info,
|
|
TargetTransformInfo::OP_None,
|
|
TargetTransformInfo::OP_None);
|
|
int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
|
|
Opd1Info, Opd2Info,
|
|
TargetTransformInfo::OP_None,
|
|
TargetTransformInfo::OP_None);
|
|
return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
|
|
}
|
|
}
|
|
|
|
Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
|
|
Opd2Info,
|
|
Opd1PropInfo, Opd2PropInfo);
|
|
if (Ty->isVectorTy()) {
|
|
// On AArch64, vector divisions are not supported natively and are
|
|
// expanded into scalar divisions of each pair of elements.
|
|
Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
|
|
Opd1Info, Opd2Info, Opd1PropInfo,
|
|
Opd2PropInfo);
|
|
Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
|
|
Opd1Info, Opd2Info, Opd1PropInfo,
|
|
Opd2PropInfo);
|
|
// TODO: if one of the arguments is scalar, then it's not necessary to
|
|
// double the cost of handling the vector elements.
|
|
Cost += Cost;
|
|
}
|
|
return Cost;
|
|
|
|
case ISD::MUL:
|
|
if (LT.second != MVT::v2i64)
|
|
return (Cost + 1) * LT.first;
|
|
// Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
|
|
// as elements are extracted from the vectors and the muls scalarized.
|
|
// As getScalarizationOverhead is a bit too pessimistic, we estimate the
|
|
// cost for a i64 vector directly here, which is:
|
|
// - four i64 extracts,
|
|
// - two i64 inserts, and
|
|
// - two muls.
|
|
// So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
|
|
// LT.first = 2 the cost is 16.
|
|
return LT.first * 8;
|
|
case ISD::ADD:
|
|
case ISD::XOR:
|
|
case ISD::OR:
|
|
case ISD::AND:
|
|
// These nodes are marked as 'custom' for combining purposes only.
|
|
// We know that they are legal. See LowerAdd in ISelLowering.
|
|
return (Cost + 1) * LT.first;
|
|
|
|
case ISD::FADD:
|
|
// These nodes are marked as 'custom' just to lower them to SVE.
|
|
// We know said lowering will incur no additional cost.
|
|
if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty())
|
|
return (Cost + 2) * LT.first;
|
|
|
|
return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
|
|
Opd2Info,
|
|
Opd1PropInfo, Opd2PropInfo);
|
|
}
|
|
}
|
|
|
|
int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
|
|
const SCEV *Ptr) {
|
|
// Address computations in vectorized code with non-consecutive addresses will
|
|
// likely result in more instructions compared to scalar code where the
|
|
// computation can more often be merged into the index mode. The resulting
|
|
// extra micro-ops can significantly decrease throughput.
|
|
unsigned NumVectorInstToHideOverhead = 10;
|
|
int MaxMergeDistance = 64;
|
|
|
|
if (Ty->isVectorTy() && SE &&
|
|
!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
|
|
return NumVectorInstToHideOverhead;
|
|
|
|
// In many cases the address computation is not merged into the instruction
|
|
// addressing mode.
|
|
return 1;
|
|
}
|
|
|
|
int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
|
|
Type *CondTy, CmpInst::Predicate VecPred,
|
|
TTI::TargetCostKind CostKind,
|
|
const Instruction *I) {
|
|
// TODO: Handle other cost kinds.
|
|
if (CostKind != TTI::TCK_RecipThroughput)
|
|
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
|
|
I);
|
|
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
|
// We don't lower some vector selects well that are wider than the register
|
|
// width.
|
|
if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
|
|
// We would need this many instructions to hide the scalarization happening.
|
|
const int AmortizationCost = 20;
|
|
|
|
// If VecPred is not set, check if we can get a predicate from the context
|
|
// instruction, if its type matches the requested ValTy.
|
|
if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
|
|
CmpInst::Predicate CurrentPred;
|
|
if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
|
|
m_Value())))
|
|
VecPred = CurrentPred;
|
|
}
|
|
// Check if we have a compare/select chain that can be lowered using CMxx &
|
|
// BFI pair.
|
|
if (CmpInst::isIntPredicate(VecPred)) {
|
|
static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
|
|
MVT::v8i16, MVT::v2i32, MVT::v4i32,
|
|
MVT::v2i64};
|
|
auto LT = TLI->getTypeLegalizationCost(DL, ValTy);
|
|
if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }))
|
|
return LT.first;
|
|
}
|
|
|
|
static const TypeConversionCostTblEntry
|
|
VectorSelectTbl[] = {
|
|
{ ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
|
|
{ ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
|
|
{ ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
|
|
{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
|
|
{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
|
|
{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
|
|
};
|
|
|
|
EVT SelCondTy = TLI->getValueType(DL, CondTy);
|
|
EVT SelValTy = TLI->getValueType(DL, ValTy);
|
|
if (SelCondTy.isSimple() && SelValTy.isSimple()) {
|
|
if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
|
|
SelCondTy.getSimpleVT(),
|
|
SelValTy.getSimpleVT()))
|
|
return Entry->Cost;
|
|
}
|
|
}
|
|
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
|
|
}
|
|
|
|
AArch64TTIImpl::TTI::MemCmpExpansionOptions
|
|
AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
|
|
TTI::MemCmpExpansionOptions Options;
|
|
if (ST->requiresStrictAlign()) {
|
|
// TODO: Add cost modeling for strict align. Misaligned loads expand to
|
|
// a bunch of instructions when strict align is enabled.
|
|
return Options;
|
|
}
|
|
Options.AllowOverlappingLoads = true;
|
|
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
|
|
Options.NumLoadsPerBlock = Options.MaxNumLoads;
|
|
// TODO: Though vector loads usually perform well on AArch64, in some targets
|
|
// they may wake up the FP unit, which raises the power consumption. Perhaps
|
|
// they could be used with no holds barred (-O3).
|
|
Options.LoadSizes = {8, 4, 2, 1};
|
|
return Options;
|
|
}
|
|
|
|
bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
|
|
return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
|
|
}
|
|
|
|
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
|
|
MaybeAlign Alignment, unsigned AddressSpace,
|
|
TTI::TargetCostKind CostKind,
|
|
const Instruction *I) {
|
|
// TODO: Handle other cost kinds.
|
|
if (CostKind != TTI::TCK_RecipThroughput)
|
|
return 1;
|
|
|
|
// Type legalization can't handle structs
|
|
if (TLI->getValueType(DL, Ty, true) == MVT::Other)
|
|
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
|
|
CostKind);
|
|
|
|
auto LT = TLI->getTypeLegalizationCost(DL, Ty);
|
|
|
|
if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
|
|
LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
|
|
// Unaligned stores are extremely inefficient. We don't split all
|
|
// unaligned 128-bit stores because the negative impact that has shown in
|
|
// practice on inlined block copy code.
|
|
// We make such stores expensive so that we will only vectorize if there
|
|
// are 6 other instructions getting vectorized.
|
|
const int AmortizationCost = 6;
|
|
|
|
return LT.first * 2 * AmortizationCost;
|
|
}
|
|
|
|
if (useNeonVector(Ty) &&
|
|
cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
|
|
unsigned ProfitableNumElements;
|
|
if (Opcode == Instruction::Store)
|
|
// We use a custom trunc store lowering so v.4b should be profitable.
|
|
ProfitableNumElements = 4;
|
|
else
|
|
// We scalarize the loads because there is not v.4b register and we
|
|
// have to promote the elements to v.2.
|
|
ProfitableNumElements = 8;
|
|
|
|
if (cast<FixedVectorType>(Ty)->getNumElements() < ProfitableNumElements) {
|
|
unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
|
|
unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
|
|
// We generate 2 instructions per vector element.
|
|
return NumVectorizableInstsToAmortize * NumVecElts * 2;
|
|
}
|
|
}
|
|
|
|
return LT.first;
|
|
}
|
|
|
|
int AArch64TTIImpl::getInterleavedMemoryOpCost(
|
|
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
|
|
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
|
|
bool UseMaskForCond, bool UseMaskForGaps) {
|
|
assert(Factor >= 2 && "Invalid interleave factor");
|
|
auto *VecVTy = cast<FixedVectorType>(VecTy);
|
|
|
|
if (!UseMaskForCond && !UseMaskForGaps &&
|
|
Factor <= TLI->getMaxSupportedInterleaveFactor()) {
|
|
unsigned NumElts = VecVTy->getNumElements();
|
|
auto *SubVecTy =
|
|
FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
|
|
|
|
// ldN/stN only support legal vector types of size 64 or 128 in bits.
|
|
// Accesses having vector types that are a multiple of 128 bits can be
|
|
// matched to more than one ldN/stN instruction.
|
|
if (NumElts % Factor == 0 &&
|
|
TLI->isLegalInterleavedAccessType(SubVecTy, DL))
|
|
return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
|
|
}
|
|
|
|
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
|
|
Alignment, AddressSpace, CostKind,
|
|
UseMaskForCond, UseMaskForGaps);
|
|
}
|
|
|
|
int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
|
|
int Cost = 0;
|
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
|
for (auto *I : Tys) {
|
|
if (!I->isVectorTy())
|
|
continue;
|
|
if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
|
|
128)
|
|
Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
|
|
getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
|
|
}
|
|
return Cost;
|
|
}
|
|
|
|
unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
|
|
return ST->getMaxInterleaveFactor();
|
|
}
|
|
|
|
// For Falkor, we want to avoid having too many strided loads in a loop since
|
|
// that can exhaust the HW prefetcher resources. We adjust the unroller
|
|
// MaxCount preference below to attempt to ensure unrolling doesn't create too
|
|
// many strided loads.
|
|
static void
|
|
getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
|
|
TargetTransformInfo::UnrollingPreferences &UP) {
|
|
enum { MaxStridedLoads = 7 };
|
|
auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
|
|
int StridedLoads = 0;
|
|
// FIXME? We could make this more precise by looking at the CFG and
|
|
// e.g. not counting loads in each side of an if-then-else diamond.
|
|
for (const auto BB : L->blocks()) {
|
|
for (auto &I : *BB) {
|
|
LoadInst *LMemI = dyn_cast<LoadInst>(&I);
|
|
if (!LMemI)
|
|
continue;
|
|
|
|
Value *PtrValue = LMemI->getPointerOperand();
|
|
if (L->isLoopInvariant(PtrValue))
|
|
continue;
|
|
|
|
const SCEV *LSCEV = SE.getSCEV(PtrValue);
|
|
const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
|
|
if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
|
|
continue;
|
|
|
|
// FIXME? We could take pairing of unrolled load copies into account
|
|
// by looking at the AddRec, but we would probably have to limit this
|
|
// to loops with no stores or other memory optimization barriers.
|
|
++StridedLoads;
|
|
// We've seen enough strided loads that seeing more won't make a
|
|
// difference.
|
|
if (StridedLoads > MaxStridedLoads / 2)
|
|
return StridedLoads;
|
|
}
|
|
}
|
|
return StridedLoads;
|
|
};
|
|
|
|
int StridedLoads = countStridedLoads(L, SE);
|
|
LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
|
|
<< " strided loads\n");
|
|
// Pick the largest power of 2 unroll count that won't result in too many
|
|
// strided loads.
|
|
if (StridedLoads) {
|
|
UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
|
|
LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
|
|
<< UP.MaxCount << '\n');
|
|
}
|
|
}
|
|
|
|
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
|
|
TTI::UnrollingPreferences &UP) {
|
|
// Enable partial unrolling and runtime unrolling.
|
|
BaseT::getUnrollingPreferences(L, SE, UP);
|
|
|
|
// For inner loop, it is more likely to be a hot one, and the runtime check
|
|
// can be promoted out from LICM pass, so the overhead is less, let's try
|
|
// a larger threshold to unroll more loops.
|
|
if (L->getLoopDepth() > 1)
|
|
UP.PartialThreshold *= 2;
|
|
|
|
// Disable partial & runtime unrolling on -Os.
|
|
UP.PartialOptSizeThreshold = 0;
|
|
|
|
if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
|
|
EnableFalkorHWPFUnrollFix)
|
|
getFalkorUnrollingPreferences(L, SE, UP);
|
|
}
|
|
|
|
void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
|
|
TTI::PeelingPreferences &PP) {
|
|
BaseT::getPeelingPreferences(L, SE, PP);
|
|
}
|
|
|
|
Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
|
|
Type *ExpectedType) {
|
|
switch (Inst->getIntrinsicID()) {
|
|
default:
|
|
return nullptr;
|
|
case Intrinsic::aarch64_neon_st2:
|
|
case Intrinsic::aarch64_neon_st3:
|
|
case Intrinsic::aarch64_neon_st4: {
|
|
// Create a struct type
|
|
StructType *ST = dyn_cast<StructType>(ExpectedType);
|
|
if (!ST)
|
|
return nullptr;
|
|
unsigned NumElts = Inst->getNumArgOperands() - 1;
|
|
if (ST->getNumElements() != NumElts)
|
|
return nullptr;
|
|
for (unsigned i = 0, e = NumElts; i != e; ++i) {
|
|
if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
|
|
return nullptr;
|
|
}
|
|
Value *Res = UndefValue::get(ExpectedType);
|
|
IRBuilder<> Builder(Inst);
|
|
for (unsigned i = 0, e = NumElts; i != e; ++i) {
|
|
Value *L = Inst->getArgOperand(i);
|
|
Res = Builder.CreateInsertValue(Res, L, i);
|
|
}
|
|
return Res;
|
|
}
|
|
case Intrinsic::aarch64_neon_ld2:
|
|
case Intrinsic::aarch64_neon_ld3:
|
|
case Intrinsic::aarch64_neon_ld4:
|
|
if (Inst->getType() == ExpectedType)
|
|
return Inst;
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
|
|
MemIntrinsicInfo &Info) {
|
|
switch (Inst->getIntrinsicID()) {
|
|
default:
|
|
break;
|
|
case Intrinsic::aarch64_neon_ld2:
|
|
case Intrinsic::aarch64_neon_ld3:
|
|
case Intrinsic::aarch64_neon_ld4:
|
|
Info.ReadMem = true;
|
|
Info.WriteMem = false;
|
|
Info.PtrVal = Inst->getArgOperand(0);
|
|
break;
|
|
case Intrinsic::aarch64_neon_st2:
|
|
case Intrinsic::aarch64_neon_st3:
|
|
case Intrinsic::aarch64_neon_st4:
|
|
Info.ReadMem = false;
|
|
Info.WriteMem = true;
|
|
Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
|
|
break;
|
|
}
|
|
|
|
switch (Inst->getIntrinsicID()) {
|
|
default:
|
|
return false;
|
|
case Intrinsic::aarch64_neon_ld2:
|
|
case Intrinsic::aarch64_neon_st2:
|
|
Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
|
|
break;
|
|
case Intrinsic::aarch64_neon_ld3:
|
|
case Intrinsic::aarch64_neon_st3:
|
|
Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
|
|
break;
|
|
case Intrinsic::aarch64_neon_ld4:
|
|
case Intrinsic::aarch64_neon_st4:
|
|
Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
|
|
break;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// See if \p I should be considered for address type promotion. We check if \p
|
|
/// I is a sext with right type and used in memory accesses. If it used in a
|
|
/// "complex" getelementptr, we allow it to be promoted without finding other
|
|
/// sext instructions that sign extended the same initial value. A getelementptr
|
|
/// is considered as "complex" if it has more than 2 operands.
|
|
bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
|
|
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
|
|
bool Considerable = false;
|
|
AllowPromotionWithoutCommonHeader = false;
|
|
if (!isa<SExtInst>(&I))
|
|
return false;
|
|
Type *ConsideredSExtType =
|
|
Type::getInt64Ty(I.getParent()->getParent()->getContext());
|
|
if (I.getType() != ConsideredSExtType)
|
|
return false;
|
|
// See if the sext is the one with the right type and used in at least one
|
|
// GetElementPtrInst.
|
|
for (const User *U : I.users()) {
|
|
if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
|
|
Considerable = true;
|
|
// A getelementptr is considered as "complex" if it has more than 2
|
|
// operands. We will promote a SExt used in such complex GEP as we
|
|
// expect some computation to be merged if they are done on 64 bits.
|
|
if (GEPInst->getNumOperands() > 2) {
|
|
AllowPromotionWithoutCommonHeader = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return Considerable;
|
|
}
|
|
|
|
bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
|
|
TTI::ReductionFlags Flags) const {
|
|
auto *VTy = cast<VectorType>(Ty);
|
|
unsigned ScalarBits = Ty->getScalarSizeInBits();
|
|
switch (Opcode) {
|
|
case Instruction::FAdd:
|
|
case Instruction::FMul:
|
|
case Instruction::And:
|
|
case Instruction::Or:
|
|
case Instruction::Xor:
|
|
case Instruction::Mul:
|
|
return false;
|
|
case Instruction::Add:
|
|
return ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128;
|
|
case Instruction::ICmp:
|
|
return (ScalarBits < 64) &&
|
|
(ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128);
|
|
case Instruction::FCmp:
|
|
return Flags.NoNaN;
|
|
default:
|
|
llvm_unreachable("Unhandled reduction opcode");
|
|
}
|
|
return false;
|
|
}
|
|
|
|
int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode,
|
|
VectorType *ValTy,
|
|
bool IsPairwiseForm,
|
|
TTI::TargetCostKind CostKind) {
|
|
|
|
if (IsPairwiseForm)
|
|
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
|
|
CostKind);
|
|
|
|
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
|
|
MVT MTy = LT.second;
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
|
assert(ISD && "Invalid opcode");
|
|
|
|
// Horizontal adds can use the 'addv' instruction. We model the cost of these
|
|
// instructions as normal vector adds. This is the only arithmetic vector
|
|
// reduction operation for which we have an instruction.
|
|
static const CostTblEntry CostTblNoPairwise[]{
|
|
{ISD::ADD, MVT::v8i8, 1},
|
|
{ISD::ADD, MVT::v16i8, 1},
|
|
{ISD::ADD, MVT::v4i16, 1},
|
|
{ISD::ADD, MVT::v8i16, 1},
|
|
{ISD::ADD, MVT::v4i32, 1},
|
|
};
|
|
|
|
if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
|
|
return LT.first * Entry->Cost;
|
|
|
|
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
|
|
CostKind);
|
|
}
|
|
|
|
int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
|
|
int Index, VectorType *SubTp) {
|
|
if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
|
|
Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) {
|
|
static const CostTblEntry ShuffleTbl[] = {
|
|
// Broadcast shuffle kinds can be performed with 'dup'.
|
|
{ TTI::SK_Broadcast, MVT::v8i8, 1 },
|
|
{ TTI::SK_Broadcast, MVT::v16i8, 1 },
|
|
{ TTI::SK_Broadcast, MVT::v4i16, 1 },
|
|
{ TTI::SK_Broadcast, MVT::v8i16, 1 },
|
|
{ TTI::SK_Broadcast, MVT::v2i32, 1 },
|
|
{ TTI::SK_Broadcast, MVT::v4i32, 1 },
|
|
{ TTI::SK_Broadcast, MVT::v2i64, 1 },
|
|
{ TTI::SK_Broadcast, MVT::v2f32, 1 },
|
|
{ TTI::SK_Broadcast, MVT::v4f32, 1 },
|
|
{ TTI::SK_Broadcast, MVT::v2f64, 1 },
|
|
// Transpose shuffle kinds can be performed with 'trn1/trn2' and
|
|
// 'zip1/zip2' instructions.
|
|
{ TTI::SK_Transpose, MVT::v8i8, 1 },
|
|
{ TTI::SK_Transpose, MVT::v16i8, 1 },
|
|
{ TTI::SK_Transpose, MVT::v4i16, 1 },
|
|
{ TTI::SK_Transpose, MVT::v8i16, 1 },
|
|
{ TTI::SK_Transpose, MVT::v2i32, 1 },
|
|
{ TTI::SK_Transpose, MVT::v4i32, 1 },
|
|
{ TTI::SK_Transpose, MVT::v2i64, 1 },
|
|
{ TTI::SK_Transpose, MVT::v2f32, 1 },
|
|
{ TTI::SK_Transpose, MVT::v4f32, 1 },
|
|
{ TTI::SK_Transpose, MVT::v2f64, 1 },
|
|
// Select shuffle kinds.
|
|
// TODO: handle vXi8/vXi16.
|
|
{ TTI::SK_Select, MVT::v2i32, 1 }, // mov.
|
|
{ TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
|
|
{ TTI::SK_Select, MVT::v2i64, 1 }, // mov.
|
|
{ TTI::SK_Select, MVT::v2f32, 1 }, // mov.
|
|
{ TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
|
|
{ TTI::SK_Select, MVT::v2f64, 1 }, // mov.
|
|
// PermuteSingleSrc shuffle kinds.
|
|
// TODO: handle vXi8/vXi16.
|
|
{ TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
|
|
{ TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
|
|
{ TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
|
|
{ TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
|
|
{ TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
|
|
{ TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
|
|
};
|
|
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
|
|
if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
|
|
return LT.first * Entry->Cost;
|
|
}
|
|
|
|
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
|
|
}
|