2015-01-31 19:17:59 +08:00
|
|
|
//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
|
2014-03-29 18:18:08 +08:00
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2014-03-29 18:18:08 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2019-03-19 02:50:58 +08:00
|
|
|
#include "AArch64ExpandImm.h"
|
2015-01-31 19:17:59 +08:00
|
|
|
#include "AArch64TargetTransformInfo.h"
|
2014-05-24 20:50:23 +08:00
|
|
|
#include "MCTargetDesc/AArch64AddressingModes.h"
|
2015-03-09 14:14:28 +08:00
|
|
|
#include "llvm/Analysis/LoopInfo.h"
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
#include "llvm/CodeGen/BasicTTIImpl.h"
|
2017-11-17 09:07:10 +08:00
|
|
|
#include "llvm/CodeGen/CostTable.h"
|
|
|
|
#include "llvm/CodeGen/TargetLowering.h"
|
2017-09-08 07:27:44 +08:00
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
2019-12-11 23:55:26 +08:00
|
|
|
#include "llvm/IR/IntrinsicsAArch64.h"
|
2014-03-29 18:18:08 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2014-04-09 04:39:59 +08:00
|
|
|
#include <algorithm>
|
2014-03-29 18:18:08 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
#define DEBUG_TYPE "aarch64tti"
|
2014-04-22 10:41:26 +08:00
|
|
|
|
2017-06-29 02:53:09 +08:00
|
|
|
static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
|
|
|
|
cl::init(true), cl::Hidden);
|
|
|
|
|
2017-06-28 06:27:32 +08:00
|
|
|
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
|
|
|
|
const Function *Callee) const {
|
|
|
|
const TargetMachine &TM = getTLI()->getTargetMachine();
|
|
|
|
|
|
|
|
const FeatureBitset &CallerBits =
|
|
|
|
TM.getSubtargetImpl(*Caller)->getFeatureBits();
|
|
|
|
const FeatureBitset &CalleeBits =
|
|
|
|
TM.getSubtargetImpl(*Callee)->getFeatureBits();
|
|
|
|
|
|
|
|
// Inline a callee if its target-features are a subset of the callers
|
|
|
|
// target-features.
|
|
|
|
return (CallerBits & CalleeBits) == CalleeBits;
|
|
|
|
}
|
|
|
|
|
2018-05-01 23:54:18 +08:00
|
|
|
/// Calculate the cost of materializing a 64-bit value. This helper
|
2014-04-09 04:39:59 +08:00
|
|
|
/// method might only calculate a fraction of a larger immediate. Therefore it
|
|
|
|
/// is valid to return a cost of ZERO.
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getIntImmCost(int64_t Val) {
|
2014-04-09 04:39:59 +08:00
|
|
|
// Check if the immediate can be encoded within an instruction.
|
2014-05-24 20:50:23 +08:00
|
|
|
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
|
2014-04-09 04:39:59 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (Val < 0)
|
|
|
|
Val = ~Val;
|
|
|
|
|
|
|
|
// Calculate how many moves we will need to materialize this constant.
|
2019-03-19 02:50:58 +08:00
|
|
|
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
|
|
|
|
AArch64_IMM::expandMOVImm(Val, 64, Insn);
|
|
|
|
return Insn.size();
|
2014-04-09 04:39:59 +08:00
|
|
|
}
|
|
|
|
|
2018-05-01 23:54:18 +08:00
|
|
|
/// Calculate the cost of materializing the given constant.
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
|
2014-03-29 18:18:08 +08:00
|
|
|
assert(Ty->isIntegerTy());
|
|
|
|
|
|
|
|
unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
2014-04-12 10:36:28 +08:00
|
|
|
if (BitSize == 0)
|
2014-03-29 18:18:08 +08:00
|
|
|
return ~0U;
|
|
|
|
|
2014-04-09 04:39:59 +08:00
|
|
|
// Sign-extend all constants to a multiple of 64-bit.
|
|
|
|
APInt ImmVal = Imm;
|
|
|
|
if (BitSize & 0x3f)
|
|
|
|
ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
|
|
|
|
|
|
|
|
// Split the constant into 64-bit chunks and calculate the cost for each
|
|
|
|
// chunk.
|
2015-08-06 02:08:10 +08:00
|
|
|
int Cost = 0;
|
2014-04-09 04:39:59 +08:00
|
|
|
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
|
2014-04-10 09:36:59 +08:00
|
|
|
APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
|
2014-04-09 04:39:59 +08:00
|
|
|
int64_t Val = Tmp.getSExtValue();
|
|
|
|
Cost += getIntImmCost(Val);
|
|
|
|
}
|
|
|
|
// We need at least one instruction to materialze the constant.
|
2015-08-06 02:08:10 +08:00
|
|
|
return std::max(1, Cost);
|
2014-04-09 04:39:59 +08:00
|
|
|
}
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2019-12-12 03:54:58 +08:00
|
|
|
int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
|
|
|
|
const APInt &Imm, Type *Ty) {
|
2014-04-09 04:39:59 +08:00
|
|
|
assert(Ty->isIntegerTy());
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-04-09 04:39:59 +08:00
|
|
|
unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
2014-04-12 10:36:28 +08:00
|
|
|
// There is no cost model for constants with a bit size of 0. Return TCC_Free
|
|
|
|
// here, so that constant hoisting will ignore this constant.
|
|
|
|
if (BitSize == 0)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
|
|
|
|
unsigned ImmIdx = ~0U;
|
|
|
|
switch (Opcode) {
|
|
|
|
default:
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
case Instruction::GetElementPtr:
|
|
|
|
// Always hoist the base address of a GetElementPtr.
|
|
|
|
if (Idx == 0)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return 2 * TTI::TCC_Basic;
|
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
case Instruction::Store:
|
|
|
|
ImmIdx = 0;
|
|
|
|
break;
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::Mul:
|
|
|
|
case Instruction::UDiv:
|
|
|
|
case Instruction::SDiv:
|
|
|
|
case Instruction::URem:
|
|
|
|
case Instruction::SRem:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor:
|
|
|
|
case Instruction::ICmp:
|
|
|
|
ImmIdx = 1;
|
|
|
|
break;
|
2014-04-12 10:53:51 +08:00
|
|
|
// Always return TCC_Free for the shift value of a shift instruction.
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::LShr:
|
|
|
|
case Instruction::AShr:
|
|
|
|
if (Idx == 1)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-12 10:53:51 +08:00
|
|
|
break;
|
2014-04-09 04:39:59 +08:00
|
|
|
case Instruction::Trunc:
|
|
|
|
case Instruction::ZExt:
|
|
|
|
case Instruction::SExt:
|
|
|
|
case Instruction::IntToPtr:
|
|
|
|
case Instruction::PtrToInt:
|
|
|
|
case Instruction::BitCast:
|
|
|
|
case Instruction::PHI:
|
|
|
|
case Instruction::Call:
|
|
|
|
case Instruction::Select:
|
|
|
|
case Instruction::Ret:
|
|
|
|
case Instruction::Load:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Idx == ImmIdx) {
|
2015-08-06 02:08:10 +08:00
|
|
|
int NumConstants = (BitSize + 63) / 64;
|
|
|
|
int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return (Cost <= NumConstants * TTI::TCC_Basic)
|
2015-08-06 02:08:10 +08:00
|
|
|
? static_cast<int>(TTI::TCC_Free)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
: Cost;
|
2014-04-09 04:39:59 +08:00
|
|
|
}
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return AArch64TTIImpl::getIntImmCost(Imm, Ty);
|
2014-04-09 04:39:59 +08:00
|
|
|
}
|
|
|
|
|
2019-12-12 03:54:58 +08:00
|
|
|
int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
|
|
|
|
const APInt &Imm, Type *Ty) {
|
2014-04-09 04:39:59 +08:00
|
|
|
assert(Ty->isIntegerTy());
|
|
|
|
|
|
|
|
unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
2014-04-12 10:36:28 +08:00
|
|
|
// There is no cost model for constants with a bit size of 0. Return TCC_Free
|
|
|
|
// here, so that constant hoisting will ignore this constant.
|
|
|
|
if (BitSize == 0)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
|
2019-12-04 18:49:24 +08:00
|
|
|
// Most (all?) AArch64 intrinsics do not support folding immediates into the
|
|
|
|
// selected instruction, so we compute the materialization cost for the
|
|
|
|
// immediate directly.
|
|
|
|
if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
|
|
|
|
return AArch64TTIImpl::getIntImmCost(Imm, Ty);
|
|
|
|
|
2014-04-09 04:39:59 +08:00
|
|
|
switch (IID) {
|
|
|
|
default:
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
case Intrinsic::sadd_with_overflow:
|
|
|
|
case Intrinsic::uadd_with_overflow:
|
|
|
|
case Intrinsic::ssub_with_overflow:
|
|
|
|
case Intrinsic::usub_with_overflow:
|
|
|
|
case Intrinsic::smul_with_overflow:
|
|
|
|
case Intrinsic::umul_with_overflow:
|
|
|
|
if (Idx == 1) {
|
2015-08-06 02:08:10 +08:00
|
|
|
int NumConstants = (BitSize + 63) / 64;
|
|
|
|
int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return (Cost <= NumConstants * TTI::TCC_Basic)
|
2015-08-06 02:08:10 +08:00
|
|
|
? static_cast<int>(TTI::TCC_Free)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
: Cost;
|
2014-04-09 04:39:59 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case Intrinsic::experimental_stackmap:
|
|
|
|
if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
break;
|
|
|
|
case Intrinsic::experimental_patchpoint_void:
|
|
|
|
case Intrinsic::experimental_patchpoint_i64:
|
|
|
|
if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
break;
|
|
|
|
}
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return AArch64TTIImpl::getIntImmCost(Imm, Ty);
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
TargetTransformInfo::PopcntSupportKind
|
|
|
|
AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
|
2014-03-29 18:18:08 +08:00
|
|
|
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
|
|
|
|
if (TyWidth == 32 || TyWidth == 64)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::PSK_FastHardware;
|
2014-05-24 20:50:23 +08:00
|
|
|
// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::PSK_Software;
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
2017-05-10 04:18:12 +08:00
|
|
|
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
|
|
|
|
ArrayRef<const Value *> Args) {
|
|
|
|
|
|
|
|
// A helper that returns a vector type from the given type. The number of
|
|
|
|
// elements in type Ty determine the vector width.
|
|
|
|
auto toVectorTy = [&](Type *ArgTy) {
|
|
|
|
return VectorType::get(ArgTy->getScalarType(),
|
|
|
|
DstTy->getVectorNumElements());
|
|
|
|
};
|
|
|
|
|
|
|
|
// Exit early if DstTy is not a vector type whose elements are at least
|
|
|
|
// 16-bits wide.
|
|
|
|
if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Determine if the operation has a widening variant. We consider both the
|
|
|
|
// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
|
|
|
|
// instructions.
|
|
|
|
//
|
|
|
|
// TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
|
|
|
|
// verify that their extending operands are eliminated during code
|
|
|
|
// generation.
|
|
|
|
switch (Opcode) {
|
|
|
|
case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
|
|
|
|
case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// To be a widening instruction (either the "wide" or "long" versions), the
|
|
|
|
// second operand must be a sign- or zero extend having a single user. We
|
|
|
|
// only consider extends having a single user because they may otherwise not
|
|
|
|
// be eliminated.
|
|
|
|
if (Args.size() != 2 ||
|
|
|
|
(!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
|
|
|
|
!Args[1]->hasOneUse())
|
|
|
|
return false;
|
|
|
|
auto *Extend = cast<CastInst>(Args[1]);
|
|
|
|
|
|
|
|
// Legalize the destination type and ensure it can be used in a widening
|
|
|
|
// operation.
|
|
|
|
auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
|
|
|
|
unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
|
|
|
|
if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Legalize the source type and ensure it can be used in a widening
|
|
|
|
// operation.
|
|
|
|
Type *SrcTy = toVectorTy(Extend->getSrcTy());
|
|
|
|
auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
|
|
|
|
unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
|
|
|
|
if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Get the total number of vector elements in the legalized types.
|
|
|
|
unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
|
|
|
|
unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
|
|
|
|
|
|
|
|
// Return true if the legalized types have the same number of vector elements
|
|
|
|
// and the destination element type size is twice that of the source type.
|
|
|
|
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
|
|
|
|
}
|
|
|
|
|
2017-04-12 19:49:08 +08:00
|
|
|
int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
|
|
|
|
const Instruction *I) {
|
2014-03-29 18:18:08 +08:00
|
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
|
|
|
assert(ISD && "Invalid opcode");
|
|
|
|
|
2017-05-10 04:18:12 +08:00
|
|
|
// If the cast is observable, and it is used by a widening instruction (e.g.,
|
|
|
|
// uaddl, saddw, etc.), it may be free.
|
|
|
|
if (I && I->hasOneUse()) {
|
|
|
|
auto *SingleUser = cast<Instruction>(*I->user_begin());
|
|
|
|
SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
|
|
|
|
if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
|
|
|
|
// If the cast is the second operand, it is free. We will generate either
|
|
|
|
// a "wide" or "long" version of the widening instruction.
|
|
|
|
if (I == SingleUser->getOperand(1))
|
|
|
|
return 0;
|
|
|
|
// If the cast is not the second operand, it will be free if it looks the
|
|
|
|
// same as the second operand. In this case, we will generate a "long"
|
|
|
|
// version of the widening instruction.
|
|
|
|
if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
|
2018-01-06 03:53:51 +08:00
|
|
|
if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
|
2017-05-10 04:18:12 +08:00
|
|
|
cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-09 10:09:04 +08:00
|
|
|
EVT SrcTy = TLI->getValueType(DL, Src);
|
|
|
|
EVT DstTy = TLI->getValueType(DL, Dst);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
if (!SrcTy.isSimple() || !DstTy.isSimple())
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return BaseT::getCastInstrCost(Opcode, Dst, Src);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2015-10-28 12:02:12 +08:00
|
|
|
static const TypeConversionCostTblEntry
|
2015-10-25 08:27:14 +08:00
|
|
|
ConversionTbl[] = {
|
2015-11-19 02:03:06 +08:00
|
|
|
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
|
|
|
|
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
|
|
|
|
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
|
|
|
|
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
|
2015-08-18 00:05:09 +08:00
|
|
|
|
|
|
|
// The number of shll instructions for the extension.
|
2015-11-19 02:03:06 +08:00
|
|
|
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
|
|
|
|
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
|
|
|
|
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
|
|
|
|
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
|
|
|
|
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
|
|
|
|
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
|
|
|
|
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
|
2015-08-18 00:05:09 +08:00
|
|
|
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
// LowerVectorINT_TO_FP:
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
|
2014-03-29 18:18:08 +08:00
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
|
2014-03-29 18:18:08 +08:00
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
|
2014-06-15 17:27:06 +08:00
|
|
|
|
|
|
|
// Complex: to v2f32
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
|
2014-06-15 17:27:15 +08:00
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
|
2014-06-15 17:27:15 +08:00
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
|
2014-06-15 17:27:06 +08:00
|
|
|
|
|
|
|
// Complex: to v4f32
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
|
|
|
|
|
2015-08-18 00:05:09 +08:00
|
|
|
// Complex: to v8f32
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
|
|
|
|
|
|
|
|
// Complex: to v16f32
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
|
|
|
|
|
2014-06-15 17:27:06 +08:00
|
|
|
// Complex: to v2f64
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
|
|
|
|
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
// LowerVectorFP_TO_INT
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
|
2014-03-29 18:18:08 +08:00
|
|
|
{ ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
|
2014-03-29 18:18:08 +08:00
|
|
|
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
|
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
|
2014-06-15 17:27:06 +08:00
|
|
|
|
2014-06-15 17:27:15 +08:00
|
|
|
// Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
|
2014-06-15 17:27:15 +08:00
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
|
2014-06-15 17:27:15 +08:00
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
|
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
|
|
|
|
|
|
|
|
// Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
|
2014-06-15 17:27:15 +08:00
|
|
|
{ ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
|
|
|
|
|
|
|
|
// Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
|
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
|
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
|
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
|
2014-03-29 18:18:08 +08:00
|
|
|
};
|
|
|
|
|
2015-10-27 12:14:24 +08:00
|
|
|
if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
|
|
|
|
DstTy.getSimpleVT(),
|
|
|
|
SrcTy.getSimpleVT()))
|
|
|
|
return Entry->Cost;
|
2014-03-29 18:18:08 +08:00
|
|
|
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return BaseT::getCastInstrCost(Opcode, Dst, Src);
|
2016-04-27 23:20:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
|
|
|
|
VectorType *VecTy,
|
|
|
|
unsigned Index) {
|
|
|
|
|
|
|
|
// Make sure we were given a valid extend opcode.
|
2016-04-28 00:25:04 +08:00
|
|
|
assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
|
|
|
|
"Invalid opcode");
|
2016-04-27 23:20:21 +08:00
|
|
|
|
|
|
|
// We are extending an element we extract from a vector, so the source type
|
|
|
|
// of the extend is the element type of the vector.
|
|
|
|
auto *Src = VecTy->getElementType();
|
|
|
|
|
|
|
|
// Sign- and zero-extends are for integer types only.
|
|
|
|
assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
|
|
|
|
|
|
|
|
// Get the cost for the extract. We compute the cost (if any) for the extend
|
|
|
|
// below.
|
|
|
|
auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
|
|
|
|
|
|
|
|
// Legalize the types.
|
|
|
|
auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
|
|
|
|
auto DstVT = TLI->getValueType(DL, Dst);
|
|
|
|
auto SrcVT = TLI->getValueType(DL, Src);
|
|
|
|
|
|
|
|
// If the resulting type is still a vector and the destination type is legal,
|
|
|
|
// we may get the extension for free. If not, get the default cost for the
|
|
|
|
// extend.
|
|
|
|
if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
|
|
|
|
return Cost + getCastInstrCost(Opcode, Dst, Src);
|
|
|
|
|
|
|
|
// The destination type should be larger than the element type. If not, get
|
|
|
|
// the default cost for the extend.
|
|
|
|
if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
|
|
|
|
return Cost + getCastInstrCost(Opcode, Dst, Src);
|
|
|
|
|
|
|
|
switch (Opcode) {
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Opcode should be either SExt or ZExt");
|
|
|
|
|
|
|
|
// For sign-extends, we only need a smov, which performs the extension
|
|
|
|
// automatically.
|
|
|
|
case Instruction::SExt:
|
|
|
|
return Cost;
|
|
|
|
|
|
|
|
// For zero-extends, the extend is performed automatically by a umov unless
|
|
|
|
// the destination type is i64 and the element type is i8 or i16.
|
|
|
|
case Instruction::ZExt:
|
|
|
|
if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
|
|
|
|
return Cost;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we are unable to perform the extend for free, get the default cost.
|
|
|
|
return Cost + getCastInstrCost(Opcode, Dst, Src);
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
|
|
|
|
unsigned Index) {
|
2014-03-29 18:18:08 +08:00
|
|
|
assert(Val->isVectorTy() && "This must be a vector type");
|
|
|
|
|
|
|
|
if (Index != -1U) {
|
|
|
|
// Legalize the type.
|
2015-08-06 02:08:10 +08:00
|
|
|
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
// This type is legalized to a scalar type.
|
|
|
|
if (!LT.second.isVector())
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
// The type may be split. Normalize the index to the new type.
|
|
|
|
unsigned Width = LT.second.getVectorNumElements();
|
|
|
|
Index = Index % Width;
|
|
|
|
|
|
|
|
// The element at index zero is already inside the vector.
|
|
|
|
if (Index == 0)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// All other insert/extracts cost this much.
|
2016-06-03 02:03:53 +08:00
|
|
|
return ST->getVectorInsertExtractBaseCost();
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getArithmeticInstrCost(
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
|
|
|
|
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
|
[ARM] Teach the Arm cost model that a Shift can be folded into other instructions
This attempts to teach the cost model in Arm that code such as:
%s = shl i32 %a, 3
%a = and i32 %s, %b
Can under Arm or Thumb2 become:
and r0, r1, r2, lsl #3
So the cost of the shift can essentially be free. To do this without
trying to artificially adjust the cost of the "and" instruction, it
needs to get the users of the shl and check if they are a type of
instruction that the shift can be folded into. And so it needs to have
access to the actual instruction in getArithmeticInstrCost, which if
available is added as an extra parameter much like getCastInstrCost.
We otherwise limit it to shifts with a single user, which should
hopefully handle most of the cases. The list of instruction that the
shift can be folded into include ADC, ADD, AND, BIC, CMP, EOR, MVN, ORR,
ORN, RSB, SBC and SUB. This translates to Add, Sub, And, Or, Xor and
ICmp.
Differential Revision: https://reviews.llvm.org/D70966
2019-12-08 23:33:24 +08:00
|
|
|
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
|
|
|
|
const Instruction *CxtI) {
|
2014-03-29 18:18:08 +08:00
|
|
|
// Legalize the type.
|
2015-08-06 02:08:10 +08:00
|
|
|
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2017-05-10 04:18:12 +08:00
|
|
|
// If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
|
|
|
|
// add in the widening overhead specified by the sub-target. Since the
|
|
|
|
// extends feeding widening instructions are performed automatically, they
|
|
|
|
// aren't present in the generated code and have a zero cost. By adding a
|
|
|
|
// widening overhead here, we attach the total cost of the combined operation
|
|
|
|
// to the widening instruction.
|
|
|
|
int Cost = 0;
|
|
|
|
if (isWideningInstruction(Ty, Opcode, Args))
|
|
|
|
Cost += ST->getWideningBaseCost();
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
|
|
|
|
|
|
|
switch (ISD) {
|
|
|
|
default:
|
2017-05-10 04:18:12 +08:00
|
|
|
return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
|
|
|
|
Opd1PropInfo, Opd2PropInfo);
|
2018-03-08 06:35:32 +08:00
|
|
|
case ISD::SDIV:
|
|
|
|
if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
|
|
|
|
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
|
|
|
|
// On AArch64, scalar signed division by constants power-of-two are
|
|
|
|
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
|
|
|
|
// The OperandValue properties many not be same as that of previous
|
|
|
|
// operation; conservatively assume OP_None.
|
|
|
|
Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
|
|
|
|
TargetTransformInfo::OP_None,
|
|
|
|
TargetTransformInfo::OP_None);
|
|
|
|
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
|
|
|
|
TargetTransformInfo::OP_None,
|
|
|
|
TargetTransformInfo::OP_None);
|
|
|
|
Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
|
|
|
|
TargetTransformInfo::OP_None,
|
|
|
|
TargetTransformInfo::OP_None);
|
|
|
|
Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
|
|
|
|
TargetTransformInfo::OP_None,
|
|
|
|
TargetTransformInfo::OP_None);
|
|
|
|
return Cost;
|
|
|
|
}
|
|
|
|
LLVM_FALLTHROUGH;
|
|
|
|
case ISD::UDIV:
|
2018-05-09 20:48:22 +08:00
|
|
|
if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
|
|
|
|
auto VT = TLI->getValueType(DL, Ty);
|
|
|
|
if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
|
|
|
|
// Vector signed division by constant are expanded to the
|
|
|
|
// sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
|
|
|
|
// to MULHS + SUB + SRL + ADD + SRL.
|
|
|
|
int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info,
|
|
|
|
Opd2Info,
|
|
|
|
TargetTransformInfo::OP_None,
|
|
|
|
TargetTransformInfo::OP_None);
|
|
|
|
int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info,
|
|
|
|
Opd2Info,
|
|
|
|
TargetTransformInfo::OP_None,
|
|
|
|
TargetTransformInfo::OP_None);
|
|
|
|
int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info,
|
|
|
|
Opd2Info,
|
|
|
|
TargetTransformInfo::OP_None,
|
|
|
|
TargetTransformInfo::OP_None);
|
|
|
|
return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-08 06:35:32 +08:00
|
|
|
Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
|
|
|
|
Opd1PropInfo, Opd2PropInfo);
|
|
|
|
if (Ty->isVectorTy()) {
|
|
|
|
// On AArch64, vector divisions are not supported natively and are
|
|
|
|
// expanded into scalar divisions of each pair of elements.
|
|
|
|
Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info,
|
|
|
|
Opd2Info, Opd1PropInfo, Opd2PropInfo);
|
|
|
|
Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
|
|
|
|
Opd2Info, Opd1PropInfo, Opd2PropInfo);
|
|
|
|
// TODO: if one of the arguments is scalar, then it's not necessary to
|
|
|
|
// double the cost of handling the vector elements.
|
|
|
|
Cost += Cost;
|
|
|
|
}
|
|
|
|
return Cost;
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
case ISD::ADD:
|
|
|
|
case ISD::MUL:
|
|
|
|
case ISD::XOR:
|
|
|
|
case ISD::OR:
|
|
|
|
case ISD::AND:
|
|
|
|
// These nodes are marked as 'custom' for combining purposes only.
|
|
|
|
// We know that they are legal. See LowerAdd in ISelLowering.
|
2017-05-10 04:18:12 +08:00
|
|
|
return (Cost + 1) * LT.first;
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-01-05 22:03:41 +08:00
|
|
|
int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
|
|
|
|
const SCEV *Ptr) {
|
2014-03-29 18:18:08 +08:00
|
|
|
// Address computations in vectorized code with non-consecutive addresses will
|
|
|
|
// likely result in more instructions compared to scalar code where the
|
|
|
|
// computation can more often be merged into the index mode. The resulting
|
|
|
|
// extra micro-ops can significantly decrease throughput.
|
|
|
|
unsigned NumVectorInstToHideOverhead = 10;
|
2017-01-05 22:03:41 +08:00
|
|
|
int MaxMergeDistance = 64;
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2018-07-31 03:41:25 +08:00
|
|
|
if (Ty->isVectorTy() && SE &&
|
2017-01-05 22:03:41 +08:00
|
|
|
!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
|
2014-03-29 18:18:08 +08:00
|
|
|
return NumVectorInstToHideOverhead;
|
|
|
|
|
|
|
|
// In many cases the address computation is not merged into the instruction
|
|
|
|
// addressing mode.
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
|
2017-04-12 19:49:08 +08:00
|
|
|
Type *CondTy, const Instruction *I) {
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
2015-09-09 23:35:02 +08:00
|
|
|
// We don't lower some vector selects well that are wider than the register
|
|
|
|
// width.
|
2014-03-29 18:18:08 +08:00
|
|
|
if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
|
|
|
|
// We would need this many instructions to hide the scalarization happening.
|
2015-08-06 02:08:10 +08:00
|
|
|
const int AmortizationCost = 20;
|
2015-10-28 12:02:12 +08:00
|
|
|
static const TypeConversionCostTblEntry
|
2014-03-29 18:18:08 +08:00
|
|
|
VectorSelectTbl[] = {
|
2015-09-09 23:35:02 +08:00
|
|
|
{ ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
|
|
|
|
{ ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
|
|
|
|
{ ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
|
2014-03-29 18:18:08 +08:00
|
|
|
{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
|
|
|
|
{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
|
|
|
|
{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
|
|
|
|
};
|
|
|
|
|
2015-07-09 10:09:04 +08:00
|
|
|
EVT SelCondTy = TLI->getValueType(DL, CondTy);
|
|
|
|
EVT SelValTy = TLI->getValueType(DL, ValTy);
|
2014-03-29 18:18:08 +08:00
|
|
|
if (SelCondTy.isSimple() && SelValTy.isSimple()) {
|
2015-10-27 12:14:24 +08:00
|
|
|
if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
|
|
|
|
SelCondTy.getSimpleVT(),
|
|
|
|
SelValTy.getSimpleVT()))
|
|
|
|
return Entry->Cost;
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
}
|
2017-04-12 19:49:08 +08:00
|
|
|
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
2019-08-06 02:09:14 +08:00
|
|
|
AArch64TTIImpl::TTI::MemCmpExpansionOptions
|
|
|
|
AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
|
|
|
|
TTI::MemCmpExpansionOptions Options;
|
|
|
|
Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
|
|
|
|
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
|
|
|
|
Options.NumLoadsPerBlock = Options.MaxNumLoads;
|
|
|
|
// TODO: Though vector loads usually perform well on AArch64, in some targets
|
|
|
|
// they may wake up the FP unit, which raises the power consumption. Perhaps
|
|
|
|
// they could be used with no holds barred (-O3).
|
|
|
|
Options.LoadSizes = {8, 4, 2, 1};
|
|
|
|
return Options;
|
|
|
|
}
|
|
|
|
|
2017-01-11 07:42:21 +08:00
|
|
|
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
|
2019-10-22 23:16:52 +08:00
|
|
|
MaybeAlign Alignment, unsigned AddressSpace,
|
2017-04-12 19:49:08 +08:00
|
|
|
const Instruction *I) {
|
2017-01-11 07:42:21 +08:00
|
|
|
auto LT = TLI->getTypeLegalizationCost(DL, Ty);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2016-12-16 02:36:59 +08:00
|
|
|
if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
|
2019-10-22 23:16:52 +08:00
|
|
|
LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
|
2017-01-11 07:42:21 +08:00
|
|
|
// Unaligned stores are extremely inefficient. We don't split all
|
|
|
|
// unaligned 128-bit stores because the negative impact that has shown in
|
|
|
|
// practice on inlined block copy code.
|
|
|
|
// We make such stores expensive so that we will only vectorize if there
|
2014-03-29 18:18:08 +08:00
|
|
|
// are 6 other instructions getting vectorized.
|
2017-01-11 07:42:21 +08:00
|
|
|
const int AmortizationCost = 6;
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
return LT.first * 2 * AmortizationCost;
|
|
|
|
}
|
|
|
|
|
[AArch64] Add custom lowering for v4i8 trunc store
This patch adds a custom trunc store lowering for v4i8 vector types.
Since there is not v.4b register, the v4i8 is promoted to v4i16 (v.4h)
and default action for v4i8 is to extract each element and issue 4
byte stores.
A better strategy would be to extended the promoted v4i16 to v8i16
(with undef elements) and extract and store the word lane which
represents the v4i8 subvectores. The construction:
define void @foo(<4 x i16> %x, i8* nocapture %p) {
%0 = trunc <4 x i16> %x to <4 x i8>
%1 = bitcast i8* %p to <4 x i8>*
store <4 x i8> %0, <4 x i8>* %1, align 4, !tbaa !2
ret void
}
Can be optimized from:
umov w8, v0.h[3]
umov w9, v0.h[2]
umov w10, v0.h[1]
umov w11, v0.h[0]
strb w8, [x0, #3]
strb w9, [x0, #2]
strb w10, [x0, #1]
strb w11, [x0]
ret
To:
xtn v0.8b, v0.8h
str s0, [x0]
ret
The patch also adjust the memory cost for autovectorization, so the C
code:
void foo (const int *src, int width, unsigned char *dst)
{
for (int i = 0; i < width; i++)
*dst++ = *src++;
}
can be vectorized to:
.LBB0_4: // %vector.body
// =>This Inner Loop Header: Depth=1
ldr q0, [x0], #16
subs x12, x12, #4 // =4
xtn v0.4h, v0.4s
xtn v0.8b, v0.8h
st1 { v0.s }[0], [x2], #4
b.ne .LBB0_4
Instead of byte operations.
llvm-svn: 335735
2018-06-27 21:58:46 +08:00
|
|
|
if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
|
|
|
|
unsigned ProfitableNumElements;
|
|
|
|
if (Opcode == Instruction::Store)
|
|
|
|
// We use a custom trunc store lowering so v.4b should be profitable.
|
|
|
|
ProfitableNumElements = 4;
|
|
|
|
else
|
|
|
|
// We scalarize the loads because there is not v.4b register and we
|
|
|
|
// have to promote the elements to v.2.
|
|
|
|
ProfitableNumElements = 8;
|
|
|
|
|
|
|
|
if (Ty->getVectorNumElements() < ProfitableNumElements) {
|
|
|
|
unsigned NumVecElts = Ty->getVectorNumElements();
|
|
|
|
unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
|
|
|
|
// We generate 2 instructions per vector element.
|
|
|
|
return NumVectorizableInstsToAmortize * NumVecElts * 2;
|
|
|
|
}
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return LT.first;
|
|
|
|
}
|
2014-08-05 20:30:34 +08:00
|
|
|
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
|
|
|
unsigned Factor,
|
|
|
|
ArrayRef<unsigned> Indices,
|
|
|
|
unsigned Alignment,
|
2018-10-14 16:50:06 +08:00
|
|
|
unsigned AddressSpace,
|
2018-10-31 17:57:56 +08:00
|
|
|
bool UseMaskForCond,
|
|
|
|
bool UseMaskForGaps) {
|
[AArch64] Lower interleaved memory accesses to ldN/stN intrinsics. This patch also adds a function to calculate the cost of interleaved memory accesses.
E.g. Lower an interleaved load:
%wide.vec = load <8 x i32>, <8 x i32>* %ptr
%v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>
%v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>
into:
%ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
%vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
%vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
E.g. Lower an interleaved store:
%i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
store <12 x i32> %i.vec, <12 x i32>* %ptr
into:
%sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
%sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
%sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
Differential Revision: http://reviews.llvm.org/D10533
llvm-svn: 240754
2015-06-26 10:32:07 +08:00
|
|
|
assert(Factor >= 2 && "Invalid interleave factor");
|
|
|
|
assert(isa<VectorType>(VecTy) && "Expect a vector type");
|
|
|
|
|
2019-05-28 20:36:39 +08:00
|
|
|
if (!UseMaskForCond && !UseMaskForGaps &&
|
2018-10-31 17:57:56 +08:00
|
|
|
Factor <= TLI->getMaxSupportedInterleaveFactor()) {
|
[AArch64] Lower interleaved memory accesses to ldN/stN intrinsics. This patch also adds a function to calculate the cost of interleaved memory accesses.
E.g. Lower an interleaved load:
%wide.vec = load <8 x i32>, <8 x i32>* %ptr
%v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>
%v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>
into:
%ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
%vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
%vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
E.g. Lower an interleaved store:
%i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
store <12 x i32> %i.vec, <12 x i32>* %ptr
into:
%sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
%sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
%sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
Differential Revision: http://reviews.llvm.org/D10533
llvm-svn: 240754
2015-06-26 10:32:07 +08:00
|
|
|
unsigned NumElts = VecTy->getVectorNumElements();
|
2017-04-11 02:34:37 +08:00
|
|
|
auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
|
[AArch64] Lower interleaved memory accesses to ldN/stN intrinsics. This patch also adds a function to calculate the cost of interleaved memory accesses.
E.g. Lower an interleaved load:
%wide.vec = load <8 x i32>, <8 x i32>* %ptr
%v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>
%v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>
into:
%ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
%vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
%vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
E.g. Lower an interleaved store:
%i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
store <12 x i32> %i.vec, <12 x i32>* %ptr
into:
%sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
%sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
%sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
Differential Revision: http://reviews.llvm.org/D10533
llvm-svn: 240754
2015-06-26 10:32:07 +08:00
|
|
|
|
|
|
|
// ldN/stN only support legal vector types of size 64 or 128 in bits.
|
2017-03-02 23:15:35 +08:00
|
|
|
// Accesses having vector types that are a multiple of 128 bits can be
|
|
|
|
// matched to more than one ldN/stN instruction.
|
2017-04-11 02:34:37 +08:00
|
|
|
if (NumElts % Factor == 0 &&
|
|
|
|
TLI->isLegalInterleavedAccessType(SubVecTy, DL))
|
|
|
|
return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
|
[AArch64] Lower interleaved memory accesses to ldN/stN intrinsics. This patch also adds a function to calculate the cost of interleaved memory accesses.
E.g. Lower an interleaved load:
%wide.vec = load <8 x i32>, <8 x i32>* %ptr
%v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>
%v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>
into:
%ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
%vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
%vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
E.g. Lower an interleaved store:
%i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
store <12 x i32> %i.vec, <12 x i32>* %ptr
into:
%sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
%sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
%sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
Differential Revision: http://reviews.llvm.org/D10533
llvm-svn: 240754
2015-06-26 10:32:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
|
2018-10-31 17:57:56 +08:00
|
|
|
Alignment, AddressSpace,
|
|
|
|
UseMaskForCond, UseMaskForGaps);
|
[AArch64] Lower interleaved memory accesses to ldN/stN intrinsics. This patch also adds a function to calculate the cost of interleaved memory accesses.
E.g. Lower an interleaved load:
%wide.vec = load <8 x i32>, <8 x i32>* %ptr
%v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>
%v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>
into:
%ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
%vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
%vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
E.g. Lower an interleaved store:
%i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
store <12 x i32> %i.vec, <12 x i32>* %ptr
into:
%sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
%sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
%sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
Differential Revision: http://reviews.llvm.org/D10533
llvm-svn: 240754
2015-06-26 10:32:07 +08:00
|
|
|
}
|
|
|
|
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
|
|
|
|
int Cost = 0;
|
2014-08-05 20:30:34 +08:00
|
|
|
for (auto *I : Tys) {
|
|
|
|
if (!I->isVectorTy())
|
|
|
|
continue;
|
|
|
|
if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
|
2019-10-22 23:16:52 +08:00
|
|
|
Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0) +
|
|
|
|
getMemoryOpCost(Instruction::Load, I, Align(128), 0);
|
2014-08-05 20:30:34 +08:00
|
|
|
}
|
|
|
|
return Cost;
|
|
|
|
}
|
2014-08-21 08:02:51 +08:00
|
|
|
|
2015-05-07 01:12:25 +08:00
|
|
|
unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
|
2016-06-03 02:03:53 +08:00
|
|
|
return ST->getMaxInterleaveFactor();
|
2014-08-21 08:02:51 +08:00
|
|
|
}
|
2014-10-09 18:13:27 +08:00
|
|
|
|
2017-06-29 02:53:09 +08:00
|
|
|
// For Falkor, we want to avoid having too many strided loads in a loop since
|
|
|
|
// that can exhaust the HW prefetcher resources. We adjust the unroller
|
|
|
|
// MaxCount preference below to attempt to ensure unrolling doesn't create too
|
|
|
|
// many strided loads.
|
|
|
|
static void
|
|
|
|
getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
|
|
|
|
TargetTransformInfo::UnrollingPreferences &UP) {
|
2017-06-29 03:36:10 +08:00
|
|
|
enum { MaxStridedLoads = 7 };
|
2017-06-29 02:53:09 +08:00
|
|
|
auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
|
|
|
|
int StridedLoads = 0;
|
|
|
|
// FIXME? We could make this more precise by looking at the CFG and
|
|
|
|
// e.g. not counting loads in each side of an if-then-else diamond.
|
|
|
|
for (const auto BB : L->blocks()) {
|
|
|
|
for (auto &I : *BB) {
|
|
|
|
LoadInst *LMemI = dyn_cast<LoadInst>(&I);
|
|
|
|
if (!LMemI)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Value *PtrValue = LMemI->getPointerOperand();
|
|
|
|
if (L->isLoopInvariant(PtrValue))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
const SCEV *LSCEV = SE.getSCEV(PtrValue);
|
|
|
|
const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
|
|
|
|
if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// FIXME? We could take pairing of unrolled load copies into account
|
|
|
|
// by looking at the AddRec, but we would probably have to limit this
|
|
|
|
// to loops with no stores or other memory optimization barriers.
|
|
|
|
++StridedLoads;
|
|
|
|
// We've seen enough strided loads that seeing more won't make a
|
|
|
|
// difference.
|
|
|
|
if (StridedLoads > MaxStridedLoads / 2)
|
|
|
|
return StridedLoads;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return StridedLoads;
|
|
|
|
};
|
|
|
|
|
|
|
|
int StridedLoads = countStridedLoads(L, SE);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
|
|
|
|
<< " strided loads\n");
|
2017-06-29 02:53:09 +08:00
|
|
|
// Pick the largest power of 2 unroll count that won't result in too many
|
|
|
|
// strided loads.
|
|
|
|
if (StridedLoads) {
|
|
|
|
UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
|
|
|
|
<< UP.MaxCount << '\n');
|
2017-06-29 02:53:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[LoopUnroll] Pass SCEV to getUnrollingPreferences hook. NFCI.
Reviewers: sanjoy, anna, reames, apilipenko, igor-laevsky, mkuper
Subscribers: jholewinski, arsenm, mzolotukhin, nemanjai, nhaehnle, javed.absar, mcrosier, llvm-commits
Differential Revision: https://reviews.llvm.org/D34531
llvm-svn: 306554
2017-06-28 23:53:17 +08:00
|
|
|
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
TTI::UnrollingPreferences &UP) {
|
2015-03-09 14:14:28 +08:00
|
|
|
// Enable partial unrolling and runtime unrolling.
|
[LoopUnroll] Pass SCEV to getUnrollingPreferences hook. NFCI.
Reviewers: sanjoy, anna, reames, apilipenko, igor-laevsky, mkuper
Subscribers: jholewinski, arsenm, mzolotukhin, nemanjai, nhaehnle, javed.absar, mcrosier, llvm-commits
Differential Revision: https://reviews.llvm.org/D34531
llvm-svn: 306554
2017-06-28 23:53:17 +08:00
|
|
|
BaseT::getUnrollingPreferences(L, SE, UP);
|
2015-03-09 14:14:28 +08:00
|
|
|
|
|
|
|
// For inner loop, it is more likely to be a hot one, and the runtime check
|
|
|
|
// can be promoted out from LICM pass, so the overhead is less, let's try
|
|
|
|
// a larger threshold to unroll more loops.
|
|
|
|
if (L->getLoopDepth() > 1)
|
|
|
|
UP.PartialThreshold *= 2;
|
|
|
|
|
2014-10-09 18:13:27 +08:00
|
|
|
// Disable partial & runtime unrolling on -Os.
|
|
|
|
UP.PartialOptSizeThreshold = 0;
|
2017-06-29 02:53:09 +08:00
|
|
|
|
|
|
|
if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
|
|
|
|
EnableFalkorHWPFUnrollFix)
|
|
|
|
getFalkorUnrollingPreferences(L, SE, UP);
|
2014-10-09 18:13:27 +08:00
|
|
|
}
|
2015-01-27 06:51:15 +08:00
|
|
|
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
|
|
|
|
Type *ExpectedType) {
|
2015-01-27 06:51:15 +08:00
|
|
|
switch (Inst->getIntrinsicID()) {
|
|
|
|
default:
|
|
|
|
return nullptr;
|
|
|
|
case Intrinsic::aarch64_neon_st2:
|
|
|
|
case Intrinsic::aarch64_neon_st3:
|
|
|
|
case Intrinsic::aarch64_neon_st4: {
|
|
|
|
// Create a struct type
|
|
|
|
StructType *ST = dyn_cast<StructType>(ExpectedType);
|
|
|
|
if (!ST)
|
|
|
|
return nullptr;
|
|
|
|
unsigned NumElts = Inst->getNumArgOperands() - 1;
|
|
|
|
if (ST->getNumElements() != NumElts)
|
|
|
|
return nullptr;
|
|
|
|
for (unsigned i = 0, e = NumElts; i != e; ++i) {
|
|
|
|
if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
Value *Res = UndefValue::get(ExpectedType);
|
|
|
|
IRBuilder<> Builder(Inst);
|
|
|
|
for (unsigned i = 0, e = NumElts; i != e; ++i) {
|
|
|
|
Value *L = Inst->getArgOperand(i);
|
|
|
|
Res = Builder.CreateInsertValue(Res, L, i);
|
|
|
|
}
|
|
|
|
return Res;
|
|
|
|
}
|
|
|
|
case Intrinsic::aarch64_neon_ld2:
|
|
|
|
case Intrinsic::aarch64_neon_ld3:
|
|
|
|
case Intrinsic::aarch64_neon_ld4:
|
|
|
|
if (Inst->getType() == ExpectedType)
|
|
|
|
return Inst;
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
|
|
|
|
MemIntrinsicInfo &Info) {
|
2015-01-27 06:51:15 +08:00
|
|
|
switch (Inst->getIntrinsicID()) {
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
case Intrinsic::aarch64_neon_ld2:
|
|
|
|
case Intrinsic::aarch64_neon_ld3:
|
|
|
|
case Intrinsic::aarch64_neon_ld4:
|
|
|
|
Info.ReadMem = true;
|
|
|
|
Info.WriteMem = false;
|
|
|
|
Info.PtrVal = Inst->getArgOperand(0);
|
|
|
|
break;
|
|
|
|
case Intrinsic::aarch64_neon_st2:
|
|
|
|
case Intrinsic::aarch64_neon_st3:
|
|
|
|
case Intrinsic::aarch64_neon_st4:
|
|
|
|
Info.ReadMem = false;
|
|
|
|
Info.WriteMem = true;
|
|
|
|
Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (Inst->getIntrinsicID()) {
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
case Intrinsic::aarch64_neon_ld2:
|
|
|
|
case Intrinsic::aarch64_neon_st2:
|
|
|
|
Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
|
|
|
|
break;
|
|
|
|
case Intrinsic::aarch64_neon_ld3:
|
|
|
|
case Intrinsic::aarch64_neon_st3:
|
|
|
|
Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
|
|
|
|
break;
|
|
|
|
case Intrinsic::aarch64_neon_ld4:
|
|
|
|
case Intrinsic::aarch64_neon_st4:
|
|
|
|
Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2016-03-18 08:27:29 +08:00
|
|
|
|
2017-04-04 03:20:07 +08:00
|
|
|
/// See if \p I should be considered for address type promotion. We check if \p
|
|
|
|
/// I is a sext with right type and used in memory accesses. If it used in a
|
|
|
|
/// "complex" getelementptr, we allow it to be promoted without finding other
|
|
|
|
/// sext instructions that sign extended the same initial value. A getelementptr
|
|
|
|
/// is considered as "complex" if it has more than 2 operands.
|
|
|
|
bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
|
|
|
|
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
|
|
|
|
bool Considerable = false;
|
|
|
|
AllowPromotionWithoutCommonHeader = false;
|
|
|
|
if (!isa<SExtInst>(&I))
|
|
|
|
return false;
|
|
|
|
Type *ConsideredSExtType =
|
|
|
|
Type::getInt64Ty(I.getParent()->getParent()->getContext());
|
|
|
|
if (I.getType() != ConsideredSExtType)
|
|
|
|
return false;
|
|
|
|
// See if the sext is the one with the right type and used in at least one
|
|
|
|
// GetElementPtrInst.
|
|
|
|
for (const User *U : I.users()) {
|
|
|
|
if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
|
|
|
|
Considerable = true;
|
|
|
|
// A getelementptr is considered as "complex" if it has more than 2
|
|
|
|
// operands. We will promote a SExt used in such complex GEP as we
|
|
|
|
// expect some computation to be merged if they are done on 64 bits.
|
|
|
|
if (GEPInst->getNumOperands() > 2) {
|
|
|
|
AllowPromotionWithoutCommonHeader = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Considerable;
|
|
|
|
}
|
|
|
|
|
2017-05-17 05:29:22 +08:00
|
|
|
bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
|
|
|
|
TTI::ReductionFlags Flags) const {
|
|
|
|
assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
|
|
|
|
unsigned ScalarBits = Ty->getScalarSizeInBits();
|
|
|
|
switch (Opcode) {
|
|
|
|
case Instruction::FAdd:
|
|
|
|
case Instruction::FMul:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor:
|
|
|
|
case Instruction::Mul:
|
|
|
|
return false;
|
|
|
|
case Instruction::Add:
|
|
|
|
return ScalarBits * Ty->getVectorNumElements() >= 128;
|
|
|
|
case Instruction::ICmp:
|
|
|
|
return (ScalarBits < 64) &&
|
|
|
|
(ScalarBits * Ty->getVectorNumElements() >= 128);
|
|
|
|
case Instruction::FCmp:
|
|
|
|
return Flags.NoNaN;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Unhandled reduction opcode");
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
2018-03-16 19:34:15 +08:00
|
|
|
|
|
|
|
int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
|
|
|
|
bool IsPairwiseForm) {
|
|
|
|
|
|
|
|
if (IsPairwiseForm)
|
|
|
|
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
|
|
|
|
|
|
|
|
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
|
|
|
|
MVT MTy = LT.second;
|
|
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
|
|
|
assert(ISD && "Invalid opcode");
|
|
|
|
|
|
|
|
// Horizontal adds can use the 'addv' instruction. We model the cost of these
|
|
|
|
// instructions as normal vector adds. This is the only arithmetic vector
|
|
|
|
// reduction operation for which we have an instruction.
|
|
|
|
static const CostTblEntry CostTblNoPairwise[]{
|
|
|
|
{ISD::ADD, MVT::v8i8, 1},
|
|
|
|
{ISD::ADD, MVT::v16i8, 1},
|
|
|
|
{ISD::ADD, MVT::v4i16, 1},
|
|
|
|
{ISD::ADD, MVT::v8i16, 1},
|
|
|
|
{ISD::ADD, MVT::v4i32, 1},
|
|
|
|
};
|
|
|
|
|
|
|
|
if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
|
|
|
|
return LT.first * Entry->Cost;
|
|
|
|
|
|
|
|
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
|
|
|
|
}
|
2018-04-26 21:48:33 +08:00
|
|
|
|
|
|
|
int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
|
|
|
Type *SubTp) {
|
2018-10-25 18:52:36 +08:00
|
|
|
if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
|
|
|
|
Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) {
|
2018-06-22 17:45:31 +08:00
|
|
|
static const CostTblEntry ShuffleTbl[] = {
|
2018-10-25 18:52:36 +08:00
|
|
|
// Broadcast shuffle kinds can be performed with 'dup'.
|
|
|
|
{ TTI::SK_Broadcast, MVT::v8i8, 1 },
|
|
|
|
{ TTI::SK_Broadcast, MVT::v16i8, 1 },
|
|
|
|
{ TTI::SK_Broadcast, MVT::v4i16, 1 },
|
|
|
|
{ TTI::SK_Broadcast, MVT::v8i16, 1 },
|
|
|
|
{ TTI::SK_Broadcast, MVT::v2i32, 1 },
|
|
|
|
{ TTI::SK_Broadcast, MVT::v4i32, 1 },
|
|
|
|
{ TTI::SK_Broadcast, MVT::v2i64, 1 },
|
|
|
|
{ TTI::SK_Broadcast, MVT::v2f32, 1 },
|
|
|
|
{ TTI::SK_Broadcast, MVT::v4f32, 1 },
|
|
|
|
{ TTI::SK_Broadcast, MVT::v2f64, 1 },
|
2018-06-22 17:45:31 +08:00
|
|
|
// Transpose shuffle kinds can be performed with 'trn1/trn2' and
|
|
|
|
// 'zip1/zip2' instructions.
|
|
|
|
{ TTI::SK_Transpose, MVT::v8i8, 1 },
|
|
|
|
{ TTI::SK_Transpose, MVT::v16i8, 1 },
|
|
|
|
{ TTI::SK_Transpose, MVT::v4i16, 1 },
|
|
|
|
{ TTI::SK_Transpose, MVT::v8i16, 1 },
|
|
|
|
{ TTI::SK_Transpose, MVT::v2i32, 1 },
|
|
|
|
{ TTI::SK_Transpose, MVT::v4i32, 1 },
|
|
|
|
{ TTI::SK_Transpose, MVT::v2i64, 1 },
|
|
|
|
{ TTI::SK_Transpose, MVT::v2f32, 1 },
|
|
|
|
{ TTI::SK_Transpose, MVT::v4f32, 1 },
|
|
|
|
{ TTI::SK_Transpose, MVT::v2f64, 1 },
|
|
|
|
// Select shuffle kinds.
|
|
|
|
// TODO: handle vXi8/vXi16.
|
|
|
|
{ TTI::SK_Select, MVT::v2i32, 1 }, // mov.
|
|
|
|
{ TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
|
|
|
|
{ TTI::SK_Select, MVT::v2i64, 1 }, // mov.
|
|
|
|
{ TTI::SK_Select, MVT::v2f32, 1 }, // mov.
|
|
|
|
{ TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
|
|
|
|
{ TTI::SK_Select, MVT::v2f64, 1 }, // mov.
|
|
|
|
// PermuteSingleSrc shuffle kinds.
|
|
|
|
// TODO: handle vXi8/vXi16.
|
|
|
|
{ TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
|
|
|
|
{ TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
|
|
|
|
{ TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
|
|
|
|
{ TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
|
|
|
|
{ TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
|
|
|
|
{ TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
|
2018-04-26 21:48:33 +08:00
|
|
|
};
|
|
|
|
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
|
2018-06-22 17:45:31 +08:00
|
|
|
if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
|
2018-04-26 21:48:33 +08:00
|
|
|
return LT.first * Entry->Cost;
|
|
|
|
}
|
|
|
|
|
|
|
|
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
|
|
|
|
}
|