2015-01-31 19:17:59 +08:00
|
|
|
//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
|
2014-03-29 18:18:08 +08:00
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2015-01-31 19:17:59 +08:00
|
|
|
#include "AArch64TargetTransformInfo.h"
|
2014-05-24 20:50:23 +08:00
|
|
|
#include "MCTargetDesc/AArch64AddressingModes.h"
|
2014-03-29 18:18:08 +08:00
|
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
2015-03-09 14:14:28 +08:00
|
|
|
#include "llvm/Analysis/LoopInfo.h"
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
#include "llvm/CodeGen/BasicTTIImpl.h"
|
2014-03-29 18:18:08 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
#include "llvm/Target/CostTable.h"
|
|
|
|
#include "llvm/Target/TargetLowering.h"
|
2014-04-09 04:39:59 +08:00
|
|
|
#include <algorithm>
|
2014-03-29 18:18:08 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2014-05-24 20:50:23 +08:00
|
|
|
#define DEBUG_TYPE "aarch64tti"
|
2014-04-22 10:41:26 +08:00
|
|
|
|
2014-04-09 04:39:59 +08:00
|
|
|
/// \brief Calculate the cost of materializing a 64-bit value. This helper
|
|
|
|
/// method might only calculate a fraction of a larger immediate. Therefore it
|
|
|
|
/// is valid to return a cost of ZERO.
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getIntImmCost(int64_t Val) {
|
2014-04-09 04:39:59 +08:00
|
|
|
// Check if the immediate can be encoded within an instruction.
|
2014-05-24 20:50:23 +08:00
|
|
|
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
|
2014-04-09 04:39:59 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (Val < 0)
|
|
|
|
Val = ~Val;
|
|
|
|
|
|
|
|
// Calculate how many moves we will need to materialize this constant.
|
|
|
|
unsigned LZ = countLeadingZeros((uint64_t)Val);
|
|
|
|
return (64 - LZ + 15) / 16;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// \brief Calculate the cost of materializing the given constant.
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
|
2014-03-29 18:18:08 +08:00
|
|
|
assert(Ty->isIntegerTy());
|
|
|
|
|
|
|
|
unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
2014-04-12 10:36:28 +08:00
|
|
|
if (BitSize == 0)
|
2014-03-29 18:18:08 +08:00
|
|
|
return ~0U;
|
|
|
|
|
2014-04-09 04:39:59 +08:00
|
|
|
// Sign-extend all constants to a multiple of 64-bit.
|
|
|
|
APInt ImmVal = Imm;
|
|
|
|
if (BitSize & 0x3f)
|
|
|
|
ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
|
|
|
|
|
|
|
|
// Split the constant into 64-bit chunks and calculate the cost for each
|
|
|
|
// chunk.
|
2015-08-06 02:08:10 +08:00
|
|
|
int Cost = 0;
|
2014-04-09 04:39:59 +08:00
|
|
|
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
|
2014-04-10 09:36:59 +08:00
|
|
|
APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
|
2014-04-09 04:39:59 +08:00
|
|
|
int64_t Val = Tmp.getSExtValue();
|
|
|
|
Cost += getIntImmCost(Val);
|
|
|
|
}
|
|
|
|
// We need at least one instruction to materialze the constant.
|
2015-08-06 02:08:10 +08:00
|
|
|
return std::max(1, Cost);
|
2014-04-09 04:39:59 +08:00
|
|
|
}
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
|
|
|
|
const APInt &Imm, Type *Ty) {
|
2014-04-09 04:39:59 +08:00
|
|
|
assert(Ty->isIntegerTy());
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2014-04-09 04:39:59 +08:00
|
|
|
unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
2014-04-12 10:36:28 +08:00
|
|
|
// There is no cost model for constants with a bit size of 0. Return TCC_Free
|
|
|
|
// here, so that constant hoisting will ignore this constant.
|
|
|
|
if (BitSize == 0)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
|
|
|
|
unsigned ImmIdx = ~0U;
|
|
|
|
switch (Opcode) {
|
|
|
|
default:
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
case Instruction::GetElementPtr:
|
|
|
|
// Always hoist the base address of a GetElementPtr.
|
|
|
|
if (Idx == 0)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return 2 * TTI::TCC_Basic;
|
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
case Instruction::Store:
|
|
|
|
ImmIdx = 0;
|
|
|
|
break;
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::Mul:
|
|
|
|
case Instruction::UDiv:
|
|
|
|
case Instruction::SDiv:
|
|
|
|
case Instruction::URem:
|
|
|
|
case Instruction::SRem:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor:
|
|
|
|
case Instruction::ICmp:
|
|
|
|
ImmIdx = 1;
|
|
|
|
break;
|
2014-04-12 10:53:51 +08:00
|
|
|
// Always return TCC_Free for the shift value of a shift instruction.
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::LShr:
|
|
|
|
case Instruction::AShr:
|
|
|
|
if (Idx == 1)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-12 10:53:51 +08:00
|
|
|
break;
|
2014-04-09 04:39:59 +08:00
|
|
|
case Instruction::Trunc:
|
|
|
|
case Instruction::ZExt:
|
|
|
|
case Instruction::SExt:
|
|
|
|
case Instruction::IntToPtr:
|
|
|
|
case Instruction::PtrToInt:
|
|
|
|
case Instruction::BitCast:
|
|
|
|
case Instruction::PHI:
|
|
|
|
case Instruction::Call:
|
|
|
|
case Instruction::Select:
|
|
|
|
case Instruction::Ret:
|
|
|
|
case Instruction::Load:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Idx == ImmIdx) {
|
2015-08-06 02:08:10 +08:00
|
|
|
int NumConstants = (BitSize + 63) / 64;
|
|
|
|
int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return (Cost <= NumConstants * TTI::TCC_Basic)
|
2015-08-06 02:08:10 +08:00
|
|
|
? static_cast<int>(TTI::TCC_Free)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
: Cost;
|
2014-04-09 04:39:59 +08:00
|
|
|
}
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return AArch64TTIImpl::getIntImmCost(Imm, Ty);
|
2014-04-09 04:39:59 +08:00
|
|
|
}
|
|
|
|
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
|
|
|
|
const APInt &Imm, Type *Ty) {
|
2014-04-09 04:39:59 +08:00
|
|
|
assert(Ty->isIntegerTy());
|
|
|
|
|
|
|
|
unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
2014-04-12 10:36:28 +08:00
|
|
|
// There is no cost model for constants with a bit size of 0. Return TCC_Free
|
|
|
|
// here, so that constant hoisting will ignore this constant.
|
|
|
|
if (BitSize == 0)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
|
|
|
|
switch (IID) {
|
|
|
|
default:
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
case Intrinsic::sadd_with_overflow:
|
|
|
|
case Intrinsic::uadd_with_overflow:
|
|
|
|
case Intrinsic::ssub_with_overflow:
|
|
|
|
case Intrinsic::usub_with_overflow:
|
|
|
|
case Intrinsic::smul_with_overflow:
|
|
|
|
case Intrinsic::umul_with_overflow:
|
|
|
|
if (Idx == 1) {
|
2015-08-06 02:08:10 +08:00
|
|
|
int NumConstants = (BitSize + 63) / 64;
|
|
|
|
int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return (Cost <= NumConstants * TTI::TCC_Basic)
|
2015-08-06 02:08:10 +08:00
|
|
|
? static_cast<int>(TTI::TCC_Free)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
: Cost;
|
2014-04-09 04:39:59 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case Intrinsic::experimental_stackmap:
|
|
|
|
if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
break;
|
|
|
|
case Intrinsic::experimental_patchpoint_void:
|
|
|
|
case Intrinsic::experimental_patchpoint_i64:
|
|
|
|
if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::TCC_Free;
|
2014-04-09 04:39:59 +08:00
|
|
|
break;
|
|
|
|
}
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return AArch64TTIImpl::getIntImmCost(Imm, Ty);
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
TargetTransformInfo::PopcntSupportKind
|
|
|
|
AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
|
2014-03-29 18:18:08 +08:00
|
|
|
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
|
|
|
|
if (TyWidth == 32 || TyWidth == 64)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::PSK_FastHardware;
|
2014-05-24 20:50:23 +08:00
|
|
|
// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return TTI::PSK_Software;
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
2017-05-10 04:18:12 +08:00
|
|
|
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
|
|
|
|
ArrayRef<const Value *> Args) {
|
|
|
|
|
|
|
|
// A helper that returns a vector type from the given type. The number of
|
|
|
|
// elements in type Ty determine the vector width.
|
|
|
|
auto toVectorTy = [&](Type *ArgTy) {
|
|
|
|
return VectorType::get(ArgTy->getScalarType(),
|
|
|
|
DstTy->getVectorNumElements());
|
|
|
|
};
|
|
|
|
|
|
|
|
// Exit early if DstTy is not a vector type whose elements are at least
|
|
|
|
// 16-bits wide.
|
|
|
|
if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Determine if the operation has a widening variant. We consider both the
|
|
|
|
// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
|
|
|
|
// instructions.
|
|
|
|
//
|
|
|
|
// TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
|
|
|
|
// verify that their extending operands are eliminated during code
|
|
|
|
// generation.
|
|
|
|
switch (Opcode) {
|
|
|
|
case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
|
|
|
|
case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// To be a widening instruction (either the "wide" or "long" versions), the
|
|
|
|
// second operand must be a sign- or zero extend having a single user. We
|
|
|
|
// only consider extends having a single user because they may otherwise not
|
|
|
|
// be eliminated.
|
|
|
|
if (Args.size() != 2 ||
|
|
|
|
(!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
|
|
|
|
!Args[1]->hasOneUse())
|
|
|
|
return false;
|
|
|
|
auto *Extend = cast<CastInst>(Args[1]);
|
|
|
|
|
|
|
|
// Legalize the destination type and ensure it can be used in a widening
|
|
|
|
// operation.
|
|
|
|
auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
|
|
|
|
unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
|
|
|
|
if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Legalize the source type and ensure it can be used in a widening
|
|
|
|
// operation.
|
|
|
|
Type *SrcTy = toVectorTy(Extend->getSrcTy());
|
|
|
|
auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
|
|
|
|
unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
|
|
|
|
if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Get the total number of vector elements in the legalized types.
|
|
|
|
unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
|
|
|
|
unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
|
|
|
|
|
|
|
|
// Return true if the legalized types have the same number of vector elements
|
|
|
|
// and the destination element type size is twice that of the source type.
|
|
|
|
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
|
|
|
|
}
|
|
|
|
|
2017-04-12 19:49:08 +08:00
|
|
|
int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
|
|
|
|
const Instruction *I) {
|
2014-03-29 18:18:08 +08:00
|
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
|
|
|
assert(ISD && "Invalid opcode");
|
|
|
|
|
2017-05-10 04:18:12 +08:00
|
|
|
// If the cast is observable, and it is used by a widening instruction (e.g.,
|
|
|
|
// uaddl, saddw, etc.), it may be free.
|
|
|
|
if (I && I->hasOneUse()) {
|
|
|
|
auto *SingleUser = cast<Instruction>(*I->user_begin());
|
|
|
|
SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
|
|
|
|
if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
|
|
|
|
// If the cast is the second operand, it is free. We will generate either
|
|
|
|
// a "wide" or "long" version of the widening instruction.
|
|
|
|
if (I == SingleUser->getOperand(1))
|
|
|
|
return 0;
|
|
|
|
// If the cast is not the second operand, it will be free if it looks the
|
|
|
|
// same as the second operand. In this case, we will generate a "long"
|
|
|
|
// version of the widening instruction.
|
|
|
|
if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
|
|
|
|
if (I->getOpcode() == Cast->getOpcode() &&
|
|
|
|
cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-09 10:09:04 +08:00
|
|
|
EVT SrcTy = TLI->getValueType(DL, Src);
|
|
|
|
EVT DstTy = TLI->getValueType(DL, Dst);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
if (!SrcTy.isSimple() || !DstTy.isSimple())
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return BaseT::getCastInstrCost(Opcode, Dst, Src);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2015-10-28 12:02:12 +08:00
|
|
|
static const TypeConversionCostTblEntry
|
2015-10-25 08:27:14 +08:00
|
|
|
ConversionTbl[] = {
|
2015-11-19 02:03:06 +08:00
|
|
|
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
|
|
|
|
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
|
|
|
|
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
|
|
|
|
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
|
2015-08-18 00:05:09 +08:00
|
|
|
|
|
|
|
// The number of shll instructions for the extension.
|
2015-11-19 02:03:06 +08:00
|
|
|
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
|
|
|
|
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
|
|
|
|
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
|
|
|
|
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
|
|
|
|
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
|
|
|
|
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
|
|
|
|
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
|
2015-08-18 00:05:09 +08:00
|
|
|
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
|
|
|
|
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
// LowerVectorINT_TO_FP:
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
|
2014-03-29 18:18:08 +08:00
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
|
2014-03-29 18:18:08 +08:00
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
|
2014-06-15 17:27:06 +08:00
|
|
|
|
|
|
|
// Complex: to v2f32
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
|
2014-06-15 17:27:15 +08:00
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
|
2014-06-15 17:27:15 +08:00
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
|
2014-06-15 17:27:06 +08:00
|
|
|
|
|
|
|
// Complex: to v4f32
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
|
|
|
|
|
2015-08-18 00:05:09 +08:00
|
|
|
// Complex: to v8f32
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
|
|
|
|
|
|
|
|
// Complex: to v16f32
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
|
|
|
|
|
2014-06-15 17:27:06 +08:00
|
|
|
// Complex: to v2f64
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
|
|
|
|
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
|
|
|
|
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
|
|
|
|
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
// LowerVectorFP_TO_INT
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
|
2014-03-29 18:18:08 +08:00
|
|
|
{ ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
|
2014-03-29 18:18:08 +08:00
|
|
|
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
|
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
|
2014-06-15 17:27:06 +08:00
|
|
|
|
2014-06-15 17:27:15 +08:00
|
|
|
// Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
|
2014-06-15 17:27:15 +08:00
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
|
2014-06-15 17:27:15 +08:00
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
|
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
|
|
|
|
|
|
|
|
// Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
|
2014-06-15 17:27:06 +08:00
|
|
|
{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
|
2014-06-15 17:27:15 +08:00
|
|
|
{ ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
|
|
|
|
|
|
|
|
// Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
|
|
|
|
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
|
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
|
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
|
|
|
|
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
|
2014-03-29 18:18:08 +08:00
|
|
|
};
|
|
|
|
|
2015-10-27 12:14:24 +08:00
|
|
|
if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
|
|
|
|
DstTy.getSimpleVT(),
|
|
|
|
SrcTy.getSimpleVT()))
|
|
|
|
return Entry->Cost;
|
2014-03-29 18:18:08 +08:00
|
|
|
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
return BaseT::getCastInstrCost(Opcode, Dst, Src);
|
2016-04-27 23:20:21 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
|
|
|
|
VectorType *VecTy,
|
|
|
|
unsigned Index) {
|
|
|
|
|
|
|
|
// Make sure we were given a valid extend opcode.
|
2016-04-28 00:25:04 +08:00
|
|
|
assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
|
|
|
|
"Invalid opcode");
|
2016-04-27 23:20:21 +08:00
|
|
|
|
|
|
|
// We are extending an element we extract from a vector, so the source type
|
|
|
|
// of the extend is the element type of the vector.
|
|
|
|
auto *Src = VecTy->getElementType();
|
|
|
|
|
|
|
|
// Sign- and zero-extends are for integer types only.
|
|
|
|
assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
|
|
|
|
|
|
|
|
// Get the cost for the extract. We compute the cost (if any) for the extend
|
|
|
|
// below.
|
|
|
|
auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
|
|
|
|
|
|
|
|
// Legalize the types.
|
|
|
|
auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
|
|
|
|
auto DstVT = TLI->getValueType(DL, Dst);
|
|
|
|
auto SrcVT = TLI->getValueType(DL, Src);
|
|
|
|
|
|
|
|
// If the resulting type is still a vector and the destination type is legal,
|
|
|
|
// we may get the extension for free. If not, get the default cost for the
|
|
|
|
// extend.
|
|
|
|
if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
|
|
|
|
return Cost + getCastInstrCost(Opcode, Dst, Src);
|
|
|
|
|
|
|
|
// The destination type should be larger than the element type. If not, get
|
|
|
|
// the default cost for the extend.
|
|
|
|
if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
|
|
|
|
return Cost + getCastInstrCost(Opcode, Dst, Src);
|
|
|
|
|
|
|
|
switch (Opcode) {
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Opcode should be either SExt or ZExt");
|
|
|
|
|
|
|
|
// For sign-extends, we only need a smov, which performs the extension
|
|
|
|
// automatically.
|
|
|
|
case Instruction::SExt:
|
|
|
|
return Cost;
|
|
|
|
|
|
|
|
// For zero-extends, the extend is performed automatically by a umov unless
|
|
|
|
// the destination type is i64 and the element type is i8 or i16.
|
|
|
|
case Instruction::ZExt:
|
|
|
|
if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
|
|
|
|
return Cost;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we are unable to perform the extend for free, get the default cost.
|
|
|
|
return Cost + getCastInstrCost(Opcode, Dst, Src);
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
|
|
|
|
unsigned Index) {
|
2014-03-29 18:18:08 +08:00
|
|
|
assert(Val->isVectorTy() && "This must be a vector type");
|
|
|
|
|
|
|
|
if (Index != -1U) {
|
|
|
|
// Legalize the type.
|
2015-08-06 02:08:10 +08:00
|
|
|
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
// This type is legalized to a scalar type.
|
|
|
|
if (!LT.second.isVector())
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
// The type may be split. Normalize the index to the new type.
|
|
|
|
unsigned Width = LT.second.getVectorNumElements();
|
|
|
|
Index = Index % Width;
|
|
|
|
|
|
|
|
// The element at index zero is already inside the vector.
|
|
|
|
if (Index == 0)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// All other insert/extracts cost this much.
|
2016-06-03 02:03:53 +08:00
|
|
|
return ST->getVectorInsertExtractBaseCost();
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getArithmeticInstrCost(
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
|
|
|
|
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
|
[X86] updating TTI costs for arithmetic instructions on X86\SLM arch.
updated instructions:
pmulld, pmullw, pmulhw, mulsd, mulps, mulpd, divss, divps, divsd, divpd, addpd and subpd.
special optimization case which replaces pmulld with pmullw\pmulhw\pshuf seq.
In case if the real operands bitwidth <= 16.
Differential Revision: https://reviews.llvm.org/D28104
llvm-svn: 291657
2017-01-11 16:23:37 +08:00
|
|
|
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
|
2014-03-29 18:18:08 +08:00
|
|
|
// Legalize the type.
|
2015-08-06 02:08:10 +08:00
|
|
|
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2017-05-10 04:18:12 +08:00
|
|
|
// If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
|
|
|
|
// add in the widening overhead specified by the sub-target. Since the
|
|
|
|
// extends feeding widening instructions are performed automatically, they
|
|
|
|
// aren't present in the generated code and have a zero cost. By adding a
|
|
|
|
// widening overhead here, we attach the total cost of the combined operation
|
|
|
|
// to the widening instruction.
|
|
|
|
int Cost = 0;
|
|
|
|
if (isWideningInstruction(Ty, Opcode, Args))
|
|
|
|
Cost += ST->getWideningBaseCost();
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
|
|
|
|
2014-09-29 21:59:31 +08:00
|
|
|
if (ISD == ISD::SDIV &&
|
|
|
|
Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
|
|
|
|
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
|
|
|
|
// On AArch64, scalar signed division by constants power-of-two are
|
|
|
|
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
|
|
|
|
// The OperandValue properties many not be same as that of previous
|
|
|
|
// operation; conservatively assume OP_None.
|
2017-05-10 04:18:12 +08:00
|
|
|
Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
|
|
|
|
TargetTransformInfo::OP_None,
|
|
|
|
TargetTransformInfo::OP_None);
|
2014-09-29 21:59:31 +08:00
|
|
|
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
|
|
|
|
TargetTransformInfo::OP_None,
|
|
|
|
TargetTransformInfo::OP_None);
|
|
|
|
Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
|
|
|
|
TargetTransformInfo::OP_None,
|
|
|
|
TargetTransformInfo::OP_None);
|
|
|
|
Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
|
|
|
|
TargetTransformInfo::OP_None,
|
|
|
|
TargetTransformInfo::OP_None);
|
|
|
|
return Cost;
|
|
|
|
}
|
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
switch (ISD) {
|
|
|
|
default:
|
2017-05-10 04:18:12 +08:00
|
|
|
return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
|
|
|
|
Opd1PropInfo, Opd2PropInfo);
|
2014-03-29 18:18:08 +08:00
|
|
|
case ISD::ADD:
|
|
|
|
case ISD::MUL:
|
|
|
|
case ISD::XOR:
|
|
|
|
case ISD::OR:
|
|
|
|
case ISD::AND:
|
|
|
|
// These nodes are marked as 'custom' for combining purposes only.
|
|
|
|
// We know that they are legal. See LowerAdd in ISelLowering.
|
2017-05-10 04:18:12 +08:00
|
|
|
return (Cost + 1) * LT.first;
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-01-05 22:03:41 +08:00
|
|
|
int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
|
|
|
|
const SCEV *Ptr) {
|
2014-03-29 18:18:08 +08:00
|
|
|
// Address computations in vectorized code with non-consecutive addresses will
|
|
|
|
// likely result in more instructions compared to scalar code where the
|
|
|
|
// computation can more often be merged into the index mode. The resulting
|
|
|
|
// extra micro-ops can significantly decrease throughput.
|
|
|
|
unsigned NumVectorInstToHideOverhead = 10;
|
2017-01-05 22:03:41 +08:00
|
|
|
int MaxMergeDistance = 64;
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2017-01-05 22:03:41 +08:00
|
|
|
if (Ty->isVectorTy() && SE &&
|
|
|
|
!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
|
2014-03-29 18:18:08 +08:00
|
|
|
return NumVectorInstToHideOverhead;
|
|
|
|
|
|
|
|
// In many cases the address computation is not merged into the instruction
|
|
|
|
// addressing mode.
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
|
2017-04-12 19:49:08 +08:00
|
|
|
Type *CondTy, const Instruction *I) {
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
2015-09-09 23:35:02 +08:00
|
|
|
// We don't lower some vector selects well that are wider than the register
|
|
|
|
// width.
|
2014-03-29 18:18:08 +08:00
|
|
|
if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
|
|
|
|
// We would need this many instructions to hide the scalarization happening.
|
2015-08-06 02:08:10 +08:00
|
|
|
const int AmortizationCost = 20;
|
2015-10-28 12:02:12 +08:00
|
|
|
static const TypeConversionCostTblEntry
|
2014-03-29 18:18:08 +08:00
|
|
|
VectorSelectTbl[] = {
|
2015-09-09 23:35:02 +08:00
|
|
|
{ ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
|
|
|
|
{ ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
|
|
|
|
{ ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
|
2014-03-29 18:18:08 +08:00
|
|
|
{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
|
|
|
|
{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
|
|
|
|
{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
|
|
|
|
};
|
|
|
|
|
2015-07-09 10:09:04 +08:00
|
|
|
EVT SelCondTy = TLI->getValueType(DL, CondTy);
|
|
|
|
EVT SelValTy = TLI->getValueType(DL, ValTy);
|
2014-03-29 18:18:08 +08:00
|
|
|
if (SelCondTy.isSimple() && SelValTy.isSimple()) {
|
2015-10-27 12:14:24 +08:00
|
|
|
if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
|
|
|
|
SelCondTy.getSimpleVT(),
|
|
|
|
SelValTy.getSimpleVT()))
|
|
|
|
return Entry->Cost;
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
}
|
2017-04-12 19:49:08 +08:00
|
|
|
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
|
2014-03-29 18:18:08 +08:00
|
|
|
}
|
|
|
|
|
2017-01-11 07:42:21 +08:00
|
|
|
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
|
2017-04-12 19:49:08 +08:00
|
|
|
unsigned Alignment, unsigned AddressSpace,
|
|
|
|
const Instruction *I) {
|
2017-01-11 07:42:21 +08:00
|
|
|
auto LT = TLI->getTypeLegalizationCost(DL, Ty);
|
2014-03-29 18:18:08 +08:00
|
|
|
|
2016-12-16 02:36:59 +08:00
|
|
|
if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
|
2017-01-11 07:42:21 +08:00
|
|
|
LT.second.is128BitVector() && Alignment < 16) {
|
|
|
|
// Unaligned stores are extremely inefficient. We don't split all
|
|
|
|
// unaligned 128-bit stores because the negative impact that has shown in
|
|
|
|
// practice on inlined block copy code.
|
|
|
|
// We make such stores expensive so that we will only vectorize if there
|
2014-03-29 18:18:08 +08:00
|
|
|
// are 6 other instructions getting vectorized.
|
2017-01-11 07:42:21 +08:00
|
|
|
const int AmortizationCost = 6;
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
return LT.first * 2 * AmortizationCost;
|
|
|
|
}
|
|
|
|
|
2017-01-11 07:42:21 +08:00
|
|
|
if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) &&
|
|
|
|
Ty->getVectorNumElements() < 8) {
|
2014-03-29 18:18:08 +08:00
|
|
|
// We scalarize the loads/stores because there is not v.4b register and we
|
|
|
|
// have to promote the elements to v.4h.
|
2017-01-11 07:42:21 +08:00
|
|
|
unsigned NumVecElts = Ty->getVectorNumElements();
|
2014-03-29 18:18:08 +08:00
|
|
|
unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
|
|
|
|
// We generate 2 instructions per vector element.
|
|
|
|
return NumVectorizableInstsToAmortize * NumVecElts * 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
return LT.first;
|
|
|
|
}
|
2014-08-05 20:30:34 +08:00
|
|
|
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
|
|
|
unsigned Factor,
|
|
|
|
ArrayRef<unsigned> Indices,
|
|
|
|
unsigned Alignment,
|
|
|
|
unsigned AddressSpace) {
|
[AArch64] Lower interleaved memory accesses to ldN/stN intrinsics. This patch also adds a function to calculate the cost of interleaved memory accesses.
E.g. Lower an interleaved load:
%wide.vec = load <8 x i32>, <8 x i32>* %ptr
%v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>
%v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>
into:
%ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
%vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
%vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
E.g. Lower an interleaved store:
%i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
store <12 x i32> %i.vec, <12 x i32>* %ptr
into:
%sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
%sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
%sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
Differential Revision: http://reviews.llvm.org/D10533
llvm-svn: 240754
2015-06-26 10:32:07 +08:00
|
|
|
assert(Factor >= 2 && "Invalid interleave factor");
|
|
|
|
assert(isa<VectorType>(VecTy) && "Expect a vector type");
|
|
|
|
|
|
|
|
if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
|
|
|
|
unsigned NumElts = VecTy->getVectorNumElements();
|
2017-04-11 02:34:37 +08:00
|
|
|
auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
|
[AArch64] Lower interleaved memory accesses to ldN/stN intrinsics. This patch also adds a function to calculate the cost of interleaved memory accesses.
E.g. Lower an interleaved load:
%wide.vec = load <8 x i32>, <8 x i32>* %ptr
%v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>
%v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>
into:
%ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
%vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
%vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
E.g. Lower an interleaved store:
%i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
store <12 x i32> %i.vec, <12 x i32>* %ptr
into:
%sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
%sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
%sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
Differential Revision: http://reviews.llvm.org/D10533
llvm-svn: 240754
2015-06-26 10:32:07 +08:00
|
|
|
|
|
|
|
// ldN/stN only support legal vector types of size 64 or 128 in bits.
|
2017-03-02 23:15:35 +08:00
|
|
|
// Accesses having vector types that are a multiple of 128 bits can be
|
|
|
|
// matched to more than one ldN/stN instruction.
|
2017-04-11 02:34:37 +08:00
|
|
|
if (NumElts % Factor == 0 &&
|
|
|
|
TLI->isLegalInterleavedAccessType(SubVecTy, DL))
|
|
|
|
return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
|
[AArch64] Lower interleaved memory accesses to ldN/stN intrinsics. This patch also adds a function to calculate the cost of interleaved memory accesses.
E.g. Lower an interleaved load:
%wide.vec = load <8 x i32>, <8 x i32>* %ptr
%v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>
%v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>
into:
%ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
%vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
%vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
E.g. Lower an interleaved store:
%i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
store <12 x i32> %i.vec, <12 x i32>* %ptr
into:
%sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
%sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
%sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
Differential Revision: http://reviews.llvm.org/D10533
llvm-svn: 240754
2015-06-26 10:32:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
|
|
|
|
Alignment, AddressSpace);
|
|
|
|
}
|
|
|
|
|
2015-08-06 02:08:10 +08:00
|
|
|
int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
|
|
|
|
int Cost = 0;
|
2014-08-05 20:30:34 +08:00
|
|
|
for (auto *I : Tys) {
|
|
|
|
if (!I->isVectorTy())
|
|
|
|
continue;
|
|
|
|
if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
|
|
|
|
Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
|
|
|
|
getMemoryOpCost(Instruction::Load, I, 128, 0);
|
|
|
|
}
|
|
|
|
return Cost;
|
|
|
|
}
|
2014-08-21 08:02:51 +08:00
|
|
|
|
2015-05-07 01:12:25 +08:00
|
|
|
unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
|
2016-06-03 02:03:53 +08:00
|
|
|
return ST->getMaxInterleaveFactor();
|
2014-08-21 08:02:51 +08:00
|
|
|
}
|
2014-10-09 18:13:27 +08:00
|
|
|
|
2015-02-01 22:31:23 +08:00
|
|
|
void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
TTI::UnrollingPreferences &UP) {
|
2015-03-09 14:14:28 +08:00
|
|
|
// Enable partial unrolling and runtime unrolling.
|
|
|
|
BaseT::getUnrollingPreferences(L, UP);
|
|
|
|
|
|
|
|
// For inner loop, it is more likely to be a hot one, and the runtime check
|
|
|
|
// can be promoted out from LICM pass, so the overhead is less, let's try
|
|
|
|
// a larger threshold to unroll more loops.
|
|
|
|
if (L->getLoopDepth() > 1)
|
|
|
|
UP.PartialThreshold *= 2;
|
|
|
|
|
2014-10-09 18:13:27 +08:00
|
|
|
// Disable partial & runtime unrolling on -Os.
|
|
|
|
UP.PartialOptSizeThreshold = 0;
|
|
|
|
}
|
2015-01-27 06:51:15 +08:00
|
|
|
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
|
|
|
|
Type *ExpectedType) {
|
2015-01-27 06:51:15 +08:00
|
|
|
switch (Inst->getIntrinsicID()) {
|
|
|
|
default:
|
|
|
|
return nullptr;
|
|
|
|
case Intrinsic::aarch64_neon_st2:
|
|
|
|
case Intrinsic::aarch64_neon_st3:
|
|
|
|
case Intrinsic::aarch64_neon_st4: {
|
|
|
|
// Create a struct type
|
|
|
|
StructType *ST = dyn_cast<StructType>(ExpectedType);
|
|
|
|
if (!ST)
|
|
|
|
return nullptr;
|
|
|
|
unsigned NumElts = Inst->getNumArgOperands() - 1;
|
|
|
|
if (ST->getNumElements() != NumElts)
|
|
|
|
return nullptr;
|
|
|
|
for (unsigned i = 0, e = NumElts; i != e; ++i) {
|
|
|
|
if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
Value *Res = UndefValue::get(ExpectedType);
|
|
|
|
IRBuilder<> Builder(Inst);
|
|
|
|
for (unsigned i = 0, e = NumElts; i != e; ++i) {
|
|
|
|
Value *L = Inst->getArgOperand(i);
|
|
|
|
Res = Builder.CreateInsertValue(Res, L, i);
|
|
|
|
}
|
|
|
|
return Res;
|
|
|
|
}
|
|
|
|
case Intrinsic::aarch64_neon_ld2:
|
|
|
|
case Intrinsic::aarch64_neon_ld3:
|
|
|
|
case Intrinsic::aarch64_neon_ld4:
|
|
|
|
if (Inst->getType() == ExpectedType)
|
|
|
|
return Inst;
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
llvm-svn: 227669
2015-01-31 11:43:40 +08:00
|
|
|
bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
|
|
|
|
MemIntrinsicInfo &Info) {
|
2015-01-27 06:51:15 +08:00
|
|
|
switch (Inst->getIntrinsicID()) {
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
case Intrinsic::aarch64_neon_ld2:
|
|
|
|
case Intrinsic::aarch64_neon_ld3:
|
|
|
|
case Intrinsic::aarch64_neon_ld4:
|
|
|
|
Info.ReadMem = true;
|
|
|
|
Info.WriteMem = false;
|
|
|
|
Info.PtrVal = Inst->getArgOperand(0);
|
|
|
|
break;
|
|
|
|
case Intrinsic::aarch64_neon_st2:
|
|
|
|
case Intrinsic::aarch64_neon_st3:
|
|
|
|
case Intrinsic::aarch64_neon_st4:
|
|
|
|
Info.ReadMem = false;
|
|
|
|
Info.WriteMem = true;
|
|
|
|
Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (Inst->getIntrinsicID()) {
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
case Intrinsic::aarch64_neon_ld2:
|
|
|
|
case Intrinsic::aarch64_neon_st2:
|
|
|
|
Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
|
|
|
|
break;
|
|
|
|
case Intrinsic::aarch64_neon_ld3:
|
|
|
|
case Intrinsic::aarch64_neon_st3:
|
|
|
|
Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
|
|
|
|
break;
|
|
|
|
case Intrinsic::aarch64_neon_ld4:
|
|
|
|
case Intrinsic::aarch64_neon_st4:
|
|
|
|
Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2016-03-18 08:27:29 +08:00
|
|
|
|
2017-04-04 03:20:07 +08:00
|
|
|
/// See if \p I should be considered for address type promotion. We check if \p
|
|
|
|
/// I is a sext with right type and used in memory accesses. If it used in a
|
|
|
|
/// "complex" getelementptr, we allow it to be promoted without finding other
|
|
|
|
/// sext instructions that sign extended the same initial value. A getelementptr
|
|
|
|
/// is considered as "complex" if it has more than 2 operands.
|
|
|
|
bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
|
|
|
|
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
|
|
|
|
bool Considerable = false;
|
|
|
|
AllowPromotionWithoutCommonHeader = false;
|
|
|
|
if (!isa<SExtInst>(&I))
|
|
|
|
return false;
|
|
|
|
Type *ConsideredSExtType =
|
|
|
|
Type::getInt64Ty(I.getParent()->getParent()->getContext());
|
|
|
|
if (I.getType() != ConsideredSExtType)
|
|
|
|
return false;
|
|
|
|
// See if the sext is the one with the right type and used in at least one
|
|
|
|
// GetElementPtrInst.
|
|
|
|
for (const User *U : I.users()) {
|
|
|
|
if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
|
|
|
|
Considerable = true;
|
|
|
|
// A getelementptr is considered as "complex" if it has more than 2
|
|
|
|
// operands. We will promote a SExt used in such complex GEP as we
|
|
|
|
// expect some computation to be merged if they are done on 64 bits.
|
|
|
|
if (GEPInst->getNumOperands() > 2) {
|
|
|
|
AllowPromotionWithoutCommonHeader = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Considerable;
|
|
|
|
}
|
|
|
|
|
2016-03-18 08:27:29 +08:00
|
|
|
unsigned AArch64TTIImpl::getCacheLineSize() {
|
2016-06-03 02:03:53 +08:00
|
|
|
return ST->getCacheLineSize();
|
2016-03-18 08:27:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
unsigned AArch64TTIImpl::getPrefetchDistance() {
|
2016-06-03 02:03:53 +08:00
|
|
|
return ST->getPrefetchDistance();
|
2016-03-18 08:27:29 +08:00
|
|
|
}
|
2016-03-18 08:27:38 +08:00
|
|
|
|
|
|
|
unsigned AArch64TTIImpl::getMinPrefetchStride() {
|
2016-06-03 02:03:53 +08:00
|
|
|
return ST->getMinPrefetchStride();
|
2016-03-18 08:27:38 +08:00
|
|
|
}
|
2016-03-18 08:27:43 +08:00
|
|
|
|
|
|
|
unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
|
2016-06-03 02:03:53 +08:00
|
|
|
return ST->getMaxPrefetchIterationsAhead();
|
2016-03-18 08:27:43 +08:00
|
|
|
}
|