2018-01-25 20:06:32 +08:00
|
|
|
//===- AggressiveInstCombine.cpp ------------------------------------------===//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2018-01-25 20:06:32 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file implements the aggressive expression pattern combiner classes.
|
|
|
|
// Currently, it handles expression patterns for:
|
|
|
|
// * Truncate instruction
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
|
|
|
|
#include "AggressiveInstCombineInternal.h"
|
2018-07-11 06:48:13 +08:00
|
|
|
#include "llvm-c/Initialization.h"
|
2018-09-05 19:41:12 +08:00
|
|
|
#include "llvm-c/Transforms/AggressiveInstCombine.h"
|
2018-01-25 20:06:32 +08:00
|
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
|
|
|
#include "llvm/Analysis/BasicAliasAnalysis.h"
|
|
|
|
#include "llvm/Analysis/GlobalsModRef.h"
|
|
|
|
#include "llvm/Analysis/TargetLibraryInfo.h"
|
|
|
|
#include "llvm/IR/DataLayout.h"
|
2018-01-31 18:41:31 +08:00
|
|
|
#include "llvm/IR/Dominators.h"
|
2018-05-02 05:02:09 +08:00
|
|
|
#include "llvm/IR/IRBuilder.h"
|
2018-04-24 23:40:07 +08:00
|
|
|
#include "llvm/IR/LegacyPassManager.h"
|
2018-05-02 05:02:09 +08:00
|
|
|
#include "llvm/IR/PatternMatch.h"
|
Sink all InitializePasses.h includes
This file lists every pass in LLVM, and is included by Pass.h, which is
very popular. Every time we add, remove, or rename a pass in LLVM, it
caused lots of recompilation.
I found this fact by looking at this table, which is sorted by the
number of times a file was changed over the last 100,000 git commits
multiplied by the number of object files that depend on it in the
current checkout:
recompiles touches affected_files header
342380 95 3604 llvm/include/llvm/ADT/STLExtras.h
314730 234 1345 llvm/include/llvm/InitializePasses.h
307036 118 2602 llvm/include/llvm/ADT/APInt.h
213049 59 3611 llvm/include/llvm/Support/MathExtras.h
170422 47 3626 llvm/include/llvm/Support/Compiler.h
162225 45 3605 llvm/include/llvm/ADT/Optional.h
158319 63 2513 llvm/include/llvm/ADT/Triple.h
140322 39 3598 llvm/include/llvm/ADT/StringRef.h
137647 59 2333 llvm/include/llvm/Support/Error.h
131619 73 1803 llvm/include/llvm/Support/FileSystem.h
Before this change, touching InitializePasses.h would cause 1345 files
to recompile. After this change, touching it only causes 550 compiles in
an incremental rebuild.
Reviewers: bkramer, asbirlea, bollu, jdoerfert
Differential Revision: https://reviews.llvm.org/D70211
2019-11-14 05:15:01 +08:00
|
|
|
#include "llvm/InitializePasses.h"
|
2018-01-25 20:06:32 +08:00
|
|
|
#include "llvm/Pass.h"
|
2018-07-11 06:48:13 +08:00
|
|
|
#include "llvm/Transforms/Utils/Local.h"
|
2018-01-25 20:06:32 +08:00
|
|
|
using namespace llvm;
|
2018-05-02 05:02:09 +08:00
|
|
|
using namespace PatternMatch;
|
2018-01-25 20:06:32 +08:00
|
|
|
|
|
|
|
#define DEBUG_TYPE "aggressive-instcombine"
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
/// Contains expression pattern combiner logic.
|
|
|
|
/// This class provides both the logic to combine expression patterns and
|
|
|
|
/// combine them. It differs from InstCombiner class in that each pattern
|
|
|
|
/// combiner runs only once as opposed to InstCombine's multi-iteration,
|
|
|
|
/// which allows pattern combiner to have higher complexity than the O(1)
|
|
|
|
/// required by the instruction combiner.
|
|
|
|
class AggressiveInstCombinerLegacyPass : public FunctionPass {
|
|
|
|
public:
|
|
|
|
static char ID; // Pass identification, replacement for typeid
|
|
|
|
|
|
|
|
AggressiveInstCombinerLegacyPass() : FunctionPass(ID) {
|
|
|
|
initializeAggressiveInstCombinerLegacyPassPass(
|
|
|
|
*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override;
|
|
|
|
|
|
|
|
/// Run all expression pattern optimizations on the given /p F function.
|
|
|
|
///
|
|
|
|
/// \param F function to optimize.
|
|
|
|
/// \returns true if the IR is changed.
|
|
|
|
bool runOnFunction(Function &F) override;
|
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
[AggressiveInstCombine] convert rotate with guard branch into funnel shift (PR34924)
Now, that we have funnel shift intrinsics, it should be safe to convert this form of rotate to it.
In the worst case (a target that doesn't have rotate instructions), we will expand this into a
branch-less sequence of ALU ops (neg/and/and/lshr/shl/or) in the backend, so it's still very
likely to be a perf improvement over the original code.
The motivating source code pattern for this is shown in:
https://bugs.llvm.org/show_bug.cgi?id=34924
Background:
I looked at several different options before deciding where to try this - instcombine, simplifycfg,
CGP - because it doesn't fit cleanly anywhere AFAIK.
The backend (CGP, SDAG, GlobalIsel?) is too late for what we're trying to accomplish. We want to
have the IR converted before we reach things like vectorization because the reduced code can make a
loop much simpler to transform.
Technically, this could be included in instcombine, but it's a large pattern match that includes
control-flow, so it just felt wrong to stuff into there (although I have a draft of that patch).
Similarly, this could be part of simplifycfg, but all of this pattern matching is a stretch.
So we're left with our relatively new dumping ground for homeless transforms: aggressive-instcombine.
This only runs at -O3, but that seems like a reasonable limitation given that source code has many
options to avoid this pattern (including the recently added clang intrinsics for rotates).
I'm including a PhaseOrdering test because we require the teamwork of 3 passes (aggressive-instcombine,
instcombine, simplifycfg) to get this into the minimal IR form that we want. That test shows a bug
with the new pass manager that's independent of this change (but it will be masked if we canonicalize
harder to funnel shift intrinsics in instcombine).
Differential Revision: https://reviews.llvm.org/D55604
llvm-svn: 349396
2018-12-18 05:14:51 +08:00
|
|
|
/// Match a pattern for a bitwise rotate operation that partially guards
|
|
|
|
/// against undefined behavior by branching around the rotation when the shift
|
|
|
|
/// amount is 0.
|
|
|
|
static bool foldGuardedRotateToFunnelShift(Instruction &I) {
|
|
|
|
if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// As with the one-use checks below, this is not strictly necessary, but we
|
|
|
|
// are being cautious to avoid potential perf regressions on targets that
|
|
|
|
// do not actually have a rotate instruction (where the funnel shift would be
|
|
|
|
// expanded back into math/shift/logic ops).
|
|
|
|
if (!isPowerOf2_32(I.getType()->getScalarSizeInBits()))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Match V to funnel shift left/right and capture the source operand and
|
|
|
|
// shift amount in X and Y.
|
|
|
|
auto matchRotate = [](Value *V, Value *&X, Value *&Y) {
|
|
|
|
Value *L0, *L1, *R0, *R1;
|
|
|
|
unsigned Width = V->getType()->getScalarSizeInBits();
|
|
|
|
auto Sub = m_Sub(m_SpecificInt(Width), m_Value(R1));
|
|
|
|
|
|
|
|
// rotate_left(X, Y) == (X << Y) | (X >> (Width - Y))
|
2019-01-03 03:51:46 +08:00
|
|
|
auto RotL = m_OneUse(
|
|
|
|
m_c_Or(m_Shl(m_Value(L0), m_Value(L1)), m_LShr(m_Value(R0), Sub)));
|
[AggressiveInstCombine] convert rotate with guard branch into funnel shift (PR34924)
Now, that we have funnel shift intrinsics, it should be safe to convert this form of rotate to it.
In the worst case (a target that doesn't have rotate instructions), we will expand this into a
branch-less sequence of ALU ops (neg/and/and/lshr/shl/or) in the backend, so it's still very
likely to be a perf improvement over the original code.
The motivating source code pattern for this is shown in:
https://bugs.llvm.org/show_bug.cgi?id=34924
Background:
I looked at several different options before deciding where to try this - instcombine, simplifycfg,
CGP - because it doesn't fit cleanly anywhere AFAIK.
The backend (CGP, SDAG, GlobalIsel?) is too late for what we're trying to accomplish. We want to
have the IR converted before we reach things like vectorization because the reduced code can make a
loop much simpler to transform.
Technically, this could be included in instcombine, but it's a large pattern match that includes
control-flow, so it just felt wrong to stuff into there (although I have a draft of that patch).
Similarly, this could be part of simplifycfg, but all of this pattern matching is a stretch.
So we're left with our relatively new dumping ground for homeless transforms: aggressive-instcombine.
This only runs at -O3, but that seems like a reasonable limitation given that source code has many
options to avoid this pattern (including the recently added clang intrinsics for rotates).
I'm including a PhaseOrdering test because we require the teamwork of 3 passes (aggressive-instcombine,
instcombine, simplifycfg) to get this into the minimal IR form that we want. That test shows a bug
with the new pass manager that's independent of this change (but it will be masked if we canonicalize
harder to funnel shift intrinsics in instcombine).
Differential Revision: https://reviews.llvm.org/D55604
llvm-svn: 349396
2018-12-18 05:14:51 +08:00
|
|
|
if (RotL.match(V) && L0 == R0 && L1 == R1) {
|
|
|
|
X = L0;
|
|
|
|
Y = L1;
|
|
|
|
return Intrinsic::fshl;
|
|
|
|
}
|
|
|
|
|
|
|
|
// rotate_right(X, Y) == (X >> Y) | (X << (Width - Y))
|
2019-01-03 03:51:46 +08:00
|
|
|
auto RotR = m_OneUse(
|
|
|
|
m_c_Or(m_LShr(m_Value(L0), m_Value(L1)), m_Shl(m_Value(R0), Sub)));
|
[AggressiveInstCombine] convert rotate with guard branch into funnel shift (PR34924)
Now, that we have funnel shift intrinsics, it should be safe to convert this form of rotate to it.
In the worst case (a target that doesn't have rotate instructions), we will expand this into a
branch-less sequence of ALU ops (neg/and/and/lshr/shl/or) in the backend, so it's still very
likely to be a perf improvement over the original code.
The motivating source code pattern for this is shown in:
https://bugs.llvm.org/show_bug.cgi?id=34924
Background:
I looked at several different options before deciding where to try this - instcombine, simplifycfg,
CGP - because it doesn't fit cleanly anywhere AFAIK.
The backend (CGP, SDAG, GlobalIsel?) is too late for what we're trying to accomplish. We want to
have the IR converted before we reach things like vectorization because the reduced code can make a
loop much simpler to transform.
Technically, this could be included in instcombine, but it's a large pattern match that includes
control-flow, so it just felt wrong to stuff into there (although I have a draft of that patch).
Similarly, this could be part of simplifycfg, but all of this pattern matching is a stretch.
So we're left with our relatively new dumping ground for homeless transforms: aggressive-instcombine.
This only runs at -O3, but that seems like a reasonable limitation given that source code has many
options to avoid this pattern (including the recently added clang intrinsics for rotates).
I'm including a PhaseOrdering test because we require the teamwork of 3 passes (aggressive-instcombine,
instcombine, simplifycfg) to get this into the minimal IR form that we want. That test shows a bug
with the new pass manager that's independent of this change (but it will be masked if we canonicalize
harder to funnel shift intrinsics in instcombine).
Differential Revision: https://reviews.llvm.org/D55604
llvm-svn: 349396
2018-12-18 05:14:51 +08:00
|
|
|
if (RotR.match(V) && L0 == R0 && L1 == R1) {
|
|
|
|
X = L0;
|
|
|
|
Y = L1;
|
|
|
|
return Intrinsic::fshr;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Intrinsic::not_intrinsic;
|
|
|
|
};
|
|
|
|
|
|
|
|
// One phi operand must be a rotate operation, and the other phi operand must
|
|
|
|
// be the source value of that rotate operation:
|
|
|
|
// phi [ rotate(RotSrc, RotAmt), RotBB ], [ RotSrc, GuardBB ]
|
|
|
|
PHINode &Phi = cast<PHINode>(I);
|
|
|
|
Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1);
|
|
|
|
Value *RotSrc, *RotAmt;
|
|
|
|
Intrinsic::ID IID = matchRotate(P0, RotSrc, RotAmt);
|
|
|
|
if (IID == Intrinsic::not_intrinsic || RotSrc != P1) {
|
|
|
|
IID = matchRotate(P1, RotSrc, RotAmt);
|
|
|
|
if (IID == Intrinsic::not_intrinsic || RotSrc != P0)
|
|
|
|
return false;
|
|
|
|
assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) &&
|
|
|
|
"Pattern must match funnel shift left or right");
|
|
|
|
}
|
|
|
|
|
|
|
|
// The incoming block with our source operand must be the "guard" block.
|
|
|
|
// That must contain a cmp+branch to avoid the rotate when the shift amount
|
|
|
|
// is equal to 0. The other incoming block is the block with the rotate.
|
|
|
|
BasicBlock *GuardBB = Phi.getIncomingBlock(RotSrc == P1);
|
|
|
|
BasicBlock *RotBB = Phi.getIncomingBlock(RotSrc != P1);
|
|
|
|
Instruction *TermI = GuardBB->getTerminator();
|
|
|
|
ICmpInst::Predicate Pred;
|
2019-09-25 23:05:08 +08:00
|
|
|
BasicBlock *PhiBB = Phi.getParent();
|
|
|
|
if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(RotAmt), m_ZeroInt()),
|
|
|
|
m_SpecificBB(PhiBB), m_SpecificBB(RotBB))))
|
[AggressiveInstCombine] convert rotate with guard branch into funnel shift (PR34924)
Now, that we have funnel shift intrinsics, it should be safe to convert this form of rotate to it.
In the worst case (a target that doesn't have rotate instructions), we will expand this into a
branch-less sequence of ALU ops (neg/and/and/lshr/shl/or) in the backend, so it's still very
likely to be a perf improvement over the original code.
The motivating source code pattern for this is shown in:
https://bugs.llvm.org/show_bug.cgi?id=34924
Background:
I looked at several different options before deciding where to try this - instcombine, simplifycfg,
CGP - because it doesn't fit cleanly anywhere AFAIK.
The backend (CGP, SDAG, GlobalIsel?) is too late for what we're trying to accomplish. We want to
have the IR converted before we reach things like vectorization because the reduced code can make a
loop much simpler to transform.
Technically, this could be included in instcombine, but it's a large pattern match that includes
control-flow, so it just felt wrong to stuff into there (although I have a draft of that patch).
Similarly, this could be part of simplifycfg, but all of this pattern matching is a stretch.
So we're left with our relatively new dumping ground for homeless transforms: aggressive-instcombine.
This only runs at -O3, but that seems like a reasonable limitation given that source code has many
options to avoid this pattern (including the recently added clang intrinsics for rotates).
I'm including a PhaseOrdering test because we require the teamwork of 3 passes (aggressive-instcombine,
instcombine, simplifycfg) to get this into the minimal IR form that we want. That test shows a bug
with the new pass manager that's independent of this change (but it will be masked if we canonicalize
harder to funnel shift intrinsics in instcombine).
Differential Revision: https://reviews.llvm.org/D55604
llvm-svn: 349396
2018-12-18 05:14:51 +08:00
|
|
|
return false;
|
|
|
|
|
2019-09-25 23:05:08 +08:00
|
|
|
if (Pred != CmpInst::ICMP_EQ)
|
[AggressiveInstCombine] convert rotate with guard branch into funnel shift (PR34924)
Now, that we have funnel shift intrinsics, it should be safe to convert this form of rotate to it.
In the worst case (a target that doesn't have rotate instructions), we will expand this into a
branch-less sequence of ALU ops (neg/and/and/lshr/shl/or) in the backend, so it's still very
likely to be a perf improvement over the original code.
The motivating source code pattern for this is shown in:
https://bugs.llvm.org/show_bug.cgi?id=34924
Background:
I looked at several different options before deciding where to try this - instcombine, simplifycfg,
CGP - because it doesn't fit cleanly anywhere AFAIK.
The backend (CGP, SDAG, GlobalIsel?) is too late for what we're trying to accomplish. We want to
have the IR converted before we reach things like vectorization because the reduced code can make a
loop much simpler to transform.
Technically, this could be included in instcombine, but it's a large pattern match that includes
control-flow, so it just felt wrong to stuff into there (although I have a draft of that patch).
Similarly, this could be part of simplifycfg, but all of this pattern matching is a stretch.
So we're left with our relatively new dumping ground for homeless transforms: aggressive-instcombine.
This only runs at -O3, but that seems like a reasonable limitation given that source code has many
options to avoid this pattern (including the recently added clang intrinsics for rotates).
I'm including a PhaseOrdering test because we require the teamwork of 3 passes (aggressive-instcombine,
instcombine, simplifycfg) to get this into the minimal IR form that we want. That test shows a bug
with the new pass manager that's independent of this change (but it will be masked if we canonicalize
harder to funnel shift intrinsics in instcombine).
Differential Revision: https://reviews.llvm.org/D55604
llvm-svn: 349396
2018-12-18 05:14:51 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
// We matched a variation of this IR pattern:
|
|
|
|
// GuardBB:
|
|
|
|
// %cmp = icmp eq i32 %RotAmt, 0
|
|
|
|
// br i1 %cmp, label %PhiBB, label %RotBB
|
|
|
|
// RotBB:
|
|
|
|
// %sub = sub i32 32, %RotAmt
|
|
|
|
// %shr = lshr i32 %X, %sub
|
|
|
|
// %shl = shl i32 %X, %RotAmt
|
|
|
|
// %rot = or i32 %shr, %shl
|
|
|
|
// br label %PhiBB
|
|
|
|
// PhiBB:
|
|
|
|
// %cond = phi i32 [ %rot, %RotBB ], [ %X, %GuardBB ]
|
|
|
|
// -->
|
|
|
|
// llvm.fshl.i32(i32 %X, i32 %RotAmt)
|
|
|
|
IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
|
|
|
|
Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType());
|
|
|
|
Phi.replaceAllUsesWith(Builder.CreateCall(F, {RotSrc, RotSrc, RotAmt}));
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-05-10 07:08:15 +08:00
|
|
|
/// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and
|
|
|
|
/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
|
|
|
|
/// of 'and' ops, then we also need to capture the fact that we saw an
|
|
|
|
/// "and X, 1", so that's an extra return value for that case.
|
|
|
|
struct MaskOps {
|
|
|
|
Value *Root;
|
|
|
|
APInt Mask;
|
|
|
|
bool MatchAndChain;
|
|
|
|
bool FoundAnd1;
|
|
|
|
|
2019-01-03 03:51:46 +08:00
|
|
|
MaskOps(unsigned BitWidth, bool MatchAnds)
|
|
|
|
: Root(nullptr), Mask(APInt::getNullValue(BitWidth)),
|
|
|
|
MatchAndChain(MatchAnds), FoundAnd1(false) {}
|
2018-05-10 07:08:15 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
|
|
|
|
/// chain of 'and' or 'or' instructions looking for shift ops of a common source
|
|
|
|
/// value. Examples:
|
|
|
|
/// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)
|
|
|
|
/// returns { X, 0x129 }
|
|
|
|
/// and (and (X >> 1), 1), (X >> 4)
|
|
|
|
/// returns { X, 0x12 }
|
|
|
|
static bool matchAndOrChain(Value *V, MaskOps &MOps) {
|
2018-05-02 05:02:09 +08:00
|
|
|
Value *Op0, *Op1;
|
2018-05-10 07:08:15 +08:00
|
|
|
if (MOps.MatchAndChain) {
|
|
|
|
// Recurse through a chain of 'and' operands. This requires an extra check
|
|
|
|
// vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere
|
|
|
|
// in the chain to know that all of the high bits are cleared.
|
|
|
|
if (match(V, m_And(m_Value(Op0), m_One()))) {
|
|
|
|
MOps.FoundAnd1 = true;
|
|
|
|
return matchAndOrChain(Op0, MOps);
|
|
|
|
}
|
|
|
|
if (match(V, m_And(m_Value(Op0), m_Value(Op1))))
|
|
|
|
return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
|
|
|
|
} else {
|
|
|
|
// Recurse through a chain of 'or' operands.
|
|
|
|
if (match(V, m_Or(m_Value(Op0), m_Value(Op1))))
|
|
|
|
return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
|
|
|
|
}
|
2018-05-02 05:02:09 +08:00
|
|
|
|
|
|
|
// We need a shift-right or a bare value representing a compare of bit 0 of
|
|
|
|
// the original source operand.
|
|
|
|
Value *Candidate;
|
|
|
|
uint64_t BitIndex = 0;
|
|
|
|
if (!match(V, m_LShr(m_Value(Candidate), m_ConstantInt(BitIndex))))
|
|
|
|
Candidate = V;
|
|
|
|
|
|
|
|
// Initialize result source operand.
|
2018-05-10 07:08:15 +08:00
|
|
|
if (!MOps.Root)
|
|
|
|
MOps.Root = Candidate;
|
2018-05-02 05:02:09 +08:00
|
|
|
|
2018-05-14 21:43:32 +08:00
|
|
|
// The shift constant is out-of-range? This code hasn't been simplified.
|
|
|
|
if (BitIndex >= MOps.Mask.getBitWidth())
|
|
|
|
return false;
|
|
|
|
|
2018-05-02 05:02:09 +08:00
|
|
|
// Fill in the mask bit derived from the shift constant.
|
2018-05-10 07:08:15 +08:00
|
|
|
MOps.Mask.setBit(BitIndex);
|
|
|
|
return MOps.Root == Candidate;
|
2018-05-02 05:02:09 +08:00
|
|
|
}
|
|
|
|
|
2018-05-10 07:08:15 +08:00
|
|
|
/// Match patterns that correspond to "any-bits-set" and "all-bits-set".
|
|
|
|
/// These will include a chain of 'or' or 'and'-shifted bits from a
|
|
|
|
/// common source value:
|
|
|
|
/// and (or (lshr X, C), ...), 1 --> (X & CMask) != 0
|
|
|
|
/// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask
|
|
|
|
/// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns
|
|
|
|
/// that differ only with a final 'not' of the result. We expect that final
|
|
|
|
/// 'not' to be folded with the compare that we create here (invert predicate).
|
|
|
|
static bool foldAnyOrAllBitsSet(Instruction &I) {
|
|
|
|
// The 'any-bits-set' ('or' chain) pattern is simpler to match because the
|
|
|
|
// final "and X, 1" instruction must be the final op in the sequence.
|
|
|
|
bool MatchAllBitsSet;
|
|
|
|
if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value())))
|
|
|
|
MatchAllBitsSet = true;
|
|
|
|
else if (match(&I, m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One())))
|
|
|
|
MatchAllBitsSet = false;
|
|
|
|
else
|
2018-05-02 05:02:09 +08:00
|
|
|
return false;
|
|
|
|
|
2018-05-10 07:08:15 +08:00
|
|
|
MaskOps MOps(I.getType()->getScalarSizeInBits(), MatchAllBitsSet);
|
|
|
|
if (MatchAllBitsSet) {
|
|
|
|
if (!matchAndOrChain(cast<BinaryOperator>(&I), MOps) || !MOps.FoundAnd1)
|
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
if (!matchAndOrChain(cast<BinaryOperator>(&I)->getOperand(0), MOps))
|
|
|
|
return false;
|
|
|
|
}
|
2018-05-02 05:02:09 +08:00
|
|
|
|
2018-05-10 07:08:15 +08:00
|
|
|
// The pattern was found. Create a masked compare that replaces all of the
|
|
|
|
// shift and logic ops.
|
2018-05-02 05:02:09 +08:00
|
|
|
IRBuilder<> Builder(&I);
|
2018-05-10 07:08:15 +08:00
|
|
|
Constant *Mask = ConstantInt::get(I.getType(), MOps.Mask);
|
|
|
|
Value *And = Builder.CreateAnd(MOps.Root, Mask);
|
2019-01-03 03:51:46 +08:00
|
|
|
Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask)
|
|
|
|
: Builder.CreateIsNotNull(And);
|
2018-05-10 07:08:15 +08:00
|
|
|
Value *Zext = Builder.CreateZExt(Cmp, I.getType());
|
2018-05-02 05:02:09 +08:00
|
|
|
I.replaceAllUsesWith(Zext);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-10-11 13:13:56 +08:00
|
|
|
// Try to recognize below function as popcount intrinsic.
|
|
|
|
// This is the "best" algorithm from
|
|
|
|
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
|
|
|
// Also used in TargetLowering::expandCTPOP().
|
|
|
|
//
|
|
|
|
// int popcount(unsigned int i) {
|
|
|
|
// i = i - ((i >> 1) & 0x55555555);
|
|
|
|
// i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
|
|
|
|
// i = ((i + (i >> 4)) & 0x0F0F0F0F);
|
|
|
|
// return (i * 0x01010101) >> 24;
|
|
|
|
// }
|
|
|
|
static bool tryToRecognizePopCount(Instruction &I) {
|
|
|
|
if (I.getOpcode() != Instruction::LShr)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
Type *Ty = I.getType();
|
|
|
|
if (!Ty->isIntOrIntVectorTy())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
unsigned Len = Ty->getScalarSizeInBits();
|
|
|
|
// FIXME: fix Len == 8 and other irregular type lengths.
|
|
|
|
if (!(Len <= 128 && Len > 8 && Len % 8 == 0))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55));
|
|
|
|
APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33));
|
|
|
|
APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F));
|
|
|
|
APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01));
|
|
|
|
APInt MaskShift = APInt(Len, Len - 8);
|
|
|
|
|
|
|
|
Value *Op0 = I.getOperand(0);
|
|
|
|
Value *Op1 = I.getOperand(1);
|
|
|
|
Value *MulOp0;
|
|
|
|
// Matching "(i * 0x01010101...) >> 24".
|
|
|
|
if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
|
|
|
|
match(Op1, m_SpecificInt(MaskShift))) {
|
|
|
|
Value *ShiftOp0;
|
|
|
|
// Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
|
|
|
|
if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
|
|
|
|
m_Deferred(ShiftOp0)),
|
|
|
|
m_SpecificInt(Mask0F)))) {
|
|
|
|
Value *AndOp0;
|
|
|
|
// Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)".
|
|
|
|
if (match(ShiftOp0,
|
|
|
|
m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)),
|
|
|
|
m_And(m_LShr(m_Deferred(AndOp0), m_SpecificInt(2)),
|
|
|
|
m_SpecificInt(Mask33))))) {
|
|
|
|
Value *Root, *SubOp1;
|
|
|
|
// Matching "i - ((i >> 1) & 0x55555555...)".
|
|
|
|
if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) &&
|
|
|
|
match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)),
|
|
|
|
m_SpecificInt(Mask55)))) {
|
|
|
|
LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
|
|
|
|
IRBuilder<> Builder(&I);
|
|
|
|
Function *Func = Intrinsic::getDeclaration(
|
|
|
|
I.getModule(), Intrinsic::ctpop, I.getType());
|
|
|
|
I.replaceAllUsesWith(Builder.CreateCall(Func, {Root}));
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-05-02 05:02:09 +08:00
|
|
|
/// This is the entry point for folds that could be implemented in regular
|
|
|
|
/// InstCombine, but they are separated because they are not expected to
|
|
|
|
/// occur frequently and/or have more than a constant-length pattern match.
|
|
|
|
static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
|
|
|
|
bool MadeChange = false;
|
|
|
|
for (BasicBlock &BB : F) {
|
|
|
|
// Ignore unreachable basic blocks.
|
|
|
|
if (!DT.isReachableFromEntry(&BB))
|
|
|
|
continue;
|
|
|
|
// Do not delete instructions under here and invalidate the iterator.
|
2018-05-10 07:08:15 +08:00
|
|
|
// Walk the block backwards for efficiency. We're matching a chain of
|
|
|
|
// use->defs, so we're more likely to succeed by starting from the bottom.
|
|
|
|
// Also, we want to avoid matching partial patterns.
|
|
|
|
// TODO: It would be more efficient if we removed dead instructions
|
|
|
|
// iteratively in this loop rather than waiting until the end.
|
[AggressiveInstCombine] convert rotate with guard branch into funnel shift (PR34924)
Now, that we have funnel shift intrinsics, it should be safe to convert this form of rotate to it.
In the worst case (a target that doesn't have rotate instructions), we will expand this into a
branch-less sequence of ALU ops (neg/and/and/lshr/shl/or) in the backend, so it's still very
likely to be a perf improvement over the original code.
The motivating source code pattern for this is shown in:
https://bugs.llvm.org/show_bug.cgi?id=34924
Background:
I looked at several different options before deciding where to try this - instcombine, simplifycfg,
CGP - because it doesn't fit cleanly anywhere AFAIK.
The backend (CGP, SDAG, GlobalIsel?) is too late for what we're trying to accomplish. We want to
have the IR converted before we reach things like vectorization because the reduced code can make a
loop much simpler to transform.
Technically, this could be included in instcombine, but it's a large pattern match that includes
control-flow, so it just felt wrong to stuff into there (although I have a draft of that patch).
Similarly, this could be part of simplifycfg, but all of this pattern matching is a stretch.
So we're left with our relatively new dumping ground for homeless transforms: aggressive-instcombine.
This only runs at -O3, but that seems like a reasonable limitation given that source code has many
options to avoid this pattern (including the recently added clang intrinsics for rotates).
I'm including a PhaseOrdering test because we require the teamwork of 3 passes (aggressive-instcombine,
instcombine, simplifycfg) to get this into the minimal IR form that we want. That test shows a bug
with the new pass manager that's independent of this change (but it will be masked if we canonicalize
harder to funnel shift intrinsics in instcombine).
Differential Revision: https://reviews.llvm.org/D55604
llvm-svn: 349396
2018-12-18 05:14:51 +08:00
|
|
|
for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
|
2018-05-10 07:08:15 +08:00
|
|
|
MadeChange |= foldAnyOrAllBitsSet(I);
|
[AggressiveInstCombine] convert rotate with guard branch into funnel shift (PR34924)
Now, that we have funnel shift intrinsics, it should be safe to convert this form of rotate to it.
In the worst case (a target that doesn't have rotate instructions), we will expand this into a
branch-less sequence of ALU ops (neg/and/and/lshr/shl/or) in the backend, so it's still very
likely to be a perf improvement over the original code.
The motivating source code pattern for this is shown in:
https://bugs.llvm.org/show_bug.cgi?id=34924
Background:
I looked at several different options before deciding where to try this - instcombine, simplifycfg,
CGP - because it doesn't fit cleanly anywhere AFAIK.
The backend (CGP, SDAG, GlobalIsel?) is too late for what we're trying to accomplish. We want to
have the IR converted before we reach things like vectorization because the reduced code can make a
loop much simpler to transform.
Technically, this could be included in instcombine, but it's a large pattern match that includes
control-flow, so it just felt wrong to stuff into there (although I have a draft of that patch).
Similarly, this could be part of simplifycfg, but all of this pattern matching is a stretch.
So we're left with our relatively new dumping ground for homeless transforms: aggressive-instcombine.
This only runs at -O3, but that seems like a reasonable limitation given that source code has many
options to avoid this pattern (including the recently added clang intrinsics for rotates).
I'm including a PhaseOrdering test because we require the teamwork of 3 passes (aggressive-instcombine,
instcombine, simplifycfg) to get this into the minimal IR form that we want. That test shows a bug
with the new pass manager that's independent of this change (but it will be masked if we canonicalize
harder to funnel shift intrinsics in instcombine).
Differential Revision: https://reviews.llvm.org/D55604
llvm-svn: 349396
2018-12-18 05:14:51 +08:00
|
|
|
MadeChange |= foldGuardedRotateToFunnelShift(I);
|
2019-10-11 13:13:56 +08:00
|
|
|
MadeChange |= tryToRecognizePopCount(I);
|
[AggressiveInstCombine] convert rotate with guard branch into funnel shift (PR34924)
Now, that we have funnel shift intrinsics, it should be safe to convert this form of rotate to it.
In the worst case (a target that doesn't have rotate instructions), we will expand this into a
branch-less sequence of ALU ops (neg/and/and/lshr/shl/or) in the backend, so it's still very
likely to be a perf improvement over the original code.
The motivating source code pattern for this is shown in:
https://bugs.llvm.org/show_bug.cgi?id=34924
Background:
I looked at several different options before deciding where to try this - instcombine, simplifycfg,
CGP - because it doesn't fit cleanly anywhere AFAIK.
The backend (CGP, SDAG, GlobalIsel?) is too late for what we're trying to accomplish. We want to
have the IR converted before we reach things like vectorization because the reduced code can make a
loop much simpler to transform.
Technically, this could be included in instcombine, but it's a large pattern match that includes
control-flow, so it just felt wrong to stuff into there (although I have a draft of that patch).
Similarly, this could be part of simplifycfg, but all of this pattern matching is a stretch.
So we're left with our relatively new dumping ground for homeless transforms: aggressive-instcombine.
This only runs at -O3, but that seems like a reasonable limitation given that source code has many
options to avoid this pattern (including the recently added clang intrinsics for rotates).
I'm including a PhaseOrdering test because we require the teamwork of 3 passes (aggressive-instcombine,
instcombine, simplifycfg) to get this into the minimal IR form that we want. That test shows a bug
with the new pass manager that's independent of this change (but it will be masked if we canonicalize
harder to funnel shift intrinsics in instcombine).
Differential Revision: https://reviews.llvm.org/D55604
llvm-svn: 349396
2018-12-18 05:14:51 +08:00
|
|
|
}
|
2018-05-02 05:02:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// We're done with transforms, so remove dead instructions.
|
|
|
|
if (MadeChange)
|
|
|
|
for (BasicBlock &BB : F)
|
|
|
|
SimplifyInstructionsInBlock(&BB);
|
|
|
|
|
|
|
|
return MadeChange;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// This is the entry point for all transforms. Pass manager differences are
|
|
|
|
/// handled in the callers of this function.
|
|
|
|
static bool runImpl(Function &F, TargetLibraryInfo &TLI, DominatorTree &DT) {
|
|
|
|
bool MadeChange = false;
|
|
|
|
const DataLayout &DL = F.getParent()->getDataLayout();
|
|
|
|
TruncInstCombine TIC(TLI, DL, DT);
|
|
|
|
MadeChange |= TIC.run(F);
|
|
|
|
MadeChange |= foldUnusualPatterns(F, DT);
|
|
|
|
return MadeChange;
|
|
|
|
}
|
|
|
|
|
2018-01-25 20:06:32 +08:00
|
|
|
void AggressiveInstCombinerLegacyPass::getAnalysisUsage(
|
|
|
|
AnalysisUsage &AU) const {
|
|
|
|
AU.setPreservesCFG();
|
2018-01-31 18:41:31 +08:00
|
|
|
AU.addRequired<DominatorTreeWrapperPass>();
|
2018-01-25 20:06:32 +08:00
|
|
|
AU.addRequired<TargetLibraryInfoWrapperPass>();
|
|
|
|
AU.addPreserved<AAResultsWrapperPass>();
|
|
|
|
AU.addPreserved<BasicAAWrapperPass>();
|
2018-01-31 18:41:31 +08:00
|
|
|
AU.addPreserved<DominatorTreeWrapperPass>();
|
2018-01-25 20:06:32 +08:00
|
|
|
AU.addPreserved<GlobalsAAWrapperPass>();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AggressiveInstCombinerLegacyPass::runOnFunction(Function &F) {
|
Change TargetLibraryInfo analysis passes to always require Function
Summary:
This is the first change to enable the TLI to be built per-function so
that -fno-builtin* handling can be migrated to use function attributes.
See discussion on D61634 for background. This is an enabler for fixing
handling of these options for LTO, for example.
This change should not affect behavior, as the provided function is not
yet used to build a specifically per-function TLI, but rather enables
that migration.
Most of the changes were very mechanical, e.g. passing a Function to the
legacy analysis pass's getTLI interface, or in Module level cases,
adding a callback. This is similar to the way the per-function TTI
analysis works.
There was one place where we were looking for builtins but not in the
context of a specific function. See FindCXAAtExit in
lib/Transforms/IPO/GlobalOpt.cpp. I'm somewhat concerned my workaround
could provide the wrong behavior in some corner cases. Suggestions
welcome.
Reviewers: chandlerc, hfinkel
Subscribers: arsenm, dschuff, jvesely, nhaehnle, mehdi_amini, javed.absar, sbc100, jgravelle-google, eraman, aheejin, steven_wu, george.burgess.iv, dexonsmith, jfb, asbirlea, gchatelet, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D66428
llvm-svn: 371284
2019-09-07 11:09:36 +08:00
|
|
|
auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
|
2018-05-02 05:02:09 +08:00
|
|
|
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
|
|
|
return runImpl(F, TLI, DT);
|
2018-01-25 20:06:32 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
|
|
|
|
FunctionAnalysisManager &AM) {
|
|
|
|
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
|
2018-05-02 05:02:09 +08:00
|
|
|
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
|
|
|
|
if (!runImpl(F, TLI, DT)) {
|
2018-01-25 20:06:32 +08:00
|
|
|
// No changes, all analyses are preserved.
|
|
|
|
return PreservedAnalyses::all();
|
2018-05-02 05:02:09 +08:00
|
|
|
}
|
2018-01-25 20:06:32 +08:00
|
|
|
// Mark all the analyses that instcombine updates as preserved.
|
|
|
|
PreservedAnalyses PA;
|
|
|
|
PA.preserveSet<CFGAnalyses>();
|
|
|
|
PA.preserve<AAManager>();
|
|
|
|
PA.preserve<GlobalsAA>();
|
|
|
|
return PA;
|
|
|
|
}
|
|
|
|
|
|
|
|
char AggressiveInstCombinerLegacyPass::ID = 0;
|
|
|
|
INITIALIZE_PASS_BEGIN(AggressiveInstCombinerLegacyPass,
|
|
|
|
"aggressive-instcombine",
|
|
|
|
"Combine pattern based expressions", false, false)
|
2018-01-31 18:41:31 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
|
2018-01-25 20:06:32 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
|
|
|
|
INITIALIZE_PASS_END(AggressiveInstCombinerLegacyPass, "aggressive-instcombine",
|
|
|
|
"Combine pattern based expressions", false, false)
|
|
|
|
|
2018-04-24 08:05:21 +08:00
|
|
|
// Initialization Routines
|
|
|
|
void llvm::initializeAggressiveInstCombine(PassRegistry &Registry) {
|
|
|
|
initializeAggressiveInstCombinerLegacyPassPass(Registry);
|
|
|
|
}
|
|
|
|
|
2018-04-24 08:39:29 +08:00
|
|
|
void LLVMInitializeAggressiveInstCombiner(LLVMPassRegistryRef R) {
|
|
|
|
initializeAggressiveInstCombinerLegacyPassPass(*unwrap(R));
|
|
|
|
}
|
|
|
|
|
2018-01-25 20:06:32 +08:00
|
|
|
FunctionPass *llvm::createAggressiveInstCombinerPass() {
|
|
|
|
return new AggressiveInstCombinerLegacyPass();
|
|
|
|
}
|
2018-04-24 23:40:07 +08:00
|
|
|
|
|
|
|
void LLVMAddAggressiveInstCombinerPass(LLVMPassManagerRef PM) {
|
|
|
|
unwrap(PM)->add(createAggressiveInstCombinerPass());
|
|
|
|
}
|