2004-10-19 05:08:22 +08:00
|
|
|
//===- LoopStrengthReduce.cpp - Strength Reduce GEPs in Loops -------------===//
|
2005-04-22 07:48:37 +08:00
|
|
|
//
|
2004-10-19 05:08:22 +08:00
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2005-04-22 07:48:37 +08:00
|
|
|
//
|
2004-10-19 05:08:22 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This pass performs a strength reduction on array references inside loops that
|
|
|
|
// have as one or more of their components the loop induction variable. This is
|
|
|
|
// accomplished by creating a new Value to hold the initial value of the array
|
|
|
|
// access for the first iteration, and then creating a new GEP instruction in
|
|
|
|
// the loop to increment the value by the appropriate amount.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2005-08-04 07:30:08 +08:00
|
|
|
#define DEBUG_TYPE "loop-reduce"
|
2004-10-19 05:08:22 +08:00
|
|
|
#include "llvm/Transforms/Scalar.h"
|
|
|
|
#include "llvm/Constants.h"
|
|
|
|
#include "llvm/Instructions.h"
|
2007-05-04 22:59:09 +08:00
|
|
|
#include "llvm/IntrinsicInst.h"
|
2004-10-19 05:08:22 +08:00
|
|
|
#include "llvm/Type.h"
|
2005-03-04 12:04:26 +08:00
|
|
|
#include "llvm/DerivedTypes.h"
|
2004-10-19 05:08:22 +08:00
|
|
|
#include "llvm/Analysis/Dominators.h"
|
|
|
|
#include "llvm/Analysis/LoopInfo.h"
|
2007-03-07 05:14:09 +08:00
|
|
|
#include "llvm/Analysis/LoopPass.h"
|
2005-07-30 08:15:07 +08:00
|
|
|
#include "llvm/Analysis/ScalarEvolutionExpander.h"
|
2004-10-19 05:08:22 +08:00
|
|
|
#include "llvm/Support/CFG.h"
|
2005-07-30 08:15:07 +08:00
|
|
|
#include "llvm/Support/GetElementPtrTypeIterator.h"
|
Fix a FIXME: if we are inserting code for a PHI argument, split the critical
edge so that the code is not always executed for both operands. This
prevents LSR from inserting code into loops whose exit blocks contain
PHI uses of IV expressions (which are outside of loops). On gzip, for
example, we turn this ugly code:
.LBB_test_1: ; loopentry
add r27, r3, r28
lhz r27, 3(r27)
add r26, r4, r28
lhz r26, 3(r26)
add r25, r30, r28 ;; Only live if exiting the loop
add r24, r29, r28 ;; Only live if exiting the loop
cmpw cr0, r27, r26
bne .LBB_test_5 ; loopexit
into this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2: ; loopentry.loopexit_crit_edge
add r2, r30, r27
add r8, r29, r27
b .LBB_test_9 ; loopexit
.LBB_test_2: ; shortcirc_next.0
...
blt .LBB_test_1
into this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2: ; loopentry.loopexit_crit_edge
add r2, r30, r27
add r8, r29, r27
b .LBB_t_3: ; shortcirc_next.0
.LBB_test_3: ; shortcirc_next.0
...
blt .LBB_test_1
Next step: get the block out of the loop so that the loop is all
fall-throughs again.
llvm-svn: 22766
2005-08-13 06:06:11 +08:00
|
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
2004-10-19 05:08:22 +08:00
|
|
|
#include "llvm/Transforms/Utils/Local.h"
|
2005-03-04 12:04:26 +08:00
|
|
|
#include "llvm/Target/TargetData.h"
|
2007-10-27 07:08:19 +08:00
|
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
2004-10-19 05:08:22 +08:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2005-07-30 08:15:07 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2006-08-27 20:54:02 +08:00
|
|
|
#include "llvm/Support/Compiler.h"
|
2006-03-14 07:14:23 +08:00
|
|
|
#include "llvm/Target/TargetLowering.h"
|
2005-07-31 02:22:27 +08:00
|
|
|
#include <algorithm>
|
2004-10-19 05:08:22 +08:00
|
|
|
#include <set>
|
|
|
|
using namespace llvm;
|
|
|
|
|
2007-10-25 17:11:16 +08:00
|
|
|
STATISTIC(NumReduced , "Number of GEPs strength reduced");
|
|
|
|
STATISTIC(NumInserted, "Number of PHIs inserted");
|
|
|
|
STATISTIC(NumVariable, "Number of PHIs with variable strides");
|
2008-08-28 01:50:18 +08:00
|
|
|
STATISTIC(NumEliminated, "Number of strides eliminated");
|
|
|
|
STATISTIC(NumShadow, "Number of Shadow IVs optimized");
|
2004-10-19 05:08:22 +08:00
|
|
|
|
2006-12-20 05:40:18 +08:00
|
|
|
namespace {
|
2007-03-20 08:47:50 +08:00
|
|
|
|
2007-03-21 04:43:18 +08:00
|
|
|
struct BasedUser;
|
2007-03-20 08:47:50 +08:00
|
|
|
|
2005-08-04 06:21:05 +08:00
|
|
|
/// IVStrideUse - Keep track of one use of a strided induction variable, where
|
|
|
|
/// the stride is stored externally. The Offset member keeps track of the
|
2007-10-30 03:32:39 +08:00
|
|
|
/// offset from the IV, User is the actual user of the operand, and
|
|
|
|
/// 'OperandValToReplace' is the operand of the User that is the use.
|
2007-02-06 07:32:05 +08:00
|
|
|
struct VISIBILITY_HIDDEN IVStrideUse {
|
2005-08-04 06:21:05 +08:00
|
|
|
SCEVHandle Offset;
|
|
|
|
Instruction *User;
|
|
|
|
Value *OperandValToReplace;
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
|
|
|
|
// isUseOfPostIncrementedValue - True if this should use the
|
|
|
|
// post-incremented version of this IV, not the preincremented version.
|
|
|
|
// This can only be set in special cases, such as the terminating setcc
|
_test:
li r2, 0
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r2, 1
stw r2, 0(r4)
blr
[zion ~/llvm]$ cat > ~/xx
Uses of IV's outside of the loop should use hte post-incremented version
of the IV, not the preincremented version. This helps many loops (e.g. in sixtrack)
which used to generate code like this (this is the code from the
dont-hoist-simple-loop-constants.ll testcase):
_test:
li r2, 0 **** IV starts at 0
LBB_test_1: ; no_exit.2
or r5, r2, r2 **** Copy for loop exit
li r2, 0
stw r2, 0(r3)
addi r3, r3, 4
addi r2, r5, 1
addi r6, r5, 2 **** IV+2
cmpwi cr0, r6, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r5, 2 **** IV+2
stw r2, 0(r4)
blr
And now generated code like this:
_test:
li r2, 1 *** IV starts at 1
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701 *** IV.postinc + 0
blt cr0, LBB_test_1
LBB_test_2: ; loopexit.2.loopexit
stw r2, 0(r4) *** IV.postinc + 0
blr
llvm-svn: 23313
2005-09-12 14:04:47 +08:00
|
|
|
// instruction for a loop or uses dominated by the loop.
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
bool isUseOfPostIncrementedValue;
|
2005-08-04 06:21:05 +08:00
|
|
|
|
|
|
|
IVStrideUse(const SCEVHandle &Offs, Instruction *U, Value *O)
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
: Offset(Offs), User(U), OperandValToReplace(O),
|
|
|
|
isUseOfPostIncrementedValue(false) {}
|
2005-08-04 06:21:05 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/// IVUsersOfOneStride - This structure keeps track of all instructions that
|
|
|
|
/// have an operand that is based on the trip count multiplied by some stride.
|
|
|
|
/// The stride for all of these users is common and kept external to this
|
|
|
|
/// structure.
|
2007-02-06 07:32:05 +08:00
|
|
|
struct VISIBILITY_HIDDEN IVUsersOfOneStride {
|
2005-07-30 08:15:07 +08:00
|
|
|
/// Users - Keep track of all of the users of this stride as well as the
|
2005-08-04 06:21:05 +08:00
|
|
|
/// initial value and the operand that uses the IV.
|
|
|
|
std::vector<IVStrideUse> Users;
|
|
|
|
|
|
|
|
void addUser(const SCEVHandle &Offset,Instruction *User, Value *Operand) {
|
|
|
|
Users.push_back(IVStrideUse(Offset, User, Operand));
|
2005-07-30 08:15:07 +08:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2006-03-17 05:53:05 +08:00
|
|
|
/// IVInfo - This structure keeps track of one IV expression inserted during
|
2006-03-18 16:03:12 +08:00
|
|
|
/// StrengthReduceStridedIVUsers. It contains the stride, the common base, as
|
|
|
|
/// well as the PHI node and increment value created for rewrite.
|
2007-02-06 07:32:05 +08:00
|
|
|
struct VISIBILITY_HIDDEN IVExpr {
|
2006-03-18 16:03:12 +08:00
|
|
|
SCEVHandle Stride;
|
2006-03-17 05:53:05 +08:00
|
|
|
SCEVHandle Base;
|
|
|
|
PHINode *PHI;
|
|
|
|
Value *IncV;
|
|
|
|
|
2006-03-18 16:03:12 +08:00
|
|
|
IVExpr(const SCEVHandle &stride, const SCEVHandle &base, PHINode *phi,
|
|
|
|
Value *incv)
|
|
|
|
: Stride(stride), Base(base), PHI(phi), IncV(incv) {}
|
2006-03-17 05:53:05 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/// IVsOfOneStride - This structure keeps track of all IV expression inserted
|
|
|
|
/// during StrengthReduceStridedIVUsers for a particular stride of the IV.
|
2007-02-06 07:32:05 +08:00
|
|
|
struct VISIBILITY_HIDDEN IVsOfOneStride {
|
2006-03-17 05:53:05 +08:00
|
|
|
std::vector<IVExpr> IVs;
|
|
|
|
|
2006-03-18 16:03:12 +08:00
|
|
|
void addIV(const SCEVHandle &Stride, const SCEVHandle &Base, PHINode *PHI,
|
|
|
|
Value *IncV) {
|
|
|
|
IVs.push_back(IVExpr(Stride, Base, PHI, IncV));
|
2006-03-17 05:53:05 +08:00
|
|
|
}
|
|
|
|
};
|
2005-07-30 08:15:07 +08:00
|
|
|
|
2007-03-07 05:14:09 +08:00
|
|
|
class VISIBILITY_HIDDEN LoopStrengthReduce : public LoopPass {
|
2004-10-19 05:08:22 +08:00
|
|
|
LoopInfo *LI;
|
2007-06-08 05:42:15 +08:00
|
|
|
DominatorTree *DT;
|
2005-07-30 08:15:07 +08:00
|
|
|
ScalarEvolution *SE;
|
|
|
|
const TargetData *TD;
|
|
|
|
const Type *UIntPtrTy;
|
2004-10-19 05:08:22 +08:00
|
|
|
bool Changed;
|
2005-08-02 10:52:02 +08:00
|
|
|
|
2005-07-30 08:15:07 +08:00
|
|
|
/// IVUsesByStride - Keep track of all uses of induction variables that we
|
|
|
|
/// are interested in. The key of the map is the stride of the access.
|
Teach LSR to strength reduce IVs that have a loop-invariant but non-constant stride.
For code like this:
void foo(float *a, float *b, int n, int stride_a, int stride_b) {
int i;
for (i=0; i<n; i++)
a[i*stride_a] = b[i*stride_b];
}
we now emit:
.LBB_foo2_2: ; no_exit
lfs f0, 0(r4)
stfs f0, 0(r3)
addi r7, r7, 1
add r4, r2, r4
add r3, r6, r3
cmpw cr0, r7, r5
blt .LBB_foo2_2 ; no_exit
instead of:
.LBB_foo_2: ; no_exit
mullw r8, r2, r7 ;; multiply!
slwi r8, r8, 2
lfsx f0, r4, r8
mullw r8, r2, r6 ;; multiply!
slwi r8, r8, 2
stfsx f0, r3, r8
addi r2, r2, 1
cmpw cr0, r2, r5
blt .LBB_foo_2 ; no_exit
loops with variable strides occur pretty often. For example, in SPECFP2K
there are 317 variable strides in 177.mesa, 3 in 179.art, 14 in 188.ammp,
56 in 168.wupwise, 36 in 172.mgrid.
Now we can allow indvars to turn functions written like this:
void foo2(float *a, float *b, int n, int stride_a, int stride_b) {
int i, ai = 0, bi = 0;
for (i=0; i<n; i++)
{
a[ai] = b[bi];
ai += stride_a;
bi += stride_b;
}
}
into code like the above for better analysis. With this patch, they generate
identical code.
llvm-svn: 22740
2005-08-10 08:45:21 +08:00
|
|
|
std::map<SCEVHandle, IVUsersOfOneStride> IVUsesByStride;
|
2005-07-30 08:15:07 +08:00
|
|
|
|
2006-03-17 05:53:05 +08:00
|
|
|
/// IVsByStride - Keep track of all IVs that have been inserted for a
|
|
|
|
/// particular stride.
|
|
|
|
std::map<SCEVHandle, IVsOfOneStride> IVsByStride;
|
|
|
|
|
2005-10-09 14:20:55 +08:00
|
|
|
/// StrideOrder - An ordering of the keys in IVUsesByStride that is stable:
|
|
|
|
/// We use this to iterate over the IVUsesByStride collection without being
|
|
|
|
/// dependent on random ordering of pointers in the process.
|
2007-10-31 06:27:26 +08:00
|
|
|
SmallVector<SCEVHandle, 16> StrideOrder;
|
2005-10-09 14:20:55 +08:00
|
|
|
|
2005-08-04 09:19:13 +08:00
|
|
|
/// CastedValues - As we need to cast values to uintptr_t, this keeps track
|
|
|
|
/// of the casted version of each value. This is accessed by
|
|
|
|
/// getCastedVersionOf.
|
2007-10-31 06:27:26 +08:00
|
|
|
DenseMap<Value*, Value*> CastedPointers;
|
2005-07-30 08:15:07 +08:00
|
|
|
|
|
|
|
/// DeadInsts - Keep track of instructions we may have made dead, so that
|
|
|
|
/// we can remove them after we are done working.
|
2008-12-01 14:27:41 +08:00
|
|
|
SmallVector<Instruction*, 16> DeadInsts;
|
2006-03-14 07:14:23 +08:00
|
|
|
|
|
|
|
/// TLI - Keep a pointer of a TargetLowering to consult for determining
|
|
|
|
/// transformation profitability.
|
|
|
|
const TargetLowering *TLI;
|
|
|
|
|
2004-10-19 05:08:22 +08:00
|
|
|
public:
|
2007-05-03 09:11:54 +08:00
|
|
|
static char ID; // Pass ID, replacement for typeid
|
2007-08-01 23:32:29 +08:00
|
|
|
explicit LoopStrengthReduce(const TargetLowering *tli = NULL) :
|
2008-09-05 01:05:41 +08:00
|
|
|
LoopPass(&ID), TLI(tli) {
|
2005-03-04 12:04:26 +08:00
|
|
|
}
|
|
|
|
|
2007-03-07 05:14:09 +08:00
|
|
|
bool runOnLoop(Loop *L, LPPassManager &LPM);
|
2004-10-19 05:08:22 +08:00
|
|
|
|
|
|
|
virtual void getAnalysisUsage(AnalysisUsage &AU) const {
|
2005-08-17 14:35:16 +08:00
|
|
|
// We split critical edges, so we change the CFG. However, we do update
|
|
|
|
// many analyses if they are around.
|
|
|
|
AU.addPreservedID(LoopSimplifyID);
|
|
|
|
AU.addPreserved<LoopInfo>();
|
|
|
|
AU.addPreserved<DominanceFrontier>();
|
|
|
|
AU.addPreserved<DominatorTree>();
|
|
|
|
|
2005-02-28 03:37:07 +08:00
|
|
|
AU.addRequiredID(LoopSimplifyID);
|
2004-10-19 05:08:22 +08:00
|
|
|
AU.addRequired<LoopInfo>();
|
2007-06-08 05:42:15 +08:00
|
|
|
AU.addRequired<DominatorTree>();
|
2005-03-04 12:04:26 +08:00
|
|
|
AU.addRequired<TargetData>();
|
2005-07-30 08:15:07 +08:00
|
|
|
AU.addRequired<ScalarEvolution>();
|
2008-08-27 01:57:54 +08:00
|
|
|
AU.addPreserved<ScalarEvolution>();
|
2004-10-19 05:08:22 +08:00
|
|
|
}
|
2005-08-04 09:19:13 +08:00
|
|
|
|
|
|
|
/// getCastedVersionOf - Return the specified value casted to uintptr_t.
|
|
|
|
///
|
2006-12-13 16:06:42 +08:00
|
|
|
Value *getCastedVersionOf(Instruction::CastOps opcode, Value *V);
|
2005-08-04 09:19:13 +08:00
|
|
|
private:
|
2005-08-05 01:40:30 +08:00
|
|
|
bool AddUsersIfInteresting(Instruction *I, Loop *L,
|
2007-10-27 07:08:19 +08:00
|
|
|
SmallPtrSet<Instruction*,16> &Processed);
|
2007-10-30 03:31:25 +08:00
|
|
|
SCEVHandle GetExpressionSCEV(Instruction *E);
|
2007-10-25 17:11:16 +08:00
|
|
|
ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond,
|
|
|
|
IVStrideUse* &CondUse,
|
|
|
|
const SCEVHandle* &CondStride);
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
void OptimizeIndvars(Loop *L);
|
2008-08-27 01:57:54 +08:00
|
|
|
|
|
|
|
/// OptimizeShadowIV - If IV is used in a int-to-float cast
|
|
|
|
/// inside the loop then try to eliminate the cast opeation.
|
|
|
|
void OptimizeShadowIV(Loop *L);
|
|
|
|
|
2008-09-16 05:22:06 +08:00
|
|
|
/// OptimizeSMax - Rewrite the loop's terminating condition
|
|
|
|
/// if it uses an smax computation.
|
|
|
|
ICmpInst *OptimizeSMax(Loop *L, ICmpInst *Cond,
|
|
|
|
IVStrideUse* &CondUse);
|
|
|
|
|
2008-08-14 04:31:11 +08:00
|
|
|
bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
|
2008-08-27 01:57:54 +08:00
|
|
|
const SCEVHandle *&CondStride);
|
2007-10-26 06:45:20 +08:00
|
|
|
bool RequiresTypeConversion(const Type *Ty, const Type *NewTy);
|
Loosen up iv reuse to allow reuse of the same stride but a larger type when truncating from the larger type to smaller type is free.
e.g.
Turns this loop:
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
movw %dx, %si
LBB1_2: # bb
movl L_X$non_lazy_ptr, %edi
movw %si, (%edi)
movl L_Y$non_lazy_ptr, %edi
movw %dx, (%edi)
addw $4, %dx
incw %si
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
into
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
LBB1_2: # bb
movl L_X$non_lazy_ptr, %esi
movw %cx, (%esi)
movl L_Y$non_lazy_ptr, %esi
movw %dx, (%esi)
addw $4, %dx
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
llvm-svn: 43375
2007-10-26 09:56:11 +08:00
|
|
|
unsigned CheckForIVReuse(bool, bool, const SCEVHandle&,
|
2007-10-23 04:40:42 +08:00
|
|
|
IVExpr&, const Type*,
|
2007-03-20 08:47:50 +08:00
|
|
|
const std::vector<BasedUser>& UsersToProcess);
|
2007-10-23 04:40:42 +08:00
|
|
|
bool ValidStride(bool, int64_t,
|
|
|
|
const std::vector<BasedUser>& UsersToProcess);
|
2007-10-26 06:45:20 +08:00
|
|
|
SCEVHandle CollectIVUsers(const SCEVHandle &Stride,
|
|
|
|
IVUsersOfOneStride &Uses,
|
|
|
|
Loop *L,
|
|
|
|
bool &AllUsesAreAddresses,
|
|
|
|
std::vector<BasedUser> &UsersToProcess);
|
Teach LSR to strength reduce IVs that have a loop-invariant but non-constant stride.
For code like this:
void foo(float *a, float *b, int n, int stride_a, int stride_b) {
int i;
for (i=0; i<n; i++)
a[i*stride_a] = b[i*stride_b];
}
we now emit:
.LBB_foo2_2: ; no_exit
lfs f0, 0(r4)
stfs f0, 0(r3)
addi r7, r7, 1
add r4, r2, r4
add r3, r6, r3
cmpw cr0, r7, r5
blt .LBB_foo2_2 ; no_exit
instead of:
.LBB_foo_2: ; no_exit
mullw r8, r2, r7 ;; multiply!
slwi r8, r8, 2
lfsx f0, r4, r8
mullw r8, r2, r6 ;; multiply!
slwi r8, r8, 2
stfsx f0, r3, r8
addi r2, r2, 1
cmpw cr0, r2, r5
blt .LBB_foo_2 ; no_exit
loops with variable strides occur pretty often. For example, in SPECFP2K
there are 317 variable strides in 177.mesa, 3 in 179.art, 14 in 188.ammp,
56 in 168.wupwise, 36 in 172.mgrid.
Now we can allow indvars to turn functions written like this:
void foo2(float *a, float *b, int n, int stride_a, int stride_b) {
int i, ai = 0, bi = 0;
for (i=0; i<n; i++)
{
a[ai] = b[bi];
ai += stride_a;
bi += stride_b;
}
}
into code like the above for better analysis. With this patch, they generate
identical code.
llvm-svn: 22740
2005-08-10 08:45:21 +08:00
|
|
|
void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
|
|
|
|
IVUsersOfOneStride &Uses,
|
2005-08-04 06:21:05 +08:00
|
|
|
Loop *L, bool isOnlyStride);
|
2008-12-01 14:14:28 +08:00
|
|
|
void DeleteTriviallyDeadInstructions();
|
2004-10-19 05:08:22 +08:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2008-05-13 08:00:25 +08:00
|
|
|
char LoopStrengthReduce::ID = 0;
|
|
|
|
static RegisterPass<LoopStrengthReduce>
|
|
|
|
X("loop-reduce", "Loop Strength Reduction");
|
|
|
|
|
2008-10-23 07:32:42 +08:00
|
|
|
Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
|
2006-03-17 05:53:05 +08:00
|
|
|
return new LoopStrengthReduce(TLI);
|
2004-10-19 05:08:22 +08:00
|
|
|
}
|
|
|
|
|
2006-12-12 13:05:00 +08:00
|
|
|
/// getCastedVersionOf - Return the specified value casted to uintptr_t. This
|
|
|
|
/// assumes that the Value* V is of integer or pointer type only.
|
2005-08-04 09:19:13 +08:00
|
|
|
///
|
2006-12-13 16:06:42 +08:00
|
|
|
Value *LoopStrengthReduce::getCastedVersionOf(Instruction::CastOps opcode,
|
|
|
|
Value *V) {
|
2005-08-04 09:19:13 +08:00
|
|
|
if (V->getType() == UIntPtrTy) return V;
|
|
|
|
if (Constant *CB = dyn_cast<Constant>(V))
|
2006-12-13 16:06:42 +08:00
|
|
|
return ConstantExpr::getCast(opcode, CB, UIntPtrTy);
|
2005-08-04 09:19:13 +08:00
|
|
|
|
|
|
|
Value *&New = CastedPointers[V];
|
|
|
|
if (New) return New;
|
|
|
|
|
2006-12-13 16:06:42 +08:00
|
|
|
New = SCEVExpander::InsertCastOfTo(opcode, V, UIntPtrTy);
|
2008-12-01 14:27:41 +08:00
|
|
|
DeadInsts.push_back(cast<Instruction>(New));
|
2005-08-05 03:08:16 +08:00
|
|
|
return New;
|
2005-08-04 09:19:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-10-19 05:08:22 +08:00
|
|
|
/// DeleteTriviallyDeadInstructions - If any of the instructions is the
|
|
|
|
/// specified set are trivially dead, delete them and see if this makes any of
|
|
|
|
/// their operands subsequently dead.
|
2008-12-01 14:14:28 +08:00
|
|
|
void LoopStrengthReduce::DeleteTriviallyDeadInstructions() {
|
2008-12-01 14:27:41 +08:00
|
|
|
if (DeadInsts.empty()) return;
|
|
|
|
|
|
|
|
// Sort the deadinsts list so that we can trivially eliminate duplicates as we
|
|
|
|
// go. The code below never adds a non-dead instruction to the worklist, but
|
|
|
|
// callers may not be so careful.
|
2008-12-01 14:49:59 +08:00
|
|
|
array_pod_sort(DeadInsts.begin(), DeadInsts.end());
|
2008-12-01 14:27:41 +08:00
|
|
|
|
|
|
|
// Drop duplicate instructions and those with uses.
|
|
|
|
for (unsigned i = 0, e = DeadInsts.size()-1; i < e; ++i) {
|
|
|
|
Instruction *I = DeadInsts[i];
|
|
|
|
if (!I->use_empty()) DeadInsts[i] = 0;
|
|
|
|
while (DeadInsts[i+1] == I && i != e)
|
|
|
|
DeadInsts[++i] = 0;
|
|
|
|
}
|
|
|
|
|
2008-12-01 14:14:28 +08:00
|
|
|
while (!DeadInsts.empty()) {
|
|
|
|
Instruction *I = DeadInsts.back();
|
|
|
|
DeadInsts.pop_back();
|
2008-12-01 14:27:41 +08:00
|
|
|
|
|
|
|
if (I == 0 || !isInstructionTriviallyDead(I))
|
2008-12-01 14:11:32 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
SE->deleteValueFromRecords(I);
|
2008-11-29 11:43:04 +08:00
|
|
|
|
2008-12-01 14:27:41 +08:00
|
|
|
for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) {
|
|
|
|
if (Instruction *U = dyn_cast<Instruction>(*OI)) {
|
|
|
|
*OI = 0;
|
2008-12-01 14:11:32 +08:00
|
|
|
if (U->use_empty())
|
2008-12-01 14:27:41 +08:00
|
|
|
DeadInsts.push_back(U);
|
2008-12-01 14:11:32 +08:00
|
|
|
}
|
2004-10-19 05:08:22 +08:00
|
|
|
}
|
2008-12-01 14:11:32 +08:00
|
|
|
|
|
|
|
I->eraseFromParent();
|
|
|
|
Changed = true;
|
2004-10-19 05:08:22 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-07-30 08:15:07 +08:00
|
|
|
|
2005-08-05 01:40:30 +08:00
|
|
|
/// GetExpressionSCEV - Compute and return the SCEV for the specified
|
|
|
|
/// instruction.
|
2007-10-30 03:31:25 +08:00
|
|
|
SCEVHandle LoopStrengthReduce::GetExpressionSCEV(Instruction *Exp) {
|
2007-03-26 11:01:27 +08:00
|
|
|
// Pointer to pointer bitcast instructions return the same value as their
|
|
|
|
// operand.
|
|
|
|
if (BitCastInst *BCI = dyn_cast<BitCastInst>(Exp)) {
|
|
|
|
if (SE->hasSCEV(BCI) || !isa<Instruction>(BCI->getOperand(0)))
|
|
|
|
return SE->getSCEV(BCI);
|
2007-10-30 03:31:25 +08:00
|
|
|
SCEVHandle R = GetExpressionSCEV(cast<Instruction>(BCI->getOperand(0)));
|
2007-03-26 11:01:27 +08:00
|
|
|
SE->setSCEV(BCI, R);
|
|
|
|
return R;
|
|
|
|
}
|
|
|
|
|
Fix some 80 column violations.
Once we compute the evolution for a GEP, tell SE about it. This allows users
of the GEP to know it, if the users are not direct. This allows us to compile
this testcase:
void fbSolidFillmmx(int w, unsigned char *d) {
while (w >= 64) {
*(unsigned long long *) (d + 0) = 0;
*(unsigned long long *) (d + 8) = 0;
*(unsigned long long *) (d + 16) = 0;
*(unsigned long long *) (d + 24) = 0;
*(unsigned long long *) (d + 32) = 0;
*(unsigned long long *) (d + 40) = 0;
*(unsigned long long *) (d + 48) = 0;
*(unsigned long long *) (d + 56) = 0;
w -= 64;
d += 64;
}
}
into:
.LBB_fbSolidFillmmx_2: ; no_exit
li r2, 0
stw r2, 0(r4)
stw r2, 4(r4)
stw r2, 8(r4)
stw r2, 12(r4)
stw r2, 16(r4)
stw r2, 20(r4)
stw r2, 24(r4)
stw r2, 28(r4)
stw r2, 32(r4)
stw r2, 36(r4)
stw r2, 40(r4)
stw r2, 44(r4)
stw r2, 48(r4)
stw r2, 52(r4)
stw r2, 56(r4)
stw r2, 60(r4)
addi r4, r4, 64
addi r3, r3, -64
cmpwi cr0, r3, 63
bgt .LBB_fbSolidFillmmx_2 ; no_exit
instead of:
.LBB_fbSolidFillmmx_2: ; no_exit
li r11, 0
stw r11, 0(r4)
stw r11, 4(r4)
stwx r11, r10, r4
add r12, r10, r4
stw r11, 4(r12)
stwx r11, r9, r4
add r12, r9, r4
stw r11, 4(r12)
stwx r11, r8, r4
add r12, r8, r4
stw r11, 4(r12)
stwx r11, r7, r4
add r12, r7, r4
stw r11, 4(r12)
stwx r11, r6, r4
add r12, r6, r4
stw r11, 4(r12)
stwx r11, r5, r4
add r12, r5, r4
stw r11, 4(r12)
stwx r11, r2, r4
add r12, r2, r4
stw r11, 4(r12)
addi r4, r4, 64
addi r3, r3, -64
cmpwi cr0, r3, 63
bgt .LBB_fbSolidFillmmx_2 ; no_exit
llvm-svn: 22737
2005-08-10 07:39:36 +08:00
|
|
|
// Scalar Evolutions doesn't know how to compute SCEV's for GEP instructions.
|
|
|
|
// If this is a GEP that SE doesn't know about, compute it now and insert it.
|
|
|
|
// If this is not a GEP, or if we have already done this computation, just let
|
|
|
|
// SE figure it out.
|
2005-08-05 01:40:30 +08:00
|
|
|
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Exp);
|
Fix some 80 column violations.
Once we compute the evolution for a GEP, tell SE about it. This allows users
of the GEP to know it, if the users are not direct. This allows us to compile
this testcase:
void fbSolidFillmmx(int w, unsigned char *d) {
while (w >= 64) {
*(unsigned long long *) (d + 0) = 0;
*(unsigned long long *) (d + 8) = 0;
*(unsigned long long *) (d + 16) = 0;
*(unsigned long long *) (d + 24) = 0;
*(unsigned long long *) (d + 32) = 0;
*(unsigned long long *) (d + 40) = 0;
*(unsigned long long *) (d + 48) = 0;
*(unsigned long long *) (d + 56) = 0;
w -= 64;
d += 64;
}
}
into:
.LBB_fbSolidFillmmx_2: ; no_exit
li r2, 0
stw r2, 0(r4)
stw r2, 4(r4)
stw r2, 8(r4)
stw r2, 12(r4)
stw r2, 16(r4)
stw r2, 20(r4)
stw r2, 24(r4)
stw r2, 28(r4)
stw r2, 32(r4)
stw r2, 36(r4)
stw r2, 40(r4)
stw r2, 44(r4)
stw r2, 48(r4)
stw r2, 52(r4)
stw r2, 56(r4)
stw r2, 60(r4)
addi r4, r4, 64
addi r3, r3, -64
cmpwi cr0, r3, 63
bgt .LBB_fbSolidFillmmx_2 ; no_exit
instead of:
.LBB_fbSolidFillmmx_2: ; no_exit
li r11, 0
stw r11, 0(r4)
stw r11, 4(r4)
stwx r11, r10, r4
add r12, r10, r4
stw r11, 4(r12)
stwx r11, r9, r4
add r12, r9, r4
stw r11, 4(r12)
stwx r11, r8, r4
add r12, r8, r4
stw r11, 4(r12)
stwx r11, r7, r4
add r12, r7, r4
stw r11, 4(r12)
stwx r11, r6, r4
add r12, r6, r4
stw r11, 4(r12)
stwx r11, r5, r4
add r12, r5, r4
stw r11, 4(r12)
stwx r11, r2, r4
add r12, r2, r4
stw r11, 4(r12)
addi r4, r4, 64
addi r3, r3, -64
cmpwi cr0, r3, 63
bgt .LBB_fbSolidFillmmx_2 ; no_exit
llvm-svn: 22737
2005-08-10 07:39:36 +08:00
|
|
|
if (!GEP || SE->hasSCEV(GEP))
|
2005-08-05 01:40:30 +08:00
|
|
|
return SE->getSCEV(Exp);
|
|
|
|
|
2005-07-30 08:15:07 +08:00
|
|
|
// Analyze all of the subscripts of this getelementptr instruction, looking
|
2007-10-30 03:31:25 +08:00
|
|
|
// for uses that are determined by the trip count of the loop. First, skip
|
|
|
|
// all operands the are not dependent on the IV.
|
2005-07-30 08:15:07 +08:00
|
|
|
|
|
|
|
// Build up the base expression. Insert an LLVM cast of the pointer to
|
|
|
|
// uintptr_t first.
|
2007-10-23 02:31:58 +08:00
|
|
|
SCEVHandle GEPVal = SE->getUnknown(
|
2006-12-13 16:06:42 +08:00
|
|
|
getCastedVersionOf(Instruction::PtrToInt, GEP->getOperand(0)));
|
2005-07-30 08:15:07 +08:00
|
|
|
|
|
|
|
gep_type_iterator GTI = gep_type_begin(GEP);
|
2005-08-05 01:40:30 +08:00
|
|
|
|
2008-06-12 05:38:51 +08:00
|
|
|
for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
|
|
|
|
i != e; ++i, ++GTI) {
|
2005-07-30 08:15:07 +08:00
|
|
|
// If this is a use of a recurrence that we can analyze, and it comes before
|
|
|
|
// Op does in the GEP operand list, we will handle this when we process this
|
|
|
|
// operand.
|
|
|
|
if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
|
|
|
|
const StructLayout *SL = TD->getStructLayout(STy);
|
2008-06-12 05:38:51 +08:00
|
|
|
unsigned Idx = cast<ConstantInt>(*i)->getZExtValue();
|
2007-02-11 03:55:17 +08:00
|
|
|
uint64_t Offset = SL->getElementOffset(Idx);
|
2007-10-23 02:31:58 +08:00
|
|
|
GEPVal = SE->getAddExpr(GEPVal,
|
|
|
|
SE->getIntegerSCEV(Offset, UIntPtrTy));
|
2005-03-07 06:52:29 +08:00
|
|
|
} else {
|
2006-12-13 16:06:42 +08:00
|
|
|
unsigned GEPOpiBits =
|
2008-06-12 05:38:51 +08:00
|
|
|
(*i)->getType()->getPrimitiveSizeInBits();
|
2006-12-13 16:06:42 +08:00
|
|
|
unsigned IntPtrBits = UIntPtrTy->getPrimitiveSizeInBits();
|
|
|
|
Instruction::CastOps opcode = (GEPOpiBits < IntPtrBits ?
|
|
|
|
Instruction::SExt : (GEPOpiBits > IntPtrBits ? Instruction::Trunc :
|
|
|
|
Instruction::BitCast));
|
2008-06-12 05:38:51 +08:00
|
|
|
Value *OpVal = getCastedVersionOf(opcode, *i);
|
2005-08-05 03:08:16 +08:00
|
|
|
SCEVHandle Idx = SE->getSCEV(OpVal);
|
|
|
|
|
2007-10-02 07:08:35 +08:00
|
|
|
uint64_t TypeSize = TD->getABITypeSize(GTI.getIndexedType());
|
2005-08-05 01:40:30 +08:00
|
|
|
if (TypeSize != 1)
|
2007-10-23 02:31:58 +08:00
|
|
|
Idx = SE->getMulExpr(Idx,
|
|
|
|
SE->getConstant(ConstantInt::get(UIntPtrTy,
|
|
|
|
TypeSize)));
|
|
|
|
GEPVal = SE->getAddExpr(GEPVal, Idx);
|
2005-03-07 06:52:29 +08:00
|
|
|
}
|
2004-10-19 05:08:22 +08:00
|
|
|
}
|
2005-07-30 08:15:07 +08:00
|
|
|
|
Fix some 80 column violations.
Once we compute the evolution for a GEP, tell SE about it. This allows users
of the GEP to know it, if the users are not direct. This allows us to compile
this testcase:
void fbSolidFillmmx(int w, unsigned char *d) {
while (w >= 64) {
*(unsigned long long *) (d + 0) = 0;
*(unsigned long long *) (d + 8) = 0;
*(unsigned long long *) (d + 16) = 0;
*(unsigned long long *) (d + 24) = 0;
*(unsigned long long *) (d + 32) = 0;
*(unsigned long long *) (d + 40) = 0;
*(unsigned long long *) (d + 48) = 0;
*(unsigned long long *) (d + 56) = 0;
w -= 64;
d += 64;
}
}
into:
.LBB_fbSolidFillmmx_2: ; no_exit
li r2, 0
stw r2, 0(r4)
stw r2, 4(r4)
stw r2, 8(r4)
stw r2, 12(r4)
stw r2, 16(r4)
stw r2, 20(r4)
stw r2, 24(r4)
stw r2, 28(r4)
stw r2, 32(r4)
stw r2, 36(r4)
stw r2, 40(r4)
stw r2, 44(r4)
stw r2, 48(r4)
stw r2, 52(r4)
stw r2, 56(r4)
stw r2, 60(r4)
addi r4, r4, 64
addi r3, r3, -64
cmpwi cr0, r3, 63
bgt .LBB_fbSolidFillmmx_2 ; no_exit
instead of:
.LBB_fbSolidFillmmx_2: ; no_exit
li r11, 0
stw r11, 0(r4)
stw r11, 4(r4)
stwx r11, r10, r4
add r12, r10, r4
stw r11, 4(r12)
stwx r11, r9, r4
add r12, r9, r4
stw r11, 4(r12)
stwx r11, r8, r4
add r12, r8, r4
stw r11, 4(r12)
stwx r11, r7, r4
add r12, r7, r4
stw r11, 4(r12)
stwx r11, r6, r4
add r12, r6, r4
stw r11, 4(r12)
stwx r11, r5, r4
add r12, r5, r4
stw r11, 4(r12)
stwx r11, r2, r4
add r12, r2, r4
stw r11, 4(r12)
addi r4, r4, 64
addi r3, r3, -64
cmpwi cr0, r3, 63
bgt .LBB_fbSolidFillmmx_2 ; no_exit
llvm-svn: 22737
2005-08-10 07:39:36 +08:00
|
|
|
SE->setSCEV(GEP, GEPVal);
|
2005-08-05 01:40:30 +08:00
|
|
|
return GEPVal;
|
2005-07-30 08:15:07 +08:00
|
|
|
}
|
|
|
|
|
2005-08-05 03:08:16 +08:00
|
|
|
/// getSCEVStartAndStride - Compute the start and stride of this expression,
|
|
|
|
/// returning false if the expression is not a start/stride pair, or true if it
|
|
|
|
/// is. The stride must be a loop invariant expression, but the start may be
|
|
|
|
/// a mix of loop invariant and loop variant expressions.
|
|
|
|
static bool getSCEVStartAndStride(const SCEVHandle &SH, Loop *L,
|
2007-10-23 02:31:58 +08:00
|
|
|
SCEVHandle &Start, SCEVHandle &Stride,
|
|
|
|
ScalarEvolution *SE) {
|
2005-08-05 03:08:16 +08:00
|
|
|
SCEVHandle TheAddRec = Start; // Initialize to zero.
|
|
|
|
|
|
|
|
// If the outer level is an AddExpr, the operands are all start values except
|
|
|
|
// for a nested AddRecExpr.
|
|
|
|
if (SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(SH)) {
|
|
|
|
for (unsigned i = 0, e = AE->getNumOperands(); i != e; ++i)
|
|
|
|
if (SCEVAddRecExpr *AddRec =
|
|
|
|
dyn_cast<SCEVAddRecExpr>(AE->getOperand(i))) {
|
|
|
|
if (AddRec->getLoop() == L)
|
2007-10-23 02:31:58 +08:00
|
|
|
TheAddRec = SE->getAddExpr(AddRec, TheAddRec);
|
2005-08-05 03:08:16 +08:00
|
|
|
else
|
|
|
|
return false; // Nested IV of some sort?
|
|
|
|
} else {
|
2007-10-23 02:31:58 +08:00
|
|
|
Start = SE->getAddExpr(Start, AE->getOperand(i));
|
2005-08-05 03:08:16 +08:00
|
|
|
}
|
|
|
|
|
2006-11-03 04:25:50 +08:00
|
|
|
} else if (isa<SCEVAddRecExpr>(SH)) {
|
2005-08-05 03:08:16 +08:00
|
|
|
TheAddRec = SH;
|
|
|
|
} else {
|
|
|
|
return false; // not analyzable.
|
|
|
|
}
|
|
|
|
|
|
|
|
SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(TheAddRec);
|
|
|
|
if (!AddRec || AddRec->getLoop() != L) return false;
|
|
|
|
|
|
|
|
// FIXME: Generalize to non-affine IV's.
|
|
|
|
if (!AddRec->isAffine()) return false;
|
|
|
|
|
2007-10-23 02:31:58 +08:00
|
|
|
Start = SE->getAddExpr(Start, AddRec->getOperand(0));
|
2005-08-05 03:08:16 +08:00
|
|
|
|
|
|
|
if (!isa<SCEVConstant>(AddRec->getOperand(1)))
|
2006-11-26 17:46:52 +08:00
|
|
|
DOUT << "[" << L->getHeader()->getName()
|
|
|
|
<< "] Variable stride: " << *AddRec << "\n";
|
Teach LSR to strength reduce IVs that have a loop-invariant but non-constant stride.
For code like this:
void foo(float *a, float *b, int n, int stride_a, int stride_b) {
int i;
for (i=0; i<n; i++)
a[i*stride_a] = b[i*stride_b];
}
we now emit:
.LBB_foo2_2: ; no_exit
lfs f0, 0(r4)
stfs f0, 0(r3)
addi r7, r7, 1
add r4, r2, r4
add r3, r6, r3
cmpw cr0, r7, r5
blt .LBB_foo2_2 ; no_exit
instead of:
.LBB_foo_2: ; no_exit
mullw r8, r2, r7 ;; multiply!
slwi r8, r8, 2
lfsx f0, r4, r8
mullw r8, r2, r6 ;; multiply!
slwi r8, r8, 2
stfsx f0, r3, r8
addi r2, r2, 1
cmpw cr0, r2, r5
blt .LBB_foo_2 ; no_exit
loops with variable strides occur pretty often. For example, in SPECFP2K
there are 317 variable strides in 177.mesa, 3 in 179.art, 14 in 188.ammp,
56 in 168.wupwise, 36 in 172.mgrid.
Now we can allow indvars to turn functions written like this:
void foo2(float *a, float *b, int n, int stride_a, int stride_b) {
int i, ai = 0, bi = 0;
for (i=0; i<n; i++)
{
a[ai] = b[bi];
ai += stride_a;
bi += stride_b;
}
}
into code like the above for better analysis. With this patch, they generate
identical code.
llvm-svn: 22740
2005-08-10 08:45:21 +08:00
|
|
|
|
|
|
|
Stride = AddRec->getOperand(1);
|
2005-08-05 03:08:16 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2005-10-03 09:04:44 +08:00
|
|
|
/// IVUseShouldUsePostIncValue - We have discovered a "User" of an IV expression
|
|
|
|
/// and now we need to decide whether the user should use the preinc or post-inc
|
|
|
|
/// value. If this user should use the post-inc version of the IV, return true.
|
|
|
|
///
|
|
|
|
/// Choosing wrong here can break dominance properties (if we choose to use the
|
|
|
|
/// post-inc value when we cannot) or it can end up adding extra live-ranges to
|
|
|
|
/// the loop, resulting in reg-reg copies (if we use the pre-inc value when we
|
|
|
|
/// should use the post-inc value).
|
|
|
|
static bool IVUseShouldUsePostIncValue(Instruction *User, Instruction *IV,
|
2007-10-31 07:45:15 +08:00
|
|
|
Loop *L, DominatorTree *DT, Pass *P,
|
2008-12-01 14:27:41 +08:00
|
|
|
SmallVectorImpl<Instruction*> &DeadInsts){
|
2005-10-03 09:04:44 +08:00
|
|
|
// If the user is in the loop, use the preinc value.
|
|
|
|
if (L->contains(User->getParent())) return false;
|
|
|
|
|
Make IVUseShouldUsePostIncValue more aggressive when the use is a PHI. In
particular, it should realize that phi's use their values in the pred block
not the phi block itself. This change turns our em3d loop from this:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r2, 0
b LBB_test_6 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
or r2, r6, r6
lwz r6, 0(r3)
cmpw cr0, r6, r5
beq cr0, LBB_test_6 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r2, 1
cmpw cr0, r6, r4
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; endif.loopexit.loopexit_crit_edge
addi r3, r2, 1
blr
LBB_test_6: ; loopexit
or r3, r2, r2
blr
into:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r2, 0
b LBB_test_5 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
lwz r2, 0(r3)
cmpw cr0, r2, r5
or r2, r6, r6
beq cr0, LBB_test_5 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r6, 1
cmpw cr0, r6, r4
or r2, r6, r6
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; loopexit
or r3, r2, r2
blr
Unfortunately, this is actually worse code, because the register coallescer
is getting confused somehow. If it were doing its job right, it could turn the
code into this:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r6, 0
b LBB_test_5 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
lwz r2, 0(r3)
cmpw cr0, r2, r5
beq cr0, LBB_test_5 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r6, 1
cmpw cr0, r6, r4
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; loopexit
or r3, r6, r6
blr
... which I'll work on next. :)
llvm-svn: 23604
2005-10-03 10:50:05 +08:00
|
|
|
BasicBlock *LatchBlock = L->getLoopLatch();
|
|
|
|
|
|
|
|
// Ok, the user is outside of the loop. If it is dominated by the latch
|
|
|
|
// block, use the post-inc value.
|
2007-06-08 05:42:15 +08:00
|
|
|
if (DT->dominates(LatchBlock, User->getParent()))
|
Make IVUseShouldUsePostIncValue more aggressive when the use is a PHI. In
particular, it should realize that phi's use their values in the pred block
not the phi block itself. This change turns our em3d loop from this:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r2, 0
b LBB_test_6 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
or r2, r6, r6
lwz r6, 0(r3)
cmpw cr0, r6, r5
beq cr0, LBB_test_6 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r2, 1
cmpw cr0, r6, r4
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; endif.loopexit.loopexit_crit_edge
addi r3, r2, 1
blr
LBB_test_6: ; loopexit
or r3, r2, r2
blr
into:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r2, 0
b LBB_test_5 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
lwz r2, 0(r3)
cmpw cr0, r2, r5
or r2, r6, r6
beq cr0, LBB_test_5 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r6, 1
cmpw cr0, r6, r4
or r2, r6, r6
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; loopexit
or r3, r2, r2
blr
Unfortunately, this is actually worse code, because the register coallescer
is getting confused somehow. If it were doing its job right, it could turn the
code into this:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r6, 0
b LBB_test_5 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
lwz r2, 0(r3)
cmpw cr0, r2, r5
beq cr0, LBB_test_5 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r6, 1
cmpw cr0, r6, r4
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; loopexit
or r3, r6, r6
blr
... which I'll work on next. :)
llvm-svn: 23604
2005-10-03 10:50:05 +08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
// There is one case we have to be careful of: PHI nodes. These little guys
|
|
|
|
// can live in blocks that do not dominate the latch block, but (since their
|
|
|
|
// uses occur in the predecessor block, not the block the PHI lives in) should
|
|
|
|
// still use the post-inc value. Check for this case now.
|
|
|
|
PHINode *PN = dyn_cast<PHINode>(User);
|
|
|
|
if (!PN) return false; // not a phi, not dominated by latch block.
|
|
|
|
|
|
|
|
// Look at all of the uses of IV by the PHI node. If any use corresponds to
|
|
|
|
// a block that is not dominated by the latch block, give up and use the
|
|
|
|
// preincremented value.
|
|
|
|
unsigned NumUses = 0;
|
|
|
|
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
|
|
|
|
if (PN->getIncomingValue(i) == IV) {
|
|
|
|
++NumUses;
|
2007-06-08 05:42:15 +08:00
|
|
|
if (!DT->dominates(LatchBlock, PN->getIncomingBlock(i)))
|
Make IVUseShouldUsePostIncValue more aggressive when the use is a PHI. In
particular, it should realize that phi's use their values in the pred block
not the phi block itself. This change turns our em3d loop from this:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r2, 0
b LBB_test_6 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
or r2, r6, r6
lwz r6, 0(r3)
cmpw cr0, r6, r5
beq cr0, LBB_test_6 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r2, 1
cmpw cr0, r6, r4
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; endif.loopexit.loopexit_crit_edge
addi r3, r2, 1
blr
LBB_test_6: ; loopexit
or r3, r2, r2
blr
into:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r2, 0
b LBB_test_5 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
lwz r2, 0(r3)
cmpw cr0, r2, r5
or r2, r6, r6
beq cr0, LBB_test_5 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r6, 1
cmpw cr0, r6, r4
or r2, r6, r6
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; loopexit
or r3, r2, r2
blr
Unfortunately, this is actually worse code, because the register coallescer
is getting confused somehow. If it were doing its job right, it could turn the
code into this:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r6, 0
b LBB_test_5 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
lwz r2, 0(r3)
cmpw cr0, r2, r5
beq cr0, LBB_test_5 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r6, 1
cmpw cr0, r6, r4
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; loopexit
or r3, r6, r6
blr
... which I'll work on next. :)
llvm-svn: 23604
2005-10-03 10:50:05 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Okay, all uses of IV by PN are in predecessor blocks that really are
|
|
|
|
// dominated by the latch block. Split the critical edges and use the
|
|
|
|
// post-incremented value.
|
|
|
|
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
|
|
|
|
if (PN->getIncomingValue(i) == IV) {
|
2007-10-31 06:27:26 +08:00
|
|
|
SplitCriticalEdge(PN->getIncomingBlock(i), PN->getParent(), P, false);
|
2006-10-28 08:59:20 +08:00
|
|
|
// Splitting the critical edge can reduce the number of entries in this
|
|
|
|
// PHI.
|
|
|
|
e = PN->getNumIncomingValues();
|
Make IVUseShouldUsePostIncValue more aggressive when the use is a PHI. In
particular, it should realize that phi's use their values in the pred block
not the phi block itself. This change turns our em3d loop from this:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r2, 0
b LBB_test_6 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
or r2, r6, r6
lwz r6, 0(r3)
cmpw cr0, r6, r5
beq cr0, LBB_test_6 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r2, 1
cmpw cr0, r6, r4
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; endif.loopexit.loopexit_crit_edge
addi r3, r2, 1
blr
LBB_test_6: ; loopexit
or r3, r2, r2
blr
into:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r2, 0
b LBB_test_5 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
lwz r2, 0(r3)
cmpw cr0, r2, r5
or r2, r6, r6
beq cr0, LBB_test_5 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r6, 1
cmpw cr0, r6, r4
or r2, r6, r6
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; loopexit
or r3, r2, r2
blr
Unfortunately, this is actually worse code, because the register coallescer
is getting confused somehow. If it were doing its job right, it could turn the
code into this:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r6, 0
b LBB_test_5 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
lwz r2, 0(r3)
cmpw cr0, r2, r5
beq cr0, LBB_test_5 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r6, 1
cmpw cr0, r6, r4
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; loopexit
or r3, r6, r6
blr
... which I'll work on next. :)
llvm-svn: 23604
2005-10-03 10:50:05 +08:00
|
|
|
if (--NumUses == 0) break;
|
|
|
|
}
|
2007-10-31 07:45:15 +08:00
|
|
|
|
|
|
|
// PHI node might have become a constant value after SplitCriticalEdge.
|
2008-12-01 14:27:41 +08:00
|
|
|
DeadInsts.push_back(User);
|
Make IVUseShouldUsePostIncValue more aggressive when the use is a PHI. In
particular, it should realize that phi's use their values in the pred block
not the phi block itself. This change turns our em3d loop from this:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r2, 0
b LBB_test_6 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
or r2, r6, r6
lwz r6, 0(r3)
cmpw cr0, r6, r5
beq cr0, LBB_test_6 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r2, 1
cmpw cr0, r6, r4
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; endif.loopexit.loopexit_crit_edge
addi r3, r2, 1
blr
LBB_test_6: ; loopexit
or r3, r2, r2
blr
into:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r2, 0
b LBB_test_5 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
lwz r2, 0(r3)
cmpw cr0, r2, r5
or r2, r6, r6
beq cr0, LBB_test_5 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r6, 1
cmpw cr0, r6, r4
or r2, r6, r6
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; loopexit
or r3, r2, r2
blr
Unfortunately, this is actually worse code, because the register coallescer
is getting confused somehow. If it were doing its job right, it could turn the
code into this:
_test:
cmpwi cr0, r4, 0
bgt cr0, LBB_test_2 ; entry.no_exit_crit_edge
LBB_test_1: ; entry.loopexit_crit_edge
li r6, 0
b LBB_test_5 ; loopexit
LBB_test_2: ; entry.no_exit_crit_edge
li r6, 0
LBB_test_3: ; no_exit
lwz r2, 0(r3)
cmpw cr0, r2, r5
beq cr0, LBB_test_5 ; loopexit
LBB_test_4: ; endif
addi r3, r3, 4
addi r6, r6, 1
cmpw cr0, r6, r4
blt cr0, LBB_test_3 ; no_exit
LBB_test_5: ; loopexit
or r3, r6, r6
blr
... which I'll work on next. :)
llvm-svn: 23604
2005-10-03 10:50:05 +08:00
|
|
|
|
|
|
|
return true;
|
2005-10-03 09:04:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2005-07-30 08:15:07 +08:00
|
|
|
/// AddUsersIfInteresting - Inspect the specified instruction. If it is a
|
|
|
|
/// reducible SCEV, recursively add its users to the IVUsesByStride set and
|
|
|
|
/// return true. Otherwise, return false.
|
2005-08-05 01:40:30 +08:00
|
|
|
bool LoopStrengthReduce::AddUsersIfInteresting(Instruction *I, Loop *L,
|
2007-10-27 07:08:19 +08:00
|
|
|
SmallPtrSet<Instruction*,16> &Processed) {
|
2007-01-15 10:27:26 +08:00
|
|
|
if (!I->getType()->isInteger() && !isa<PointerType>(I->getType()))
|
2008-04-15 02:26:16 +08:00
|
|
|
return false; // Void and FP expressions cannot be reduced.
|
2007-10-27 07:08:19 +08:00
|
|
|
if (!Processed.insert(I))
|
2005-08-05 01:40:30 +08:00
|
|
|
return true; // Instruction already handled.
|
|
|
|
|
2005-08-05 03:08:16 +08:00
|
|
|
// Get the symbolic expression for this instruction.
|
2007-10-30 03:31:25 +08:00
|
|
|
SCEVHandle ISE = GetExpressionSCEV(I);
|
2005-08-05 03:08:16 +08:00
|
|
|
if (isa<SCEVCouldNotCompute>(ISE)) return false;
|
|
|
|
|
|
|
|
// Get the start and stride for this expression.
|
2007-10-23 02:31:58 +08:00
|
|
|
SCEVHandle Start = SE->getIntegerSCEV(0, ISE->getType());
|
Teach LSR to strength reduce IVs that have a loop-invariant but non-constant stride.
For code like this:
void foo(float *a, float *b, int n, int stride_a, int stride_b) {
int i;
for (i=0; i<n; i++)
a[i*stride_a] = b[i*stride_b];
}
we now emit:
.LBB_foo2_2: ; no_exit
lfs f0, 0(r4)
stfs f0, 0(r3)
addi r7, r7, 1
add r4, r2, r4
add r3, r6, r3
cmpw cr0, r7, r5
blt .LBB_foo2_2 ; no_exit
instead of:
.LBB_foo_2: ; no_exit
mullw r8, r2, r7 ;; multiply!
slwi r8, r8, 2
lfsx f0, r4, r8
mullw r8, r2, r6 ;; multiply!
slwi r8, r8, 2
stfsx f0, r3, r8
addi r2, r2, 1
cmpw cr0, r2, r5
blt .LBB_foo_2 ; no_exit
loops with variable strides occur pretty often. For example, in SPECFP2K
there are 317 variable strides in 177.mesa, 3 in 179.art, 14 in 188.ammp,
56 in 168.wupwise, 36 in 172.mgrid.
Now we can allow indvars to turn functions written like this:
void foo2(float *a, float *b, int n, int stride_a, int stride_b) {
int i, ai = 0, bi = 0;
for (i=0; i<n; i++)
{
a[ai] = b[bi];
ai += stride_a;
bi += stride_b;
}
}
into code like the above for better analysis. With this patch, they generate
identical code.
llvm-svn: 22740
2005-08-10 08:45:21 +08:00
|
|
|
SCEVHandle Stride = Start;
|
2007-10-23 02:31:58 +08:00
|
|
|
if (!getSCEVStartAndStride(ISE, L, Start, Stride, SE))
|
2005-08-05 03:08:16 +08:00
|
|
|
return false; // Non-reducible symbolic expression, bail out.
|
2007-03-10 05:19:53 +08:00
|
|
|
|
2007-04-24 06:42:03 +08:00
|
|
|
std::vector<Instruction *> IUsers;
|
|
|
|
// Collect all I uses now because IVUseShouldUsePostIncValue may
|
|
|
|
// invalidate use_iterator.
|
|
|
|
for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; ++UI)
|
|
|
|
IUsers.push_back(cast<Instruction>(*UI));
|
2005-07-30 08:15:07 +08:00
|
|
|
|
2007-04-24 06:42:03 +08:00
|
|
|
for (unsigned iused_index = 0, iused_size = IUsers.size();
|
|
|
|
iused_index != iused_size; ++iused_index) {
|
|
|
|
|
|
|
|
Instruction *User = IUsers[iused_index];
|
2007-03-10 05:19:53 +08:00
|
|
|
|
2005-07-30 08:15:07 +08:00
|
|
|
// Do not infinitely recurse on PHI nodes.
|
2005-09-13 10:09:55 +08:00
|
|
|
if (isa<PHINode>(User) && Processed.count(User))
|
2005-07-30 08:15:07 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// If this is an instruction defined in a nested loop, or outside this loop,
|
When processing outer loops and we find uses of an IV in inner loops, make
sure to handle the use, just don't recurse into it.
This permits us to generate this code for a simple nested loop case:
.LBB_foo_0: ; entry
stwu r1, -48(r1)
stw r29, 44(r1)
stw r30, 40(r1)
mflr r11
stw r11, 56(r1)
lis r2, ha16(L_A$non_lazy_ptr)
lwz r30, lo16(L_A$non_lazy_ptr)(r2)
li r29, 1
.LBB_foo_1: ; no_exit.0
bl L_bar$stub
li r2, 1
or r3, r30, r30
.LBB_foo_2: ; no_exit.1
lfd f0, 8(r3)
stfd f0, 0(r3)
addi r4, r2, 1
addi r3, r3, 8
cmpwi cr0, r2, 100
or r2, r4, r4
bne .LBB_foo_2 ; no_exit.1
.LBB_foo_3: ; loopexit.1
addi r30, r30, 800
addi r2, r29, 1
cmpwi cr0, r29, 100
or r29, r2, r2
bne .LBB_foo_1 ; no_exit.0
.LBB_foo_4: ; return
lwz r11, 56(r1)
mtlr r11
lwz r30, 40(r1)
lwz r29, 44(r1)
lwz r1, 0(r1)
blr
instead of this:
_foo:
.LBB_foo_0: ; entry
stwu r1, -48(r1)
stw r28, 44(r1) ;; uses an extra register.
stw r29, 40(r1)
stw r30, 36(r1)
mflr r11
stw r11, 56(r1)
li r30, 1
li r29, 0
or r28, r29, r29
.LBB_foo_1: ; no_exit.0
bl L_bar$stub
mulli r2, r28, 800 ;; unstrength-reduced multiply
lis r3, ha16(L_A$non_lazy_ptr) ;; loop invariant address computation
lwz r3, lo16(L_A$non_lazy_ptr)(r3)
add r2, r2, r3
mulli r4, r29, 800 ;; unstrength-reduced multiply
addi r3, r3, 8
add r3, r4, r3
li r4, 1
.LBB_foo_2: ; no_exit.1
lfd f0, 0(r3)
stfd f0, 0(r2)
addi r5, r4, 1
addi r2, r2, 8 ;; multiple stride 8 IV's
addi r3, r3, 8
cmpwi cr0, r4, 100
or r4, r5, r5
bne .LBB_foo_2 ; no_exit.1
.LBB_foo_3: ; loopexit.1
addi r28, r28, 1 ;;; Many IV's with stride 1
addi r29, r29, 1
addi r2, r30, 1
cmpwi cr0, r30, 100
or r30, r2, r2
bne .LBB_foo_1 ; no_exit.0
.LBB_foo_4: ; return
lwz r11, 56(r1)
mtlr r11
lwz r30, 36(r1)
lwz r29, 40(r1)
lwz r28, 44(r1)
lwz r1, 0(r1)
blr
llvm-svn: 22640
2005-08-04 08:14:11 +08:00
|
|
|
// don't recurse into it.
|
2005-08-05 03:08:16 +08:00
|
|
|
bool AddUserToIVUsers = false;
|
When processing outer loops and we find uses of an IV in inner loops, make
sure to handle the use, just don't recurse into it.
This permits us to generate this code for a simple nested loop case:
.LBB_foo_0: ; entry
stwu r1, -48(r1)
stw r29, 44(r1)
stw r30, 40(r1)
mflr r11
stw r11, 56(r1)
lis r2, ha16(L_A$non_lazy_ptr)
lwz r30, lo16(L_A$non_lazy_ptr)(r2)
li r29, 1
.LBB_foo_1: ; no_exit.0
bl L_bar$stub
li r2, 1
or r3, r30, r30
.LBB_foo_2: ; no_exit.1
lfd f0, 8(r3)
stfd f0, 0(r3)
addi r4, r2, 1
addi r3, r3, 8
cmpwi cr0, r2, 100
or r2, r4, r4
bne .LBB_foo_2 ; no_exit.1
.LBB_foo_3: ; loopexit.1
addi r30, r30, 800
addi r2, r29, 1
cmpwi cr0, r29, 100
or r29, r2, r2
bne .LBB_foo_1 ; no_exit.0
.LBB_foo_4: ; return
lwz r11, 56(r1)
mtlr r11
lwz r30, 40(r1)
lwz r29, 44(r1)
lwz r1, 0(r1)
blr
instead of this:
_foo:
.LBB_foo_0: ; entry
stwu r1, -48(r1)
stw r28, 44(r1) ;; uses an extra register.
stw r29, 40(r1)
stw r30, 36(r1)
mflr r11
stw r11, 56(r1)
li r30, 1
li r29, 0
or r28, r29, r29
.LBB_foo_1: ; no_exit.0
bl L_bar$stub
mulli r2, r28, 800 ;; unstrength-reduced multiply
lis r3, ha16(L_A$non_lazy_ptr) ;; loop invariant address computation
lwz r3, lo16(L_A$non_lazy_ptr)(r3)
add r2, r2, r3
mulli r4, r29, 800 ;; unstrength-reduced multiply
addi r3, r3, 8
add r3, r4, r3
li r4, 1
.LBB_foo_2: ; no_exit.1
lfd f0, 0(r3)
stfd f0, 0(r2)
addi r5, r4, 1
addi r2, r2, 8 ;; multiple stride 8 IV's
addi r3, r3, 8
cmpwi cr0, r4, 100
or r4, r5, r5
bne .LBB_foo_2 ; no_exit.1
.LBB_foo_3: ; loopexit.1
addi r28, r28, 1 ;;; Many IV's with stride 1
addi r29, r29, 1
addi r2, r30, 1
cmpwi cr0, r30, 100
or r30, r2, r2
bne .LBB_foo_1 ; no_exit.0
.LBB_foo_4: ; return
lwz r11, 56(r1)
mtlr r11
lwz r30, 36(r1)
lwz r29, 40(r1)
lwz r28, 44(r1)
lwz r1, 0(r1)
blr
llvm-svn: 22640
2005-08-04 08:14:11 +08:00
|
|
|
if (LI->getLoopFor(User->getParent()) != L) {
|
2006-11-26 17:46:52 +08:00
|
|
|
DOUT << "FOUND USER in other loop: " << *User
|
|
|
|
<< " OF SCEV: " << *ISE << "\n";
|
2005-08-05 03:08:16 +08:00
|
|
|
AddUserToIVUsers = true;
|
2005-08-05 01:40:30 +08:00
|
|
|
} else if (!AddUsersIfInteresting(User, L, Processed)) {
|
2006-11-26 17:46:52 +08:00
|
|
|
DOUT << "FOUND USER: " << *User
|
|
|
|
<< " OF SCEV: " << *ISE << "\n";
|
2005-08-05 03:08:16 +08:00
|
|
|
AddUserToIVUsers = true;
|
|
|
|
}
|
2005-04-22 07:48:37 +08:00
|
|
|
|
2005-08-05 03:08:16 +08:00
|
|
|
if (AddUserToIVUsers) {
|
2005-10-09 14:20:55 +08:00
|
|
|
IVUsersOfOneStride &StrideUses = IVUsesByStride[Stride];
|
|
|
|
if (StrideUses.Users.empty()) // First occurance of this stride?
|
|
|
|
StrideOrder.push_back(Stride);
|
|
|
|
|
2005-08-04 08:40:47 +08:00
|
|
|
// Okay, we found a user that we cannot reduce. Analyze the instruction
|
_test:
li r2, 0
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r2, 1
stw r2, 0(r4)
blr
[zion ~/llvm]$ cat > ~/xx
Uses of IV's outside of the loop should use hte post-incremented version
of the IV, not the preincremented version. This helps many loops (e.g. in sixtrack)
which used to generate code like this (this is the code from the
dont-hoist-simple-loop-constants.ll testcase):
_test:
li r2, 0 **** IV starts at 0
LBB_test_1: ; no_exit.2
or r5, r2, r2 **** Copy for loop exit
li r2, 0
stw r2, 0(r3)
addi r3, r3, 4
addi r2, r5, 1
addi r6, r5, 2 **** IV+2
cmpwi cr0, r6, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r5, 2 **** IV+2
stw r2, 0(r4)
blr
And now generated code like this:
_test:
li r2, 1 *** IV starts at 1
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701 *** IV.postinc + 0
blt cr0, LBB_test_1
LBB_test_2: ; loopexit.2.loopexit
stw r2, 0(r4) *** IV.postinc + 0
blr
llvm-svn: 23313
2005-09-12 14:04:47 +08:00
|
|
|
// and decide what to do with it. If we are a use inside of the loop, use
|
|
|
|
// the value before incrementation, otherwise use it after incrementation.
|
2007-10-31 07:45:15 +08:00
|
|
|
if (IVUseShouldUsePostIncValue(User, I, L, DT, this, DeadInsts)) {
|
_test:
li r2, 0
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r2, 1
stw r2, 0(r4)
blr
[zion ~/llvm]$ cat > ~/xx
Uses of IV's outside of the loop should use hte post-incremented version
of the IV, not the preincremented version. This helps many loops (e.g. in sixtrack)
which used to generate code like this (this is the code from the
dont-hoist-simple-loop-constants.ll testcase):
_test:
li r2, 0 **** IV starts at 0
LBB_test_1: ; no_exit.2
or r5, r2, r2 **** Copy for loop exit
li r2, 0
stw r2, 0(r3)
addi r3, r3, 4
addi r2, r5, 1
addi r6, r5, 2 **** IV+2
cmpwi cr0, r6, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r5, 2 **** IV+2
stw r2, 0(r4)
blr
And now generated code like this:
_test:
li r2, 1 *** IV starts at 1
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701 *** IV.postinc + 0
blt cr0, LBB_test_1
LBB_test_2: ; loopexit.2.loopexit
stw r2, 0(r4) *** IV.postinc + 0
blr
llvm-svn: 23313
2005-09-12 14:04:47 +08:00
|
|
|
// The value used will be incremented by the stride more than we are
|
|
|
|
// expecting, so subtract this off.
|
2007-10-23 02:31:58 +08:00
|
|
|
SCEVHandle NewStart = SE->getMinusSCEV(Start, Stride);
|
2005-10-09 14:20:55 +08:00
|
|
|
StrideUses.addUser(NewStart, User, I);
|
|
|
|
StrideUses.Users.back().isUseOfPostIncrementedValue = true;
|
2006-11-26 17:46:52 +08:00
|
|
|
DOUT << " USING POSTINC SCEV, START=" << *NewStart<< "\n";
|
2005-10-03 09:04:44 +08:00
|
|
|
} else {
|
2005-10-09 14:20:55 +08:00
|
|
|
StrideUses.addUser(Start, User, I);
|
_test:
li r2, 0
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r2, 1
stw r2, 0(r4)
blr
[zion ~/llvm]$ cat > ~/xx
Uses of IV's outside of the loop should use hte post-incremented version
of the IV, not the preincremented version. This helps many loops (e.g. in sixtrack)
which used to generate code like this (this is the code from the
dont-hoist-simple-loop-constants.ll testcase):
_test:
li r2, 0 **** IV starts at 0
LBB_test_1: ; no_exit.2
or r5, r2, r2 **** Copy for loop exit
li r2, 0
stw r2, 0(r3)
addi r3, r3, 4
addi r2, r5, 1
addi r6, r5, 2 **** IV+2
cmpwi cr0, r6, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r5, 2 **** IV+2
stw r2, 0(r4)
blr
And now generated code like this:
_test:
li r2, 1 *** IV starts at 1
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701 *** IV.postinc + 0
blt cr0, LBB_test_1
LBB_test_2: ; loopexit.2.loopexit
stw r2, 0(r4) *** IV.postinc + 0
blr
llvm-svn: 23313
2005-09-12 14:04:47 +08:00
|
|
|
}
|
2005-07-30 08:15:07 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
/// BasedUser - For a particular base value, keep information about how we've
|
|
|
|
/// partitioned the expression so far.
|
|
|
|
struct BasedUser {
|
2007-10-23 02:31:58 +08:00
|
|
|
/// SE - The current ScalarEvolution object.
|
|
|
|
ScalarEvolution *SE;
|
|
|
|
|
2005-08-09 06:56:21 +08:00
|
|
|
/// Base - The Base value for the PHI node that needs to be inserted for
|
|
|
|
/// this use. As the use is processed, information gets moved from this
|
|
|
|
/// field to the Imm field (below). BasedUser values are sorted by this
|
|
|
|
/// field.
|
|
|
|
SCEVHandle Base;
|
|
|
|
|
2005-07-30 08:15:07 +08:00
|
|
|
/// Inst - The instruction using the induction variable.
|
|
|
|
Instruction *Inst;
|
|
|
|
|
2005-08-04 06:21:05 +08:00
|
|
|
/// OperandValToReplace - The operand value of Inst to replace with the
|
|
|
|
/// EmittedBase.
|
|
|
|
Value *OperandValToReplace;
|
2005-07-30 08:15:07 +08:00
|
|
|
|
|
|
|
/// Imm - The immediate value that should be added to the base immediately
|
|
|
|
/// before Inst, because it will be folded into the imm field of the
|
|
|
|
/// instruction.
|
|
|
|
SCEVHandle Imm;
|
|
|
|
|
|
|
|
/// EmittedBase - The actual value* to use for the base value of this
|
|
|
|
/// operation. This is null if we should just use zero so far.
|
|
|
|
Value *EmittedBase;
|
|
|
|
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
// isUseOfPostIncrementedValue - True if this should use the
|
|
|
|
// post-incremented version of this IV, not the preincremented version.
|
|
|
|
// This can only be set in special cases, such as the terminating setcc
|
_test:
li r2, 0
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r2, 1
stw r2, 0(r4)
blr
[zion ~/llvm]$ cat > ~/xx
Uses of IV's outside of the loop should use hte post-incremented version
of the IV, not the preincremented version. This helps many loops (e.g. in sixtrack)
which used to generate code like this (this is the code from the
dont-hoist-simple-loop-constants.ll testcase):
_test:
li r2, 0 **** IV starts at 0
LBB_test_1: ; no_exit.2
or r5, r2, r2 **** Copy for loop exit
li r2, 0
stw r2, 0(r3)
addi r3, r3, 4
addi r2, r5, 1
addi r6, r5, 2 **** IV+2
cmpwi cr0, r6, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r5, 2 **** IV+2
stw r2, 0(r4)
blr
And now generated code like this:
_test:
li r2, 1 *** IV starts at 1
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701 *** IV.postinc + 0
blt cr0, LBB_test_1
LBB_test_2: ; loopexit.2.loopexit
stw r2, 0(r4) *** IV.postinc + 0
blr
llvm-svn: 23313
2005-09-12 14:04:47 +08:00
|
|
|
// instruction for a loop and uses outside the loop that are dominated by
|
|
|
|
// the loop.
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
bool isUseOfPostIncrementedValue;
|
2005-08-09 06:56:21 +08:00
|
|
|
|
2007-10-23 02:31:58 +08:00
|
|
|
BasedUser(IVStrideUse &IVSU, ScalarEvolution *se)
|
|
|
|
: SE(se), Base(IVSU.Offset), Inst(IVSU.User),
|
2005-08-09 06:56:21 +08:00
|
|
|
OperandValToReplace(IVSU.OperandValToReplace),
|
2007-10-23 02:31:58 +08:00
|
|
|
Imm(SE->getIntegerSCEV(0, Base->getType())), EmittedBase(0),
|
2005-08-09 06:56:21 +08:00
|
|
|
isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue) {}
|
2005-07-30 08:15:07 +08:00
|
|
|
|
2005-08-05 04:03:32 +08:00
|
|
|
// Once we rewrite the code to insert the new IVs we want, update the
|
|
|
|
// operands of Inst to use the new expression 'NewBase', with 'Imm' added
|
|
|
|
// to it.
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
void RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
|
2008-05-16 07:26:57 +08:00
|
|
|
Instruction *InsertPt,
|
2007-10-31 07:45:15 +08:00
|
|
|
SCEVExpander &Rewriter, Loop *L, Pass *P,
|
2008-12-01 14:27:41 +08:00
|
|
|
SmallVectorImpl<Instruction*> &DeadInsts);
|
2006-02-04 15:36:50 +08:00
|
|
|
|
|
|
|
Value *InsertCodeForBaseAtPosition(const SCEVHandle &NewBase,
|
|
|
|
SCEVExpander &Rewriter,
|
|
|
|
Instruction *IP, Loop *L);
|
2005-07-30 08:15:07 +08:00
|
|
|
void dump() const;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
void BasedUser::dump() const {
|
2006-12-07 09:30:32 +08:00
|
|
|
cerr << " Base=" << *Base;
|
|
|
|
cerr << " Imm=" << *Imm;
|
2005-07-30 08:15:07 +08:00
|
|
|
if (EmittedBase)
|
2006-12-07 09:30:32 +08:00
|
|
|
cerr << " EB=" << *EmittedBase;
|
2005-07-30 08:15:07 +08:00
|
|
|
|
2006-12-07 09:30:32 +08:00
|
|
|
cerr << " Inst: " << *Inst;
|
2005-07-30 08:15:07 +08:00
|
|
|
}
|
|
|
|
|
2006-02-04 15:36:50 +08:00
|
|
|
Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase,
|
|
|
|
SCEVExpander &Rewriter,
|
|
|
|
Instruction *IP, Loop *L) {
|
|
|
|
// Figure out where we *really* want to insert this code. In particular, if
|
|
|
|
// the user is inside of a loop that is nested inside of L, we really don't
|
|
|
|
// want to insert this expression before the user, we'd rather pull it out as
|
|
|
|
// many loops as possible.
|
|
|
|
LoopInfo &LI = Rewriter.getLoopInfo();
|
|
|
|
Instruction *BaseInsertPt = IP;
|
|
|
|
|
|
|
|
// Figure out the most-nested loop that IP is in.
|
|
|
|
Loop *InsertLoop = LI.getLoopFor(IP->getParent());
|
|
|
|
|
|
|
|
// If InsertLoop is not L, and InsertLoop is nested inside of L, figure out
|
|
|
|
// the preheader of the outer-most loop where NewBase is not loop invariant.
|
2008-12-03 02:40:09 +08:00
|
|
|
if (L->contains(IP->getParent()))
|
|
|
|
while (InsertLoop && NewBase->isLoopInvariant(InsertLoop)) {
|
|
|
|
BaseInsertPt = InsertLoop->getLoopPreheader()->getTerminator();
|
|
|
|
InsertLoop = InsertLoop->getParentLoop();
|
|
|
|
}
|
2006-02-04 15:36:50 +08:00
|
|
|
|
|
|
|
// If there is no immediate value, skip the next part.
|
2008-06-19 00:23:07 +08:00
|
|
|
if (Imm->isZero())
|
|
|
|
return Rewriter.expandCodeFor(NewBase, BaseInsertPt);
|
2006-02-04 15:36:50 +08:00
|
|
|
|
|
|
|
Value *Base = Rewriter.expandCodeFor(NewBase, BaseInsertPt);
|
2007-06-06 09:23:55 +08:00
|
|
|
|
|
|
|
// If we are inserting the base and imm values in the same block, make sure to
|
|
|
|
// adjust the IP position if insertion reused a result.
|
|
|
|
if (IP == BaseInsertPt)
|
|
|
|
IP = Rewriter.getInsertionPoint();
|
2006-02-04 15:36:50 +08:00
|
|
|
|
|
|
|
// Always emit the immediate (if non-zero) into the same block as the user.
|
2007-10-23 02:31:58 +08:00
|
|
|
SCEVHandle NewValSCEV = SE->getAddExpr(SE->getUnknown(Base), Imm);
|
2007-06-15 22:38:12 +08:00
|
|
|
return Rewriter.expandCodeFor(NewValSCEV, IP);
|
2007-06-06 09:23:55 +08:00
|
|
|
|
2006-02-04 15:36:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-08-05 04:03:32 +08:00
|
|
|
// Once we rewrite the code to insert the new IVs we want, update the
|
|
|
|
// operands of Inst to use the new expression 'NewBase', with 'Imm' added
|
2008-05-16 07:26:57 +08:00
|
|
|
// to it. NewBasePt is the last instruction which contributes to the
|
|
|
|
// value of NewBase in the case that it's a diffferent instruction from
|
|
|
|
// the PHI that NewBase is computed from, or null otherwise.
|
|
|
|
//
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
|
2008-05-16 07:26:57 +08:00
|
|
|
Instruction *NewBasePt,
|
2007-10-31 07:45:15 +08:00
|
|
|
SCEVExpander &Rewriter, Loop *L, Pass *P,
|
2008-12-01 14:27:41 +08:00
|
|
|
SmallVectorImpl<Instruction*> &DeadInsts){
|
2005-08-05 04:03:32 +08:00
|
|
|
if (!isa<PHINode>(Inst)) {
|
Now that codegen prepare isn't defeating me, I can finally fix what I set
out to do! :)
This fixes a problem where LSR would insert a bunch of code into each MBB
that uses a particular subexpression (e.g. IV+base+C). The problem is that
this code cannot be CSE'd back together if inserted into different blocks.
This patch changes LSR to attempt to insert a single copy of this code and
share it, allowing codegenprepare to duplicate the code if it can be sunk
into various addressing modes. On CodeGen/ARM/lsr-code-insertion.ll,
for example, this gives us code like:
add r8, r0, r5
str r6, [r8, #+4]
..
ble LBB1_4 @cond_next
LBB1_3: @cond_true
str r10, [r8, #+4]
LBB1_4: @cond_next
...
LBB1_5: @cond_true55
ldr r6, LCPI1_1
str r6, [r8, #+4]
instead of:
add r10, r0, r6
str r8, [r10, #+4]
...
ble LBB1_4 @cond_next
LBB1_3: @cond_true
add r8, r0, r6
str r10, [r8, #+4]
LBB1_4: @cond_next
...
LBB1_5: @cond_true55
add r8, r0, r6
ldr r10, LCPI1_1
str r10, [r8, #+4]
Besides being smaller and more efficient, this makes it immediately
obvious that it is profitable to predicate LBB1_3 now :)
llvm-svn: 35972
2007-04-14 04:42:26 +08:00
|
|
|
// By default, insert code at the user instruction.
|
|
|
|
BasicBlock::iterator InsertPt = Inst;
|
|
|
|
|
|
|
|
// However, if the Operand is itself an instruction, the (potentially
|
|
|
|
// complex) inserted code may be shared by many users. Because of this, we
|
|
|
|
// want to emit code for the computation of the operand right before its old
|
|
|
|
// computation. This is usually safe, because we obviously used to use the
|
|
|
|
// computation when it was computed in its current block. However, in some
|
|
|
|
// cases (e.g. use of a post-incremented induction variable) the NewBase
|
|
|
|
// value will be pinned to live somewhere after the original computation.
|
|
|
|
// In this case, we have to back off.
|
2008-12-02 12:52:26 +08:00
|
|
|
//
|
|
|
|
// If this is a use outside the loop (which means after, since it is based
|
|
|
|
// on a loop indvar) we use the post-incremented value, so that we don't
|
|
|
|
// artificially make the preinc value live out the bottom of the loop.
|
2008-12-02 06:00:01 +08:00
|
|
|
if (!isUseOfPostIncrementedValue && L->contains(Inst->getParent())) {
|
2008-05-20 11:01:48 +08:00
|
|
|
if (NewBasePt && isa<PHINode>(OperandValToReplace)) {
|
2008-05-16 07:26:57 +08:00
|
|
|
InsertPt = NewBasePt;
|
|
|
|
++InsertPt;
|
2008-06-12 05:38:51 +08:00
|
|
|
} else if (Instruction *OpInst
|
|
|
|
= dyn_cast<Instruction>(OperandValToReplace)) {
|
Now that codegen prepare isn't defeating me, I can finally fix what I set
out to do! :)
This fixes a problem where LSR would insert a bunch of code into each MBB
that uses a particular subexpression (e.g. IV+base+C). The problem is that
this code cannot be CSE'd back together if inserted into different blocks.
This patch changes LSR to attempt to insert a single copy of this code and
share it, allowing codegenprepare to duplicate the code if it can be sunk
into various addressing modes. On CodeGen/ARM/lsr-code-insertion.ll,
for example, this gives us code like:
add r8, r0, r5
str r6, [r8, #+4]
..
ble LBB1_4 @cond_next
LBB1_3: @cond_true
str r10, [r8, #+4]
LBB1_4: @cond_next
...
LBB1_5: @cond_true55
ldr r6, LCPI1_1
str r6, [r8, #+4]
instead of:
add r10, r0, r6
str r8, [r10, #+4]
...
ble LBB1_4 @cond_next
LBB1_3: @cond_true
add r8, r0, r6
str r10, [r8, #+4]
LBB1_4: @cond_next
...
LBB1_5: @cond_true55
add r8, r0, r6
ldr r10, LCPI1_1
str r10, [r8, #+4]
Besides being smaller and more efficient, this makes it immediately
obvious that it is profitable to predicate LBB1_3 now :)
llvm-svn: 35972
2007-04-14 04:42:26 +08:00
|
|
|
InsertPt = OpInst;
|
|
|
|
while (isa<PHINode>(InsertPt)) ++InsertPt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Value *NewVal = InsertCodeForBaseAtPosition(NewBase, Rewriter, InsertPt, L);
|
2007-08-01 01:22:27 +08:00
|
|
|
// Adjust the type back to match the Inst. Note that we can't use InsertPt
|
|
|
|
// here because the SCEVExpander may have inserted the instructions after
|
|
|
|
// that point, in its efforts to avoid inserting redundant expressions.
|
2007-06-15 22:38:12 +08:00
|
|
|
if (isa<PointerType>(OperandValToReplace->getType())) {
|
2007-08-01 01:22:27 +08:00
|
|
|
NewVal = SCEVExpander::InsertCastOfTo(Instruction::IntToPtr,
|
|
|
|
NewVal,
|
|
|
|
OperandValToReplace->getType());
|
2007-06-15 22:38:12 +08:00
|
|
|
}
|
2005-08-05 04:03:32 +08:00
|
|
|
// Replace the use of the operand Value with the new Phi we just created.
|
|
|
|
Inst->replaceUsesOfWith(OperandValToReplace, NewVal);
|
2007-05-12 06:40:34 +08:00
|
|
|
DOUT << " CHANGED: IMM =" << *Imm;
|
|
|
|
DOUT << " \tNEWBASE =" << *NewBase;
|
|
|
|
DOUT << " \tInst = " << *Inst;
|
2005-08-05 04:03:32 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// PHI nodes are more complex. We have to insert one copy of the NewBase+Imm
|
2005-08-10 08:35:32 +08:00
|
|
|
// expression into each operand block that uses it. Note that PHI nodes can
|
|
|
|
// have multiple entries for the same predecessor. We use a map to make sure
|
|
|
|
// that a PHI node only has a single Value* for each predecessor (which also
|
|
|
|
// prevents us from inserting duplicate code in some blocks).
|
2007-10-31 06:27:26 +08:00
|
|
|
DenseMap<BasicBlock*, Value*> InsertedCode;
|
2005-08-05 04:03:32 +08:00
|
|
|
PHINode *PN = cast<PHINode>(Inst);
|
|
|
|
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
|
|
|
|
if (PN->getIncomingValue(i) == OperandValToReplace) {
|
Fix a FIXME: if we are inserting code for a PHI argument, split the critical
edge so that the code is not always executed for both operands. This
prevents LSR from inserting code into loops whose exit blocks contain
PHI uses of IV expressions (which are outside of loops). On gzip, for
example, we turn this ugly code:
.LBB_test_1: ; loopentry
add r27, r3, r28
lhz r27, 3(r27)
add r26, r4, r28
lhz r26, 3(r26)
add r25, r30, r28 ;; Only live if exiting the loop
add r24, r29, r28 ;; Only live if exiting the loop
cmpw cr0, r27, r26
bne .LBB_test_5 ; loopexit
into this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2: ; loopentry.loopexit_crit_edge
add r2, r30, r27
add r8, r29, r27
b .LBB_test_9 ; loopexit
.LBB_test_2: ; shortcirc_next.0
...
blt .LBB_test_1
into this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2: ; loopentry.loopexit_crit_edge
add r2, r30, r27
add r8, r29, r27
b .LBB_t_3: ; shortcirc_next.0
.LBB_test_3: ; shortcirc_next.0
...
blt .LBB_test_1
Next step: get the block out of the loop so that the loop is all
fall-throughs again.
llvm-svn: 22766
2005-08-13 06:06:11 +08:00
|
|
|
// If this is a critical edge, split the edge so that we do not insert the
|
2005-09-13 10:09:55 +08:00
|
|
|
// code on all predecessor/successor paths. We do this unless this is the
|
|
|
|
// canonical backedge for this loop, as this can make some inserted code
|
|
|
|
// be in an illegal position.
|
2005-10-03 08:31:52 +08:00
|
|
|
BasicBlock *PHIPred = PN->getIncomingBlock(i);
|
|
|
|
if (e != 1 && PHIPred->getTerminator()->getNumSuccessors() > 1 &&
|
|
|
|
(PN->getParent() != L->getHeader() || !L->contains(PHIPred))) {
|
|
|
|
|
2005-08-17 14:35:16 +08:00
|
|
|
// First step, split the critical edge.
|
2007-10-31 06:27:26 +08:00
|
|
|
SplitCriticalEdge(PHIPred, PN->getParent(), P, false);
|
When splitting critical edges, make sure not to leave the new block in the
middle of the loop. This turns a critical loop in gzip into this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
bne .LBB_test_8 ; loopentry.loopexit_crit_edge
.LBB_test_2: ; shortcirc_next.0
add r28, r3, r27
lhz r28, 5(r28)
add r26, r4, r27
lhz r26, 5(r26)
cmpw cr0, r28, r26
bne .LBB_test_7 ; shortcirc_next.0.loopexit_crit_edge
.LBB_test_3: ; shortcirc_next.1
add r28, r3, r27
lhz r28, 7(r28)
add r26, r4, r27
lhz r26, 7(r26)
cmpw cr0, r28, r26
bne .LBB_test_6 ; shortcirc_next.1.loopexit_crit_edge
.LBB_test_4: ; shortcirc_next.2
add r28, r3, r27
lhz r26, 9(r28)
add r28, r4, r27
lhz r25, 9(r28)
addi r28, r27, 8
cmpw cr7, r26, r25
mfcr r26, 1
rlwinm r26, r26, 31, 31, 31
add r25, r8, r27
cmpw cr7, r25, r7
mfcr r25, 1
rlwinm r25, r25, 29, 31, 31
and. r26, r26, r25
bne .LBB_test_1 ; loopentry
instead of this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2: ; loopentry.loopexit_crit_edge
add r2, r30, r27
add r8, r29, r27
b .LBB_test_9 ; loopexit
.LBB_test_3: ; shortcirc_next.0
add r28, r3, r27
lhz r28, 5(r28)
add r26, r4, r27
lhz r26, 5(r26)
cmpw cr0, r28, r26
beq .LBB_test_5 ; shortcirc_next.1
.LBB_test_4: ; shortcirc_next.0.loopexit_crit_edge
add r2, r11, r27
add r8, r12, r27
b .LBB_test_9 ; loopexit
.LBB_test_5: ; shortcirc_next.1
add r28, r3, r27
lhz r28, 7(r28)
add r26, r4, r27
lhz r26, 7(r26)
cmpw cr0, r28, r26
beq .LBB_test_7 ; shortcirc_next.2
.LBB_test_6: ; shortcirc_next.1.loopexit_crit_edge
add r2, r9, r27
add r8, r10, r27
b .LBB_test_9 ; loopexit
.LBB_test_7: ; shortcirc_next.2
add r28, r3, r27
lhz r26, 9(r28)
add r28, r4, r27
lhz r25, 9(r28)
addi r28, r27, 8
cmpw cr7, r26, r25
mfcr r26, 1
rlwinm r26, r26, 31, 31, 31
add r25, r8, r27
cmpw cr7, r25, r7
mfcr r25, 1
rlwinm r25, r25, 29, 31, 31
and. r26, r26, r25
bne .LBB_test_1 ; loopentry
Next up, improve the code for the loop.
llvm-svn: 22769
2005-08-13 06:22:17 +08:00
|
|
|
|
2005-08-17 14:35:16 +08:00
|
|
|
// Next step: move the basic block. In particular, if the PHI node
|
|
|
|
// is outside of the loop, and PredTI is in the loop, we want to
|
|
|
|
// move the block to be immediately before the PHI block, not
|
|
|
|
// immediately after PredTI.
|
2005-10-03 08:31:52 +08:00
|
|
|
if (L->contains(PHIPred) && !L->contains(PN->getParent())) {
|
2005-08-17 14:35:16 +08:00
|
|
|
BasicBlock *NewBB = PN->getIncomingBlock(i);
|
|
|
|
NewBB->moveBefore(PN->getParent());
|
Fix a FIXME: if we are inserting code for a PHI argument, split the critical
edge so that the code is not always executed for both operands. This
prevents LSR from inserting code into loops whose exit blocks contain
PHI uses of IV expressions (which are outside of loops). On gzip, for
example, we turn this ugly code:
.LBB_test_1: ; loopentry
add r27, r3, r28
lhz r27, 3(r27)
add r26, r4, r28
lhz r26, 3(r26)
add r25, r30, r28 ;; Only live if exiting the loop
add r24, r29, r28 ;; Only live if exiting the loop
cmpw cr0, r27, r26
bne .LBB_test_5 ; loopexit
into this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2: ; loopentry.loopexit_crit_edge
add r2, r30, r27
add r8, r29, r27
b .LBB_test_9 ; loopexit
.LBB_test_2: ; shortcirc_next.0
...
blt .LBB_test_1
into this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2: ; loopentry.loopexit_crit_edge
add r2, r30, r27
add r8, r29, r27
b .LBB_t_3: ; shortcirc_next.0
.LBB_test_3: ; shortcirc_next.0
...
blt .LBB_test_1
Next step: get the block out of the loop so that the loop is all
fall-throughs again.
llvm-svn: 22766
2005-08-13 06:06:11 +08:00
|
|
|
}
|
2006-10-28 08:59:20 +08:00
|
|
|
|
|
|
|
// Splitting the edge can reduce the number of PHI entries we have.
|
|
|
|
e = PN->getNumIncomingValues();
|
Fix a FIXME: if we are inserting code for a PHI argument, split the critical
edge so that the code is not always executed for both operands. This
prevents LSR from inserting code into loops whose exit blocks contain
PHI uses of IV expressions (which are outside of loops). On gzip, for
example, we turn this ugly code:
.LBB_test_1: ; loopentry
add r27, r3, r28
lhz r27, 3(r27)
add r26, r4, r28
lhz r26, 3(r26)
add r25, r30, r28 ;; Only live if exiting the loop
add r24, r29, r28 ;; Only live if exiting the loop
cmpw cr0, r27, r26
bne .LBB_test_5 ; loopexit
into this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2: ; loopentry.loopexit_crit_edge
add r2, r30, r27
add r8, r29, r27
b .LBB_test_9 ; loopexit
.LBB_test_2: ; shortcirc_next.0
...
blt .LBB_test_1
into this:
.LBB_test_1: ; loopentry
or r27, r28, r28
add r28, r3, r27
lhz r28, 3(r28)
add r26, r4, r27
lhz r26, 3(r26)
cmpw cr0, r28, r26
beq .LBB_test_3 ; shortcirc_next.0
.LBB_test_2: ; loopentry.loopexit_crit_edge
add r2, r30, r27
add r8, r29, r27
b .LBB_t_3: ; shortcirc_next.0
.LBB_test_3: ; shortcirc_next.0
...
blt .LBB_test_1
Next step: get the block out of the loop so that the loop is all
fall-throughs again.
llvm-svn: 22766
2005-08-13 06:06:11 +08:00
|
|
|
}
|
2005-08-05 04:03:32 +08:00
|
|
|
|
2005-08-10 08:35:32 +08:00
|
|
|
Value *&Code = InsertedCode[PN->getIncomingBlock(i)];
|
|
|
|
if (!Code) {
|
|
|
|
// Insert the code into the end of the predecessor block.
|
2006-02-04 15:36:50 +08:00
|
|
|
Instruction *InsertPt = PN->getIncomingBlock(i)->getTerminator();
|
|
|
|
Code = InsertCodeForBaseAtPosition(NewBase, Rewriter, InsertPt, L);
|
2007-06-15 22:38:12 +08:00
|
|
|
|
2007-08-03 00:53:43 +08:00
|
|
|
// Adjust the type back to match the PHI. Note that we can't use
|
|
|
|
// InsertPt here because the SCEVExpander may have inserted its
|
|
|
|
// instructions after that point, in its efforts to avoid inserting
|
|
|
|
// redundant expressions.
|
2007-06-15 22:38:12 +08:00
|
|
|
if (isa<PointerType>(PN->getType())) {
|
2007-08-01 01:22:27 +08:00
|
|
|
Code = SCEVExpander::InsertCastOfTo(Instruction::IntToPtr,
|
|
|
|
Code,
|
|
|
|
PN->getType());
|
2007-06-15 22:38:12 +08:00
|
|
|
}
|
2005-08-10 08:35:32 +08:00
|
|
|
}
|
2005-08-05 04:03:32 +08:00
|
|
|
|
|
|
|
// Replace the use of the operand Value with the new Phi we just created.
|
2005-08-10 08:35:32 +08:00
|
|
|
PN->setIncomingValue(i, Code);
|
2005-08-05 04:03:32 +08:00
|
|
|
Rewriter.clear();
|
|
|
|
}
|
|
|
|
}
|
2007-10-31 07:45:15 +08:00
|
|
|
|
|
|
|
// PHI node might have become a constant value after SplitCriticalEdge.
|
2008-12-01 14:27:41 +08:00
|
|
|
DeadInsts.push_back(Inst);
|
2007-10-31 07:45:15 +08:00
|
|
|
|
2006-11-26 17:46:52 +08:00
|
|
|
DOUT << " CHANGED: IMM =" << *Imm << " Inst = " << *Inst;
|
2005-08-05 04:03:32 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-07-30 08:15:07 +08:00
|
|
|
/// isTargetConstant - Return true if the following can be referenced by the
|
|
|
|
/// immediate field of a target instruction.
|
2007-03-14 04:34:37 +08:00
|
|
|
static bool isTargetConstant(const SCEVHandle &V, const Type *UseTy,
|
|
|
|
const TargetLowering *TLI) {
|
2005-08-08 14:25:50 +08:00
|
|
|
if (SCEVConstant *SC = dyn_cast<SCEVConstant>(V)) {
|
2007-03-13 07:27:37 +08:00
|
|
|
int64_t VC = SC->getValue()->getSExtValue();
|
2007-04-10 06:20:14 +08:00
|
|
|
if (TLI) {
|
|
|
|
TargetLowering::AddrMode AM;
|
|
|
|
AM.BaseOffs = VC;
|
|
|
|
return TLI->isLegalAddressingMode(AM, UseTy);
|
|
|
|
} else {
|
2006-03-14 07:14:23 +08:00
|
|
|
// Defaults to PPC. PPC allows a sign-extended 16-bit immediate field.
|
2007-03-13 07:27:37 +08:00
|
|
|
return (VC > -(1 << 16) && VC < (1 << 16)-1);
|
2007-04-10 06:20:14 +08:00
|
|
|
}
|
2005-08-08 14:25:50 +08:00
|
|
|
}
|
2005-07-31 02:33:25 +08:00
|
|
|
|
2005-07-30 08:15:07 +08:00
|
|
|
if (SCEVUnknown *SU = dyn_cast<SCEVUnknown>(V))
|
|
|
|
if (ConstantExpr *CE = dyn_cast<ConstantExpr>(SU->getValue()))
|
2007-04-10 06:20:14 +08:00
|
|
|
if (TLI && CE->getOpcode() == Instruction::PtrToInt) {
|
2006-03-14 07:14:23 +08:00
|
|
|
Constant *Op0 = CE->getOperand(0);
|
2007-04-10 06:20:14 +08:00
|
|
|
if (GlobalValue *GV = dyn_cast<GlobalValue>(Op0)) {
|
|
|
|
TargetLowering::AddrMode AM;
|
|
|
|
AM.BaseGV = GV;
|
|
|
|
return TLI->isLegalAddressingMode(AM, UseTy);
|
|
|
|
}
|
2006-03-14 07:14:23 +08:00
|
|
|
}
|
2005-07-30 08:15:07 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2005-08-09 06:32:34 +08:00
|
|
|
/// MoveLoopVariantsToImediateField - Move any subexpressions from Val that are
|
|
|
|
/// loop varying to the Imm operand.
|
|
|
|
static void MoveLoopVariantsToImediateField(SCEVHandle &Val, SCEVHandle &Imm,
|
2007-10-23 02:31:58 +08:00
|
|
|
Loop *L, ScalarEvolution *SE) {
|
2005-08-09 06:32:34 +08:00
|
|
|
if (Val->isLoopInvariant(L)) return; // Nothing to do.
|
|
|
|
|
|
|
|
if (SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
|
|
|
|
std::vector<SCEVHandle> NewOps;
|
|
|
|
NewOps.reserve(SAE->getNumOperands());
|
|
|
|
|
|
|
|
for (unsigned i = 0; i != SAE->getNumOperands(); ++i)
|
|
|
|
if (!SAE->getOperand(i)->isLoopInvariant(L)) {
|
|
|
|
// If this is a loop-variant expression, it must stay in the immediate
|
|
|
|
// field of the expression.
|
2007-10-23 02:31:58 +08:00
|
|
|
Imm = SE->getAddExpr(Imm, SAE->getOperand(i));
|
2005-08-09 06:32:34 +08:00
|
|
|
} else {
|
|
|
|
NewOps.push_back(SAE->getOperand(i));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (NewOps.empty())
|
2007-10-23 02:31:58 +08:00
|
|
|
Val = SE->getIntegerSCEV(0, Val->getType());
|
2005-08-09 06:32:34 +08:00
|
|
|
else
|
2007-10-23 02:31:58 +08:00
|
|
|
Val = SE->getAddExpr(NewOps);
|
2005-08-09 06:32:34 +08:00
|
|
|
} else if (SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
|
|
|
|
// Try to pull immediates out of the start value of nested addrec's.
|
|
|
|
SCEVHandle Start = SARE->getStart();
|
2007-10-23 02:31:58 +08:00
|
|
|
MoveLoopVariantsToImediateField(Start, Imm, L, SE);
|
2005-08-09 06:32:34 +08:00
|
|
|
|
|
|
|
std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
|
|
|
|
Ops[0] = Start;
|
2007-10-23 02:31:58 +08:00
|
|
|
Val = SE->getAddRecExpr(Ops, SARE->getLoop());
|
2005-08-09 06:32:34 +08:00
|
|
|
} else {
|
|
|
|
// Otherwise, all of Val is variant, move the whole thing over.
|
2007-10-23 02:31:58 +08:00
|
|
|
Imm = SE->getAddExpr(Imm, Val);
|
|
|
|
Val = SE->getIntegerSCEV(0, Val->getType());
|
2005-08-09 06:32:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-08-05 06:34:05 +08:00
|
|
|
/// MoveImmediateValues - Look at Val, and pull out any additions of constants
|
2005-07-30 08:15:07 +08:00
|
|
|
/// that can fit into the immediate field of instructions in the target.
|
2005-08-05 06:34:05 +08:00
|
|
|
/// Accumulate these immediate values into the Imm value.
|
2006-03-14 07:14:23 +08:00
|
|
|
static void MoveImmediateValues(const TargetLowering *TLI,
|
2007-03-14 04:34:37 +08:00
|
|
|
Instruction *User,
|
2006-03-14 07:14:23 +08:00
|
|
|
SCEVHandle &Val, SCEVHandle &Imm,
|
2007-10-23 02:31:58 +08:00
|
|
|
bool isAddress, Loop *L,
|
|
|
|
ScalarEvolution *SE) {
|
2007-03-14 04:34:37 +08:00
|
|
|
const Type *UseTy = User->getType();
|
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(User))
|
|
|
|
UseTy = SI->getOperand(0)->getType();
|
|
|
|
|
Teach loop-reduce to see into nested loops, to pull out immediate values
pushed down by SCEV.
In a nested loop case, this allows us to emit this:
lis r3, ha16(L_A$non_lazy_ptr)
lwz r3, lo16(L_A$non_lazy_ptr)(r3)
add r2, r2, r3
li r3, 1
.LBB_foo_2: ; no_exit.1
lfd f0, 8(r2) ;; Uses offset of 8 instead of 0
stfd f0, 0(r2)
addi r4, r3, 1
addi r2, r2, 8
cmpwi cr0, r3, 100
or r3, r4, r4
bne .LBB_foo_2 ; no_exit.1
instead of this:
lis r3, ha16(L_A$non_lazy_ptr)
lwz r3, lo16(L_A$non_lazy_ptr)(r3)
add r2, r2, r3
addi r3, r3, 8
li r4, 1
.LBB_foo_2: ; no_exit.1
lfd f0, 0(r3)
stfd f0, 0(r2)
addi r5, r4, 1
addi r2, r2, 8
addi r3, r3, 8
cmpwi cr0, r4, 100
or r4, r5, r5
bne .LBB_foo_2 ; no_exit.1
llvm-svn: 22639
2005-08-04 07:44:42 +08:00
|
|
|
if (SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
|
2005-08-05 06:34:05 +08:00
|
|
|
std::vector<SCEVHandle> NewOps;
|
|
|
|
NewOps.reserve(SAE->getNumOperands());
|
|
|
|
|
2006-02-04 15:36:50 +08:00
|
|
|
for (unsigned i = 0; i != SAE->getNumOperands(); ++i) {
|
|
|
|
SCEVHandle NewOp = SAE->getOperand(i);
|
2007-10-23 02:31:58 +08:00
|
|
|
MoveImmediateValues(TLI, User, NewOp, Imm, isAddress, L, SE);
|
2006-02-04 15:36:50 +08:00
|
|
|
|
|
|
|
if (!NewOp->isLoopInvariant(L)) {
|
2005-08-05 03:08:16 +08:00
|
|
|
// If this is a loop-variant expression, it must stay in the immediate
|
|
|
|
// field of the expression.
|
2007-10-23 02:31:58 +08:00
|
|
|
Imm = SE->getAddExpr(Imm, NewOp);
|
2005-08-05 06:34:05 +08:00
|
|
|
} else {
|
2006-02-04 15:36:50 +08:00
|
|
|
NewOps.push_back(NewOp);
|
2005-07-30 08:15:07 +08:00
|
|
|
}
|
2006-02-04 15:36:50 +08:00
|
|
|
}
|
2005-08-05 06:34:05 +08:00
|
|
|
|
|
|
|
if (NewOps.empty())
|
2007-10-23 02:31:58 +08:00
|
|
|
Val = SE->getIntegerSCEV(0, Val->getType());
|
2005-08-05 06:34:05 +08:00
|
|
|
else
|
2007-10-23 02:31:58 +08:00
|
|
|
Val = SE->getAddExpr(NewOps);
|
2005-08-05 06:34:05 +08:00
|
|
|
return;
|
Teach loop-reduce to see into nested loops, to pull out immediate values
pushed down by SCEV.
In a nested loop case, this allows us to emit this:
lis r3, ha16(L_A$non_lazy_ptr)
lwz r3, lo16(L_A$non_lazy_ptr)(r3)
add r2, r2, r3
li r3, 1
.LBB_foo_2: ; no_exit.1
lfd f0, 8(r2) ;; Uses offset of 8 instead of 0
stfd f0, 0(r2)
addi r4, r3, 1
addi r2, r2, 8
cmpwi cr0, r3, 100
or r3, r4, r4
bne .LBB_foo_2 ; no_exit.1
instead of this:
lis r3, ha16(L_A$non_lazy_ptr)
lwz r3, lo16(L_A$non_lazy_ptr)(r3)
add r2, r2, r3
addi r3, r3, 8
li r4, 1
.LBB_foo_2: ; no_exit.1
lfd f0, 0(r3)
stfd f0, 0(r2)
addi r5, r4, 1
addi r2, r2, 8
addi r3, r3, 8
cmpwi cr0, r4, 100
or r4, r5, r5
bne .LBB_foo_2 ; no_exit.1
llvm-svn: 22639
2005-08-04 07:44:42 +08:00
|
|
|
} else if (SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
|
|
|
|
// Try to pull immediates out of the start value of nested addrec's.
|
2005-08-05 06:34:05 +08:00
|
|
|
SCEVHandle Start = SARE->getStart();
|
2007-10-23 02:31:58 +08:00
|
|
|
MoveImmediateValues(TLI, User, Start, Imm, isAddress, L, SE);
|
2005-08-05 06:34:05 +08:00
|
|
|
|
|
|
|
if (Start != SARE->getStart()) {
|
|
|
|
std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
|
|
|
|
Ops[0] = Start;
|
2007-10-23 02:31:58 +08:00
|
|
|
Val = SE->getAddRecExpr(Ops, SARE->getLoop());
|
2005-08-05 06:34:05 +08:00
|
|
|
}
|
|
|
|
return;
|
2006-02-04 15:36:50 +08:00
|
|
|
} else if (SCEVMulExpr *SME = dyn_cast<SCEVMulExpr>(Val)) {
|
|
|
|
// Transform "8 * (4 + v)" -> "32 + 8*V" if "32" fits in the immed field.
|
2007-03-14 04:34:37 +08:00
|
|
|
if (isAddress && isTargetConstant(SME->getOperand(0), UseTy, TLI) &&
|
2006-02-04 15:36:50 +08:00
|
|
|
SME->getNumOperands() == 2 && SME->isLoopInvariant(L)) {
|
|
|
|
|
2007-10-23 02:31:58 +08:00
|
|
|
SCEVHandle SubImm = SE->getIntegerSCEV(0, Val->getType());
|
2006-02-04 15:36:50 +08:00
|
|
|
SCEVHandle NewOp = SME->getOperand(1);
|
2007-10-23 02:31:58 +08:00
|
|
|
MoveImmediateValues(TLI, User, NewOp, SubImm, isAddress, L, SE);
|
2006-02-04 15:36:50 +08:00
|
|
|
|
|
|
|
// If we extracted something out of the subexpressions, see if we can
|
|
|
|
// simplify this!
|
|
|
|
if (NewOp != SME->getOperand(1)) {
|
|
|
|
// Scale SubImm up by "8". If the result is a target constant, we are
|
|
|
|
// good.
|
2007-10-23 02:31:58 +08:00
|
|
|
SubImm = SE->getMulExpr(SubImm, SME->getOperand(0));
|
2007-03-14 04:34:37 +08:00
|
|
|
if (isTargetConstant(SubImm, UseTy, TLI)) {
|
2006-02-04 15:36:50 +08:00
|
|
|
// Accumulate the immediate.
|
2007-10-23 02:31:58 +08:00
|
|
|
Imm = SE->getAddExpr(Imm, SubImm);
|
2006-02-04 15:36:50 +08:00
|
|
|
|
|
|
|
// Update what is left of 'Val'.
|
2007-10-23 02:31:58 +08:00
|
|
|
Val = SE->getMulExpr(SME->getOperand(0), NewOp);
|
2006-02-04 15:36:50 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2005-07-30 08:15:07 +08:00
|
|
|
}
|
|
|
|
|
2005-08-05 06:34:05 +08:00
|
|
|
// Loop-variant expressions must stay in the immediate field of the
|
|
|
|
// expression.
|
2007-03-14 04:34:37 +08:00
|
|
|
if ((isAddress && isTargetConstant(Val, UseTy, TLI)) ||
|
2005-08-05 06:34:05 +08:00
|
|
|
!Val->isLoopInvariant(L)) {
|
2007-10-23 02:31:58 +08:00
|
|
|
Imm = SE->getAddExpr(Imm, Val);
|
|
|
|
Val = SE->getIntegerSCEV(0, Val->getType());
|
2005-08-05 06:34:05 +08:00
|
|
|
return;
|
2005-08-05 03:26:19 +08:00
|
|
|
}
|
2005-08-05 06:34:05 +08:00
|
|
|
|
|
|
|
// Otherwise, no immediates to move.
|
2005-07-30 08:15:07 +08:00
|
|
|
}
|
|
|
|
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
|
2006-08-03 14:34:50 +08:00
|
|
|
/// SeparateSubExprs - Decompose Expr into all of the subexpressions that are
|
|
|
|
/// added together. This is used to reassociate common addition subexprs
|
|
|
|
/// together for maximal sharing when rewriting bases.
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
static void SeparateSubExprs(std::vector<SCEVHandle> &SubExprs,
|
2007-10-23 02:31:58 +08:00
|
|
|
SCEVHandle Expr,
|
|
|
|
ScalarEvolution *SE) {
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
if (SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(Expr)) {
|
|
|
|
for (unsigned j = 0, e = AE->getNumOperands(); j != e; ++j)
|
2007-10-23 02:31:58 +08:00
|
|
|
SeparateSubExprs(SubExprs, AE->getOperand(j), SE);
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
} else if (SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Expr)) {
|
2007-10-23 02:31:58 +08:00
|
|
|
SCEVHandle Zero = SE->getIntegerSCEV(0, Expr->getType());
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
if (SARE->getOperand(0) == Zero) {
|
|
|
|
SubExprs.push_back(Expr);
|
|
|
|
} else {
|
|
|
|
// Compute the addrec with zero as its base.
|
|
|
|
std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
|
|
|
|
Ops[0] = Zero; // Start with zero base.
|
2007-10-23 02:31:58 +08:00
|
|
|
SubExprs.push_back(SE->getAddRecExpr(Ops, SARE->getLoop()));
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
|
|
|
|
|
2007-10-23 02:31:58 +08:00
|
|
|
SeparateSubExprs(SubExprs, SARE->getOperand(0), SE);
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
}
|
2008-06-19 00:23:07 +08:00
|
|
|
} else if (!Expr->isZero()) {
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
// Do not add zero.
|
|
|
|
SubExprs.push_back(Expr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
/// RemoveCommonExpressionsFromUseBases - Look through all of the uses in Bases,
|
|
|
|
/// removing any common subexpressions from it. Anything truly common is
|
|
|
|
/// removed, accumulated, and returned. This looks for things like (a+b+c) and
|
2008-12-02 12:52:26 +08:00
|
|
|
/// (a+c+d) and computes the common (a+c) subexpression. The common expression
|
|
|
|
/// is *removed* from the Bases and returned.
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
static SCEVHandle
|
2007-10-23 02:31:58 +08:00
|
|
|
RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
|
2008-12-02 06:00:01 +08:00
|
|
|
ScalarEvolution *SE, Loop *L) {
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
unsigned NumUses = Uses.size();
|
|
|
|
|
2008-12-02 12:52:26 +08:00
|
|
|
// Only one use? This is a very common case, so we handle it specially and
|
|
|
|
// cheaply.
|
2007-10-23 02:31:58 +08:00
|
|
|
SCEVHandle Zero = SE->getIntegerSCEV(0, Uses[0].Base->getType());
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
SCEVHandle Result = Zero;
|
|
|
|
if (NumUses == 1) {
|
2008-12-02 12:52:26 +08:00
|
|
|
// If the use is inside the loop, use its base, regardless of what it is:
|
|
|
|
// it is clearly shared across all the IV's. If the use is outside the loop
|
|
|
|
// (which means after it) we don't want to factor anything *into* the loop,
|
|
|
|
// so just use 0 as the base.
|
2008-12-02 06:00:01 +08:00
|
|
|
if (L->contains(Uses[0].Inst->getParent()))
|
|
|
|
std::swap(Result, Uses[0].Base);
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
|
|
|
// To find common subexpressions, count how many of Uses use each expression.
|
|
|
|
// If any subexpressions are used Uses.size() times, they are common.
|
|
|
|
std::map<SCEVHandle, unsigned> SubExpressionUseCounts;
|
|
|
|
|
2005-10-12 02:41:04 +08:00
|
|
|
// UniqueSubExprs - Keep track of all of the subexpressions we see in the
|
|
|
|
// order we see them.
|
|
|
|
std::vector<SCEVHandle> UniqueSubExprs;
|
|
|
|
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
std::vector<SCEVHandle> SubExprs;
|
2008-12-02 12:52:26 +08:00
|
|
|
unsigned NumUsesInsideLoop = 0;
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
for (unsigned i = 0; i != NumUses; ++i) {
|
2008-12-02 12:52:26 +08:00
|
|
|
// If the user is outside the loop, just ignore it for base computation.
|
|
|
|
// Since the user is outside the loop, it must be *after* the loop (if it
|
|
|
|
// were before, it could not be based on the loop IV). We don't want users
|
|
|
|
// after the loop to affect base computation of values *inside* the loop,
|
|
|
|
// because we can always add their offsets to the result IV after the loop
|
|
|
|
// is done, ensuring we get good code inside the loop.
|
2008-12-02 06:00:01 +08:00
|
|
|
if (!L->contains(Uses[i].Inst->getParent()))
|
|
|
|
continue;
|
|
|
|
NumUsesInsideLoop++;
|
|
|
|
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
// If the base is zero (which is common), return zero now, there are no
|
|
|
|
// CSEs we can find.
|
|
|
|
if (Uses[i].Base == Zero) return Zero;
|
|
|
|
|
|
|
|
// Split the expression into subexprs.
|
2007-10-23 02:31:58 +08:00
|
|
|
SeparateSubExprs(SubExprs, Uses[i].Base, SE);
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
// Add one to SubExpressionUseCounts for each subexpr present.
|
|
|
|
for (unsigned j = 0, e = SubExprs.size(); j != e; ++j)
|
2005-10-12 02:41:04 +08:00
|
|
|
if (++SubExpressionUseCounts[SubExprs[j]] == 1)
|
|
|
|
UniqueSubExprs.push_back(SubExprs[j]);
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
SubExprs.clear();
|
|
|
|
}
|
|
|
|
|
2005-10-12 02:41:04 +08:00
|
|
|
// Now that we know how many times each is used, build Result. Iterate over
|
|
|
|
// UniqueSubexprs so that we have a stable ordering.
|
|
|
|
for (unsigned i = 0, e = UniqueSubExprs.size(); i != e; ++i) {
|
|
|
|
std::map<SCEVHandle, unsigned>::iterator I =
|
|
|
|
SubExpressionUseCounts.find(UniqueSubExprs[i]);
|
|
|
|
assert(I != SubExpressionUseCounts.end() && "Entry not found?");
|
2008-12-02 12:52:26 +08:00
|
|
|
if (I->second == NumUsesInsideLoop) // Found CSE!
|
2007-10-23 02:31:58 +08:00
|
|
|
Result = SE->getAddExpr(Result, I->first);
|
2008-12-02 12:52:26 +08:00
|
|
|
else
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
// Remove non-cse's from SubExpressionUseCounts.
|
2005-10-12 02:41:04 +08:00
|
|
|
SubExpressionUseCounts.erase(I);
|
|
|
|
}
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
|
|
|
|
// If we found no CSE's, return now.
|
|
|
|
if (Result == Zero) return Result;
|
|
|
|
|
|
|
|
// Otherwise, remove all of the CSE's we found from each of the base values.
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
for (unsigned i = 0; i != NumUses; ++i) {
|
2008-12-03 05:17:11 +08:00
|
|
|
// Uses outside the loop don't necessarily include the common base, but
|
|
|
|
// the final IV value coming into those uses does. Instead of trying to
|
|
|
|
// remove the pieces of the common base, which might not be there,
|
|
|
|
// subtract off the base to compensate for this.
|
|
|
|
if (!L->contains(Uses[i].Inst->getParent())) {
|
|
|
|
Uses[i].Base = SE->getMinusSCEV(Uses[i].Base, Result);
|
2008-12-02 06:00:01 +08:00
|
|
|
continue;
|
2008-12-03 05:17:11 +08:00
|
|
|
}
|
2008-12-02 06:00:01 +08:00
|
|
|
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
// Split the expression into subexprs.
|
2007-10-23 02:31:58 +08:00
|
|
|
SeparateSubExprs(SubExprs, Uses[i].Base, SE);
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
|
|
|
|
// Remove any common subexpressions.
|
|
|
|
for (unsigned j = 0, e = SubExprs.size(); j != e; ++j)
|
|
|
|
if (SubExpressionUseCounts.count(SubExprs[j])) {
|
|
|
|
SubExprs.erase(SubExprs.begin()+j);
|
|
|
|
--j; --e;
|
|
|
|
}
|
|
|
|
|
2008-12-02 12:52:26 +08:00
|
|
|
// Finally, add the non-shared expressions together.
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
if (SubExprs.empty())
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
Uses[i].Base = Zero;
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
else
|
2007-10-23 02:31:58 +08:00
|
|
|
Uses[i].Base = SE->getAddExpr(SubExprs);
|
Ooops, don't forget to clear this. The real inner loop is now:
.LBB_foo_3: ; no_exit.1
lfd f2, 0(r9)
lfd f3, 8(r9)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r9)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfd f2, 0(r9)
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22782
2005-08-13 15:42:01 +08:00
|
|
|
SubExprs.clear();
|
Recursively scan scev expressions for common subexpressions. This allows us
to handle nested loops much better, for example, by being able to tell that
these two expressions:
{( 8 + ( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp 12)}<loopentry.1>
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
Have the following common part that can be shared:
{(( 16 * ( 1 + %Tmp11 + %Tmp12)) + %c_),+,( 16 * %Tmp12)}<loopentry.1>
This allows us to codegen an important inner loop in 168.wupwise as:
.LBB_foo_4: ; no_exit.1
lfd f2, 16(r9)
fmul f3, f0, f2
fmul f2, f1, f2
fadd f4, f3, f2
stfd f4, 8(r9)
fsub f2, f3, f2
stfd f2, 16(r9)
addi r8, r8, 1
addi r9, r9, 16
cmpw cr0, r8, r4
ble .LBB_foo_4 ; no_exit.1
instead of:
.LBB_foo_3: ; no_exit.1
lfdx f2, r6, r9
add r10, r6, r9
lfd f3, 8(r10)
fmul f4, f1, f2
fmadd f4, f0, f3, f4
stfd f4, 8(r10)
fmul f3, f1, f3
fmsub f2, f0, f2, f3
stfdx f2, r6, r9
addi r9, r9, 16
addi r8, r8, 1
cmpw cr0, r8, r4
ble .LBB_foo_3 ; no_exit.1
llvm-svn: 22781
2005-08-13 15:27:18 +08:00
|
|
|
}
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
2007-03-20 08:47:50 +08:00
|
|
|
/// ValidStride - Check whether the given Scale is valid for all loads and
|
2007-04-10 06:20:14 +08:00
|
|
|
/// stores in UsersToProcess.
|
2007-03-20 08:47:50 +08:00
|
|
|
///
|
2007-10-23 04:40:42 +08:00
|
|
|
bool LoopStrengthReduce::ValidStride(bool HasBaseReg,
|
|
|
|
int64_t Scale,
|
2007-03-20 08:47:50 +08:00
|
|
|
const std::vector<BasedUser>& UsersToProcess) {
|
2007-12-20 07:33:23 +08:00
|
|
|
if (!TLI)
|
|
|
|
return true;
|
|
|
|
|
2007-03-21 05:54:54 +08:00
|
|
|
for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) {
|
Pass the type of the store access, not the type of the store, into the
target hook. This allows us to codegen a loop as:
LBB1_1: @cond_next
mov r2, #0
str r2, [r0, +r3, lsl #2]
add r3, r3, #1
cmn r3, #1
bne LBB1_1 @cond_next
instead of:
LBB1_1: @cond_next
mov r2, #0
str r2, [r0], #+4
add r3, r3, #1
cmn r3, #1
bne LBB1_1 @cond_next
This looks the same, but has one fewer induction variable (and therefore,
one fewer register) live in the loop.
llvm-svn: 35592
2007-04-02 14:34:44 +08:00
|
|
|
// If this is a load or other access, pass the type of the access in.
|
|
|
|
const Type *AccessTy = Type::VoidTy;
|
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(UsersToProcess[i].Inst))
|
|
|
|
AccessTy = SI->getOperand(0)->getType();
|
|
|
|
else if (LoadInst *LI = dyn_cast<LoadInst>(UsersToProcess[i].Inst))
|
|
|
|
AccessTy = LI->getType();
|
2008-03-20 06:02:26 +08:00
|
|
|
else if (isa<PHINode>(UsersToProcess[i].Inst))
|
|
|
|
continue;
|
Pass the type of the store access, not the type of the store, into the
target hook. This allows us to codegen a loop as:
LBB1_1: @cond_next
mov r2, #0
str r2, [r0, +r3, lsl #2]
add r3, r3, #1
cmn r3, #1
bne LBB1_1 @cond_next
instead of:
LBB1_1: @cond_next
mov r2, #0
str r2, [r0], #+4
add r3, r3, #1
cmn r3, #1
bne LBB1_1 @cond_next
This looks the same, but has one fewer induction variable (and therefore,
one fewer register) live in the loop.
llvm-svn: 35592
2007-04-02 14:34:44 +08:00
|
|
|
|
2007-04-10 06:20:14 +08:00
|
|
|
TargetLowering::AddrMode AM;
|
|
|
|
if (SCEVConstant *SC = dyn_cast<SCEVConstant>(UsersToProcess[i].Imm))
|
|
|
|
AM.BaseOffs = SC->getValue()->getSExtValue();
|
2008-06-19 00:23:07 +08:00
|
|
|
AM.HasBaseReg = HasBaseReg || !UsersToProcess[i].Base->isZero();
|
2007-04-10 06:20:14 +08:00
|
|
|
AM.Scale = Scale;
|
|
|
|
|
|
|
|
// If load[imm+r*scale] is illegal, bail out.
|
2007-12-20 07:33:23 +08:00
|
|
|
if (!TLI->isLegalAddressingMode(AM, AccessTy))
|
2007-03-20 08:47:50 +08:00
|
|
|
return false;
|
2007-03-21 05:54:54 +08:00
|
|
|
}
|
2007-03-20 08:47:50 +08:00
|
|
|
return true;
|
|
|
|
}
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
|
2007-10-26 06:45:20 +08:00
|
|
|
/// RequiresTypeConversion - Returns true if converting Ty to NewTy is not
|
|
|
|
/// a nop.
|
Loosen up iv reuse to allow reuse of the same stride but a larger type when truncating from the larger type to smaller type is free.
e.g.
Turns this loop:
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
movw %dx, %si
LBB1_2: # bb
movl L_X$non_lazy_ptr, %edi
movw %si, (%edi)
movl L_Y$non_lazy_ptr, %edi
movw %dx, (%edi)
addw $4, %dx
incw %si
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
into
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
LBB1_2: # bb
movl L_X$non_lazy_ptr, %esi
movw %cx, (%esi)
movl L_Y$non_lazy_ptr, %esi
movw %dx, (%esi)
addw $4, %dx
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
llvm-svn: 43375
2007-10-26 09:56:11 +08:00
|
|
|
bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1,
|
|
|
|
const Type *Ty2) {
|
|
|
|
if (Ty1 == Ty2)
|
2007-10-26 06:45:20 +08:00
|
|
|
return false;
|
Loosen up iv reuse to allow reuse of the same stride but a larger type when truncating from the larger type to smaller type is free.
e.g.
Turns this loop:
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
movw %dx, %si
LBB1_2: # bb
movl L_X$non_lazy_ptr, %edi
movw %si, (%edi)
movl L_Y$non_lazy_ptr, %edi
movw %dx, (%edi)
addw $4, %dx
incw %si
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
into
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
LBB1_2: # bb
movl L_X$non_lazy_ptr, %esi
movw %cx, (%esi)
movl L_Y$non_lazy_ptr, %esi
movw %dx, (%esi)
addw $4, %dx
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
llvm-svn: 43375
2007-10-26 09:56:11 +08:00
|
|
|
if (TLI && TLI->isTruncateFree(Ty1, Ty2))
|
|
|
|
return false;
|
|
|
|
return (!Ty1->canLosslesslyBitCastTo(Ty2) &&
|
|
|
|
!(isa<PointerType>(Ty2) &&
|
|
|
|
Ty1->canLosslesslyBitCastTo(UIntPtrTy)) &&
|
|
|
|
!(isa<PointerType>(Ty1) &&
|
|
|
|
Ty2->canLosslesslyBitCastTo(UIntPtrTy)));
|
2007-10-26 06:45:20 +08:00
|
|
|
}
|
|
|
|
|
2006-03-18 03:52:23 +08:00
|
|
|
/// CheckForIVReuse - Returns the multiple if the stride is the multiple
|
|
|
|
/// of a previous stride and it is a legal value for the target addressing
|
2007-10-23 04:40:42 +08:00
|
|
|
/// mode scale component and optional base reg. This allows the users of
|
|
|
|
/// this stride to be rewritten as prev iv * factor. It returns 0 if no
|
|
|
|
/// reuse is possible.
|
|
|
|
unsigned LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
|
Loosen up iv reuse to allow reuse of the same stride but a larger type when truncating from the larger type to smaller type is free.
e.g.
Turns this loop:
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
movw %dx, %si
LBB1_2: # bb
movl L_X$non_lazy_ptr, %edi
movw %si, (%edi)
movl L_Y$non_lazy_ptr, %edi
movw %dx, (%edi)
addw $4, %dx
incw %si
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
into
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
LBB1_2: # bb
movl L_X$non_lazy_ptr, %esi
movw %cx, (%esi)
movl L_Y$non_lazy_ptr, %esi
movw %dx, (%esi)
addw $4, %dx
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
llvm-svn: 43375
2007-10-26 09:56:11 +08:00
|
|
|
bool AllUsesAreAddresses,
|
2007-10-23 04:40:42 +08:00
|
|
|
const SCEVHandle &Stride,
|
2007-03-20 08:47:50 +08:00
|
|
|
IVExpr &IV, const Type *Ty,
|
|
|
|
const std::vector<BasedUser>& UsersToProcess) {
|
2006-03-18 03:52:23 +08:00
|
|
|
if (SCEVConstant *SC = dyn_cast<SCEVConstant>(Stride)) {
|
2007-03-03 07:37:53 +08:00
|
|
|
int64_t SInt = SC->getValue()->getSExtValue();
|
2007-11-17 10:48:01 +08:00
|
|
|
for (unsigned NewStride = 0, e = StrideOrder.size(); NewStride != e;
|
|
|
|
++NewStride) {
|
|
|
|
std::map<SCEVHandle, IVsOfOneStride>::iterator SI =
|
|
|
|
IVsByStride.find(StrideOrder[NewStride]);
|
|
|
|
if (SI == IVsByStride.end())
|
|
|
|
continue;
|
2007-03-13 07:27:37 +08:00
|
|
|
int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
|
Loosen up iv reuse to allow reuse of the same stride but a larger type when truncating from the larger type to smaller type is free.
e.g.
Turns this loop:
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
movw %dx, %si
LBB1_2: # bb
movl L_X$non_lazy_ptr, %edi
movw %si, (%edi)
movl L_Y$non_lazy_ptr, %edi
movw %dx, (%edi)
addw $4, %dx
incw %si
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
into
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
LBB1_2: # bb
movl L_X$non_lazy_ptr, %esi
movw %cx, (%esi)
movl L_Y$non_lazy_ptr, %esi
movw %dx, (%esi)
addw $4, %dx
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
llvm-svn: 43375
2007-10-26 09:56:11 +08:00
|
|
|
if (SI->first != Stride &&
|
2007-04-03 06:51:58 +08:00
|
|
|
(unsigned(abs(SInt)) < SSInt || (SInt % SSInt) != 0))
|
2006-03-18 03:52:23 +08:00
|
|
|
continue;
|
2007-03-13 07:27:37 +08:00
|
|
|
int64_t Scale = SInt / SSInt;
|
2007-03-20 08:47:50 +08:00
|
|
|
// Check that this stride is valid for all the types used for loads and
|
|
|
|
// stores; if it can be used for some and not others, we might as well use
|
|
|
|
// the original stride everywhere, since we have to create the IV for it
|
2007-10-30 03:23:53 +08:00
|
|
|
// anyway. If the scale is 1, then we don't need to worry about folding
|
|
|
|
// multiplications.
|
|
|
|
if (Scale == 1 ||
|
|
|
|
(AllUsesAreAddresses &&
|
|
|
|
ValidStride(HasBaseReg, Scale, UsersToProcess)))
|
2007-03-13 07:27:37 +08:00
|
|
|
for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
|
|
|
|
IE = SI->second.IVs.end(); II != IE; ++II)
|
|
|
|
// FIXME: Only handle base == 0 for now.
|
|
|
|
// Only reuse previous IV if it would not require a type conversion.
|
2008-06-19 00:23:07 +08:00
|
|
|
if (II->Base->isZero() &&
|
Loosen up iv reuse to allow reuse of the same stride but a larger type when truncating from the larger type to smaller type is free.
e.g.
Turns this loop:
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
movw %dx, %si
LBB1_2: # bb
movl L_X$non_lazy_ptr, %edi
movw %si, (%edi)
movl L_Y$non_lazy_ptr, %edi
movw %dx, (%edi)
addw $4, %dx
incw %si
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
into
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
LBB1_2: # bb
movl L_X$non_lazy_ptr, %esi
movw %cx, (%esi)
movl L_Y$non_lazy_ptr, %esi
movw %dx, (%esi)
addw $4, %dx
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
llvm-svn: 43375
2007-10-26 09:56:11 +08:00
|
|
|
!RequiresTypeConversion(II->Base->getType(), Ty)) {
|
2007-03-13 07:27:37 +08:00
|
|
|
IV = *II;
|
|
|
|
return Scale;
|
|
|
|
}
|
2006-03-18 03:52:23 +08:00
|
|
|
}
|
|
|
|
}
|
2006-03-18 16:03:12 +08:00
|
|
|
return 0;
|
2006-03-18 03:52:23 +08:00
|
|
|
}
|
|
|
|
|
2006-08-03 14:34:50 +08:00
|
|
|
/// PartitionByIsUseOfPostIncrementedValue - Simple boolean predicate that
|
|
|
|
/// returns true if Val's isUseOfPostIncrementedValue is true.
|
|
|
|
static bool PartitionByIsUseOfPostIncrementedValue(const BasedUser &Val) {
|
|
|
|
return Val.isUseOfPostIncrementedValue;
|
|
|
|
}
|
2006-03-18 03:52:23 +08:00
|
|
|
|
2008-04-15 02:26:16 +08:00
|
|
|
/// isNonConstantNegative - Return true if the specified scev is negated, but
|
Handle negative strides much more optimally. This compiles X86/lsr-negative-stride.ll
into:
_t:
movl 8(%esp), %ecx
movl 4(%esp), %eax
cmpl %ecx, %eax
je LBB1_3 #bb17
LBB1_1: #bb
cmpl %ecx, %eax
jg LBB1_4 #cond_true
LBB1_2: #cond_false
subl %eax, %ecx
cmpl %ecx, %eax
jne LBB1_1 #bb
LBB1_3: #bb17
ret
LBB1_4: #cond_true
subl %ecx, %eax
cmpl %ecx, %eax
jne LBB1_1 #bb
jmp LBB1_3 #bb17
instead of:
_t:
subl $4, %esp
movl %esi, (%esp)
movl 12(%esp), %ecx
movl 8(%esp), %eax
cmpl %ecx, %eax
je LBB1_4 #bb17
LBB1_1: #bb.outer
movl %ecx, %edx
negl %edx
LBB1_2: #bb
cmpl %ecx, %eax
jle LBB1_5 #cond_false
LBB1_3: #cond_true
addl %edx, %eax
cmpl %ecx, %eax
jne LBB1_2 #bb
LBB1_4: #bb17
movl (%esp), %esi
addl $4, %esp
ret
LBB1_5: #cond_false
movl %ecx, %edx
subl %eax, %edx
movl %eax, %esi
addl %esi, %esi
cmpl %ecx, %esi
je LBB1_4 #bb17
LBB1_6: #cond_false.bb.outer_crit_edge
movl %edx, %ecx
jmp LBB1_1 #bb.outer
llvm-svn: 37252
2007-05-19 09:22:21 +08:00
|
|
|
/// not a constant.
|
|
|
|
static bool isNonConstantNegative(const SCEVHandle &Expr) {
|
|
|
|
SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Expr);
|
|
|
|
if (!Mul) return false;
|
|
|
|
|
|
|
|
// If there is a constant factor, it will be first.
|
|
|
|
SCEVConstant *SC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
|
|
|
|
if (!SC) return false;
|
|
|
|
|
|
|
|
// Return true if the value is negative, this matches things like (-42 * V).
|
|
|
|
return SC->getValue()->getValue().isNegative();
|
|
|
|
}
|
|
|
|
|
2007-12-20 07:33:23 +08:00
|
|
|
/// isAddress - Returns true if the specified instruction is using the
|
|
|
|
/// specified value as an address.
|
|
|
|
static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
|
|
|
|
bool isAddress = isa<LoadInst>(Inst);
|
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
|
|
|
|
if (SI->getOperand(1) == OperandVal)
|
|
|
|
isAddress = true;
|
|
|
|
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
|
|
|
|
// Addressing modes can also be folded into prefetches and a variety
|
|
|
|
// of intrinsics.
|
|
|
|
switch (II->getIntrinsicID()) {
|
|
|
|
default: break;
|
|
|
|
case Intrinsic::prefetch:
|
|
|
|
case Intrinsic::x86_sse2_loadu_dq:
|
|
|
|
case Intrinsic::x86_sse2_loadu_pd:
|
|
|
|
case Intrinsic::x86_sse_loadu_ps:
|
|
|
|
case Intrinsic::x86_sse_storeu_ps:
|
|
|
|
case Intrinsic::x86_sse2_storeu_pd:
|
|
|
|
case Intrinsic::x86_sse2_storeu_dq:
|
|
|
|
case Intrinsic::x86_sse2_storel_dq:
|
|
|
|
if (II->getOperand(1) == OperandVal)
|
|
|
|
isAddress = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return isAddress;
|
|
|
|
}
|
|
|
|
|
2007-10-26 06:45:20 +08:00
|
|
|
// CollectIVUsers - Transform our list of users and offsets to a bit more
|
2008-06-24 06:11:52 +08:00
|
|
|
// complex table. In this new vector, each 'BasedUser' contains 'Base', the base
|
|
|
|
// of the strided accesses, as well as the old information from Uses. We
|
2007-10-26 06:45:20 +08:00
|
|
|
// progressively move information from the Base field to the Imm field, until
|
|
|
|
// we eventually have the full access expression to rewrite the use.
|
|
|
|
SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
|
|
|
|
IVUsersOfOneStride &Uses,
|
|
|
|
Loop *L,
|
|
|
|
bool &AllUsesAreAddresses,
|
|
|
|
std::vector<BasedUser> &UsersToProcess) {
|
2005-07-30 08:15:07 +08:00
|
|
|
UsersToProcess.reserve(Uses.Users.size());
|
2005-08-09 06:56:21 +08:00
|
|
|
for (unsigned i = 0, e = Uses.Users.size(); i != e; ++i) {
|
2007-10-23 02:31:58 +08:00
|
|
|
UsersToProcess.push_back(BasedUser(Uses.Users[i], SE));
|
2005-08-09 06:56:21 +08:00
|
|
|
|
2008-12-04 03:25:46 +08:00
|
|
|
// Move any loop variant operands from the offset field to the immediate
|
2005-08-09 06:56:21 +08:00
|
|
|
// field of the use, so that we don't try to use something before it is
|
|
|
|
// computed.
|
|
|
|
MoveLoopVariantsToImediateField(UsersToProcess.back().Base,
|
2007-10-23 02:31:58 +08:00
|
|
|
UsersToProcess.back().Imm, L, SE);
|
2005-08-09 06:56:21 +08:00
|
|
|
assert(UsersToProcess.back().Base->isLoopInvariant(L) &&
|
2005-08-05 06:34:05 +08:00
|
|
|
"Base value is not loop invariant!");
|
2005-03-07 05:58:22 +08:00
|
|
|
}
|
2006-03-18 03:52:23 +08:00
|
|
|
|
2006-07-19 03:07:58 +08:00
|
|
|
// We now have a whole bunch of uses of like-strided induction variables, but
|
|
|
|
// they might all have different bases. We want to emit one PHI node for this
|
|
|
|
// stride which we fold as many common expressions (between the IVs) into as
|
|
|
|
// possible. Start by identifying the common expressions in the base values
|
|
|
|
// for the strides (e.g. if we have "A+C+B" and "A+B+D" as our bases, find
|
|
|
|
// "A+B"), emit it to the preheader, then remove the expression from the
|
|
|
|
// UsersToProcess base values.
|
|
|
|
SCEVHandle CommonExprs =
|
2008-12-02 06:00:01 +08:00
|
|
|
RemoveCommonExpressionsFromUseBases(UsersToProcess, SE, L);
|
2007-10-23 04:40:42 +08:00
|
|
|
|
2005-08-09 06:32:34 +08:00
|
|
|
// Next, figure out what we can represent in the immediate fields of
|
|
|
|
// instructions. If we can represent anything there, move it to the imm
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
// fields of the BasedUsers. We do this so that it increases the commonality
|
|
|
|
// of the remaining uses.
|
2007-12-20 10:20:53 +08:00
|
|
|
unsigned NumPHI = 0;
|
2005-08-09 06:32:34 +08:00
|
|
|
for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
|
2005-08-16 08:38:11 +08:00
|
|
|
// If the user is not in the current loop, this means it is using the exit
|
|
|
|
// value of the IV. Do not put anything in the base, make sure it's all in
|
|
|
|
// the immediate field to allow as much factoring as possible.
|
|
|
|
if (!L->contains(UsersToProcess[i].Inst->getParent())) {
|
2007-10-23 02:31:58 +08:00
|
|
|
UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm,
|
|
|
|
UsersToProcess[i].Base);
|
2005-08-18 05:22:41 +08:00
|
|
|
UsersToProcess[i].Base =
|
2007-10-23 02:31:58 +08:00
|
|
|
SE->getIntegerSCEV(0, UsersToProcess[i].Base->getType());
|
2005-08-16 08:38:11 +08:00
|
|
|
} else {
|
|
|
|
|
|
|
|
// Addressing modes can be folded into loads and stores. Be careful that
|
|
|
|
// the store is through the expression, not of the expression though.
|
2007-12-20 10:20:53 +08:00
|
|
|
bool isPHI = false;
|
2007-12-20 07:33:23 +08:00
|
|
|
bool isAddress = isAddressUse(UsersToProcess[i].Inst,
|
|
|
|
UsersToProcess[i].OperandValToReplace);
|
|
|
|
if (isa<PHINode>(UsersToProcess[i].Inst)) {
|
2007-12-20 10:20:53 +08:00
|
|
|
isPHI = true;
|
|
|
|
++NumPHI;
|
2007-05-04 07:20:33 +08:00
|
|
|
}
|
2007-10-23 04:40:42 +08:00
|
|
|
|
|
|
|
// If this use isn't an address, then not all uses are addresses.
|
2008-03-20 06:02:26 +08:00
|
|
|
if (!isAddress && !isPHI)
|
2007-10-23 04:40:42 +08:00
|
|
|
AllUsesAreAddresses = false;
|
2005-08-16 08:38:11 +08:00
|
|
|
|
2007-03-14 04:34:37 +08:00
|
|
|
MoveImmediateValues(TLI, UsersToProcess[i].Inst, UsersToProcess[i].Base,
|
2007-10-23 02:31:58 +08:00
|
|
|
UsersToProcess[i].Imm, isAddress, L, SE);
|
2005-08-16 08:38:11 +08:00
|
|
|
}
|
2005-08-09 06:32:34 +08:00
|
|
|
}
|
2006-03-17 05:53:05 +08:00
|
|
|
|
2007-12-20 10:20:53 +08:00
|
|
|
// If one of the use if a PHI node and all other uses are addresses, still
|
|
|
|
// allow iv reuse. Essentially we are trading one constant multiplication
|
|
|
|
// for one fewer iv.
|
|
|
|
if (NumPHI > 1)
|
|
|
|
AllUsesAreAddresses = false;
|
|
|
|
|
2007-10-26 06:45:20 +08:00
|
|
|
return CommonExprs;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// StrengthReduceStridedIVUsers - Strength reduce all of the users of a single
|
|
|
|
/// stride of IV. All of the users may have different starting values, and this
|
|
|
|
/// may not be the only stride (we know it is if isOnlyStride is true).
|
|
|
|
void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
|
|
|
|
IVUsersOfOneStride &Uses,
|
|
|
|
Loop *L,
|
|
|
|
bool isOnlyStride) {
|
|
|
|
// If all the users are moved to another stride, then there is nothing to do.
|
2008-01-29 21:02:09 +08:00
|
|
|
if (Uses.Users.empty())
|
2007-10-26 06:45:20 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
// Keep track if every use in UsersToProcess is an address. If they all are,
|
|
|
|
// we may be able to rewrite the entire collection of them in terms of a
|
|
|
|
// smaller-stride IV.
|
|
|
|
bool AllUsesAreAddresses = true;
|
|
|
|
|
|
|
|
// Transform our list of users and offsets to a bit more complex table. In
|
|
|
|
// this new vector, each 'BasedUser' contains 'Base' the base of the
|
|
|
|
// strided accessas well as the old information from Uses. We progressively
|
|
|
|
// move information from the Base field to the Imm field, until we eventually
|
|
|
|
// have the full access expression to rewrite the use.
|
|
|
|
std::vector<BasedUser> UsersToProcess;
|
|
|
|
SCEVHandle CommonExprs = CollectIVUsers(Stride, Uses, L, AllUsesAreAddresses,
|
|
|
|
UsersToProcess);
|
|
|
|
|
|
|
|
// If we managed to find some expressions in common, we'll need to carry
|
|
|
|
// their value in a register and add it in for each use. This will take up
|
|
|
|
// a register operand, which potentially restricts what stride values are
|
|
|
|
// valid.
|
2008-06-19 00:23:07 +08:00
|
|
|
bool HaveCommonExprs = !CommonExprs->isZero();
|
2007-10-26 06:45:20 +08:00
|
|
|
|
2007-10-23 04:40:42 +08:00
|
|
|
// If all uses are addresses, check if it is possible to reuse an IV with a
|
|
|
|
// stride that is a factor of this stride. And that the multiple is a number
|
|
|
|
// that can be encoded in the scale field of the target addressing mode. And
|
|
|
|
// that we will have a valid instruction after this substition, including the
|
|
|
|
// immediate field, if any.
|
2007-03-21 05:54:54 +08:00
|
|
|
PHINode *NewPHI = NULL;
|
|
|
|
Value *IncV = NULL;
|
2007-10-23 02:31:58 +08:00
|
|
|
IVExpr ReuseIV(SE->getIntegerSCEV(0, Type::Int32Ty),
|
|
|
|
SE->getIntegerSCEV(0, Type::Int32Ty),
|
|
|
|
0, 0);
|
2007-10-23 04:40:42 +08:00
|
|
|
unsigned RewriteFactor = 0;
|
Loosen up iv reuse to allow reuse of the same stride but a larger type when truncating from the larger type to smaller type is free.
e.g.
Turns this loop:
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
movw %dx, %si
LBB1_2: # bb
movl L_X$non_lazy_ptr, %edi
movw %si, (%edi)
movl L_Y$non_lazy_ptr, %edi
movw %dx, (%edi)
addw $4, %dx
incw %si
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
into
LBB1_1: # entry.bb_crit_edge
xorl %ecx, %ecx
xorw %dx, %dx
LBB1_2: # bb
movl L_X$non_lazy_ptr, %esi
movw %cx, (%esi)
movl L_Y$non_lazy_ptr, %esi
movw %dx, (%esi)
addw $4, %dx
incl %ecx
cmpl %eax, %ecx
jne LBB1_2 # bb
llvm-svn: 43375
2007-10-26 09:56:11 +08:00
|
|
|
RewriteFactor = CheckForIVReuse(HaveCommonExprs, AllUsesAreAddresses,
|
|
|
|
Stride, ReuseIV, CommonExprs->getType(),
|
|
|
|
UsersToProcess);
|
2007-03-21 05:54:54 +08:00
|
|
|
if (RewriteFactor != 0) {
|
|
|
|
DOUT << "BASED ON IV of STRIDE " << *ReuseIV.Stride
|
|
|
|
<< " and BASE " << *ReuseIV.Base << " :\n";
|
|
|
|
NewPHI = ReuseIV.PHI;
|
|
|
|
IncV = ReuseIV.IncV;
|
|
|
|
}
|
|
|
|
|
2007-04-02 06:21:39 +08:00
|
|
|
const Type *ReplacedTy = CommonExprs->getType();
|
|
|
|
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
// Now that we know what we need to do, insert the PHI node itself.
|
|
|
|
//
|
2007-04-02 06:21:39 +08:00
|
|
|
DOUT << "INSERTING IV of TYPE " << *ReplacedTy << " of STRIDE "
|
2007-05-12 06:40:34 +08:00
|
|
|
<< *Stride << " and BASE " << *CommonExprs << ": ";
|
2006-03-17 05:53:05 +08:00
|
|
|
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
SCEVExpander Rewriter(*SE, *LI);
|
|
|
|
SCEVExpander PreheaderRewriter(*SE, *LI);
|
|
|
|
|
|
|
|
BasicBlock *Preheader = L->getLoopPreheader();
|
|
|
|
Instruction *PreInsertPt = Preheader->getTerminator();
|
|
|
|
Instruction *PhiInsertBefore = L->getHeader()->begin();
|
|
|
|
|
2005-09-13 01:11:27 +08:00
|
|
|
BasicBlock *LatchBlock = L->getLoopLatch();
|
2006-03-17 05:53:05 +08:00
|
|
|
|
2006-03-18 03:52:23 +08:00
|
|
|
|
|
|
|
// Emit the initial base value into the loop preheader.
|
|
|
|
Value *CommonBaseV
|
2007-06-15 22:38:12 +08:00
|
|
|
= PreheaderRewriter.expandCodeFor(CommonExprs, PreInsertPt);
|
2006-03-18 03:52:23 +08:00
|
|
|
|
2006-03-18 16:03:12 +08:00
|
|
|
if (RewriteFactor == 0) {
|
2006-03-17 05:53:05 +08:00
|
|
|
// Create a new Phi for this base, and stick it in the loop header.
|
2008-04-07 04:25:17 +08:00
|
|
|
NewPHI = PHINode::Create(ReplacedTy, "iv.", PhiInsertBefore);
|
2006-03-17 05:53:05 +08:00
|
|
|
++NumInserted;
|
2005-08-09 06:32:34 +08:00
|
|
|
|
2006-03-18 03:52:23 +08:00
|
|
|
// Add common base to the new Phi node.
|
|
|
|
NewPHI->addIncoming(CommonBaseV, Preheader);
|
|
|
|
|
Handle negative strides much more optimally. This compiles X86/lsr-negative-stride.ll
into:
_t:
movl 8(%esp), %ecx
movl 4(%esp), %eax
cmpl %ecx, %eax
je LBB1_3 #bb17
LBB1_1: #bb
cmpl %ecx, %eax
jg LBB1_4 #cond_true
LBB1_2: #cond_false
subl %eax, %ecx
cmpl %ecx, %eax
jne LBB1_1 #bb
LBB1_3: #bb17
ret
LBB1_4: #cond_true
subl %ecx, %eax
cmpl %ecx, %eax
jne LBB1_1 #bb
jmp LBB1_3 #bb17
instead of:
_t:
subl $4, %esp
movl %esi, (%esp)
movl 12(%esp), %ecx
movl 8(%esp), %eax
cmpl %ecx, %eax
je LBB1_4 #bb17
LBB1_1: #bb.outer
movl %ecx, %edx
negl %edx
LBB1_2: #bb
cmpl %ecx, %eax
jle LBB1_5 #cond_false
LBB1_3: #cond_true
addl %edx, %eax
cmpl %ecx, %eax
jne LBB1_2 #bb
LBB1_4: #bb17
movl (%esp), %esi
addl $4, %esp
ret
LBB1_5: #cond_false
movl %ecx, %edx
subl %eax, %edx
movl %eax, %esi
addl %esi, %esi
cmpl %ecx, %esi
je LBB1_4 #bb17
LBB1_6: #cond_false.bb.outer_crit_edge
movl %edx, %ecx
jmp LBB1_1 #bb.outer
llvm-svn: 37252
2007-05-19 09:22:21 +08:00
|
|
|
// If the stride is negative, insert a sub instead of an add for the
|
|
|
|
// increment.
|
|
|
|
bool isNegative = isNonConstantNegative(Stride);
|
|
|
|
SCEVHandle IncAmount = Stride;
|
|
|
|
if (isNegative)
|
2007-10-23 02:31:58 +08:00
|
|
|
IncAmount = SE->getNegativeSCEV(Stride);
|
Handle negative strides much more optimally. This compiles X86/lsr-negative-stride.ll
into:
_t:
movl 8(%esp), %ecx
movl 4(%esp), %eax
cmpl %ecx, %eax
je LBB1_3 #bb17
LBB1_1: #bb
cmpl %ecx, %eax
jg LBB1_4 #cond_true
LBB1_2: #cond_false
subl %eax, %ecx
cmpl %ecx, %eax
jne LBB1_1 #bb
LBB1_3: #bb17
ret
LBB1_4: #cond_true
subl %ecx, %eax
cmpl %ecx, %eax
jne LBB1_1 #bb
jmp LBB1_3 #bb17
instead of:
_t:
subl $4, %esp
movl %esi, (%esp)
movl 12(%esp), %ecx
movl 8(%esp), %eax
cmpl %ecx, %eax
je LBB1_4 #bb17
LBB1_1: #bb.outer
movl %ecx, %edx
negl %edx
LBB1_2: #bb
cmpl %ecx, %eax
jle LBB1_5 #cond_false
LBB1_3: #cond_true
addl %edx, %eax
cmpl %ecx, %eax
jne LBB1_2 #bb
LBB1_4: #bb17
movl (%esp), %esi
addl $4, %esp
ret
LBB1_5: #cond_false
movl %ecx, %edx
subl %eax, %edx
movl %eax, %esi
addl %esi, %esi
cmpl %ecx, %esi
je LBB1_4 #bb17
LBB1_6: #cond_false.bb.outer_crit_edge
movl %edx, %ecx
jmp LBB1_1 #bb.outer
llvm-svn: 37252
2007-05-19 09:22:21 +08:00
|
|
|
|
2006-03-17 05:53:05 +08:00
|
|
|
// Insert the stride into the preheader.
|
2007-06-15 22:38:12 +08:00
|
|
|
Value *StrideV = PreheaderRewriter.expandCodeFor(IncAmount, PreInsertPt);
|
2006-03-17 05:53:05 +08:00
|
|
|
if (!isa<ConstantInt>(StrideV)) ++NumVariable;
|
Teach LSR to strength reduce IVs that have a loop-invariant but non-constant stride.
For code like this:
void foo(float *a, float *b, int n, int stride_a, int stride_b) {
int i;
for (i=0; i<n; i++)
a[i*stride_a] = b[i*stride_b];
}
we now emit:
.LBB_foo2_2: ; no_exit
lfs f0, 0(r4)
stfs f0, 0(r3)
addi r7, r7, 1
add r4, r2, r4
add r3, r6, r3
cmpw cr0, r7, r5
blt .LBB_foo2_2 ; no_exit
instead of:
.LBB_foo_2: ; no_exit
mullw r8, r2, r7 ;; multiply!
slwi r8, r8, 2
lfsx f0, r4, r8
mullw r8, r2, r6 ;; multiply!
slwi r8, r8, 2
stfsx f0, r3, r8
addi r2, r2, 1
cmpw cr0, r2, r5
blt .LBB_foo_2 ; no_exit
loops with variable strides occur pretty often. For example, in SPECFP2K
there are 317 variable strides in 177.mesa, 3 in 179.art, 14 in 188.ammp,
56 in 168.wupwise, 36 in 172.mgrid.
Now we can allow indvars to turn functions written like this:
void foo2(float *a, float *b, int n, int stride_a, int stride_b) {
int i, ai = 0, bi = 0;
for (i=0; i<n; i++)
{
a[ai] = b[bi];
ai += stride_a;
bi += stride_b;
}
}
into code like the above for better analysis. With this patch, they generate
identical code.
llvm-svn: 22740
2005-08-10 08:45:21 +08:00
|
|
|
|
2006-03-17 05:53:05 +08:00
|
|
|
// Emit the increment of the base value before the terminator of the loop
|
|
|
|
// latch block, and add it to the Phi node.
|
2007-10-23 02:31:58 +08:00
|
|
|
SCEVHandle IncExp = SE->getUnknown(StrideV);
|
Handle negative strides much more optimally. This compiles X86/lsr-negative-stride.ll
into:
_t:
movl 8(%esp), %ecx
movl 4(%esp), %eax
cmpl %ecx, %eax
je LBB1_3 #bb17
LBB1_1: #bb
cmpl %ecx, %eax
jg LBB1_4 #cond_true
LBB1_2: #cond_false
subl %eax, %ecx
cmpl %ecx, %eax
jne LBB1_1 #bb
LBB1_3: #bb17
ret
LBB1_4: #cond_true
subl %ecx, %eax
cmpl %ecx, %eax
jne LBB1_1 #bb
jmp LBB1_3 #bb17
instead of:
_t:
subl $4, %esp
movl %esi, (%esp)
movl 12(%esp), %ecx
movl 8(%esp), %eax
cmpl %ecx, %eax
je LBB1_4 #bb17
LBB1_1: #bb.outer
movl %ecx, %edx
negl %edx
LBB1_2: #bb
cmpl %ecx, %eax
jle LBB1_5 #cond_false
LBB1_3: #cond_true
addl %edx, %eax
cmpl %ecx, %eax
jne LBB1_2 #bb
LBB1_4: #bb17
movl (%esp), %esi
addl $4, %esp
ret
LBB1_5: #cond_false
movl %ecx, %edx
subl %eax, %edx
movl %eax, %esi
addl %esi, %esi
cmpl %ecx, %esi
je LBB1_4 #bb17
LBB1_6: #cond_false.bb.outer_crit_edge
movl %edx, %ecx
jmp LBB1_1 #bb.outer
llvm-svn: 37252
2007-05-19 09:22:21 +08:00
|
|
|
if (isNegative)
|
2007-10-23 02:31:58 +08:00
|
|
|
IncExp = SE->getNegativeSCEV(IncExp);
|
|
|
|
IncExp = SE->getAddExpr(SE->getUnknown(NewPHI), IncExp);
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
|
2007-06-15 22:38:12 +08:00
|
|
|
IncV = Rewriter.expandCodeFor(IncExp, LatchBlock->getTerminator());
|
2006-03-17 05:53:05 +08:00
|
|
|
IncV->setName(NewPHI->getName()+".inc");
|
|
|
|
NewPHI->addIncoming(IncV, LatchBlock);
|
|
|
|
|
2006-03-18 03:52:23 +08:00
|
|
|
// Remember this in case a later stride is multiple of this.
|
2006-03-18 16:03:12 +08:00
|
|
|
IVsByStride[Stride].addIV(Stride, CommonExprs, NewPHI, IncV);
|
2007-05-12 06:40:34 +08:00
|
|
|
|
|
|
|
DOUT << " IV=%" << NewPHI->getNameStr() << " INC=%" << IncV->getNameStr();
|
2006-03-18 03:52:23 +08:00
|
|
|
} else {
|
|
|
|
Constant *C = dyn_cast<Constant>(CommonBaseV);
|
|
|
|
if (!C ||
|
|
|
|
(!C->isNullValue() &&
|
2007-10-23 02:31:58 +08:00
|
|
|
!isTargetConstant(SE->getUnknown(CommonBaseV), ReplacedTy, TLI)))
|
2006-11-27 09:05:10 +08:00
|
|
|
// We want the common base emitted into the preheader! This is just
|
|
|
|
// using cast as a copy so BitCast (no-op cast) is appropriate
|
|
|
|
CommonBaseV = new BitCastInst(CommonBaseV, CommonBaseV->getType(),
|
|
|
|
"commonbase", PreInsertPt);
|
2006-03-17 05:53:05 +08:00
|
|
|
}
|
2007-05-12 06:40:34 +08:00
|
|
|
DOUT << "\n";
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
|
2006-08-03 14:34:50 +08:00
|
|
|
// We want to emit code for users inside the loop first. To do this, we
|
|
|
|
// rearrange BasedUser so that the entries at the end have
|
|
|
|
// isUseOfPostIncrementedValue = false, because we pop off the end of the
|
|
|
|
// vector (so we handle them first).
|
|
|
|
std::partition(UsersToProcess.begin(), UsersToProcess.end(),
|
|
|
|
PartitionByIsUseOfPostIncrementedValue);
|
|
|
|
|
|
|
|
// Sort this by base, so that things with the same base are handled
|
|
|
|
// together. By partitioning first and stable-sorting later, we are
|
|
|
|
// guaranteed that within each base we will pop off users from within the
|
|
|
|
// loop before users outside of the loop with a particular base.
|
|
|
|
//
|
|
|
|
// We would like to use stable_sort here, but we can't. The problem is that
|
|
|
|
// SCEVHandle's don't have a deterministic ordering w.r.t to each other, so
|
|
|
|
// we don't have anything to do a '<' comparison on. Because we think the
|
|
|
|
// number of uses is small, do a horrible bubble sort which just relies on
|
|
|
|
// ==.
|
|
|
|
for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
|
|
|
|
// Get a base value.
|
|
|
|
SCEVHandle Base = UsersToProcess[i].Base;
|
|
|
|
|
2007-10-31 06:27:26 +08:00
|
|
|
// Compact everything with this base to be consequtive with this one.
|
2006-08-03 14:34:50 +08:00
|
|
|
for (unsigned j = i+1; j != e; ++j) {
|
|
|
|
if (UsersToProcess[j].Base == Base) {
|
|
|
|
std::swap(UsersToProcess[i+1], UsersToProcess[j]);
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Process all the users now. This outer loop handles all bases, the inner
|
|
|
|
// loop handles all users of a particular base.
|
2005-07-30 08:15:07 +08:00
|
|
|
while (!UsersToProcess.empty()) {
|
2005-10-12 02:30:57 +08:00
|
|
|
SCEVHandle Base = UsersToProcess.back().Base;
|
2005-08-04 07:30:08 +08:00
|
|
|
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
// Emit the code for Base into the preheader.
|
2007-06-15 22:38:12 +08:00
|
|
|
Value *BaseV = PreheaderRewriter.expandCodeFor(Base, PreInsertPt);
|
2007-05-12 06:40:34 +08:00
|
|
|
|
|
|
|
DOUT << " INSERTING code for BASE = " << *Base << ":";
|
|
|
|
if (BaseV->hasName())
|
|
|
|
DOUT << " Result value name = %" << BaseV->getNameStr();
|
|
|
|
DOUT << "\n";
|
|
|
|
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
// If BaseV is a constant other than 0, make sure that it gets inserted into
|
|
|
|
// the preheader, instead of being forward substituted into the uses. We do
|
2006-11-27 09:05:10 +08:00
|
|
|
// this by forcing a BitCast (noop cast) to be inserted into the preheader
|
|
|
|
// in this case.
|
2006-08-03 14:34:50 +08:00
|
|
|
if (Constant *C = dyn_cast<Constant>(BaseV)) {
|
2007-03-14 04:34:37 +08:00
|
|
|
if (!C->isNullValue() && !isTargetConstant(Base, ReplacedTy, TLI)) {
|
2006-11-27 09:05:10 +08:00
|
|
|
// We want this constant emitted into the preheader! This is just
|
|
|
|
// using cast as a copy so BitCast (no-op cast) is appropriate
|
|
|
|
BaseV = new BitCastInst(BaseV, BaseV->getType(), "preheaderinsert",
|
2008-04-15 02:26:16 +08:00
|
|
|
PreInsertPt);
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
}
|
2006-08-03 14:34:50 +08:00
|
|
|
}
|
|
|
|
|
2005-07-30 08:15:07 +08:00
|
|
|
// Emit the code to add the immediate offset to the Phi value, just before
|
Move from Stage 0 to Stage 1.
Only emit one PHI node for IV uses with identical bases and strides (after
moving foldable immediates to the load/store instruction).
This implements LoopStrengthReduce/dont_insert_redundant_ops.ll, allowing
us to generate this PPC code for test1:
or r30, r3, r3
.LBB_test1_1: ; Loop
li r2, 0
stw r2, 0(r30)
stw r2, 4(r30)
bl L_pred$stub
addi r30, r30, 8
cmplwi cr0, r3, 0
bne .LBB_test1_1 ; Loop
instead of this code:
or r30, r3, r3
or r29, r3, r3
.LBB_test1_1: ; Loop
li r2, 0
stw r2, 0(r29)
stw r2, 4(r30)
bl L_pred$stub
addi r30, r30, 8 ;; Two iv's with step of 8
addi r29, r29, 8
cmplwi cr0, r3, 0
bne .LBB_test1_1 ; Loop
llvm-svn: 22635
2005-08-04 06:51:21 +08:00
|
|
|
// the instructions that we identified as using this stride and base.
|
2005-10-12 02:30:57 +08:00
|
|
|
do {
|
2006-08-03 14:34:50 +08:00
|
|
|
// FIXME: Use emitted users to emit other users.
|
2005-10-12 02:30:57 +08:00
|
|
|
BasedUser &User = UsersToProcess.back();
|
Move from Stage 0 to Stage 1.
Only emit one PHI node for IV uses with identical bases and strides (after
moving foldable immediates to the load/store instruction).
This implements LoopStrengthReduce/dont_insert_redundant_ops.ll, allowing
us to generate this PPC code for test1:
or r30, r3, r3
.LBB_test1_1: ; Loop
li r2, 0
stw r2, 0(r30)
stw r2, 4(r30)
bl L_pred$stub
addi r30, r30, 8
cmplwi cr0, r3, 0
bne .LBB_test1_1 ; Loop
instead of this code:
or r30, r3, r3
or r29, r3, r3
.LBB_test1_1: ; Loop
li r2, 0
stw r2, 0(r29)
stw r2, 4(r30)
bl L_pred$stub
addi r30, r30, 8 ;; Two iv's with step of 8
addi r29, r29, 8
cmplwi cr0, r3, 0
bne .LBB_test1_1 ; Loop
llvm-svn: 22635
2005-08-04 06:51:21 +08:00
|
|
|
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
// If this instruction wants to use the post-incremented value, move it
|
|
|
|
// after the post-inc and use its value instead of the PHI.
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
Value *RewriteOp = NewPHI;
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
if (User.isUseOfPostIncrementedValue) {
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
RewriteOp = IncV;
|
_test:
li r2, 0
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r2, 1
stw r2, 0(r4)
blr
[zion ~/llvm]$ cat > ~/xx
Uses of IV's outside of the loop should use hte post-incremented version
of the IV, not the preincremented version. This helps many loops (e.g. in sixtrack)
which used to generate code like this (this is the code from the
dont-hoist-simple-loop-constants.ll testcase):
_test:
li r2, 0 **** IV starts at 0
LBB_test_1: ; no_exit.2
or r5, r2, r2 **** Copy for loop exit
li r2, 0
stw r2, 0(r3)
addi r3, r3, 4
addi r2, r5, 1
addi r6, r5, 2 **** IV+2
cmpwi cr0, r6, 701
blt cr0, LBB_test_1 ; no_exit.2
LBB_test_2: ; loopexit.2.loopexit
addi r2, r5, 2 **** IV+2
stw r2, 0(r4)
blr
And now generated code like this:
_test:
li r2, 1 *** IV starts at 1
LBB_test_1: ; no_exit.2
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
addi r3, r3, 4
cmpwi cr0, r2, 701 *** IV.postinc + 0
blt cr0, LBB_test_1
LBB_test_2: ; loopexit.2.loopexit
stw r2, 0(r4) *** IV.postinc + 0
blr
llvm-svn: 23313
2005-09-12 14:04:47 +08:00
|
|
|
|
|
|
|
// If this user is in the loop, make sure it is the last thing in the
|
|
|
|
// loop to ensure it is dominated by the increment.
|
|
|
|
if (L->contains(User.Inst->getParent()))
|
|
|
|
User.Inst->moveBefore(LatchBlock->getTerminator());
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
}
|
2006-12-13 16:06:42 +08:00
|
|
|
if (RewriteOp->getType() != ReplacedTy) {
|
|
|
|
Instruction::CastOps opcode = Instruction::Trunc;
|
|
|
|
if (ReplacedTy->getPrimitiveSizeInBits() ==
|
|
|
|
RewriteOp->getType()->getPrimitiveSizeInBits())
|
|
|
|
opcode = Instruction::BitCast;
|
|
|
|
RewriteOp = SCEVExpander::InsertCastOfTo(opcode, RewriteOp, ReplacedTy);
|
|
|
|
}
|
2006-06-09 08:12:42 +08:00
|
|
|
|
2007-10-23 02:31:58 +08:00
|
|
|
SCEVHandle RewriteExpr = SE->getUnknown(RewriteOp);
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
|
2008-05-16 07:26:57 +08:00
|
|
|
// If we had to insert new instrutions for RewriteOp, we have to
|
|
|
|
// consider that they may not have been able to end up immediately
|
|
|
|
// next to RewriteOp, because non-PHI instructions may never precede
|
|
|
|
// PHI instructions in a block. In this case, remember where the last
|
2008-05-20 11:01:48 +08:00
|
|
|
// instruction was inserted so that if we're replacing a different
|
|
|
|
// PHI node, we can use the later point to expand the final
|
|
|
|
// RewriteExpr.
|
2008-05-16 07:26:57 +08:00
|
|
|
Instruction *NewBasePt = dyn_cast<Instruction>(RewriteOp);
|
|
|
|
if (RewriteOp == NewPHI) NewBasePt = 0;
|
|
|
|
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
// Clear the SCEVExpander's expression map so that we are guaranteed
|
|
|
|
// to have the code emitted where we expect it.
|
|
|
|
Rewriter.clear();
|
2006-03-17 05:53:05 +08:00
|
|
|
|
|
|
|
// If we are reusing the iv, then it must be multiplied by a constant
|
|
|
|
// factor take advantage of addressing mode scale component.
|
2006-03-18 16:03:12 +08:00
|
|
|
if (RewriteFactor != 0) {
|
2007-10-31 06:27:26 +08:00
|
|
|
RewriteExpr = SE->getMulExpr(SE->getIntegerSCEV(RewriteFactor,
|
|
|
|
RewriteExpr->getType()),
|
|
|
|
RewriteExpr);
|
2006-03-18 03:52:23 +08:00
|
|
|
|
|
|
|
// The common base is emitted in the loop preheader. But since we
|
|
|
|
// are reusing an IV, it has not been used to initialize the PHI node.
|
|
|
|
// Add it to the expression used to rewrite the uses.
|
|
|
|
if (!isa<ConstantInt>(CommonBaseV) ||
|
2007-03-03 07:51:25 +08:00
|
|
|
!cast<ConstantInt>(CommonBaseV)->isZero())
|
2007-10-23 02:31:58 +08:00
|
|
|
RewriteExpr = SE->getAddExpr(RewriteExpr,
|
|
|
|
SE->getUnknown(CommonBaseV));
|
2006-03-18 03:52:23 +08:00
|
|
|
}
|
2006-03-17 05:53:05 +08:00
|
|
|
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
// Now that we know what we need to do, insert code before User for the
|
|
|
|
// immediate and any loop-variant expressions.
|
2007-03-03 07:51:25 +08:00
|
|
|
if (!isa<ConstantInt>(BaseV) || !cast<ConstantInt>(BaseV)->isZero())
|
Implement: LoopStrengthReduce/share_ivs.ll
Two changes:
* Only insert one PHI node for each stride. Other values are live in
values. This cannot introduce higher register pressure than the
previous approach, and can take advantage of reg+reg addressing modes.
* Factor common base values out of uses before moving values from the
base to the immediate fields. This improves codegen by starting the
stride-specific PHI node out at a common place for each IV use.
As an example, we used to generate this for a loop in swim:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfd f0, 0(r8)
stfd f0, 0(r3)
lfd f0, 0(r6)
stfd f0, 0(r7)
lfd f0, 0(r2)
stfd f0, 0(r5)
addi r9, r9, 1
addi r2, r2, 8
addi r5, r5, 8
addi r6, r6, 8
addi r7, r7, 8
addi r8, r8, 8
addi r3, r3, 8
cmpw cr0, r9, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
now we emit:
.LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_2: ; no_exit.7.i
lfdx f0, r8, r2
stfdx f0, r9, r2
lfdx f0, r5, r2
stfdx f0, r7, r2
lfdx f0, r3, r2
stfdx f0, r6, r2
addi r10, r10, 1
addi r2, r2, 8
cmpw cr0, r10, r4
bgt .LBB_main_no_exit_2E_6_2E_i_no_exit_2E_7_2E_i_1
As another more dramatic example, we used to emit this:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfd f0, 8(r21)
lfd f4, 8(r3)
lfd f5, 8(r27)
lfd f6, 8(r22)
lfd f7, 8(r5)
lfd f8, 8(r6)
lfd f9, 8(r30)
lfd f10, 8(r11)
lfd f11, 8(r12)
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfd f0, 8(r4)
lfd f0, 8(r25)
lfd f5, 8(r26)
lfd f6, 8(r23)
lfd f9, 8(r28)
lfd f10, 8(r10)
lfd f12, 8(r9)
lfd f13, 8(r29)
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfd f0, 8(r24)
lfd f0, 8(r8)
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfd f0, 8(r2)
addi r20, r20, 1
addi r2, r2, 8
addi r8, r8, 8
addi r10, r10, 8
addi r12, r12, 8
addi r6, r6, 8
addi r29, r29, 8
addi r28, r28, 8
addi r26, r26, 8
addi r25, r25, 8
addi r24, r24, 8
addi r5, r5, 8
addi r23, r23, 8
addi r22, r22, 8
addi r3, r3, 8
addi r9, r9, 8
addi r11, r11, 8
addi r30, r30, 8
addi r27, r27, 8
addi r21, r21, 8
addi r4, r4, 8
cmpw cr0, r20, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
we now emit:
.LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_2: ; no_exit.1.i19
lfdx f0, r21, r20
lfdx f4, r3, r20
lfdx f5, r27, r20
lfdx f6, r22, r20
lfdx f7, r5, r20
lfdx f8, r6, r20
lfdx f9, r30, r20
lfdx f10, r11, r20
lfdx f11, r12, r20
fsub f10, f10, f11
fadd f5, f4, f5
fmul f5, f5, f1
fadd f6, f6, f7
fadd f6, f6, f8
fadd f6, f6, f9
fmadd f0, f5, f6, f0
fnmsub f0, f10, f2, f0
stfdx f0, r4, r20
lfdx f0, r25, r20
lfdx f5, r26, r20
lfdx f6, r23, r20
lfdx f9, r28, r20
lfdx f10, r10, r20
lfdx f12, r9, r20
lfdx f13, r29, r20
fsub f11, f13, f11
fadd f4, f4, f5
fmul f4, f4, f1
fadd f5, f6, f9
fadd f5, f5, f10
fadd f5, f5, f12
fnmsub f0, f4, f5, f0
fnmsub f0, f11, f3, f0
stfdx f0, r24, r20
lfdx f0, r8, r20
fsub f4, f7, f8
fsub f5, f12, f10
fnmsub f0, f5, f2, f0
fnmsub f0, f4, f3, f0
stfdx f0, r2, r20
addi r19, r19, 1
addi r20, r20, 8
cmpw cr0, r19, r7
bgt .LBB_main_L_90_no_exit_2E_0_2E_i16_no_exit_2E_1_2E_i19_1
llvm-svn: 22722
2005-08-09 08:18:09 +08:00
|
|
|
// Add BaseV to the PHI value if needed.
|
2007-10-23 02:31:58 +08:00
|
|
|
RewriteExpr = SE->getAddExpr(RewriteExpr, SE->getUnknown(BaseV));
|
2006-03-17 05:53:05 +08:00
|
|
|
|
2008-05-16 07:26:57 +08:00
|
|
|
User.RewriteInstructionToUseNewBase(RewriteExpr, NewBasePt,
|
|
|
|
Rewriter, L, this,
|
2007-10-31 07:45:15 +08:00
|
|
|
DeadInsts);
|
Move from Stage 0 to Stage 1.
Only emit one PHI node for IV uses with identical bases and strides (after
moving foldable immediates to the load/store instruction).
This implements LoopStrengthReduce/dont_insert_redundant_ops.ll, allowing
us to generate this PPC code for test1:
or r30, r3, r3
.LBB_test1_1: ; Loop
li r2, 0
stw r2, 0(r30)
stw r2, 4(r30)
bl L_pred$stub
addi r30, r30, 8
cmplwi cr0, r3, 0
bne .LBB_test1_1 ; Loop
instead of this code:
or r30, r3, r3
or r29, r3, r3
.LBB_test1_1: ; Loop
li r2, 0
stw r2, 0(r29)
stw r2, 4(r30)
bl L_pred$stub
addi r30, r30, 8 ;; Two iv's with step of 8
addi r29, r29, 8
cmplwi cr0, r3, 0
bne .LBB_test1_1 ; Loop
llvm-svn: 22635
2005-08-04 06:51:21 +08:00
|
|
|
|
2008-12-01 14:14:28 +08:00
|
|
|
// Mark old value we replaced as possibly dead, so that it is eliminated
|
Move from Stage 0 to Stage 1.
Only emit one PHI node for IV uses with identical bases and strides (after
moving foldable immediates to the load/store instruction).
This implements LoopStrengthReduce/dont_insert_redundant_ops.ll, allowing
us to generate this PPC code for test1:
or r30, r3, r3
.LBB_test1_1: ; Loop
li r2, 0
stw r2, 0(r30)
stw r2, 4(r30)
bl L_pred$stub
addi r30, r30, 8
cmplwi cr0, r3, 0
bne .LBB_test1_1 ; Loop
instead of this code:
or r30, r3, r3
or r29, r3, r3
.LBB_test1_1: ; Loop
li r2, 0
stw r2, 0(r29)
stw r2, 4(r30)
bl L_pred$stub
addi r30, r30, 8 ;; Two iv's with step of 8
addi r29, r29, 8
cmplwi cr0, r3, 0
bne .LBB_test1_1 ; Loop
llvm-svn: 22635
2005-08-04 06:51:21 +08:00
|
|
|
// if we just replaced the last use of that value.
|
2008-12-01 14:27:41 +08:00
|
|
|
DeadInsts.push_back(cast<Instruction>(User.OperandValToReplace));
|
Move from Stage 0 to Stage 1.
Only emit one PHI node for IV uses with identical bases and strides (after
moving foldable immediates to the load/store instruction).
This implements LoopStrengthReduce/dont_insert_redundant_ops.ll, allowing
us to generate this PPC code for test1:
or r30, r3, r3
.LBB_test1_1: ; Loop
li r2, 0
stw r2, 0(r30)
stw r2, 4(r30)
bl L_pred$stub
addi r30, r30, 8
cmplwi cr0, r3, 0
bne .LBB_test1_1 ; Loop
instead of this code:
or r30, r3, r3
or r29, r3, r3
.LBB_test1_1: ; Loop
li r2, 0
stw r2, 0(r29)
stw r2, 4(r30)
bl L_pred$stub
addi r30, r30, 8 ;; Two iv's with step of 8
addi r29, r29, 8
cmplwi cr0, r3, 0
bne .LBB_test1_1 ; Loop
llvm-svn: 22635
2005-08-04 06:51:21 +08:00
|
|
|
|
2005-10-12 02:30:57 +08:00
|
|
|
UsersToProcess.pop_back();
|
Move from Stage 0 to Stage 1.
Only emit one PHI node for IV uses with identical bases and strides (after
moving foldable immediates to the load/store instruction).
This implements LoopStrengthReduce/dont_insert_redundant_ops.ll, allowing
us to generate this PPC code for test1:
or r30, r3, r3
.LBB_test1_1: ; Loop
li r2, 0
stw r2, 0(r30)
stw r2, 4(r30)
bl L_pred$stub
addi r30, r30, 8
cmplwi cr0, r3, 0
bne .LBB_test1_1 ; Loop
instead of this code:
or r30, r3, r3
or r29, r3, r3
.LBB_test1_1: ; Loop
li r2, 0
stw r2, 0(r29)
stw r2, 4(r30)
bl L_pred$stub
addi r30, r30, 8 ;; Two iv's with step of 8
addi r29, r29, 8
cmplwi cr0, r3, 0
bne .LBB_test1_1 ; Loop
llvm-svn: 22635
2005-08-04 06:51:21 +08:00
|
|
|
++NumReduced;
|
2005-10-12 02:30:57 +08:00
|
|
|
|
2006-08-03 14:34:50 +08:00
|
|
|
// If there are any more users to process with the same base, process them
|
|
|
|
// now. We sorted by base above, so we just have to check the last elt.
|
2005-10-12 02:30:57 +08:00
|
|
|
} while (!UsersToProcess.empty() && UsersToProcess.back().Base == Base);
|
2005-07-30 08:15:07 +08:00
|
|
|
// TODO: Next, find out which base index is the most common, pull it out.
|
|
|
|
}
|
|
|
|
|
|
|
|
// IMPORTANT TODO: Figure out how to partition the IV's with this stride, but
|
|
|
|
// different starting values, into different PHIs.
|
2004-10-19 05:08:22 +08:00
|
|
|
}
|
|
|
|
|
2008-08-14 04:31:11 +08:00
|
|
|
/// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
|
2007-04-03 13:11:24 +08:00
|
|
|
/// set the IV user and stride information and return true, otherwise return
|
|
|
|
/// false.
|
2008-08-14 04:31:11 +08:00
|
|
|
bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
|
2007-04-03 13:11:24 +08:00
|
|
|
const SCEVHandle *&CondStride) {
|
|
|
|
for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e && !CondUse;
|
|
|
|
++Stride) {
|
|
|
|
std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI =
|
|
|
|
IVUsesByStride.find(StrideOrder[Stride]);
|
|
|
|
assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
|
|
|
|
|
|
|
|
for (std::vector<IVStrideUse>::iterator UI = SI->second.Users.begin(),
|
|
|
|
E = SI->second.Users.end(); UI != E; ++UI)
|
|
|
|
if (UI->User == Cond) {
|
|
|
|
// NOTE: we could handle setcc instructions with multiple uses here, but
|
|
|
|
// InstCombine does it as well for simple uses, it's not clear that it
|
|
|
|
// occurs enough in real life to handle.
|
|
|
|
CondUse = &*UI;
|
|
|
|
CondStride = &SI->first;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2007-10-25 17:11:16 +08:00
|
|
|
namespace {
|
|
|
|
// Constant strides come first which in turns are sorted by their absolute
|
|
|
|
// values. If absolute values are the same, then positive strides comes first.
|
|
|
|
// e.g.
|
|
|
|
// 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X
|
|
|
|
struct StrideCompare {
|
|
|
|
bool operator()(const SCEVHandle &LHS, const SCEVHandle &RHS) {
|
|
|
|
SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
|
|
|
|
SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
|
|
|
|
if (LHSC && RHSC) {
|
|
|
|
int64_t LV = LHSC->getValue()->getSExtValue();
|
|
|
|
int64_t RV = RHSC->getValue()->getSExtValue();
|
|
|
|
uint64_t ALV = (LV < 0) ? -LV : LV;
|
|
|
|
uint64_t ARV = (RV < 0) ? -RV : RV;
|
|
|
|
if (ALV == ARV)
|
|
|
|
return LV > RV;
|
|
|
|
else
|
|
|
|
return ALV < ARV;
|
|
|
|
}
|
|
|
|
return (LHSC && !RHSC);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
/// ChangeCompareStride - If a loop termination compare instruction is the
|
|
|
|
/// only use of its stride, and the compaison is against a constant value,
|
|
|
|
/// try eliminate the stride by moving the compare instruction to another
|
|
|
|
/// stride and change its constant operand accordingly. e.g.
|
|
|
|
///
|
|
|
|
/// loop:
|
|
|
|
/// ...
|
|
|
|
/// v1 = v1 + 3
|
|
|
|
/// v2 = v2 + 1
|
|
|
|
/// if (v2 < 10) goto loop
|
|
|
|
/// =>
|
|
|
|
/// loop:
|
|
|
|
/// ...
|
|
|
|
/// v1 = v1 + 3
|
|
|
|
/// if (v1 < 30) goto loop
|
|
|
|
ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
|
2007-10-31 07:45:15 +08:00
|
|
|
IVStrideUse* &CondUse,
|
2007-10-25 17:11:16 +08:00
|
|
|
const SCEVHandle* &CondStride) {
|
|
|
|
if (StrideOrder.size() < 2 ||
|
|
|
|
IVUsesByStride[*CondStride].Users.size() != 1)
|
|
|
|
return Cond;
|
|
|
|
const SCEVConstant *SC = dyn_cast<SCEVConstant>(*CondStride);
|
|
|
|
if (!SC) return Cond;
|
|
|
|
ConstantInt *C = dyn_cast<ConstantInt>(Cond->getOperand(1));
|
|
|
|
if (!C) return Cond;
|
|
|
|
|
|
|
|
ICmpInst::Predicate Predicate = Cond->getPredicate();
|
|
|
|
int64_t CmpSSInt = SC->getValue()->getSExtValue();
|
|
|
|
int64_t CmpVal = C->getValue().getSExtValue();
|
2007-10-27 07:08:19 +08:00
|
|
|
unsigned BitWidth = C->getValue().getBitWidth();
|
|
|
|
uint64_t SignBit = 1ULL << (BitWidth-1);
|
|
|
|
const Type *CmpTy = C->getType();
|
|
|
|
const Type *NewCmpTy = NULL;
|
2007-10-30 06:07:18 +08:00
|
|
|
unsigned TyBits = CmpTy->getPrimitiveSizeInBits();
|
|
|
|
unsigned NewTyBits = 0;
|
2007-10-25 17:11:16 +08:00
|
|
|
int64_t NewCmpVal = CmpVal;
|
|
|
|
SCEVHandle *NewStride = NULL;
|
|
|
|
Value *NewIncV = NULL;
|
|
|
|
int64_t Scale = 1;
|
|
|
|
|
2008-08-13 10:05:14 +08:00
|
|
|
// Check stride constant and the comparision constant signs to detect
|
|
|
|
// overflow.
|
2008-09-10 04:54:34 +08:00
|
|
|
if ((CmpVal & SignBit) != (CmpSSInt & SignBit))
|
2008-08-13 10:05:14 +08:00
|
|
|
return Cond;
|
|
|
|
|
2007-10-25 17:11:16 +08:00
|
|
|
// Look for a suitable stride / iv as replacement.
|
|
|
|
std::stable_sort(StrideOrder.begin(), StrideOrder.end(), StrideCompare());
|
|
|
|
for (unsigned i = 0, e = StrideOrder.size(); i != e; ++i) {
|
|
|
|
std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI =
|
|
|
|
IVUsesByStride.find(StrideOrder[i]);
|
|
|
|
if (!isa<SCEVConstant>(SI->first))
|
|
|
|
continue;
|
|
|
|
int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
|
2007-10-27 07:08:19 +08:00
|
|
|
if (abs(SSInt) <= abs(CmpSSInt) || (SSInt % CmpSSInt) != 0)
|
2007-10-25 17:11:16 +08:00
|
|
|
continue;
|
|
|
|
|
2007-10-27 07:08:19 +08:00
|
|
|
Scale = SSInt / CmpSSInt;
|
|
|
|
NewCmpVal = CmpVal * Scale;
|
|
|
|
APInt Mul = APInt(BitWidth, NewCmpVal);
|
|
|
|
// Check for overflow.
|
|
|
|
if (Mul.getSExtValue() != NewCmpVal) {
|
|
|
|
NewCmpVal = CmpVal;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2007-10-25 17:11:16 +08:00
|
|
|
// Watch out for overflow.
|
2007-10-27 07:08:19 +08:00
|
|
|
if (ICmpInst::isSignedPredicate(Predicate) &&
|
|
|
|
(CmpVal & SignBit) != (NewCmpVal & SignBit))
|
2007-10-25 17:11:16 +08:00
|
|
|
NewCmpVal = CmpVal;
|
2007-10-27 07:08:19 +08:00
|
|
|
|
2007-10-25 17:11:16 +08:00
|
|
|
if (NewCmpVal != CmpVal) {
|
|
|
|
// Pick the best iv to use trying to avoid a cast.
|
|
|
|
NewIncV = NULL;
|
|
|
|
for (std::vector<IVStrideUse>::iterator UI = SI->second.Users.begin(),
|
|
|
|
E = SI->second.Users.end(); UI != E; ++UI) {
|
|
|
|
NewIncV = UI->OperandValToReplace;
|
|
|
|
if (NewIncV->getType() == CmpTy)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!NewIncV) {
|
|
|
|
NewCmpVal = CmpVal;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
NewCmpTy = NewIncV->getType();
|
2007-10-30 06:07:18 +08:00
|
|
|
NewTyBits = isa<PointerType>(NewCmpTy)
|
|
|
|
? UIntPtrTy->getPrimitiveSizeInBits()
|
|
|
|
: NewCmpTy->getPrimitiveSizeInBits();
|
|
|
|
if (RequiresTypeConversion(NewCmpTy, CmpTy)) {
|
2008-06-12 05:38:51 +08:00
|
|
|
// Check if it is possible to rewrite it using
|
|
|
|
// an iv / stride of a smaller integer type.
|
2007-10-30 06:07:18 +08:00
|
|
|
bool TruncOk = false;
|
|
|
|
if (NewCmpTy->isInteger()) {
|
|
|
|
unsigned Bits = NewTyBits;
|
|
|
|
if (ICmpInst::isSignedPredicate(Predicate))
|
|
|
|
--Bits;
|
|
|
|
uint64_t Mask = (1ULL << Bits) - 1;
|
|
|
|
if (((uint64_t)NewCmpVal & Mask) == (uint64_t)NewCmpVal)
|
|
|
|
TruncOk = true;
|
|
|
|
}
|
|
|
|
if (!TruncOk) {
|
|
|
|
NewCmpVal = CmpVal;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Don't rewrite if use offset is non-constant and the new type is
|
|
|
|
// of a different type.
|
|
|
|
// FIXME: too conservative?
|
|
|
|
if (NewTyBits != TyBits && !isa<SCEVConstant>(CondUse->Offset)) {
|
2007-10-26 06:45:20 +08:00
|
|
|
NewCmpVal = CmpVal;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AllUsesAreAddresses = true;
|
|
|
|
std::vector<BasedUser> UsersToProcess;
|
|
|
|
SCEVHandle CommonExprs = CollectIVUsers(SI->first, SI->second, L,
|
|
|
|
AllUsesAreAddresses,
|
|
|
|
UsersToProcess);
|
|
|
|
// Avoid rewriting the compare instruction with an iv of new stride
|
|
|
|
// if it's likely the new stride uses will be rewritten using the
|
|
|
|
if (AllUsesAreAddresses &&
|
2008-06-19 00:23:07 +08:00
|
|
|
ValidStride(!CommonExprs->isZero(), Scale, UsersToProcess)) {
|
2007-10-25 17:11:16 +08:00
|
|
|
NewCmpVal = CmpVal;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2008-08-07 02:04:43 +08:00
|
|
|
// If scale is negative, use swapped predicate unless it's testing
|
2007-10-25 17:11:16 +08:00
|
|
|
// for equality.
|
|
|
|
if (Scale < 0 && !Cond->isEquality())
|
2008-08-07 02:04:43 +08:00
|
|
|
Predicate = ICmpInst::getSwappedPredicate(Predicate);
|
2007-10-25 17:11:16 +08:00
|
|
|
|
|
|
|
NewStride = &StrideOrder[i];
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-06-17 06:34:15 +08:00
|
|
|
// Forgo this transformation if it the increment happens to be
|
|
|
|
// unfortunately positioned after the condition, and the condition
|
|
|
|
// has multiple uses which prevent it from being moved immediately
|
|
|
|
// before the branch. See
|
|
|
|
// test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-*.ll
|
|
|
|
// for an example of this situation.
|
2008-08-13 10:05:14 +08:00
|
|
|
if (!Cond->hasOneUse()) {
|
2008-06-17 06:34:15 +08:00
|
|
|
for (BasicBlock::iterator I = Cond, E = Cond->getParent()->end();
|
|
|
|
I != E; ++I)
|
|
|
|
if (I == NewIncV)
|
|
|
|
return Cond;
|
2008-08-13 10:05:14 +08:00
|
|
|
}
|
2008-06-17 06:34:15 +08:00
|
|
|
|
2007-10-25 17:11:16 +08:00
|
|
|
if (NewCmpVal != CmpVal) {
|
|
|
|
// Create a new compare instruction using new stride / iv.
|
|
|
|
ICmpInst *OldCond = Cond;
|
2007-10-30 06:07:18 +08:00
|
|
|
Value *RHS;
|
|
|
|
if (!isa<PointerType>(NewCmpTy))
|
|
|
|
RHS = ConstantInt::get(NewCmpTy, NewCmpVal);
|
|
|
|
else {
|
|
|
|
RHS = ConstantInt::get(UIntPtrTy, NewCmpVal);
|
|
|
|
RHS = SCEVExpander::InsertCastOfTo(Instruction::IntToPtr, RHS, NewCmpTy);
|
2007-10-25 17:11:16 +08:00
|
|
|
}
|
2007-10-27 07:08:19 +08:00
|
|
|
// Insert new compare instruction.
|
2008-06-14 05:43:41 +08:00
|
|
|
Cond = new ICmpInst(Predicate, NewIncV, RHS,
|
|
|
|
L->getHeader()->getName() + ".termcond",
|
|
|
|
OldCond);
|
2007-10-27 07:08:19 +08:00
|
|
|
|
|
|
|
// Remove the old compare instruction. The old indvar is probably dead too.
|
2008-12-01 14:27:41 +08:00
|
|
|
DeadInsts.push_back(cast<Instruction>(CondUse->OperandValToReplace));
|
2007-10-27 07:08:19 +08:00
|
|
|
SE->deleteValueFromRecords(OldCond);
|
2008-05-21 08:54:12 +08:00
|
|
|
OldCond->replaceAllUsesWith(Cond);
|
2007-10-25 17:11:16 +08:00
|
|
|
OldCond->eraseFromParent();
|
2007-10-27 07:08:19 +08:00
|
|
|
|
2007-10-25 17:11:16 +08:00
|
|
|
IVUsesByStride[*CondStride].Users.pop_back();
|
2007-10-30 06:07:18 +08:00
|
|
|
SCEVHandle NewOffset = TyBits == NewTyBits
|
|
|
|
? SE->getMulExpr(CondUse->Offset,
|
|
|
|
SE->getConstant(ConstantInt::get(CmpTy, Scale)))
|
|
|
|
: SE->getConstant(ConstantInt::get(NewCmpTy,
|
|
|
|
cast<SCEVConstant>(CondUse->Offset)->getValue()->getSExtValue()*Scale));
|
2007-10-25 17:11:16 +08:00
|
|
|
IVUsesByStride[*NewStride].addUser(NewOffset, Cond, NewIncV);
|
|
|
|
CondUse = &IVUsesByStride[*NewStride].Users.back();
|
|
|
|
CondStride = NewStride;
|
|
|
|
++NumEliminated;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Cond;
|
|
|
|
}
|
|
|
|
|
2008-09-16 05:22:06 +08:00
|
|
|
/// OptimizeSMax - Rewrite the loop's terminating condition if it uses
|
|
|
|
/// an smax computation.
|
|
|
|
///
|
|
|
|
/// This is a narrow solution to a specific, but acute, problem. For loops
|
|
|
|
/// like this:
|
|
|
|
///
|
|
|
|
/// i = 0;
|
|
|
|
/// do {
|
|
|
|
/// p[i] = 0.0;
|
|
|
|
/// } while (++i < n);
|
|
|
|
///
|
|
|
|
/// where the comparison is signed, the trip count isn't just 'n', because
|
|
|
|
/// 'n' could be negative. And unfortunately this can come up even for loops
|
|
|
|
/// where the user didn't use a C do-while loop. For example, seemingly
|
|
|
|
/// well-behaved top-test loops will commonly be lowered like this:
|
|
|
|
//
|
|
|
|
/// if (n > 0) {
|
|
|
|
/// i = 0;
|
|
|
|
/// do {
|
|
|
|
/// p[i] = 0.0;
|
|
|
|
/// } while (++i < n);
|
|
|
|
/// }
|
|
|
|
///
|
|
|
|
/// and then it's possible for subsequent optimization to obscure the if
|
|
|
|
/// test in such a way that indvars can't find it.
|
|
|
|
///
|
|
|
|
/// When indvars can't find the if test in loops like this, it creates a
|
|
|
|
/// signed-max expression, which allows it to give the loop a canonical
|
|
|
|
/// induction variable:
|
|
|
|
///
|
|
|
|
/// i = 0;
|
|
|
|
/// smax = n < 1 ? 1 : n;
|
|
|
|
/// do {
|
|
|
|
/// p[i] = 0.0;
|
|
|
|
/// } while (++i != smax);
|
|
|
|
///
|
|
|
|
/// Canonical induction variables are necessary because the loop passes
|
|
|
|
/// are designed around them. The most obvious example of this is the
|
|
|
|
/// LoopInfo analysis, which doesn't remember trip count values. It
|
|
|
|
/// expects to be able to rediscover the trip count each time it is
|
|
|
|
/// needed, and it does this using a simple analyis that only succeeds if
|
|
|
|
/// the loop has a canonical induction variable.
|
|
|
|
///
|
|
|
|
/// However, when it comes time to generate code, the maximum operation
|
|
|
|
/// can be quite costly, especially if it's inside of an outer loop.
|
|
|
|
///
|
|
|
|
/// This function solves this problem by detecting this type of loop and
|
|
|
|
/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
|
|
|
|
/// the instructions for the maximum computation.
|
|
|
|
///
|
|
|
|
ICmpInst *LoopStrengthReduce::OptimizeSMax(Loop *L, ICmpInst *Cond,
|
|
|
|
IVStrideUse* &CondUse) {
|
|
|
|
// Check that the loop matches the pattern we're looking for.
|
|
|
|
if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
|
|
|
|
Cond->getPredicate() != CmpInst::ICMP_NE)
|
|
|
|
return Cond;
|
|
|
|
|
|
|
|
SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
|
|
|
|
if (!Sel || !Sel->hasOneUse()) return Cond;
|
|
|
|
|
|
|
|
SCEVHandle IterationCount = SE->getIterationCount(L);
|
|
|
|
if (isa<SCEVCouldNotCompute>(IterationCount))
|
|
|
|
return Cond;
|
|
|
|
SCEVHandle One = SE->getIntegerSCEV(1, IterationCount->getType());
|
|
|
|
|
|
|
|
// Adjust for an annoying getIterationCount quirk.
|
|
|
|
IterationCount = SE->getAddExpr(IterationCount, One);
|
|
|
|
|
|
|
|
// Check for a max calculation that matches the pattern.
|
|
|
|
SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(IterationCount);
|
|
|
|
if (!SMax || SMax != SE->getSCEV(Sel)) return Cond;
|
|
|
|
|
|
|
|
SCEVHandle SMaxLHS = SMax->getOperand(0);
|
|
|
|
SCEVHandle SMaxRHS = SMax->getOperand(1);
|
|
|
|
if (!SMaxLHS || SMaxLHS != One) return Cond;
|
|
|
|
|
|
|
|
// Check the relevant induction variable for conformance to
|
|
|
|
// the pattern.
|
|
|
|
SCEVHandle IV = SE->getSCEV(Cond->getOperand(0));
|
|
|
|
SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
|
|
|
|
if (!AR || !AR->isAffine() ||
|
|
|
|
AR->getStart() != One ||
|
|
|
|
AR->getStepRecurrence(*SE) != One)
|
|
|
|
return Cond;
|
|
|
|
|
|
|
|
// Check the right operand of the select, and remember it, as it will
|
|
|
|
// be used in the new comparison instruction.
|
|
|
|
Value *NewRHS = 0;
|
|
|
|
if (SE->getSCEV(Sel->getOperand(1)) == SMaxRHS)
|
|
|
|
NewRHS = Sel->getOperand(1);
|
|
|
|
else if (SE->getSCEV(Sel->getOperand(2)) == SMaxRHS)
|
|
|
|
NewRHS = Sel->getOperand(2);
|
|
|
|
if (!NewRHS) return Cond;
|
|
|
|
|
|
|
|
// Ok, everything looks ok to change the condition into an SLT or SGE and
|
|
|
|
// delete the max calculation.
|
|
|
|
ICmpInst *NewCond =
|
|
|
|
new ICmpInst(Cond->getPredicate() == CmpInst::ICMP_NE ?
|
|
|
|
CmpInst::ICMP_SLT :
|
|
|
|
CmpInst::ICMP_SGE,
|
|
|
|
Cond->getOperand(0), NewRHS, "scmp", Cond);
|
|
|
|
|
|
|
|
// Delete the max calculation instructions.
|
2008-10-01 10:02:03 +08:00
|
|
|
SE->deleteValueFromRecords(Cond);
|
2008-09-16 05:22:06 +08:00
|
|
|
Cond->replaceAllUsesWith(NewCond);
|
|
|
|
Cond->eraseFromParent();
|
|
|
|
Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
|
|
|
|
SE->deleteValueFromRecords(Sel);
|
2008-10-01 10:02:03 +08:00
|
|
|
Sel->eraseFromParent();
|
2008-09-16 05:22:06 +08:00
|
|
|
if (Cmp->use_empty()) {
|
|
|
|
SE->deleteValueFromRecords(Cmp);
|
2008-10-01 10:02:03 +08:00
|
|
|
Cmp->eraseFromParent();
|
2008-09-16 05:22:06 +08:00
|
|
|
}
|
|
|
|
CondUse->User = NewCond;
|
|
|
|
return NewCond;
|
|
|
|
}
|
|
|
|
|
2008-08-27 01:57:54 +08:00
|
|
|
/// OptimizeShadowIV - If IV is used in a int-to-float cast
|
|
|
|
/// inside the loop then try to eliminate the cast opeation.
|
|
|
|
void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
|
|
|
|
|
|
|
|
SCEVHandle IterationCount = SE->getIterationCount(L);
|
|
|
|
if (isa<SCEVCouldNotCompute>(IterationCount))
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e;
|
|
|
|
++Stride) {
|
|
|
|
std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI =
|
|
|
|
IVUsesByStride.find(StrideOrder[Stride]);
|
|
|
|
assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
|
|
|
|
if (!isa<SCEVConstant>(SI->first))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
for (std::vector<IVStrideUse>::iterator UI = SI->second.Users.begin(),
|
|
|
|
E = SI->second.Users.end(); UI != E; /* empty */) {
|
|
|
|
std::vector<IVStrideUse>::iterator CandidateUI = UI;
|
2008-08-28 01:50:18 +08:00
|
|
|
++UI;
|
2008-08-27 01:57:54 +08:00
|
|
|
Instruction *ShadowUse = CandidateUI->User;
|
|
|
|
const Type *DestTy = NULL;
|
|
|
|
|
|
|
|
/* If shadow use is a int->float cast then insert a second IV
|
2008-08-28 01:50:18 +08:00
|
|
|
to eliminate this cast.
|
2008-08-27 01:57:54 +08:00
|
|
|
|
|
|
|
for (unsigned i = 0; i < n; ++i)
|
|
|
|
foo((double)i);
|
|
|
|
|
2008-08-28 01:50:18 +08:00
|
|
|
is transformed into
|
2008-08-27 01:57:54 +08:00
|
|
|
|
|
|
|
double d = 0.0;
|
|
|
|
for (unsigned i = 0; i < n; ++i, ++d)
|
|
|
|
foo(d);
|
|
|
|
*/
|
2008-08-28 01:50:18 +08:00
|
|
|
if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->User))
|
2008-08-27 01:57:54 +08:00
|
|
|
DestTy = UCast->getDestTy();
|
2008-08-28 01:50:18 +08:00
|
|
|
else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->User))
|
2008-08-27 01:57:54 +08:00
|
|
|
DestTy = SCast->getDestTy();
|
2008-08-28 04:55:23 +08:00
|
|
|
if (!DestTy) continue;
|
|
|
|
|
|
|
|
if (TLI) {
|
|
|
|
/* If target does not support DestTy natively then do not apply
|
|
|
|
this transformation. */
|
|
|
|
MVT DVT = TLI->getValueType(DestTy);
|
|
|
|
if (!TLI->isTypeLegal(DVT)) continue;
|
|
|
|
}
|
|
|
|
|
2008-08-27 01:57:54 +08:00
|
|
|
PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
|
|
|
|
if (!PH) continue;
|
|
|
|
if (PH->getNumIncomingValues() != 2) continue;
|
|
|
|
|
|
|
|
const Type *SrcTy = PH->getType();
|
|
|
|
int Mantissa = DestTy->getFPMantissaWidth();
|
|
|
|
if (Mantissa == -1) continue;
|
|
|
|
if ((int)TD->getTypeSizeInBits(SrcTy) > Mantissa)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
unsigned Entry, Latch;
|
|
|
|
if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
|
|
|
|
Entry = 0;
|
|
|
|
Latch = 1;
|
|
|
|
} else {
|
|
|
|
Entry = 1;
|
|
|
|
Latch = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
|
|
|
|
if (!Init) continue;
|
|
|
|
ConstantFP *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
|
|
|
|
|
|
|
|
BinaryOperator *Incr =
|
|
|
|
dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
|
|
|
|
if (!Incr) continue;
|
|
|
|
if (Incr->getOpcode() != Instruction::Add
|
|
|
|
&& Incr->getOpcode() != Instruction::Sub)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Initialize new IV, double d = 0.0 in above example. */
|
|
|
|
ConstantInt *C = NULL;
|
|
|
|
if (Incr->getOperand(0) == PH)
|
|
|
|
C = dyn_cast<ConstantInt>(Incr->getOperand(1));
|
|
|
|
else if (Incr->getOperand(1) == PH)
|
|
|
|
C = dyn_cast<ConstantInt>(Incr->getOperand(0));
|
|
|
|
else
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!C) continue;
|
|
|
|
|
|
|
|
/* Add new PHINode. */
|
|
|
|
PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);
|
|
|
|
|
2008-08-28 01:50:18 +08:00
|
|
|
/* create new increment. '++d' in above example. */
|
2008-08-27 01:57:54 +08:00
|
|
|
ConstantFP *CFP = ConstantFP::get(DestTy, C->getZExtValue());
|
|
|
|
BinaryOperator *NewIncr =
|
|
|
|
BinaryOperator::Create(Incr->getOpcode(),
|
|
|
|
NewPH, CFP, "IV.S.next.", Incr);
|
|
|
|
|
|
|
|
NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
|
|
|
|
NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
|
|
|
|
|
|
|
|
/* Remove cast operation */
|
|
|
|
SE->deleteValueFromRecords(ShadowUse);
|
|
|
|
ShadowUse->replaceAllUsesWith(NewPH);
|
|
|
|
ShadowUse->eraseFromParent();
|
|
|
|
SI->second.Users.erase(CandidateUI);
|
|
|
|
NumShadow++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
// OptimizeIndvars - Now that IVUsesByStride is set up with all of the indvar
|
|
|
|
// uses in the loop, look to see if we can eliminate some, in favor of using
|
|
|
|
// common indvars for the different uses.
|
|
|
|
void LoopStrengthReduce::OptimizeIndvars(Loop *L) {
|
|
|
|
// TODO: implement optzns here.
|
|
|
|
|
2008-08-27 01:57:54 +08:00
|
|
|
OptimizeShadowIV(L);
|
|
|
|
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
// Finally, get the terminating condition for the loop if possible. If we
|
|
|
|
// can, we want to change it to use a post-incremented version of its
|
2006-03-24 15:14:34 +08:00
|
|
|
// induction variable, to allow coalescing the live ranges for the IV into
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
// one register value.
|
|
|
|
PHINode *SomePHI = cast<PHINode>(L->getHeader()->begin());
|
|
|
|
BasicBlock *Preheader = L->getLoopPreheader();
|
|
|
|
BasicBlock *LatchBlock =
|
|
|
|
SomePHI->getIncomingBlock(SomePHI->getIncomingBlock(0) == Preheader);
|
|
|
|
BranchInst *TermBr = dyn_cast<BranchInst>(LatchBlock->getTerminator());
|
2006-12-23 14:05:41 +08:00
|
|
|
if (!TermBr || TermBr->isUnconditional() ||
|
|
|
|
!isa<ICmpInst>(TermBr->getCondition()))
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
return;
|
2006-12-23 14:05:41 +08:00
|
|
|
ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
|
|
|
|
// Search IVUsesByStride to find Cond's IVUse if there is one.
|
|
|
|
IVStrideUse *CondUse = 0;
|
Teach LSR to strength reduce IVs that have a loop-invariant but non-constant stride.
For code like this:
void foo(float *a, float *b, int n, int stride_a, int stride_b) {
int i;
for (i=0; i<n; i++)
a[i*stride_a] = b[i*stride_b];
}
we now emit:
.LBB_foo2_2: ; no_exit
lfs f0, 0(r4)
stfs f0, 0(r3)
addi r7, r7, 1
add r4, r2, r4
add r3, r6, r3
cmpw cr0, r7, r5
blt .LBB_foo2_2 ; no_exit
instead of:
.LBB_foo_2: ; no_exit
mullw r8, r2, r7 ;; multiply!
slwi r8, r8, 2
lfsx f0, r4, r8
mullw r8, r2, r6 ;; multiply!
slwi r8, r8, 2
stfsx f0, r3, r8
addi r2, r2, 1
cmpw cr0, r2, r5
blt .LBB_foo_2 ; no_exit
loops with variable strides occur pretty often. For example, in SPECFP2K
there are 317 variable strides in 177.mesa, 3 in 179.art, 14 in 188.ammp,
56 in 168.wupwise, 36 in 172.mgrid.
Now we can allow indvars to turn functions written like this:
void foo2(float *a, float *b, int n, int stride_a, int stride_b) {
int i, ai = 0, bi = 0;
for (i=0; i<n; i++)
{
a[ai] = b[bi];
ai += stride_a;
bi += stride_b;
}
}
into code like the above for better analysis. With this patch, they generate
identical code.
llvm-svn: 22740
2005-08-10 08:45:21 +08:00
|
|
|
const SCEVHandle *CondStride = 0;
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
|
2008-08-14 04:31:11 +08:00
|
|
|
if (!FindIVUserForCond(Cond, CondUse, CondStride))
|
2007-04-03 13:11:24 +08:00
|
|
|
return; // setcc doesn't use the IV.
|
2007-10-25 17:11:16 +08:00
|
|
|
|
2008-09-16 05:22:06 +08:00
|
|
|
// If the trip count is computed in terms of an smax (due to ScalarEvolution
|
|
|
|
// being unable to find a sufficient guard, for example), change the loop
|
|
|
|
// comparison to use SLT instead of NE.
|
|
|
|
Cond = OptimizeSMax(L, Cond, CondUse);
|
|
|
|
|
2007-10-25 17:11:16 +08:00
|
|
|
// If possible, change stride and operands of the compare instruction to
|
|
|
|
// eliminate one stride.
|
|
|
|
Cond = ChangeCompareStride(L, Cond, CondUse, CondStride);
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
|
|
|
|
// It's possible for the setcc instruction to be anywhere in the loop, and
|
|
|
|
// possible for it to have multiple users. If it is not immediately before
|
|
|
|
// the latch block branch, move it.
|
|
|
|
if (&*++BasicBlock::iterator(Cond) != (Instruction*)TermBr) {
|
|
|
|
if (Cond->hasOneUse()) { // Condition has a single use, just move it.
|
|
|
|
Cond->moveBefore(TermBr);
|
|
|
|
} else {
|
|
|
|
// Otherwise, clone the terminating condition and insert into the loopend.
|
2006-12-23 14:05:41 +08:00
|
|
|
Cond = cast<ICmpInst>(Cond->clone());
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
Cond->setName(L->getHeader()->getName() + ".termcond");
|
|
|
|
LatchBlock->getInstList().insert(TermBr, Cond);
|
|
|
|
|
|
|
|
// Clone the IVUse, as the old use still exists!
|
Teach LSR to strength reduce IVs that have a loop-invariant but non-constant stride.
For code like this:
void foo(float *a, float *b, int n, int stride_a, int stride_b) {
int i;
for (i=0; i<n; i++)
a[i*stride_a] = b[i*stride_b];
}
we now emit:
.LBB_foo2_2: ; no_exit
lfs f0, 0(r4)
stfs f0, 0(r3)
addi r7, r7, 1
add r4, r2, r4
add r3, r6, r3
cmpw cr0, r7, r5
blt .LBB_foo2_2 ; no_exit
instead of:
.LBB_foo_2: ; no_exit
mullw r8, r2, r7 ;; multiply!
slwi r8, r8, 2
lfsx f0, r4, r8
mullw r8, r2, r6 ;; multiply!
slwi r8, r8, 2
stfsx f0, r3, r8
addi r2, r2, 1
cmpw cr0, r2, r5
blt .LBB_foo_2 ; no_exit
loops with variable strides occur pretty often. For example, in SPECFP2K
there are 317 variable strides in 177.mesa, 3 in 179.art, 14 in 188.ammp,
56 in 168.wupwise, 36 in 172.mgrid.
Now we can allow indvars to turn functions written like this:
void foo2(float *a, float *b, int n, int stride_a, int stride_b) {
int i, ai = 0, bi = 0;
for (i=0; i<n; i++)
{
a[ai] = b[bi];
ai += stride_a;
bi += stride_b;
}
}
into code like the above for better analysis. With this patch, they generate
identical code.
llvm-svn: 22740
2005-08-10 08:45:21 +08:00
|
|
|
IVUsesByStride[*CondStride].addUser(CondUse->Offset, Cond,
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
CondUse->OperandValToReplace);
|
Teach LSR to strength reduce IVs that have a loop-invariant but non-constant stride.
For code like this:
void foo(float *a, float *b, int n, int stride_a, int stride_b) {
int i;
for (i=0; i<n; i++)
a[i*stride_a] = b[i*stride_b];
}
we now emit:
.LBB_foo2_2: ; no_exit
lfs f0, 0(r4)
stfs f0, 0(r3)
addi r7, r7, 1
add r4, r2, r4
add r3, r6, r3
cmpw cr0, r7, r5
blt .LBB_foo2_2 ; no_exit
instead of:
.LBB_foo_2: ; no_exit
mullw r8, r2, r7 ;; multiply!
slwi r8, r8, 2
lfsx f0, r4, r8
mullw r8, r2, r6 ;; multiply!
slwi r8, r8, 2
stfsx f0, r3, r8
addi r2, r2, 1
cmpw cr0, r2, r5
blt .LBB_foo_2 ; no_exit
loops with variable strides occur pretty often. For example, in SPECFP2K
there are 317 variable strides in 177.mesa, 3 in 179.art, 14 in 188.ammp,
56 in 168.wupwise, 36 in 172.mgrid.
Now we can allow indvars to turn functions written like this:
void foo2(float *a, float *b, int n, int stride_a, int stride_b) {
int i, ai = 0, bi = 0;
for (i=0; i<n; i++)
{
a[ai] = b[bi];
ai += stride_a;
bi += stride_b;
}
}
into code like the above for better analysis. With this patch, they generate
identical code.
llvm-svn: 22740
2005-08-10 08:45:21 +08:00
|
|
|
CondUse = &IVUsesByStride[*CondStride].Users.back();
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we get to here, we know that we can transform the setcc instruction to
|
2006-03-24 15:14:34 +08:00
|
|
|
// use the post-incremented version of the IV, allowing us to coalesce the
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
// live ranges for the IV correctly.
|
2007-10-23 02:31:58 +08:00
|
|
|
CondUse->Offset = SE->getMinusSCEV(CondUse->Offset, *CondStride);
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
CondUse->isUseOfPostIncrementedValue = true;
|
2008-07-08 03:51:32 +08:00
|
|
|
Changed = true;
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
}
|
2005-07-30 08:15:07 +08:00
|
|
|
|
2007-03-07 05:14:09 +08:00
|
|
|
bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
|
2004-10-19 05:08:22 +08:00
|
|
|
|
2007-03-07 05:14:09 +08:00
|
|
|
LI = &getAnalysis<LoopInfo>();
|
2007-06-08 05:42:15 +08:00
|
|
|
DT = &getAnalysis<DominatorTree>();
|
2007-03-07 05:14:09 +08:00
|
|
|
SE = &getAnalysis<ScalarEvolution>();
|
|
|
|
TD = &getAnalysis<TargetData>();
|
|
|
|
UIntPtrTy = TD->getIntPtrType();
|
2008-07-15 01:55:01 +08:00
|
|
|
Changed = false;
|
2007-03-07 05:14:09 +08:00
|
|
|
|
|
|
|
// Find all uses of induction variables in this loop, and catagorize
|
2005-07-30 08:15:07 +08:00
|
|
|
// them by stride. Start by finding all of the PHI nodes in the header for
|
|
|
|
// this loop. If they are induction variables, inspect their uses.
|
2007-10-27 07:08:19 +08:00
|
|
|
SmallPtrSet<Instruction*,16> Processed; // Don't reprocess instructions.
|
2005-07-30 08:15:07 +08:00
|
|
|
for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
|
2005-08-05 01:40:30 +08:00
|
|
|
AddUsersIfInteresting(I, L, Processed);
|
2005-07-30 08:15:07 +08:00
|
|
|
|
2008-07-08 03:51:32 +08:00
|
|
|
if (!IVUsesByStride.empty()) {
|
|
|
|
// Optimize induction variables. Some indvar uses can be transformed to use
|
|
|
|
// strides that will be needed for other purposes. A common example of this
|
|
|
|
// is the exit test for the loop, which can often be rewritten to use the
|
|
|
|
// computation of some other indvar to decide when to terminate the loop.
|
|
|
|
OptimizeIndvars(L);
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
|
2008-07-08 03:51:32 +08:00
|
|
|
// FIXME: We can widen subreg IV's here for RISC targets. e.g. instead of
|
|
|
|
// doing computation in byte values, promote to 32-bit values if safe.
|
Implement a simple optimization for the termination condition of the loop.
The termination condition actually wants to use the post-incremented value
of the loop, not a new indvar with an unusual base.
On PPC, for example, this allows us to compile
LoopStrengthReduce/exit_compare_live_range.ll to:
_foo:
li r2, 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r2, r2, 1
cmpw cr0, r2, r4
bne .LBB_foo_1 ; no_exit
blr
instead of:
_foo:
li r2, 1 ;; IV starts at 1, not 0
.LBB_foo_1: ; no_exit
li r5, 0
stw r5, 0(r3)
addi r5, r2, 1
cmpw cr0, r2, r4
or r2, r5, r5 ;; Reg-reg copy, extra live range
bne .LBB_foo_1 ; no_exit
blr
This implements LoopStrengthReduce/exit_compare_live_range.ll
llvm-svn: 22699
2005-08-08 13:28:22 +08:00
|
|
|
|
2008-07-08 03:51:32 +08:00
|
|
|
// FIXME: Attempt to reuse values across multiple IV's. In particular, we
|
|
|
|
// could have something like "for(i) { foo(i*8); bar(i*16) }", which should
|
|
|
|
// be codegened as "for (j = 0;; j+=8) { foo(j); bar(j+j); }" on X86/PPC.
|
|
|
|
// Need to be careful that IV's are all the same type. Only works for
|
|
|
|
// intptr_t indvars.
|
2005-07-30 08:15:07 +08:00
|
|
|
|
2008-07-08 03:51:32 +08:00
|
|
|
// If we only have one stride, we can more aggressively eliminate some
|
|
|
|
// things.
|
|
|
|
bool HasOneStride = IVUsesByStride.size() == 1;
|
2006-03-17 05:53:05 +08:00
|
|
|
|
|
|
|
#ifndef NDEBUG
|
2008-07-08 03:51:32 +08:00
|
|
|
DOUT << "\nLSR on ";
|
|
|
|
DEBUG(L->dump());
|
2006-03-17 05:53:05 +08:00
|
|
|
#endif
|
|
|
|
|
2008-07-08 03:51:32 +08:00
|
|
|
// IVsByStride keeps IVs for one particular loop.
|
|
|
|
assert(IVsByStride.empty() && "Stale entries in IVsByStride?");
|
|
|
|
|
|
|
|
// Sort the StrideOrder so we process larger strides first.
|
|
|
|
std::stable_sort(StrideOrder.begin(), StrideOrder.end(), StrideCompare());
|
|
|
|
|
|
|
|
// Note: this processes each stride/type pair individually. All users
|
|
|
|
// passed into StrengthReduceStridedIVUsers have the same type AND stride.
|
|
|
|
// Also, note that we iterate over IVUsesByStride indirectly by using
|
|
|
|
// StrideOrder. This extra layer of indirection makes the ordering of
|
|
|
|
// strides deterministic - not dependent on map order.
|
|
|
|
for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e; ++Stride) {
|
|
|
|
std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI =
|
|
|
|
IVUsesByStride.find(StrideOrder[Stride]);
|
|
|
|
assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
|
|
|
|
StrengthReduceStridedIVUsers(SI->first, SI->second, L, HasOneStride);
|
|
|
|
}
|
2005-10-09 14:20:55 +08:00
|
|
|
}
|
2004-10-19 05:08:22 +08:00
|
|
|
|
2008-05-21 08:54:12 +08:00
|
|
|
// We're done analyzing this loop; release all the state we built up for it.
|
|
|
|
CastedPointers.clear();
|
|
|
|
IVUsesByStride.clear();
|
|
|
|
IVsByStride.clear();
|
|
|
|
StrideOrder.clear();
|
|
|
|
|
2004-10-19 05:08:22 +08:00
|
|
|
// Clean up after ourselves
|
|
|
|
if (!DeadInsts.empty()) {
|
2008-12-01 14:14:28 +08:00
|
|
|
DeleteTriviallyDeadInstructions();
|
2004-10-19 05:08:22 +08:00
|
|
|
|
2005-07-30 08:15:07 +08:00
|
|
|
BasicBlock::iterator I = L->getHeader()->begin();
|
2008-06-23 04:44:02 +08:00
|
|
|
while (PHINode *PN = dyn_cast<PHINode>(I++)) {
|
|
|
|
// At this point, we know that we have killed one or more IV users.
|
2008-12-01 14:11:32 +08:00
|
|
|
// It is worth checking to see if the cannonical indvar is also
|
2008-06-23 04:44:02 +08:00
|
|
|
// dead, so that we can remove it as well.
|
|
|
|
//
|
|
|
|
// We can remove a PHI if it is on a cycle in the def-use graph
|
|
|
|
// where each node in the cycle has degree one, i.e. only one use,
|
|
|
|
// and is an instruction with no side effects.
|
|
|
|
//
|
2005-07-30 08:15:07 +08:00
|
|
|
// FIXME: this needs to eliminate an induction variable even if it's being
|
|
|
|
// compared against some value to decide loop termination.
|
2008-11-28 07:00:20 +08:00
|
|
|
if (!PN->hasOneUse())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
SmallPtrSet<PHINode *, 4> PHIs;
|
|
|
|
for (Instruction *J = dyn_cast<Instruction>(*PN->use_begin());
|
|
|
|
J && J->hasOneUse() && !J->mayWriteToMemory();
|
|
|
|
J = dyn_cast<Instruction>(*J->use_begin())) {
|
|
|
|
// If we find the original PHI, we've discovered a cycle.
|
|
|
|
if (J == PN) {
|
|
|
|
// Break the cycle and mark the PHI for deletion.
|
|
|
|
SE->deleteValueFromRecords(PN);
|
|
|
|
PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
|
2008-12-01 14:27:41 +08:00
|
|
|
DeadInsts.push_back(PN);
|
2008-11-28 07:00:20 +08:00
|
|
|
Changed = true;
|
|
|
|
break;
|
2005-08-02 10:52:02 +08:00
|
|
|
}
|
2008-11-28 07:00:20 +08:00
|
|
|
// If we find a PHI more than once, we're on a cycle that
|
|
|
|
// won't prove fruitful.
|
|
|
|
if (isa<PHINode>(J) && !PHIs.insert(cast<PHINode>(J)))
|
|
|
|
break;
|
2005-07-30 08:15:07 +08:00
|
|
|
}
|
2004-10-19 05:08:22 +08:00
|
|
|
}
|
2008-12-01 14:14:28 +08:00
|
|
|
DeleteTriviallyDeadInstructions();
|
2004-10-19 05:08:22 +08:00
|
|
|
}
|
2008-07-08 03:51:32 +08:00
|
|
|
return Changed;
|
2004-10-19 05:08:22 +08:00
|
|
|
}
|