2005-01-07 15:49:41 +08:00
|
|
|
//===-- X86ISelPattern.cpp - A pattern matching inst selector for X86 -----===//
|
2005-01-11 06:10:13 +08:00
|
|
|
//
|
2005-01-07 15:49:41 +08:00
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file was developed by the LLVM research group and is distributed under
|
|
|
|
// the University of Illinois Open Source License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file defines a pattern matching instruction selector for X86.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "X86.h"
|
|
|
|
#include "X86InstrBuilder.h"
|
|
|
|
#include "X86RegisterInfo.h"
|
2005-01-10 02:52:44 +08:00
|
|
|
#include "llvm/Constants.h" // FIXME: REMOVE
|
2005-01-07 15:49:41 +08:00
|
|
|
#include "llvm/Function.h"
|
2005-01-10 02:52:44 +08:00
|
|
|
#include "llvm/CodeGen/MachineConstantPool.h" // FIXME: REMOVE
|
2005-01-07 15:49:41 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
|
|
#include "llvm/CodeGen/SelectionDAG.h"
|
|
|
|
#include "llvm/CodeGen/SelectionDAGISel.h"
|
|
|
|
#include "llvm/CodeGen/SSARegMap.h"
|
|
|
|
#include "llvm/Target/TargetData.h"
|
|
|
|
#include "llvm/Target/TargetLowering.h"
|
|
|
|
#include "llvm/Support/MathExtras.h"
|
|
|
|
#include "llvm/ADT/Statistic.h"
|
|
|
|
#include <set>
|
2005-01-12 12:29:05 +08:00
|
|
|
#include <algorithm>
|
2005-01-07 15:49:41 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// X86TargetLowering - X86 Implementation of the TargetLowering interface
|
|
|
|
namespace {
|
|
|
|
class X86TargetLowering : public TargetLowering {
|
|
|
|
int VarArgsFrameIndex; // FrameIndex for start of varargs area.
|
2005-01-09 08:01:27 +08:00
|
|
|
int ReturnAddrIndex; // FrameIndex for return slot.
|
2005-01-07 15:49:41 +08:00
|
|
|
public:
|
|
|
|
X86TargetLowering(TargetMachine &TM) : TargetLowering(TM) {
|
|
|
|
// Set up the TargetLowering object.
|
2005-01-17 08:00:33 +08:00
|
|
|
|
|
|
|
// X86 is wierd, it always uses i8 for shift amounts and setcc results.
|
|
|
|
setShiftAmountType(MVT::i8);
|
|
|
|
setSetCCResultType(MVT::i8);
|
2005-01-19 11:36:30 +08:00
|
|
|
setShiftAmountFlavor(Mask); // shl X, 32 == shl X, 0
|
2005-01-17 08:00:33 +08:00
|
|
|
|
|
|
|
// Set up the register classes.
|
2005-01-07 15:49:41 +08:00
|
|
|
addRegisterClass(MVT::i8, X86::R8RegisterClass);
|
|
|
|
addRegisterClass(MVT::i16, X86::R16RegisterClass);
|
|
|
|
addRegisterClass(MVT::i32, X86::R32RegisterClass);
|
|
|
|
addRegisterClass(MVT::f64, X86::RFPRegisterClass);
|
|
|
|
|
|
|
|
// FIXME: Eliminate these two classes when legalize can handle promotions
|
|
|
|
// well.
|
2005-01-16 15:34:08 +08:00
|
|
|
/**/ addRegisterClass(MVT::i1, X86::R8RegisterClass);
|
|
|
|
|
|
|
|
setOperationAction(ISD::MEMMOVE , MVT::Other, Expand);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Expand);
|
|
|
|
setOperationAction(ISD::ZERO_EXTEND_INREG, MVT::i16 , Expand);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
|
|
|
|
setOperationAction(ISD::ZERO_EXTEND_INREG, MVT::i1 , Expand);
|
|
|
|
setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
|
|
|
|
setOperationAction(ISD::SEXTLOAD , MVT::i1 , Expand);
|
|
|
|
setOperationAction(ISD::SREM , MVT::f64 , Expand);
|
|
|
|
|
|
|
|
// These should be promoted to a larger select which is supported.
|
|
|
|
/**/ setOperationAction(ISD::SELECT , MVT::i1 , Promote);
|
|
|
|
setOperationAction(ISD::SELECT , MVT::i8 , Promote);
|
2005-01-07 15:49:41 +08:00
|
|
|
|
|
|
|
computeRegisterProperties();
|
|
|
|
|
|
|
|
addLegalFPImmediate(+0.0); // FLD0
|
|
|
|
addLegalFPImmediate(+1.0); // FLD1
|
|
|
|
addLegalFPImmediate(-0.0); // FLD0/FCHS
|
|
|
|
addLegalFPImmediate(-1.0); // FLD1/FCHS
|
|
|
|
}
|
|
|
|
|
|
|
|
/// LowerArguments - This hook must be implemented to indicate how we should
|
|
|
|
/// lower the arguments for the specified function, into the specified DAG.
|
|
|
|
virtual std::vector<SDOperand>
|
|
|
|
LowerArguments(Function &F, SelectionDAG &DAG);
|
|
|
|
|
|
|
|
/// LowerCallTo - This hook lowers an abstract call to a function into an
|
|
|
|
/// actual call.
|
2005-01-09 03:28:19 +08:00
|
|
|
virtual std::pair<SDOperand, SDOperand>
|
|
|
|
LowerCallTo(SDOperand Chain, const Type *RetTy, SDOperand Callee,
|
|
|
|
ArgListTy &Args, SelectionDAG &DAG);
|
2005-01-09 08:01:27 +08:00
|
|
|
|
|
|
|
virtual std::pair<SDOperand, SDOperand>
|
|
|
|
LowerVAStart(SDOperand Chain, SelectionDAG &DAG);
|
|
|
|
|
|
|
|
virtual std::pair<SDOperand,SDOperand>
|
|
|
|
LowerVAArgNext(bool isVANext, SDOperand Chain, SDOperand VAList,
|
|
|
|
const Type *ArgTy, SelectionDAG &DAG);
|
|
|
|
|
|
|
|
virtual std::pair<SDOperand, SDOperand>
|
|
|
|
LowerFrameReturnAddress(bool isFrameAddr, SDOperand Chain, unsigned Depth,
|
|
|
|
SelectionDAG &DAG);
|
2005-01-07 15:49:41 +08:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<SDOperand>
|
|
|
|
X86TargetLowering::LowerArguments(Function &F, SelectionDAG &DAG) {
|
|
|
|
std::vector<SDOperand> ArgValues;
|
|
|
|
|
|
|
|
// Add DAG nodes to load the arguments... On entry to a function on the X86,
|
|
|
|
// the stack frame looks like this:
|
|
|
|
//
|
|
|
|
// [ESP] -- return address
|
|
|
|
// [ESP + 4] -- first argument (leftmost lexically)
|
|
|
|
// [ESP + 8] -- second argument, if first argument is four bytes in size
|
|
|
|
// ...
|
|
|
|
//
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
MachineFrameInfo *MFI = MF.getFrameInfo();
|
|
|
|
|
|
|
|
unsigned ArgOffset = 0; // Frame mechanisms handle retaddr slot
|
|
|
|
for (Function::aiterator I = F.abegin(), E = F.aend(); I != E; ++I) {
|
|
|
|
MVT::ValueType ObjectVT = getValueType(I->getType());
|
|
|
|
unsigned ArgIncrement = 4;
|
|
|
|
unsigned ObjSize;
|
|
|
|
switch (ObjectVT) {
|
|
|
|
default: assert(0 && "Unhandled argument type!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: ObjSize = 1; break;
|
|
|
|
case MVT::i16: ObjSize = 2; break;
|
|
|
|
case MVT::i32: ObjSize = 4; break;
|
|
|
|
case MVT::i64: ObjSize = ArgIncrement = 8; break;
|
|
|
|
case MVT::f32: ObjSize = 4; break;
|
|
|
|
case MVT::f64: ObjSize = ArgIncrement = 8; break;
|
|
|
|
}
|
|
|
|
// Create the frame index object for this incoming parameter...
|
|
|
|
int FI = MFI->CreateFixedObject(ObjSize, ArgOffset);
|
|
|
|
|
|
|
|
// Create the SelectionDAG nodes corresponding to a load from this parameter
|
|
|
|
SDOperand FIN = DAG.getFrameIndex(FI, MVT::i32);
|
|
|
|
|
|
|
|
// Don't codegen dead arguments. FIXME: remove this check when we can nuke
|
|
|
|
// dead loads.
|
|
|
|
SDOperand ArgValue;
|
|
|
|
if (!I->use_empty())
|
|
|
|
ArgValue = DAG.getLoad(ObjectVT, DAG.getEntryNode(), FIN);
|
|
|
|
else {
|
|
|
|
if (MVT::isInteger(ObjectVT))
|
|
|
|
ArgValue = DAG.getConstant(0, ObjectVT);
|
|
|
|
else
|
|
|
|
ArgValue = DAG.getConstantFP(0, ObjectVT);
|
|
|
|
}
|
|
|
|
ArgValues.push_back(ArgValue);
|
|
|
|
|
|
|
|
ArgOffset += ArgIncrement; // Move on to the next argument...
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the function takes variable number of arguments, make a frame index for
|
|
|
|
// the start of the first vararg value... for expansion of llvm.va_start.
|
|
|
|
if (F.isVarArg())
|
|
|
|
VarArgsFrameIndex = MFI->CreateFixedObject(1, ArgOffset);
|
2005-01-09 08:01:27 +08:00
|
|
|
ReturnAddrIndex = 0; // No return address slot generated yet.
|
2005-01-07 15:49:41 +08:00
|
|
|
return ArgValues;
|
|
|
|
}
|
|
|
|
|
2005-01-09 03:28:19 +08:00
|
|
|
std::pair<SDOperand, SDOperand>
|
|
|
|
X86TargetLowering::LowerCallTo(SDOperand Chain,
|
|
|
|
const Type *RetTy, SDOperand Callee,
|
|
|
|
ArgListTy &Args, SelectionDAG &DAG) {
|
2005-01-07 15:49:41 +08:00
|
|
|
// Count how many bytes are to be pushed on the stack.
|
|
|
|
unsigned NumBytes = 0;
|
|
|
|
|
|
|
|
if (Args.empty()) {
|
|
|
|
// Save zero bytes.
|
2005-01-09 03:28:19 +08:00
|
|
|
Chain = DAG.getNode(ISD::ADJCALLSTACKDOWN, MVT::Other, Chain,
|
|
|
|
DAG.getConstant(0, getPointerTy()));
|
2005-01-07 15:49:41 +08:00
|
|
|
} else {
|
|
|
|
for (unsigned i = 0, e = Args.size(); i != e; ++i)
|
|
|
|
switch (getValueType(Args[i].second)) {
|
|
|
|
default: assert(0 && "Unknown value type!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8:
|
|
|
|
case MVT::i16:
|
|
|
|
case MVT::i32:
|
|
|
|
case MVT::f32:
|
|
|
|
NumBytes += 4;
|
|
|
|
break;
|
|
|
|
case MVT::i64:
|
|
|
|
case MVT::f64:
|
|
|
|
NumBytes += 8;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2005-01-09 03:28:19 +08:00
|
|
|
Chain = DAG.getNode(ISD::ADJCALLSTACKDOWN, MVT::Other, Chain,
|
|
|
|
DAG.getConstant(NumBytes, getPointerTy()));
|
2005-01-07 15:49:41 +08:00
|
|
|
|
|
|
|
// Arguments go on the stack in reverse order, as specified by the ABI.
|
|
|
|
unsigned ArgOffset = 0;
|
2005-01-15 06:37:41 +08:00
|
|
|
SDOperand StackPtr = DAG.getCopyFromReg(X86::ESP, MVT::i32,
|
|
|
|
DAG.getEntryNode());
|
2005-01-22 03:46:38 +08:00
|
|
|
std::vector<SDOperand> Stores;
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
for (unsigned i = 0, e = Args.size(); i != e; ++i) {
|
|
|
|
unsigned ArgReg;
|
|
|
|
SDOperand PtrOff = DAG.getConstant(ArgOffset, getPointerTy());
|
|
|
|
PtrOff = DAG.getNode(ISD::ADD, MVT::i32, StackPtr, PtrOff);
|
|
|
|
|
|
|
|
switch (getValueType(Args[i].second)) {
|
|
|
|
default: assert(0 && "Unexpected ValueType for argument!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8:
|
|
|
|
case MVT::i16:
|
|
|
|
// Promote the integer to 32 bits. If the input type is signed use a
|
|
|
|
// sign extend, otherwise use a zero extend.
|
|
|
|
if (Args[i].second->isSigned())
|
|
|
|
Args[i].first =DAG.getNode(ISD::SIGN_EXTEND, MVT::i32, Args[i].first);
|
|
|
|
else
|
|
|
|
Args[i].first =DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, Args[i].first);
|
|
|
|
|
|
|
|
// FALL THROUGH
|
|
|
|
case MVT::i32:
|
|
|
|
case MVT::f32:
|
2005-01-22 03:46:38 +08:00
|
|
|
Stores.push_back(DAG.getNode(ISD::STORE, MVT::Other, Chain,
|
|
|
|
Args[i].first, PtrOff));
|
2005-01-07 15:49:41 +08:00
|
|
|
ArgOffset += 4;
|
|
|
|
break;
|
|
|
|
case MVT::i64:
|
|
|
|
case MVT::f64:
|
2005-01-22 03:46:38 +08:00
|
|
|
Stores.push_back(DAG.getNode(ISD::STORE, MVT::Other, Chain,
|
|
|
|
Args[i].first, PtrOff));
|
2005-01-07 15:49:41 +08:00
|
|
|
ArgOffset += 8;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2005-01-22 03:46:38 +08:00
|
|
|
Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, Stores);
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<MVT::ValueType> RetVals;
|
|
|
|
MVT::ValueType RetTyVT = getValueType(RetTy);
|
|
|
|
if (RetTyVT != MVT::isVoid)
|
|
|
|
RetVals.push_back(RetTyVT);
|
|
|
|
RetVals.push_back(MVT::Other);
|
|
|
|
|
2005-01-09 03:28:19 +08:00
|
|
|
SDOperand TheCall = SDOperand(DAG.getCall(RetVals, Chain, Callee), 0);
|
2005-01-09 04:51:36 +08:00
|
|
|
Chain = TheCall.getValue(RetTyVT != MVT::isVoid);
|
2005-01-09 03:28:19 +08:00
|
|
|
Chain = DAG.getNode(ISD::ADJCALLSTACKUP, MVT::Other, Chain,
|
|
|
|
DAG.getConstant(NumBytes, getPointerTy()));
|
|
|
|
return std::make_pair(TheCall, Chain);
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
|
|
|
|
2005-01-09 08:01:27 +08:00
|
|
|
std::pair<SDOperand, SDOperand>
|
|
|
|
X86TargetLowering::LowerVAStart(SDOperand Chain, SelectionDAG &DAG) {
|
|
|
|
// vastart just returns the address of the VarArgsFrameIndex slot.
|
|
|
|
return std::make_pair(DAG.getFrameIndex(VarArgsFrameIndex, MVT::i32), Chain);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::pair<SDOperand,SDOperand> X86TargetLowering::
|
|
|
|
LowerVAArgNext(bool isVANext, SDOperand Chain, SDOperand VAList,
|
|
|
|
const Type *ArgTy, SelectionDAG &DAG) {
|
|
|
|
MVT::ValueType ArgVT = getValueType(ArgTy);
|
|
|
|
SDOperand Result;
|
|
|
|
if (!isVANext) {
|
|
|
|
Result = DAG.getLoad(ArgVT, DAG.getEntryNode(), VAList);
|
|
|
|
} else {
|
|
|
|
unsigned Amt;
|
|
|
|
if (ArgVT == MVT::i32)
|
|
|
|
Amt = 4;
|
|
|
|
else {
|
|
|
|
assert((ArgVT == MVT::i64 || ArgVT == MVT::f64) &&
|
|
|
|
"Other types should have been promoted for varargs!");
|
|
|
|
Amt = 8;
|
|
|
|
}
|
|
|
|
Result = DAG.getNode(ISD::ADD, VAList.getValueType(), VAList,
|
|
|
|
DAG.getConstant(Amt, VAList.getValueType()));
|
|
|
|
}
|
|
|
|
return std::make_pair(Result, Chain);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
std::pair<SDOperand, SDOperand> X86TargetLowering::
|
|
|
|
LowerFrameReturnAddress(bool isFrameAddress, SDOperand Chain, unsigned Depth,
|
|
|
|
SelectionDAG &DAG) {
|
|
|
|
SDOperand Result;
|
|
|
|
if (Depth) // Depths > 0 not supported yet!
|
|
|
|
Result = DAG.getConstant(0, getPointerTy());
|
|
|
|
else {
|
|
|
|
if (ReturnAddrIndex == 0) {
|
|
|
|
// Set up a frame object for the return address.
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(4, -4);
|
|
|
|
}
|
|
|
|
|
|
|
|
SDOperand RetAddrFI = DAG.getFrameIndex(ReturnAddrIndex, MVT::i32);
|
|
|
|
|
|
|
|
if (!isFrameAddress)
|
|
|
|
// Just load the return address
|
|
|
|
Result = DAG.getLoad(MVT::i32, DAG.getEntryNode(), RetAddrFI);
|
|
|
|
else
|
|
|
|
Result = DAG.getNode(ISD::SUB, MVT::i32, RetAddrFI,
|
|
|
|
DAG.getConstant(4, MVT::i32));
|
|
|
|
}
|
|
|
|
return std::make_pair(Result, Chain);
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
|
|
|
|
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
namespace {
|
|
|
|
/// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
|
|
|
|
/// SDOperand's instead of register numbers for the leaves of the matched
|
|
|
|
/// tree.
|
|
|
|
struct X86ISelAddressMode {
|
|
|
|
enum {
|
|
|
|
RegBase,
|
|
|
|
FrameIndexBase,
|
|
|
|
} BaseType;
|
|
|
|
|
|
|
|
struct { // This is really a union, discriminated by BaseType!
|
|
|
|
SDOperand Reg;
|
|
|
|
int FrameIndex;
|
|
|
|
} Base;
|
|
|
|
|
|
|
|
unsigned Scale;
|
|
|
|
SDOperand IndexReg;
|
|
|
|
unsigned Disp;
|
|
|
|
GlobalValue *GV;
|
|
|
|
|
|
|
|
X86ISelAddressMode()
|
|
|
|
: BaseType(RegBase), Scale(1), IndexReg(), Disp(), GV(0) {
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
Statistic<>
|
|
|
|
NumFPKill("x86-codegen", "Number of FP_REG_KILL instructions added");
|
|
|
|
|
|
|
|
//===--------------------------------------------------------------------===//
|
|
|
|
/// ISel - X86 specific code to select X86 machine instructions for
|
|
|
|
/// SelectionDAG operations.
|
|
|
|
///
|
|
|
|
class ISel : public SelectionDAGISel {
|
|
|
|
/// ContainsFPCode - Every instruction we select that uses or defines a FP
|
|
|
|
/// register should set this to true.
|
|
|
|
bool ContainsFPCode;
|
|
|
|
|
|
|
|
/// X86Lowering - This object fully describes how to lower LLVM code to an
|
|
|
|
/// X86-specific SelectionDAG.
|
|
|
|
X86TargetLowering X86Lowering;
|
|
|
|
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
/// RegPressureMap - This keeps an approximate count of the number of
|
|
|
|
/// registers required to evaluate each node in the graph.
|
|
|
|
std::map<SDNode*, unsigned> RegPressureMap;
|
2005-01-07 15:49:41 +08:00
|
|
|
|
|
|
|
/// ExprMap - As shared expressions are codegen'd, we keep track of which
|
|
|
|
/// vreg the value is produced in, so we only emit one copy of each compiled
|
|
|
|
/// tree.
|
|
|
|
std::map<SDOperand, unsigned> ExprMap;
|
|
|
|
|
|
|
|
public:
|
|
|
|
ISel(TargetMachine &TM) : SelectionDAGISel(X86Lowering), X86Lowering(TM) {
|
|
|
|
}
|
|
|
|
|
2005-01-22 05:35:14 +08:00
|
|
|
virtual const char *getPassName() const {
|
|
|
|
return "X86 Pattern Instruction Selection";
|
|
|
|
}
|
|
|
|
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
unsigned getRegPressure(SDOperand O) {
|
|
|
|
return RegPressureMap[O.Val];
|
|
|
|
}
|
|
|
|
unsigned ComputeRegPressure(SDOperand O);
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
/// InstructionSelectBasicBlock - This callback is invoked by
|
|
|
|
/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
|
2005-01-12 12:21:28 +08:00
|
|
|
virtual void InstructionSelectBasicBlock(SelectionDAG &DAG);
|
2005-01-07 15:49:41 +08:00
|
|
|
|
2005-01-26 04:03:11 +08:00
|
|
|
bool isFoldableLoad(SDOperand Op, SDOperand OtherOp,
|
|
|
|
bool FloatPromoteOk = false);
|
2005-01-12 05:19:59 +08:00
|
|
|
void EmitFoldedLoad(SDOperand Op, X86AddressMode &AM);
|
2005-01-18 03:25:26 +08:00
|
|
|
bool TryToFoldLoadOpStore(SDNode *Node);
|
2005-01-12 05:19:59 +08:00
|
|
|
|
2005-01-19 15:37:26 +08:00
|
|
|
bool EmitOrOpOp(SDOperand Op1, SDOperand Op2, unsigned DestReg);
|
2005-01-17 09:34:14 +08:00
|
|
|
void EmitCMP(SDOperand LHS, SDOperand RHS, bool isOnlyUse);
|
2005-01-11 12:06:27 +08:00
|
|
|
bool EmitBranchCC(MachineBasicBlock *Dest, SDOperand Chain, SDOperand Cond);
|
2005-01-11 06:10:13 +08:00
|
|
|
void EmitSelectCC(SDOperand Cond, MVT::ValueType SVT,
|
|
|
|
unsigned RTrue, unsigned RFalse, unsigned RDest);
|
2005-01-07 15:49:41 +08:00
|
|
|
unsigned SelectExpr(SDOperand N);
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
|
|
|
|
X86AddressMode SelectAddrExprs(const X86ISelAddressMode &IAM);
|
|
|
|
bool MatchAddress(SDOperand N, X86ISelAddressMode &AM);
|
|
|
|
void SelectAddress(SDOperand N, X86AddressMode &AM);
|
2005-01-07 15:49:41 +08:00
|
|
|
void Select(SDOperand N);
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2005-01-12 12:21:28 +08:00
|
|
|
/// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
|
|
|
|
/// when it has created a SelectionDAG for us to codegen.
|
|
|
|
void ISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
|
|
|
|
// While we're doing this, keep track of whether we see any FP code for
|
|
|
|
// FP_REG_KILL insertion.
|
|
|
|
ContainsFPCode = false;
|
|
|
|
|
|
|
|
// Scan the PHI nodes that already are inserted into this basic block. If any
|
|
|
|
// of them is a PHI of a floating point value, we need to insert an
|
|
|
|
// FP_REG_KILL.
|
|
|
|
SSARegMap *RegMap = BB->getParent()->getSSARegMap();
|
|
|
|
for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
|
|
|
|
I != E; ++I) {
|
|
|
|
assert(I->getOpcode() == X86::PHI &&
|
|
|
|
"Isn't just PHI nodes?");
|
|
|
|
if (RegMap->getRegClass(I->getOperand(0).getReg()) ==
|
|
|
|
X86::RFPRegisterClass) {
|
|
|
|
ContainsFPCode = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compute the RegPressureMap, which is an approximation for the number of
|
|
|
|
// registers required to compute each node.
|
|
|
|
ComputeRegPressure(DAG.getRoot());
|
|
|
|
|
|
|
|
// Codegen the basic block.
|
|
|
|
Select(DAG.getRoot());
|
|
|
|
|
|
|
|
// Finally, look at all of the successors of this block. If any contain a PHI
|
|
|
|
// node of FP type, we need to insert an FP_REG_KILL in this block.
|
|
|
|
for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
|
|
|
|
E = BB->succ_end(); SI != E && !ContainsFPCode; ++SI)
|
|
|
|
for (MachineBasicBlock::iterator I = (*SI)->begin(), E = (*SI)->end();
|
|
|
|
I != E && I->getOpcode() == X86::PHI; ++I) {
|
|
|
|
if (RegMap->getRegClass(I->getOperand(0).getReg()) ==
|
|
|
|
X86::RFPRegisterClass) {
|
|
|
|
ContainsFPCode = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Insert FP_REG_KILL instructions into basic blocks that need them. This
|
|
|
|
// only occurs due to the floating point stackifier not being aggressive
|
|
|
|
// enough to handle arbitrary global stackification.
|
|
|
|
//
|
|
|
|
// Currently we insert an FP_REG_KILL instruction into each block that uses or
|
|
|
|
// defines a floating point virtual register.
|
|
|
|
//
|
|
|
|
// When the global register allocators (like linear scan) finally update live
|
|
|
|
// variable analysis, we can keep floating point values in registers across
|
|
|
|
// basic blocks. This will be a huge win, but we are waiting on the global
|
|
|
|
// allocators before we can do this.
|
|
|
|
//
|
|
|
|
if (ContainsFPCode && BB->succ_size()) {
|
|
|
|
BuildMI(*BB, BB->getFirstTerminator(), X86::FP_REG_KILL, 0);
|
|
|
|
++NumFPKill;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Clear state used for selection.
|
|
|
|
ExprMap.clear();
|
|
|
|
RegPressureMap.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
// ComputeRegPressure - Compute the RegPressureMap, which is an approximation
|
|
|
|
// for the number of registers required to compute each node. This is basically
|
|
|
|
// computing a generalized form of the Sethi-Ullman number for each node.
|
|
|
|
unsigned ISel::ComputeRegPressure(SDOperand O) {
|
|
|
|
SDNode *N = O.Val;
|
|
|
|
unsigned &Result = RegPressureMap[N];
|
|
|
|
if (Result) return Result;
|
|
|
|
|
2005-01-11 11:37:59 +08:00
|
|
|
// FIXME: Should operations like CALL (which clobber lots o regs) have a
|
|
|
|
// higher fixed cost??
|
|
|
|
|
2005-01-12 06:29:12 +08:00
|
|
|
if (N->getNumOperands() == 0) {
|
|
|
|
Result = 1;
|
|
|
|
} else {
|
|
|
|
unsigned MaxRegUse = 0;
|
|
|
|
unsigned NumExtraMaxRegUsers = 0;
|
|
|
|
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
|
|
|
|
unsigned Regs;
|
|
|
|
if (N->getOperand(i).getOpcode() == ISD::Constant)
|
|
|
|
Regs = 0;
|
|
|
|
else
|
|
|
|
Regs = ComputeRegPressure(N->getOperand(i));
|
|
|
|
if (Regs > MaxRegUse) {
|
|
|
|
MaxRegUse = Regs;
|
|
|
|
NumExtraMaxRegUsers = 0;
|
|
|
|
} else if (Regs == MaxRegUse &&
|
|
|
|
N->getOperand(i).getValueType() != MVT::Other) {
|
|
|
|
++NumExtraMaxRegUsers;
|
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
}
|
2005-01-18 06:56:09 +08:00
|
|
|
|
|
|
|
if (O.getOpcode() != ISD::TokenFactor)
|
|
|
|
Result = MaxRegUse+NumExtraMaxRegUsers;
|
|
|
|
else
|
2005-01-18 07:02:13 +08:00
|
|
|
Result = MaxRegUse == 1 ? 0 : MaxRegUse-1;
|
2005-01-12 06:29:12 +08:00
|
|
|
}
|
2005-01-12 10:19:06 +08:00
|
|
|
|
Comment out debug code :)
Select [mem] += Val operations. For constants, we used to get:
mov %ECX, -32768
add %ECX, DWORD PTR [l4_match_start]
mov DWORD PTR [l4_match_start], %ECX
Now we get:
add DWORD PTR [l4_match_start], -32768
For other values we used to get:
mov %EBP, %EDI ;; because the add destroys the value
add %EBP, DWORD PTR [l4_input_len]
mov DWORD PTR [l4_input_len], %EBP
now we get:
add DWORD PTR [l4_input_len], %EDI
Both of these use less registers than the alternative, are faster and smaller.
llvm-svn: 19488
2005-01-12 07:21:30 +08:00
|
|
|
//std::cerr << " WEIGHT: " << Result << " "; N->dump(); std::cerr << "\n";
|
2005-01-12 06:29:12 +08:00
|
|
|
return Result;
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
}
|
|
|
|
|
2005-01-21 00:50:16 +08:00
|
|
|
/// NodeTransitivelyUsesValue - Return true if N or any of its uses uses Op.
|
|
|
|
/// The DAG cannot have cycles in it, by definition, so the visited set is not
|
|
|
|
/// needed to prevent infinite loops. The DAG CAN, however, have unbounded
|
|
|
|
/// reuse, so it prevents exponential cases.
|
|
|
|
///
|
|
|
|
static bool NodeTransitivelyUsesValue(SDOperand N, SDOperand Op,
|
|
|
|
std::set<SDNode*> &Visited) {
|
|
|
|
if (N == Op) return true; // Found it.
|
|
|
|
SDNode *Node = N.Val;
|
2005-01-22 05:43:02 +08:00
|
|
|
if (Node->getNumOperands() == 0 || // Leaf?
|
|
|
|
Node->getNodeDepth() <= Op.getNodeDepth()) return false; // Can't find it?
|
2005-01-21 00:50:16 +08:00
|
|
|
if (!Visited.insert(Node).second) return false; // Already visited?
|
|
|
|
|
|
|
|
// Recurse for the first N-1 operands.
|
|
|
|
for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
|
|
|
|
if (NodeTransitivelyUsesValue(Node->getOperand(i), Op, Visited))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Tail recurse for the last operand.
|
|
|
|
return NodeTransitivelyUsesValue(Node->getOperand(0), Op, Visited);
|
|
|
|
}
|
|
|
|
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
X86AddressMode ISel::SelectAddrExprs(const X86ISelAddressMode &IAM) {
|
|
|
|
X86AddressMode Result;
|
|
|
|
|
|
|
|
// If we need to emit two register operands, emit the one with the highest
|
|
|
|
// register pressure first.
|
|
|
|
if (IAM.BaseType == X86ISelAddressMode::RegBase &&
|
|
|
|
IAM.Base.Reg.Val && IAM.IndexReg.Val) {
|
2005-01-21 00:50:16 +08:00
|
|
|
bool EmitBaseThenIndex;
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
if (getRegPressure(IAM.Base.Reg) > getRegPressure(IAM.IndexReg)) {
|
2005-01-21 00:50:16 +08:00
|
|
|
std::set<SDNode*> Visited;
|
|
|
|
EmitBaseThenIndex = true;
|
|
|
|
// If Base ends up pointing to Index, we must emit index first. This is
|
|
|
|
// because of the way we fold loads, we may end up doing bad things with
|
|
|
|
// the folded add.
|
|
|
|
if (NodeTransitivelyUsesValue(IAM.Base.Reg, IAM.IndexReg, Visited))
|
|
|
|
EmitBaseThenIndex = false;
|
|
|
|
} else {
|
|
|
|
std::set<SDNode*> Visited;
|
|
|
|
EmitBaseThenIndex = false;
|
|
|
|
// If Base ends up pointing to Index, we must emit index first. This is
|
|
|
|
// because of the way we fold loads, we may end up doing bad things with
|
|
|
|
// the folded add.
|
|
|
|
if (NodeTransitivelyUsesValue(IAM.IndexReg, IAM.Base.Reg, Visited))
|
|
|
|
EmitBaseThenIndex = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (EmitBaseThenIndex) {
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
Result.Base.Reg = SelectExpr(IAM.Base.Reg);
|
|
|
|
Result.IndexReg = SelectExpr(IAM.IndexReg);
|
|
|
|
} else {
|
|
|
|
Result.IndexReg = SelectExpr(IAM.IndexReg);
|
|
|
|
Result.Base.Reg = SelectExpr(IAM.Base.Reg);
|
|
|
|
}
|
2005-01-21 00:50:16 +08:00
|
|
|
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
} else if (IAM.BaseType == X86ISelAddressMode::RegBase && IAM.Base.Reg.Val) {
|
|
|
|
Result.Base.Reg = SelectExpr(IAM.Base.Reg);
|
|
|
|
} else if (IAM.IndexReg.Val) {
|
|
|
|
Result.IndexReg = SelectExpr(IAM.IndexReg);
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (IAM.BaseType) {
|
|
|
|
case X86ISelAddressMode::RegBase:
|
|
|
|
Result.BaseType = X86AddressMode::RegBase;
|
|
|
|
break;
|
|
|
|
case X86ISelAddressMode::FrameIndexBase:
|
|
|
|
Result.BaseType = X86AddressMode::FrameIndexBase;
|
|
|
|
Result.Base.FrameIndex = IAM.Base.FrameIndex;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(0 && "Unknown base type!");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
Result.Scale = IAM.Scale;
|
|
|
|
Result.Disp = IAM.Disp;
|
|
|
|
Result.GV = IAM.GV;
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// SelectAddress - Pattern match the maximal addressing mode for this node and
|
|
|
|
/// emit all of the leaf registers.
|
|
|
|
void ISel::SelectAddress(SDOperand N, X86AddressMode &AM) {
|
|
|
|
X86ISelAddressMode IAM;
|
|
|
|
MatchAddress(N, IAM);
|
|
|
|
AM = SelectAddrExprs(IAM);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// MatchAddress - Add the specified node to the specified addressing mode,
|
|
|
|
/// returning true if it cannot be done. This just pattern matches for the
|
|
|
|
/// addressing mode, it does not cause any code to be emitted. For that, use
|
|
|
|
/// SelectAddress.
|
|
|
|
bool ISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM) {
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (N.getOpcode()) {
|
|
|
|
default: break;
|
|
|
|
case ISD::FrameIndex:
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base.Reg.Val == 0) {
|
|
|
|
AM.BaseType = X86ISelAddressMode::FrameIndexBase;
|
2005-01-07 15:49:41 +08:00
|
|
|
AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case ISD::GlobalAddress:
|
|
|
|
if (AM.GV == 0) {
|
|
|
|
AM.GV = cast<GlobalAddressSDNode>(N)->getGlobal();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case ISD::Constant:
|
|
|
|
AM.Disp += cast<ConstantSDNode>(N)->getValue();
|
|
|
|
return false;
|
|
|
|
case ISD::SHL:
|
2005-01-13 13:53:16 +08:00
|
|
|
// We might have folded the load into this shift, so don't regen the value
|
|
|
|
// if so.
|
|
|
|
if (ExprMap.count(N)) break;
|
|
|
|
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
if (AM.IndexReg.Val == 0 && AM.Scale == 1)
|
2005-01-07 15:49:41 +08:00
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1))) {
|
|
|
|
unsigned Val = CN->getValue();
|
|
|
|
if (Val == 1 || Val == 2 || Val == 3) {
|
|
|
|
AM.Scale = 1 << Val;
|
2005-01-11 14:36:20 +08:00
|
|
|
SDOperand ShVal = N.Val->getOperand(0);
|
|
|
|
|
|
|
|
// Okay, we know that we have a scale by now. However, if the scaled
|
|
|
|
// value is an add of something and a constant, we can fold the
|
|
|
|
// constant into the disp field here.
|
2005-01-18 12:18:32 +08:00
|
|
|
if (ShVal.Val->getOpcode() == ISD::ADD && ShVal.hasOneUse() &&
|
2005-01-11 14:36:20 +08:00
|
|
|
isa<ConstantSDNode>(ShVal.Val->getOperand(1))) {
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
AM.IndexReg = ShVal.Val->getOperand(0);
|
2005-01-11 14:36:20 +08:00
|
|
|
ConstantSDNode *AddVal =
|
|
|
|
cast<ConstantSDNode>(ShVal.Val->getOperand(1));
|
|
|
|
AM.Disp += AddVal->getValue() << Val;
|
2005-01-13 13:53:16 +08:00
|
|
|
} else {
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
AM.IndexReg = ShVal;
|
2005-01-11 14:36:20 +08:00
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2005-01-12 03:37:02 +08:00
|
|
|
case ISD::MUL:
|
2005-01-13 13:53:16 +08:00
|
|
|
// We might have folded the load into this mul, so don't regen the value if
|
|
|
|
// so.
|
|
|
|
if (ExprMap.count(N)) break;
|
|
|
|
|
2005-01-12 03:37:02 +08:00
|
|
|
// X*[3,5,9] -> X+X*[2,4,8]
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
if (AM.IndexReg.Val == 0 && AM.BaseType == X86ISelAddressMode::RegBase &&
|
|
|
|
AM.Base.Reg.Val == 0)
|
2005-01-12 03:37:02 +08:00
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1)))
|
|
|
|
if (CN->getValue() == 3 || CN->getValue() == 5 || CN->getValue() == 9) {
|
|
|
|
AM.Scale = unsigned(CN->getValue())-1;
|
|
|
|
|
|
|
|
SDOperand MulVal = N.Val->getOperand(0);
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
SDOperand Reg;
|
2005-01-12 03:37:02 +08:00
|
|
|
|
|
|
|
// Okay, we know that we have a scale by now. However, if the scaled
|
|
|
|
// value is an add of something and a constant, we can fold the
|
|
|
|
// constant into the disp field here.
|
2005-01-18 12:18:32 +08:00
|
|
|
if (MulVal.Val->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
|
2005-01-12 03:37:02 +08:00
|
|
|
isa<ConstantSDNode>(MulVal.Val->getOperand(1))) {
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
Reg = MulVal.Val->getOperand(0);
|
2005-01-12 03:37:02 +08:00
|
|
|
ConstantSDNode *AddVal =
|
|
|
|
cast<ConstantSDNode>(MulVal.Val->getOperand(1));
|
|
|
|
AM.Disp += AddVal->getValue() * CN->getValue();
|
|
|
|
} else {
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
Reg = N.Val->getOperand(0);
|
2005-01-12 03:37:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
AM.IndexReg = AM.Base.Reg = Reg;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
break;
|
2005-01-07 15:49:41 +08:00
|
|
|
|
|
|
|
case ISD::ADD: {
|
2005-01-13 13:53:16 +08:00
|
|
|
// We might have folded the load into this mul, so don't regen the value if
|
|
|
|
// so.
|
|
|
|
if (ExprMap.count(N)) break;
|
|
|
|
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
X86ISelAddressMode Backup = AM;
|
|
|
|
if (!MatchAddress(N.Val->getOperand(0), AM) &&
|
|
|
|
!MatchAddress(N.Val->getOperand(1), AM))
|
2005-01-07 15:49:41 +08:00
|
|
|
return false;
|
|
|
|
AM = Backup;
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
if (!MatchAddress(N.Val->getOperand(1), AM) &&
|
|
|
|
!MatchAddress(N.Val->getOperand(0), AM))
|
Try both ways to fold an add together. This allows us to generate this code
imul %EAX, %EAX, 400
add %ECX, %EAX
add %ESI, DWORD PTR [%ECX + 4*%EDX]
inc %EDX
cmp %EDX, 100
instead of this:
imul %EAX, %EAX, 400
add %ECX, %EAX
mov %EAX, %EDX
shl %EAX, 2
add %ECX, %EAX
add %ESI, DWORD PTR [%ECX]
inc %EDX
cmp %EDX, 100
llvm-svn: 19513
2005-01-13 02:08:53 +08:00
|
|
|
return false;
|
|
|
|
AM = Backup;
|
2005-01-07 15:49:41 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-01-11 12:40:19 +08:00
|
|
|
// Is the base register already occupied?
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base.Reg.Val) {
|
2005-01-11 12:40:19 +08:00
|
|
|
// If so, check to see if the scale index register is set.
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
if (AM.IndexReg.Val == 0) {
|
|
|
|
AM.IndexReg = N;
|
2005-01-11 12:40:19 +08:00
|
|
|
AM.Scale = 1;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Otherwise, we cannot select it.
|
2005-01-07 15:49:41 +08:00
|
|
|
return true;
|
2005-01-11 12:40:19 +08:00
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
|
|
|
|
// Default, generate it as a register.
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
AM.BaseType = X86ISelAddressMode::RegBase;
|
|
|
|
AM.Base.Reg = N;
|
2005-01-07 15:49:41 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Emit2SetCCsAndLogical - Emit the following sequence of instructions,
|
|
|
|
/// assuming that the temporary registers are in the 8-bit register class.
|
|
|
|
///
|
|
|
|
/// Tmp1 = setcc1
|
|
|
|
/// Tmp2 = setcc2
|
|
|
|
/// DestReg = logicalop Tmp1, Tmp2
|
|
|
|
///
|
|
|
|
static void Emit2SetCCsAndLogical(MachineBasicBlock *BB, unsigned SetCC1,
|
|
|
|
unsigned SetCC2, unsigned LogicalOp,
|
|
|
|
unsigned DestReg) {
|
|
|
|
SSARegMap *RegMap = BB->getParent()->getSSARegMap();
|
|
|
|
unsigned Tmp1 = RegMap->createVirtualRegister(X86::R8RegisterClass);
|
|
|
|
unsigned Tmp2 = RegMap->createVirtualRegister(X86::R8RegisterClass);
|
|
|
|
BuildMI(BB, SetCC1, 0, Tmp1);
|
|
|
|
BuildMI(BB, SetCC2, 0, Tmp2);
|
|
|
|
BuildMI(BB, LogicalOp, 2, DestReg).addReg(Tmp1).addReg(Tmp2);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// EmitSetCC - Emit the code to set the specified 8-bit register to 1 if the
|
|
|
|
/// condition codes match the specified SetCCOpcode. Note that some conditions
|
|
|
|
/// require multiple instructions to generate the correct value.
|
|
|
|
static void EmitSetCC(MachineBasicBlock *BB, unsigned DestReg,
|
|
|
|
ISD::CondCode SetCCOpcode, bool isFP) {
|
|
|
|
unsigned Opc;
|
|
|
|
if (!isFP) {
|
|
|
|
switch (SetCCOpcode) {
|
|
|
|
default: assert(0 && "Illegal integer SetCC!");
|
|
|
|
case ISD::SETEQ: Opc = X86::SETEr; break;
|
|
|
|
case ISD::SETGT: Opc = X86::SETGr; break;
|
|
|
|
case ISD::SETGE: Opc = X86::SETGEr; break;
|
|
|
|
case ISD::SETLT: Opc = X86::SETLr; break;
|
|
|
|
case ISD::SETLE: Opc = X86::SETLEr; break;
|
|
|
|
case ISD::SETNE: Opc = X86::SETNEr; break;
|
|
|
|
case ISD::SETULT: Opc = X86::SETBr; break;
|
|
|
|
case ISD::SETUGT: Opc = X86::SETAr; break;
|
|
|
|
case ISD::SETULE: Opc = X86::SETBEr; break;
|
|
|
|
case ISD::SETUGE: Opc = X86::SETAEr; break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// On a floating point condition, the flags are set as follows:
|
|
|
|
// ZF PF CF op
|
|
|
|
// 0 | 0 | 0 | X > Y
|
|
|
|
// 0 | 0 | 1 | X < Y
|
|
|
|
// 1 | 0 | 0 | X == Y
|
|
|
|
// 1 | 1 | 1 | unordered
|
|
|
|
//
|
|
|
|
switch (SetCCOpcode) {
|
|
|
|
default: assert(0 && "Invalid FP setcc!");
|
|
|
|
case ISD::SETUEQ:
|
|
|
|
case ISD::SETEQ:
|
|
|
|
Opc = X86::SETEr; // True if ZF = 1
|
|
|
|
break;
|
|
|
|
case ISD::SETOGT:
|
|
|
|
case ISD::SETGT:
|
|
|
|
Opc = X86::SETAr; // True if CF = 0 and ZF = 0
|
|
|
|
break;
|
|
|
|
case ISD::SETOGE:
|
|
|
|
case ISD::SETGE:
|
|
|
|
Opc = X86::SETAEr; // True if CF = 0
|
|
|
|
break;
|
|
|
|
case ISD::SETULT:
|
|
|
|
case ISD::SETLT:
|
|
|
|
Opc = X86::SETBr; // True if CF = 1
|
|
|
|
break;
|
|
|
|
case ISD::SETULE:
|
|
|
|
case ISD::SETLE:
|
|
|
|
Opc = X86::SETBEr; // True if CF = 1 or ZF = 1
|
|
|
|
break;
|
|
|
|
case ISD::SETONE:
|
|
|
|
case ISD::SETNE:
|
|
|
|
Opc = X86::SETNEr; // True if ZF = 0
|
|
|
|
break;
|
|
|
|
case ISD::SETUO:
|
|
|
|
Opc = X86::SETPr; // True if PF = 1
|
|
|
|
break;
|
|
|
|
case ISD::SETO:
|
|
|
|
Opc = X86::SETNPr; // True if PF = 0
|
|
|
|
break;
|
|
|
|
case ISD::SETOEQ: // !PF & ZF
|
|
|
|
Emit2SetCCsAndLogical(BB, X86::SETNPr, X86::SETEr, X86::AND8rr, DestReg);
|
|
|
|
return;
|
|
|
|
case ISD::SETOLT: // !PF & CF
|
|
|
|
Emit2SetCCsAndLogical(BB, X86::SETNPr, X86::SETBr, X86::AND8rr, DestReg);
|
|
|
|
return;
|
|
|
|
case ISD::SETOLE: // !PF & (CF || ZF)
|
|
|
|
Emit2SetCCsAndLogical(BB, X86::SETNPr, X86::SETBEr, X86::AND8rr, DestReg);
|
|
|
|
return;
|
|
|
|
case ISD::SETUGT: // PF | (!ZF & !CF)
|
|
|
|
Emit2SetCCsAndLogical(BB, X86::SETPr, X86::SETAr, X86::OR8rr, DestReg);
|
|
|
|
return;
|
|
|
|
case ISD::SETUGE: // PF | !CF
|
|
|
|
Emit2SetCCsAndLogical(BB, X86::SETPr, X86::SETAEr, X86::OR8rr, DestReg);
|
|
|
|
return;
|
|
|
|
case ISD::SETUNE: // PF | !ZF
|
|
|
|
Emit2SetCCsAndLogical(BB, X86::SETPr, X86::SETNEr, X86::OR8rr, DestReg);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
BuildMI(BB, Opc, 0, DestReg);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// EmitBranchCC - Emit code into BB that arranges for control to transfer to
|
|
|
|
/// the Dest block if the Cond condition is true. If we cannot fold this
|
|
|
|
/// condition into the branch, return true.
|
|
|
|
///
|
2005-01-11 12:06:27 +08:00
|
|
|
bool ISel::EmitBranchCC(MachineBasicBlock *Dest, SDOperand Chain,
|
|
|
|
SDOperand Cond) {
|
2005-01-07 15:49:41 +08:00
|
|
|
// FIXME: Evaluate whether it would be good to emit code like (X < Y) | (A >
|
|
|
|
// B) using two conditional branches instead of one condbr, two setcc's, and
|
|
|
|
// an or.
|
|
|
|
if ((Cond.getOpcode() == ISD::OR ||
|
|
|
|
Cond.getOpcode() == ISD::AND) && Cond.Val->hasOneUse()) {
|
|
|
|
// And and or set the flags for us, so there is no need to emit a TST of the
|
|
|
|
// result. It is only safe to do this if there is only a single use of the
|
|
|
|
// AND/OR though, otherwise we don't know it will be emitted here.
|
2005-01-11 12:06:27 +08:00
|
|
|
Select(Chain);
|
2005-01-07 15:49:41 +08:00
|
|
|
SelectExpr(Cond);
|
|
|
|
BuildMI(BB, X86::JNE, 1).addMBB(Dest);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Codegen br not C -> JE.
|
|
|
|
if (Cond.getOpcode() == ISD::XOR)
|
|
|
|
if (ConstantSDNode *NC = dyn_cast<ConstantSDNode>(Cond.Val->getOperand(1)))
|
|
|
|
if (NC->isAllOnesValue()) {
|
2005-01-11 12:06:27 +08:00
|
|
|
unsigned CondR;
|
|
|
|
if (getRegPressure(Chain) > getRegPressure(Cond)) {
|
|
|
|
Select(Chain);
|
|
|
|
CondR = SelectExpr(Cond.Val->getOperand(0));
|
|
|
|
} else {
|
|
|
|
CondR = SelectExpr(Cond.Val->getOperand(0));
|
|
|
|
Select(Chain);
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, X86::TEST8rr, 2).addReg(CondR).addReg(CondR);
|
|
|
|
BuildMI(BB, X86::JE, 1).addMBB(Dest);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
SetCCSDNode *SetCC = dyn_cast<SetCCSDNode>(Cond);
|
|
|
|
if (SetCC == 0)
|
|
|
|
return true; // Can only handle simple setcc's so far.
|
|
|
|
|
|
|
|
unsigned Opc;
|
|
|
|
|
|
|
|
// Handle integer conditions first.
|
|
|
|
if (MVT::isInteger(SetCC->getOperand(0).getValueType())) {
|
|
|
|
switch (SetCC->getCondition()) {
|
|
|
|
default: assert(0 && "Illegal integer SetCC!");
|
|
|
|
case ISD::SETEQ: Opc = X86::JE; break;
|
|
|
|
case ISD::SETGT: Opc = X86::JG; break;
|
|
|
|
case ISD::SETGE: Opc = X86::JGE; break;
|
|
|
|
case ISD::SETLT: Opc = X86::JL; break;
|
|
|
|
case ISD::SETLE: Opc = X86::JLE; break;
|
|
|
|
case ISD::SETNE: Opc = X86::JNE; break;
|
|
|
|
case ISD::SETULT: Opc = X86::JB; break;
|
|
|
|
case ISD::SETUGT: Opc = X86::JA; break;
|
|
|
|
case ISD::SETULE: Opc = X86::JBE; break;
|
|
|
|
case ISD::SETUGE: Opc = X86::JAE; break;
|
|
|
|
}
|
2005-01-11 12:06:27 +08:00
|
|
|
Select(Chain);
|
2005-01-17 09:34:14 +08:00
|
|
|
EmitCMP(SetCC->getOperand(0), SetCC->getOperand(1), SetCC->hasOneUse());
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, Opc, 1).addMBB(Dest);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned Opc2 = 0; // Second branch if needed.
|
|
|
|
|
|
|
|
// On a floating point condition, the flags are set as follows:
|
|
|
|
// ZF PF CF op
|
|
|
|
// 0 | 0 | 0 | X > Y
|
|
|
|
// 0 | 0 | 1 | X < Y
|
|
|
|
// 1 | 0 | 0 | X == Y
|
|
|
|
// 1 | 1 | 1 | unordered
|
|
|
|
//
|
|
|
|
switch (SetCC->getCondition()) {
|
|
|
|
default: assert(0 && "Invalid FP setcc!");
|
|
|
|
case ISD::SETUEQ:
|
|
|
|
case ISD::SETEQ: Opc = X86::JE; break; // True if ZF = 1
|
|
|
|
case ISD::SETOGT:
|
|
|
|
case ISD::SETGT: Opc = X86::JA; break; // True if CF = 0 and ZF = 0
|
|
|
|
case ISD::SETOGE:
|
|
|
|
case ISD::SETGE: Opc = X86::JAE; break; // True if CF = 0
|
|
|
|
case ISD::SETULT:
|
|
|
|
case ISD::SETLT: Opc = X86::JB; break; // True if CF = 1
|
|
|
|
case ISD::SETULE:
|
|
|
|
case ISD::SETLE: Opc = X86::JBE; break; // True if CF = 1 or ZF = 1
|
|
|
|
case ISD::SETONE:
|
|
|
|
case ISD::SETNE: Opc = X86::JNE; break; // True if ZF = 0
|
|
|
|
case ISD::SETUO: Opc = X86::JP; break; // True if PF = 1
|
|
|
|
case ISD::SETO: Opc = X86::JNP; break; // True if PF = 0
|
|
|
|
case ISD::SETUGT: // PF = 1 | (ZF = 0 & CF = 0)
|
|
|
|
Opc = X86::JA; // ZF = 0 & CF = 0
|
|
|
|
Opc2 = X86::JP; // PF = 1
|
|
|
|
break;
|
|
|
|
case ISD::SETUGE: // PF = 1 | CF = 0
|
|
|
|
Opc = X86::JAE; // CF = 0
|
|
|
|
Opc2 = X86::JP; // PF = 1
|
|
|
|
break;
|
|
|
|
case ISD::SETUNE: // PF = 1 | ZF = 0
|
|
|
|
Opc = X86::JNE; // ZF = 0
|
|
|
|
Opc2 = X86::JP; // PF = 1
|
|
|
|
break;
|
|
|
|
case ISD::SETOEQ: // PF = 0 & ZF = 1
|
|
|
|
//X86::JNP, X86::JE
|
|
|
|
//X86::AND8rr
|
|
|
|
return true; // FIXME: Emit more efficient code for this branch.
|
|
|
|
case ISD::SETOLT: // PF = 0 & CF = 1
|
|
|
|
//X86::JNP, X86::JB
|
|
|
|
//X86::AND8rr
|
|
|
|
return true; // FIXME: Emit more efficient code for this branch.
|
|
|
|
case ISD::SETOLE: // PF = 0 & (CF = 1 || ZF = 1)
|
|
|
|
//X86::JNP, X86::JBE
|
|
|
|
//X86::AND8rr
|
|
|
|
return true; // FIXME: Emit more efficient code for this branch.
|
|
|
|
}
|
|
|
|
|
2005-01-11 12:06:27 +08:00
|
|
|
Select(Chain);
|
2005-01-17 09:34:14 +08:00
|
|
|
EmitCMP(SetCC->getOperand(0), SetCC->getOperand(1), SetCC->hasOneUse());
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, Opc, 1).addMBB(Dest);
|
|
|
|
if (Opc2)
|
|
|
|
BuildMI(BB, Opc2, 1).addMBB(Dest);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2005-01-11 06:10:13 +08:00
|
|
|
/// EmitSelectCC - Emit code into BB that performs a select operation between
|
|
|
|
/// the two registers RTrue and RFalse, generating a result into RDest. Return
|
|
|
|
/// true if the fold cannot be performed.
|
|
|
|
///
|
|
|
|
void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT,
|
|
|
|
unsigned RTrue, unsigned RFalse, unsigned RDest) {
|
|
|
|
enum Condition {
|
|
|
|
EQ, NE, LT, LE, GT, GE, B, BE, A, AE, P, NP,
|
|
|
|
NOT_SET
|
|
|
|
} CondCode = NOT_SET;
|
|
|
|
|
|
|
|
static const unsigned CMOVTAB16[] = {
|
|
|
|
X86::CMOVE16rr, X86::CMOVNE16rr, X86::CMOVL16rr, X86::CMOVLE16rr,
|
|
|
|
X86::CMOVG16rr, X86::CMOVGE16rr, X86::CMOVB16rr, X86::CMOVBE16rr,
|
|
|
|
X86::CMOVA16rr, X86::CMOVAE16rr, X86::CMOVP16rr, X86::CMOVNP16rr,
|
|
|
|
};
|
|
|
|
static const unsigned CMOVTAB32[] = {
|
|
|
|
X86::CMOVE32rr, X86::CMOVNE32rr, X86::CMOVL32rr, X86::CMOVLE32rr,
|
|
|
|
X86::CMOVG32rr, X86::CMOVGE32rr, X86::CMOVB32rr, X86::CMOVBE32rr,
|
|
|
|
X86::CMOVA32rr, X86::CMOVAE32rr, X86::CMOVP32rr, X86::CMOVNP32rr,
|
|
|
|
};
|
|
|
|
static const unsigned CMOVTABFP[] = {
|
|
|
|
X86::FCMOVE , X86::FCMOVNE, /*missing*/0, /*missing*/0,
|
|
|
|
/*missing*/0, /*missing*/0, X86::FCMOVB , X86::FCMOVBE,
|
|
|
|
X86::FCMOVA , X86::FCMOVAE, X86::FCMOVP , X86::FCMOVNP
|
|
|
|
};
|
|
|
|
|
|
|
|
if (SetCCSDNode *SetCC = dyn_cast<SetCCSDNode>(Cond)) {
|
|
|
|
if (MVT::isInteger(SetCC->getOperand(0).getValueType())) {
|
|
|
|
switch (SetCC->getCondition()) {
|
|
|
|
default: assert(0 && "Unknown integer comparison!");
|
|
|
|
case ISD::SETEQ: CondCode = EQ; break;
|
|
|
|
case ISD::SETGT: CondCode = GT; break;
|
|
|
|
case ISD::SETGE: CondCode = GE; break;
|
|
|
|
case ISD::SETLT: CondCode = LT; break;
|
|
|
|
case ISD::SETLE: CondCode = LE; break;
|
|
|
|
case ISD::SETNE: CondCode = NE; break;
|
|
|
|
case ISD::SETULT: CondCode = B; break;
|
|
|
|
case ISD::SETUGT: CondCode = A; break;
|
|
|
|
case ISD::SETULE: CondCode = BE; break;
|
|
|
|
case ISD::SETUGE: CondCode = AE; break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// On a floating point condition, the flags are set as follows:
|
|
|
|
// ZF PF CF op
|
|
|
|
// 0 | 0 | 0 | X > Y
|
|
|
|
// 0 | 0 | 1 | X < Y
|
|
|
|
// 1 | 0 | 0 | X == Y
|
|
|
|
// 1 | 1 | 1 | unordered
|
|
|
|
//
|
|
|
|
switch (SetCC->getCondition()) {
|
|
|
|
default: assert(0 && "Unknown FP comparison!");
|
|
|
|
case ISD::SETUEQ:
|
|
|
|
case ISD::SETEQ: CondCode = EQ; break; // True if ZF = 1
|
|
|
|
case ISD::SETOGT:
|
|
|
|
case ISD::SETGT: CondCode = A; break; // True if CF = 0 and ZF = 0
|
|
|
|
case ISD::SETOGE:
|
|
|
|
case ISD::SETGE: CondCode = AE; break; // True if CF = 0
|
|
|
|
case ISD::SETULT:
|
|
|
|
case ISD::SETLT: CondCode = B; break; // True if CF = 1
|
|
|
|
case ISD::SETULE:
|
|
|
|
case ISD::SETLE: CondCode = BE; break; // True if CF = 1 or ZF = 1
|
|
|
|
case ISD::SETONE:
|
|
|
|
case ISD::SETNE: CondCode = NE; break; // True if ZF = 0
|
|
|
|
case ISD::SETUO: CondCode = P; break; // True if PF = 1
|
|
|
|
case ISD::SETO: CondCode = NP; break; // True if PF = 0
|
|
|
|
case ISD::SETUGT: // PF = 1 | (ZF = 0 & CF = 0)
|
|
|
|
case ISD::SETUGE: // PF = 1 | CF = 0
|
|
|
|
case ISD::SETUNE: // PF = 1 | ZF = 0
|
|
|
|
case ISD::SETOEQ: // PF = 0 & ZF = 1
|
|
|
|
case ISD::SETOLT: // PF = 0 & CF = 1
|
|
|
|
case ISD::SETOLE: // PF = 0 & (CF = 1 || ZF = 1)
|
|
|
|
// We cannot emit this comparison as a single cmov.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned Opc = 0;
|
|
|
|
if (CondCode != NOT_SET) {
|
|
|
|
switch (SVT) {
|
|
|
|
default: assert(0 && "Cannot select this type!");
|
|
|
|
case MVT::i16: Opc = CMOVTAB16[CondCode]; break;
|
|
|
|
case MVT::i32: Opc = CMOVTAB32[CondCode]; break;
|
2005-01-11 11:50:45 +08:00
|
|
|
case MVT::f64: Opc = CMOVTABFP[CondCode]; break;
|
2005-01-11 06:10:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finally, if we weren't able to fold this, just emit the condition and test
|
|
|
|
// it.
|
|
|
|
if (CondCode == NOT_SET || Opc == 0) {
|
|
|
|
// Get the condition into the zero flag.
|
|
|
|
unsigned CondReg = SelectExpr(Cond);
|
|
|
|
BuildMI(BB, X86::TEST8rr, 2).addReg(CondReg).addReg(CondReg);
|
|
|
|
|
|
|
|
switch (SVT) {
|
|
|
|
default: assert(0 && "Cannot select this type!");
|
|
|
|
case MVT::i16: Opc = X86::CMOVE16rr; break;
|
|
|
|
case MVT::i32: Opc = X86::CMOVE32rr; break;
|
2005-01-11 11:50:45 +08:00
|
|
|
case MVT::f64: Opc = X86::FCMOVE; break;
|
2005-01-11 06:10:13 +08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// FIXME: CMP R, 0 -> TEST R, R
|
2005-01-17 09:34:14 +08:00
|
|
|
EmitCMP(Cond.getOperand(0), Cond.getOperand(1), Cond.Val->hasOneUse());
|
2005-01-11 11:37:59 +08:00
|
|
|
std::swap(RTrue, RFalse);
|
2005-01-11 06:10:13 +08:00
|
|
|
}
|
|
|
|
BuildMI(BB, Opc, 2, RDest).addReg(RTrue).addReg(RFalse);
|
|
|
|
}
|
|
|
|
|
2005-01-17 09:34:14 +08:00
|
|
|
void ISel::EmitCMP(SDOperand LHS, SDOperand RHS, bool HasOneUse) {
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
unsigned Opc;
|
2005-01-07 15:49:41 +08:00
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(RHS)) {
|
|
|
|
Opc = 0;
|
2005-01-17 14:26:58 +08:00
|
|
|
if (HasOneUse && isFoldableLoad(LHS, RHS)) {
|
2005-01-12 10:02:48 +08:00
|
|
|
switch (RHS.getValueType()) {
|
|
|
|
default: break;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = X86::CMP8mi; break;
|
|
|
|
case MVT::i16: Opc = X86::CMP16mi; break;
|
|
|
|
case MVT::i32: Opc = X86::CMP32mi; break;
|
|
|
|
}
|
|
|
|
if (Opc) {
|
|
|
|
X86AddressMode AM;
|
|
|
|
EmitFoldedLoad(LHS, AM);
|
|
|
|
addFullAddress(BuildMI(BB, Opc, 5), AM).addImm(CN->getValue());
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (RHS.getValueType()) {
|
|
|
|
default: break;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = X86::CMP8ri; break;
|
|
|
|
case MVT::i16: Opc = X86::CMP16ri; break;
|
|
|
|
case MVT::i32: Opc = X86::CMP32ri; break;
|
|
|
|
}
|
|
|
|
if (Opc) {
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
unsigned Tmp1 = SelectExpr(LHS);
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, Opc, 2).addReg(Tmp1).addImm(CN->getValue());
|
|
|
|
return;
|
|
|
|
}
|
2005-01-15 06:37:41 +08:00
|
|
|
} else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(RHS)) {
|
|
|
|
if (CN->isExactlyValue(+0.0) ||
|
|
|
|
CN->isExactlyValue(-0.0)) {
|
|
|
|
unsigned Reg = SelectExpr(LHS);
|
|
|
|
BuildMI(BB, X86::FTST, 1).addReg(Reg);
|
|
|
|
BuildMI(BB, X86::FNSTSW8r, 0);
|
|
|
|
BuildMI(BB, X86::SAHF, 1);
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
|
|
|
|
2005-01-12 10:02:48 +08:00
|
|
|
Opc = 0;
|
2005-01-17 14:26:58 +08:00
|
|
|
if (HasOneUse && isFoldableLoad(LHS, RHS)) {
|
2005-01-12 10:02:48 +08:00
|
|
|
switch (RHS.getValueType()) {
|
|
|
|
default: break;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = X86::CMP8mr; break;
|
|
|
|
case MVT::i16: Opc = X86::CMP16mr; break;
|
|
|
|
case MVT::i32: Opc = X86::CMP32mr; break;
|
|
|
|
}
|
|
|
|
if (Opc) {
|
|
|
|
X86AddressMode AM;
|
2005-01-13 13:53:16 +08:00
|
|
|
EmitFoldedLoad(LHS, AM);
|
|
|
|
unsigned Reg = SelectExpr(RHS);
|
2005-01-12 10:02:48 +08:00
|
|
|
addFullAddress(BuildMI(BB, Opc, 5), AM).addReg(Reg);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (LHS.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot compare this value!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = X86::CMP8rr; break;
|
|
|
|
case MVT::i16: Opc = X86::CMP16rr; break;
|
|
|
|
case MVT::i32: Opc = X86::CMP32rr; break;
|
2005-01-11 11:50:45 +08:00
|
|
|
case MVT::f64: Opc = X86::FUCOMIr; break;
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
unsigned Tmp1, Tmp2;
|
|
|
|
if (getRegPressure(LHS) > getRegPressure(RHS)) {
|
|
|
|
Tmp1 = SelectExpr(LHS);
|
|
|
|
Tmp2 = SelectExpr(RHS);
|
|
|
|
} else {
|
|
|
|
Tmp2 = SelectExpr(RHS);
|
|
|
|
Tmp1 = SelectExpr(LHS);
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, Opc, 2).addReg(Tmp1).addReg(Tmp2);
|
|
|
|
}
|
|
|
|
|
2005-01-12 05:19:59 +08:00
|
|
|
/// isFoldableLoad - Return true if this is a load instruction that can safely
|
|
|
|
/// be folded into an operation that uses it.
|
2005-01-26 04:03:11 +08:00
|
|
|
bool ISel::isFoldableLoad(SDOperand Op, SDOperand OtherOp, bool FloatPromoteOk){
|
|
|
|
if (Op.getOpcode() == ISD::LOAD) {
|
|
|
|
// FIXME: currently can't fold constant pool indexes.
|
|
|
|
if (isa<ConstantPoolSDNode>(Op.getOperand(1)))
|
|
|
|
return false;
|
|
|
|
} else if (FloatPromoteOk && Op.getOpcode() == ISD::EXTLOAD &&
|
|
|
|
cast<MVTSDNode>(Op)->getExtraValueType() == MVT::f32) {
|
|
|
|
// FIXME: currently can't fold constant pool indexes.
|
|
|
|
if (isa<ConstantPoolSDNode>(Op.getOperand(1)))
|
|
|
|
return false;
|
|
|
|
} else {
|
2005-01-12 05:19:59 +08:00
|
|
|
return false;
|
2005-01-26 04:03:11 +08:00
|
|
|
}
|
2005-01-12 05:19:59 +08:00
|
|
|
|
|
|
|
// If this load has already been emitted, we clearly can't fold it.
|
2005-01-13 13:53:16 +08:00
|
|
|
assert(Op.ResNo == 0 && "Not a use of the value of the load?");
|
|
|
|
if (ExprMap.count(Op.getValue(1))) return false;
|
|
|
|
assert(!ExprMap.count(Op.getValue(0)) && "Value in map but not token chain?");
|
2005-01-18 11:51:59 +08:00
|
|
|
assert(!ExprMap.count(Op.getValue(1))&&"Token lowered but value not in map?");
|
2005-01-12 05:19:59 +08:00
|
|
|
|
2005-01-17 14:26:58 +08:00
|
|
|
// If there is not just one use of its value, we cannot fold.
|
|
|
|
if (!Op.Val->hasNUsesOfValue(1, 0)) return false;
|
|
|
|
|
|
|
|
// Finally, we cannot fold the load into the operation if this would induce a
|
|
|
|
// cycle into the resultant dag. To check for this, see if OtherOp (the other
|
|
|
|
// operand of the operation we are folding the load into) can possible use the
|
|
|
|
// chain node defined by the load.
|
|
|
|
if (OtherOp.Val && !Op.Val->hasNUsesOfValue(0, 1)) { // Has uses of chain?
|
|
|
|
std::set<SDNode*> Visited;
|
|
|
|
if (NodeTransitivelyUsesValue(OtherOp, Op.getValue(1), Visited))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
2005-01-12 05:19:59 +08:00
|
|
|
}
|
|
|
|
|
2005-01-17 14:26:58 +08:00
|
|
|
|
2005-01-12 05:19:59 +08:00
|
|
|
/// EmitFoldedLoad - Ensure that the arguments of the load are code generated,
|
|
|
|
/// and compute the address being loaded into AM.
|
|
|
|
void ISel::EmitFoldedLoad(SDOperand Op, X86AddressMode &AM) {
|
|
|
|
SDOperand Chain = Op.getOperand(0);
|
|
|
|
SDOperand Address = Op.getOperand(1);
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
|
2005-01-12 05:19:59 +08:00
|
|
|
if (getRegPressure(Chain) > getRegPressure(Address)) {
|
|
|
|
Select(Chain);
|
|
|
|
SelectAddress(Address, AM);
|
|
|
|
} else {
|
|
|
|
SelectAddress(Address, AM);
|
|
|
|
Select(Chain);
|
|
|
|
}
|
|
|
|
|
|
|
|
// The chain for this load is now lowered.
|
2005-01-13 13:53:16 +08:00
|
|
|
assert(ExprMap.count(SDOperand(Op.Val, 1)) == 0 &&
|
|
|
|
"Load emitted more than once?");
|
2005-01-18 11:51:59 +08:00
|
|
|
if (!ExprMap.insert(std::make_pair(Op.getValue(1), 1)).second)
|
2005-01-13 13:53:16 +08:00
|
|
|
assert(0 && "Load emitted more than once!");
|
2005-01-12 05:19:59 +08:00
|
|
|
}
|
|
|
|
|
2005-01-19 15:37:26 +08:00
|
|
|
// EmitOrOpOp - Pattern match the expression (Op1|Op2), where we know that op1
|
|
|
|
// and op2 are i8/i16/i32 values with one use each (the or). If we can form a
|
|
|
|
// SHLD or SHRD, emit the instruction (generating the value into DestReg) and
|
|
|
|
// return true.
|
|
|
|
bool ISel::EmitOrOpOp(SDOperand Op1, SDOperand Op2, unsigned DestReg) {
|
Codegen long >> 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shrd %EAX, %EDX, 2
sar %EDX, 2
ret
instead of this:
test1:
mov %ECX, DWORD PTR [%ESP + 4]
shr %ECX, 2
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %EDX
shl %EAX, 30
or %EAX, %ECX
sar %EDX, 2
ret
and long << 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
shr %ECX, 30
mov %EDX, DWORD PTR [%ESP + 8]
shl %EDX, 2
or %EDX, %ECX
shl %EAX, 2
ret
The extra copy (marked ***) can be eliminated when I teach the code generator
that shrd32rri8 is really commutative.
llvm-svn: 19681
2005-01-19 14:18:43 +08:00
|
|
|
if (Op1.getOpcode() == ISD::SHL && Op2.getOpcode() == ISD::SRL) {
|
|
|
|
// good!
|
|
|
|
} else if (Op2.getOpcode() == ISD::SHL && Op1.getOpcode() == ISD::SRL) {
|
|
|
|
std::swap(Op1, Op2); // Op1 is the SHL now.
|
|
|
|
} else {
|
|
|
|
return false; // No match
|
|
|
|
}
|
|
|
|
|
|
|
|
SDOperand ShlVal = Op1.getOperand(0);
|
|
|
|
SDOperand ShlAmt = Op1.getOperand(1);
|
|
|
|
SDOperand ShrVal = Op2.getOperand(0);
|
|
|
|
SDOperand ShrAmt = Op2.getOperand(1);
|
|
|
|
|
2005-01-19 15:37:26 +08:00
|
|
|
unsigned RegSize = MVT::getSizeInBits(Op1.getValueType());
|
|
|
|
|
Codegen long >> 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shrd %EAX, %EDX, 2
sar %EDX, 2
ret
instead of this:
test1:
mov %ECX, DWORD PTR [%ESP + 4]
shr %ECX, 2
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %EDX
shl %EAX, 30
or %EAX, %ECX
sar %EDX, 2
ret
and long << 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
shr %ECX, 30
mov %EDX, DWORD PTR [%ESP + 8]
shl %EDX, 2
or %EDX, %ECX
shl %EAX, 2
ret
The extra copy (marked ***) can be eliminated when I teach the code generator
that shrd32rri8 is really commutative.
llvm-svn: 19681
2005-01-19 14:18:43 +08:00
|
|
|
// Find out if ShrAmt = 32-ShlAmt or ShlAmt = 32-ShrAmt.
|
|
|
|
if (ShlAmt.getOpcode() == ISD::SUB && ShlAmt.getOperand(1) == ShrAmt)
|
|
|
|
if (ConstantSDNode *SubCST = dyn_cast<ConstantSDNode>(ShlAmt.getOperand(0)))
|
Implement Regression/CodeGen/X86/rotate.ll: emit rotate instructions (which
typically cost 1 cycle) instead of shld/shrd instruction (which are typically
6 or more cycles). This also saves code space.
For example, instead of emitting:
rotr:
mov %EAX, DWORD PTR [%ESP + 4]
mov %CL, BYTE PTR [%ESP + 8]
shrd %EAX, %EAX, %CL
ret
rotli:
mov %EAX, DWORD PTR [%ESP + 4]
shrd %EAX, %EAX, 27
ret
Emit:
rotr32:
mov %CL, BYTE PTR [%ESP + 8]
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, %CL
ret
rotli32:
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, 27
ret
We also emit byte rotate instructions which do not have a sh[lr]d counterpart
at all.
llvm-svn: 19692
2005-01-19 16:07:05 +08:00
|
|
|
if (SubCST->getValue() == RegSize) {
|
|
|
|
// (A >> ShrAmt) | (A << (32-ShrAmt)) ==> ROR A, ShrAmt
|
Codegen long >> 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shrd %EAX, %EDX, 2
sar %EDX, 2
ret
instead of this:
test1:
mov %ECX, DWORD PTR [%ESP + 4]
shr %ECX, 2
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %EDX
shl %EAX, 30
or %EAX, %ECX
sar %EDX, 2
ret
and long << 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
shr %ECX, 30
mov %EDX, DWORD PTR [%ESP + 8]
shl %EDX, 2
or %EDX, %ECX
shl %EAX, 2
ret
The extra copy (marked ***) can be eliminated when I teach the code generator
that shrd32rri8 is really commutative.
llvm-svn: 19681
2005-01-19 14:18:43 +08:00
|
|
|
// (A >> ShrAmt) | (B << (32-ShrAmt)) ==> SHRD A, B, ShrAmt
|
Implement Regression/CodeGen/X86/rotate.ll: emit rotate instructions (which
typically cost 1 cycle) instead of shld/shrd instruction (which are typically
6 or more cycles). This also saves code space.
For example, instead of emitting:
rotr:
mov %EAX, DWORD PTR [%ESP + 4]
mov %CL, BYTE PTR [%ESP + 8]
shrd %EAX, %EAX, %CL
ret
rotli:
mov %EAX, DWORD PTR [%ESP + 4]
shrd %EAX, %EAX, 27
ret
Emit:
rotr32:
mov %CL, BYTE PTR [%ESP + 8]
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, %CL
ret
rotli32:
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, 27
ret
We also emit byte rotate instructions which do not have a sh[lr]d counterpart
at all.
llvm-svn: 19692
2005-01-19 16:07:05 +08:00
|
|
|
if (ShrVal == ShlVal) {
|
|
|
|
unsigned Reg, ShAmt;
|
|
|
|
if (getRegPressure(ShrVal) > getRegPressure(ShrAmt)) {
|
|
|
|
Reg = SelectExpr(ShrVal);
|
|
|
|
ShAmt = SelectExpr(ShrAmt);
|
|
|
|
} else {
|
|
|
|
ShAmt = SelectExpr(ShrAmt);
|
|
|
|
Reg = SelectExpr(ShrVal);
|
|
|
|
}
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, X86::CL).addReg(ShAmt);
|
|
|
|
unsigned Opc = RegSize == 8 ? X86::ROR8rCL :
|
|
|
|
(RegSize == 16 ? X86::ROR16rCL : X86::ROR32rCL);
|
|
|
|
BuildMI(BB, Opc, 1, DestReg).addReg(Reg);
|
|
|
|
return true;
|
|
|
|
} else if (RegSize != 8) {
|
|
|
|
unsigned AReg, BReg;
|
|
|
|
if (getRegPressure(ShlVal) > getRegPressure(ShrVal)) {
|
|
|
|
BReg = SelectExpr(ShlVal);
|
2005-01-20 01:24:34 +08:00
|
|
|
AReg = SelectExpr(ShrVal);
|
Implement Regression/CodeGen/X86/rotate.ll: emit rotate instructions (which
typically cost 1 cycle) instead of shld/shrd instruction (which are typically
6 or more cycles). This also saves code space.
For example, instead of emitting:
rotr:
mov %EAX, DWORD PTR [%ESP + 4]
mov %CL, BYTE PTR [%ESP + 8]
shrd %EAX, %EAX, %CL
ret
rotli:
mov %EAX, DWORD PTR [%ESP + 4]
shrd %EAX, %EAX, 27
ret
Emit:
rotr32:
mov %CL, BYTE PTR [%ESP + 8]
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, %CL
ret
rotli32:
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, 27
ret
We also emit byte rotate instructions which do not have a sh[lr]d counterpart
at all.
llvm-svn: 19692
2005-01-19 16:07:05 +08:00
|
|
|
} else {
|
|
|
|
AReg = SelectExpr(ShrVal);
|
2005-01-20 01:24:34 +08:00
|
|
|
BReg = SelectExpr(ShlVal);
|
Implement Regression/CodeGen/X86/rotate.ll: emit rotate instructions (which
typically cost 1 cycle) instead of shld/shrd instruction (which are typically
6 or more cycles). This also saves code space.
For example, instead of emitting:
rotr:
mov %EAX, DWORD PTR [%ESP + 4]
mov %CL, BYTE PTR [%ESP + 8]
shrd %EAX, %EAX, %CL
ret
rotli:
mov %EAX, DWORD PTR [%ESP + 4]
shrd %EAX, %EAX, 27
ret
Emit:
rotr32:
mov %CL, BYTE PTR [%ESP + 8]
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, %CL
ret
rotli32:
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, 27
ret
We also emit byte rotate instructions which do not have a sh[lr]d counterpart
at all.
llvm-svn: 19692
2005-01-19 16:07:05 +08:00
|
|
|
}
|
|
|
|
unsigned ShAmt = SelectExpr(ShrAmt);
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, X86::CL).addReg(ShAmt);
|
|
|
|
unsigned Opc = RegSize == 16 ? X86::SHRD16rrCL : X86::SHRD32rrCL;
|
|
|
|
BuildMI(BB, Opc, 2, DestReg).addReg(AReg).addReg(BReg);
|
|
|
|
return true;
|
Codegen long >> 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shrd %EAX, %EDX, 2
sar %EDX, 2
ret
instead of this:
test1:
mov %ECX, DWORD PTR [%ESP + 4]
shr %ECX, 2
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %EDX
shl %EAX, 30
or %EAX, %ECX
sar %EDX, 2
ret
and long << 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
shr %ECX, 30
mov %EDX, DWORD PTR [%ESP + 8]
shl %EDX, 2
or %EDX, %ECX
shl %EAX, 2
ret
The extra copy (marked ***) can be eliminated when I teach the code generator
that shrd32rri8 is really commutative.
llvm-svn: 19681
2005-01-19 14:18:43 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ShrAmt.getOpcode() == ISD::SUB && ShrAmt.getOperand(1) == ShlAmt)
|
|
|
|
if (ConstantSDNode *SubCST = dyn_cast<ConstantSDNode>(ShrAmt.getOperand(0)))
|
Implement Regression/CodeGen/X86/rotate.ll: emit rotate instructions (which
typically cost 1 cycle) instead of shld/shrd instruction (which are typically
6 or more cycles). This also saves code space.
For example, instead of emitting:
rotr:
mov %EAX, DWORD PTR [%ESP + 4]
mov %CL, BYTE PTR [%ESP + 8]
shrd %EAX, %EAX, %CL
ret
rotli:
mov %EAX, DWORD PTR [%ESP + 4]
shrd %EAX, %EAX, 27
ret
Emit:
rotr32:
mov %CL, BYTE PTR [%ESP + 8]
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, %CL
ret
rotli32:
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, 27
ret
We also emit byte rotate instructions which do not have a sh[lr]d counterpart
at all.
llvm-svn: 19692
2005-01-19 16:07:05 +08:00
|
|
|
if (SubCST->getValue() == RegSize) {
|
|
|
|
// (A << ShlAmt) | (A >> (32-ShlAmt)) ==> ROL A, ShrAmt
|
Codegen long >> 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shrd %EAX, %EDX, 2
sar %EDX, 2
ret
instead of this:
test1:
mov %ECX, DWORD PTR [%ESP + 4]
shr %ECX, 2
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %EDX
shl %EAX, 30
or %EAX, %ECX
sar %EDX, 2
ret
and long << 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
shr %ECX, 30
mov %EDX, DWORD PTR [%ESP + 8]
shl %EDX, 2
or %EDX, %ECX
shl %EAX, 2
ret
The extra copy (marked ***) can be eliminated when I teach the code generator
that shrd32rri8 is really commutative.
llvm-svn: 19681
2005-01-19 14:18:43 +08:00
|
|
|
// (A << ShlAmt) | (B >> (32-ShlAmt)) ==> SHLD A, B, ShrAmt
|
Implement Regression/CodeGen/X86/rotate.ll: emit rotate instructions (which
typically cost 1 cycle) instead of shld/shrd instruction (which are typically
6 or more cycles). This also saves code space.
For example, instead of emitting:
rotr:
mov %EAX, DWORD PTR [%ESP + 4]
mov %CL, BYTE PTR [%ESP + 8]
shrd %EAX, %EAX, %CL
ret
rotli:
mov %EAX, DWORD PTR [%ESP + 4]
shrd %EAX, %EAX, 27
ret
Emit:
rotr32:
mov %CL, BYTE PTR [%ESP + 8]
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, %CL
ret
rotli32:
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, 27
ret
We also emit byte rotate instructions which do not have a sh[lr]d counterpart
at all.
llvm-svn: 19692
2005-01-19 16:07:05 +08:00
|
|
|
if (ShrVal == ShlVal) {
|
|
|
|
unsigned Reg, ShAmt;
|
|
|
|
if (getRegPressure(ShrVal) > getRegPressure(ShlAmt)) {
|
|
|
|
Reg = SelectExpr(ShrVal);
|
|
|
|
ShAmt = SelectExpr(ShlAmt);
|
|
|
|
} else {
|
|
|
|
ShAmt = SelectExpr(ShlAmt);
|
|
|
|
Reg = SelectExpr(ShrVal);
|
|
|
|
}
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, X86::CL).addReg(ShAmt);
|
|
|
|
unsigned Opc = RegSize == 8 ? X86::ROL8rCL :
|
|
|
|
(RegSize == 16 ? X86::ROL16rCL : X86::ROL32rCL);
|
|
|
|
BuildMI(BB, Opc, 1, DestReg).addReg(Reg);
|
|
|
|
return true;
|
|
|
|
} else if (RegSize != 8) {
|
Codegen long >> 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shrd %EAX, %EDX, 2
sar %EDX, 2
ret
instead of this:
test1:
mov %ECX, DWORD PTR [%ESP + 4]
shr %ECX, 2
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %EDX
shl %EAX, 30
or %EAX, %ECX
sar %EDX, 2
ret
and long << 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
shr %ECX, 30
mov %EDX, DWORD PTR [%ESP + 8]
shl %EDX, 2
or %EDX, %ECX
shl %EAX, 2
ret
The extra copy (marked ***) can be eliminated when I teach the code generator
that shrd32rri8 is really commutative.
llvm-svn: 19681
2005-01-19 14:18:43 +08:00
|
|
|
unsigned AReg, BReg;
|
|
|
|
if (getRegPressure(ShlVal) > getRegPressure(ShrVal)) {
|
2005-01-20 01:24:34 +08:00
|
|
|
AReg = SelectExpr(ShlVal);
|
|
|
|
BReg = SelectExpr(ShrVal);
|
Codegen long >> 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shrd %EAX, %EDX, 2
sar %EDX, 2
ret
instead of this:
test1:
mov %ECX, DWORD PTR [%ESP + 4]
shr %ECX, 2
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %EDX
shl %EAX, 30
or %EAX, %ECX
sar %EDX, 2
ret
and long << 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
shr %ECX, 30
mov %EDX, DWORD PTR [%ESP + 8]
shl %EDX, 2
or %EDX, %ECX
shl %EAX, 2
ret
The extra copy (marked ***) can be eliminated when I teach the code generator
that shrd32rri8 is really commutative.
llvm-svn: 19681
2005-01-19 14:18:43 +08:00
|
|
|
} else {
|
2005-01-20 01:24:34 +08:00
|
|
|
BReg = SelectExpr(ShrVal);
|
|
|
|
AReg = SelectExpr(ShlVal);
|
Codegen long >> 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shrd %EAX, %EDX, 2
sar %EDX, 2
ret
instead of this:
test1:
mov %ECX, DWORD PTR [%ESP + 4]
shr %ECX, 2
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %EDX
shl %EAX, 30
or %EAX, %ECX
sar %EDX, 2
ret
and long << 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
shr %ECX, 30
mov %EDX, DWORD PTR [%ESP + 8]
shl %EDX, 2
or %EDX, %ECX
shl %EAX, 2
ret
The extra copy (marked ***) can be eliminated when I teach the code generator
that shrd32rri8 is really commutative.
llvm-svn: 19681
2005-01-19 14:18:43 +08:00
|
|
|
}
|
Implement Regression/CodeGen/X86/rotate.ll: emit rotate instructions (which
typically cost 1 cycle) instead of shld/shrd instruction (which are typically
6 or more cycles). This also saves code space.
For example, instead of emitting:
rotr:
mov %EAX, DWORD PTR [%ESP + 4]
mov %CL, BYTE PTR [%ESP + 8]
shrd %EAX, %EAX, %CL
ret
rotli:
mov %EAX, DWORD PTR [%ESP + 4]
shrd %EAX, %EAX, 27
ret
Emit:
rotr32:
mov %CL, BYTE PTR [%ESP + 8]
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, %CL
ret
rotli32:
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, 27
ret
We also emit byte rotate instructions which do not have a sh[lr]d counterpart
at all.
llvm-svn: 19692
2005-01-19 16:07:05 +08:00
|
|
|
unsigned ShAmt = SelectExpr(ShlAmt);
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, X86::CL).addReg(ShAmt);
|
|
|
|
unsigned Opc = RegSize == 16 ? X86::SHLD16rrCL : X86::SHLD32rrCL;
|
|
|
|
BuildMI(BB, Opc, 2, DestReg).addReg(AReg).addReg(BReg);
|
Codegen long >> 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shrd %EAX, %EDX, 2
sar %EDX, 2
ret
instead of this:
test1:
mov %ECX, DWORD PTR [%ESP + 4]
shr %ECX, 2
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %EDX
shl %EAX, 30
or %EAX, %ECX
sar %EDX, 2
ret
and long << 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
shr %ECX, 30
mov %EDX, DWORD PTR [%ESP + 8]
shl %EDX, 2
or %EDX, %ECX
shl %EAX, 2
ret
The extra copy (marked ***) can be eliminated when I teach the code generator
that shrd32rri8 is really commutative.
llvm-svn: 19681
2005-01-19 14:18:43 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Implement Regression/CodeGen/X86/rotate.ll: emit rotate instructions (which
typically cost 1 cycle) instead of shld/shrd instruction (which are typically
6 or more cycles). This also saves code space.
For example, instead of emitting:
rotr:
mov %EAX, DWORD PTR [%ESP + 4]
mov %CL, BYTE PTR [%ESP + 8]
shrd %EAX, %EAX, %CL
ret
rotli:
mov %EAX, DWORD PTR [%ESP + 4]
shrd %EAX, %EAX, 27
ret
Emit:
rotr32:
mov %CL, BYTE PTR [%ESP + 8]
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, %CL
ret
rotli32:
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, 27
ret
We also emit byte rotate instructions which do not have a sh[lr]d counterpart
at all.
llvm-svn: 19692
2005-01-19 16:07:05 +08:00
|
|
|
if (ConstantSDNode *ShrCst = dyn_cast<ConstantSDNode>(ShrAmt))
|
|
|
|
if (ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(ShlAmt))
|
|
|
|
if (ShrCst->getValue() < RegSize && ShlCst->getValue() < RegSize)
|
|
|
|
if (ShrCst->getValue() == RegSize-ShlCst->getValue()) {
|
|
|
|
// (A >> 5) | (A << 27) --> ROR A, 5
|
|
|
|
// (A >> 5) | (B << 27) --> SHRD A, B, 5
|
|
|
|
if (ShrVal == ShlVal) {
|
|
|
|
unsigned Reg = SelectExpr(ShrVal);
|
|
|
|
unsigned Opc = RegSize == 8 ? X86::ROR8ri :
|
|
|
|
(RegSize == 16 ? X86::ROR16ri : X86::ROR32ri);
|
|
|
|
BuildMI(BB, Opc, 2, DestReg).addReg(Reg).addImm(ShrCst->getValue());
|
|
|
|
return true;
|
|
|
|
} else if (RegSize != 8) {
|
|
|
|
unsigned AReg, BReg;
|
|
|
|
if (getRegPressure(ShlVal) > getRegPressure(ShrVal)) {
|
|
|
|
BReg = SelectExpr(ShlVal);
|
2005-01-20 01:24:34 +08:00
|
|
|
AReg = SelectExpr(ShrVal);
|
Implement Regression/CodeGen/X86/rotate.ll: emit rotate instructions (which
typically cost 1 cycle) instead of shld/shrd instruction (which are typically
6 or more cycles). This also saves code space.
For example, instead of emitting:
rotr:
mov %EAX, DWORD PTR [%ESP + 4]
mov %CL, BYTE PTR [%ESP + 8]
shrd %EAX, %EAX, %CL
ret
rotli:
mov %EAX, DWORD PTR [%ESP + 4]
shrd %EAX, %EAX, 27
ret
Emit:
rotr32:
mov %CL, BYTE PTR [%ESP + 8]
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, %CL
ret
rotli32:
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, 27
ret
We also emit byte rotate instructions which do not have a sh[lr]d counterpart
at all.
llvm-svn: 19692
2005-01-19 16:07:05 +08:00
|
|
|
} else {
|
|
|
|
AReg = SelectExpr(ShrVal);
|
2005-01-20 01:24:34 +08:00
|
|
|
BReg = SelectExpr(ShlVal);
|
Implement Regression/CodeGen/X86/rotate.ll: emit rotate instructions (which
typically cost 1 cycle) instead of shld/shrd instruction (which are typically
6 or more cycles). This also saves code space.
For example, instead of emitting:
rotr:
mov %EAX, DWORD PTR [%ESP + 4]
mov %CL, BYTE PTR [%ESP + 8]
shrd %EAX, %EAX, %CL
ret
rotli:
mov %EAX, DWORD PTR [%ESP + 4]
shrd %EAX, %EAX, 27
ret
Emit:
rotr32:
mov %CL, BYTE PTR [%ESP + 8]
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, %CL
ret
rotli32:
mov %EAX, DWORD PTR [%ESP + 4]
ror %EAX, 27
ret
We also emit byte rotate instructions which do not have a sh[lr]d counterpart
at all.
llvm-svn: 19692
2005-01-19 16:07:05 +08:00
|
|
|
}
|
|
|
|
unsigned Opc = RegSize == 16 ? X86::SHRD16rri8 : X86::SHRD32rri8;
|
|
|
|
BuildMI(BB, Opc, 3, DestReg).addReg(AReg).addReg(BReg)
|
|
|
|
.addImm(ShrCst->getValue());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Codegen long >> 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shrd %EAX, %EDX, 2
sar %EDX, 2
ret
instead of this:
test1:
mov %ECX, DWORD PTR [%ESP + 4]
shr %ECX, 2
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %EDX
shl %EAX, 30
or %EAX, %ECX
sar %EDX, 2
ret
and long << 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
shr %ECX, 30
mov %EDX, DWORD PTR [%ESP + 8]
shl %EDX, 2
or %EDX, %ECX
shl %EAX, 2
ret
The extra copy (marked ***) can be eliminated when I teach the code generator
that shrd32rri8 is really commutative.
llvm-svn: 19681
2005-01-19 14:18:43 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
unsigned ISel::SelectExpr(SDOperand N) {
|
|
|
|
unsigned Result;
|
|
|
|
unsigned Tmp1, Tmp2, Tmp3;
|
|
|
|
unsigned Opc = 0;
|
2005-01-09 03:28:19 +08:00
|
|
|
SDNode *Node = N.Val;
|
2005-01-12 05:19:59 +08:00
|
|
|
SDOperand Op0, Op1;
|
2005-01-09 03:28:19 +08:00
|
|
|
|
2005-01-15 06:37:41 +08:00
|
|
|
if (Node->getOpcode() == ISD::CopyFromReg) {
|
|
|
|
// FIXME: Handle copy from physregs!
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
// Just use the specified register as our input.
|
2005-01-14 04:50:02 +08:00
|
|
|
return dyn_cast<RegSDNode>(Node)->getReg();
|
2005-01-15 06:37:41 +08:00
|
|
|
}
|
2005-01-12 05:19:59 +08:00
|
|
|
|
|
|
|
unsigned &Reg = ExprMap[N];
|
|
|
|
if (Reg) return Reg;
|
|
|
|
|
2005-01-21 02:53:00 +08:00
|
|
|
if (N.getOpcode() != ISD::CALL && N.getOpcode() != ISD::ADD_PARTS &&
|
|
|
|
N.getOpcode() != ISD::SUB_PARTS)
|
2005-01-12 05:19:59 +08:00
|
|
|
Reg = Result = (N.getValueType() != MVT::Other) ?
|
|
|
|
MakeReg(N.getValueType()) : 1;
|
|
|
|
else {
|
|
|
|
// If this is a call instruction, make sure to prepare ALL of the result
|
|
|
|
// values as well as the chain.
|
2005-01-21 02:53:00 +08:00
|
|
|
if (N.getOpcode() == ISD::CALL) {
|
|
|
|
if (Node->getNumValues() == 1)
|
|
|
|
Reg = Result = 1; // Void call, just a chain.
|
|
|
|
else {
|
|
|
|
Result = MakeReg(Node->getValueType(0));
|
|
|
|
ExprMap[N.getValue(0)] = Result;
|
|
|
|
for (unsigned i = 1, e = N.Val->getNumValues()-1; i != e; ++i)
|
|
|
|
ExprMap[N.getValue(i)] = MakeReg(Node->getValueType(i));
|
|
|
|
ExprMap[SDOperand(Node, Node->getNumValues()-1)] = 1;
|
|
|
|
}
|
|
|
|
} else {
|
2005-01-12 05:19:59 +08:00
|
|
|
Result = MakeReg(Node->getValueType(0));
|
|
|
|
ExprMap[N.getValue(0)] = Result;
|
2005-01-21 02:53:00 +08:00
|
|
|
for (unsigned i = 1, e = N.Val->getNumValues(); i != e; ++i)
|
2005-01-12 05:19:59 +08:00
|
|
|
ExprMap[N.getValue(i)] = MakeReg(Node->getValueType(i));
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
|
|
|
}
|
2005-01-12 05:19:59 +08:00
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (N.getOpcode()) {
|
|
|
|
default:
|
2005-01-09 03:28:19 +08:00
|
|
|
Node->dump();
|
2005-01-07 15:49:41 +08:00
|
|
|
assert(0 && "Node not handled!\n");
|
|
|
|
case ISD::FrameIndex:
|
|
|
|
Tmp1 = cast<FrameIndexSDNode>(N)->getIndex();
|
|
|
|
addFrameReference(BuildMI(BB, X86::LEA32r, 4, Result), (int)Tmp1);
|
|
|
|
return Result;
|
|
|
|
case ISD::ConstantPool:
|
|
|
|
Tmp1 = cast<ConstantPoolSDNode>(N)->getIndex();
|
|
|
|
addConstantPoolReference(BuildMI(BB, X86::LEA32r, 4, Result), Tmp1);
|
|
|
|
return Result;
|
|
|
|
case ISD::ConstantFP:
|
|
|
|
ContainsFPCode = true;
|
|
|
|
Tmp1 = Result; // Intermediate Register
|
|
|
|
if (cast<ConstantFPSDNode>(N)->getValue() < 0.0 ||
|
|
|
|
cast<ConstantFPSDNode>(N)->isExactlyValue(-0.0))
|
|
|
|
Tmp1 = MakeReg(MVT::f64);
|
|
|
|
|
|
|
|
if (cast<ConstantFPSDNode>(N)->isExactlyValue(+0.0) ||
|
|
|
|
cast<ConstantFPSDNode>(N)->isExactlyValue(-0.0))
|
|
|
|
BuildMI(BB, X86::FLD0, 0, Tmp1);
|
|
|
|
else if (cast<ConstantFPSDNode>(N)->isExactlyValue(+1.0) ||
|
|
|
|
cast<ConstantFPSDNode>(N)->isExactlyValue(-1.0))
|
|
|
|
BuildMI(BB, X86::FLD1, 0, Tmp1);
|
|
|
|
else
|
|
|
|
assert(0 && "Unexpected constant!");
|
|
|
|
if (Tmp1 != Result)
|
|
|
|
BuildMI(BB, X86::FCHS, 1, Result).addReg(Tmp1);
|
|
|
|
return Result;
|
|
|
|
case ISD::Constant:
|
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot use constants of this type!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = X86::MOV8ri; break;
|
|
|
|
case MVT::i16: Opc = X86::MOV16ri; break;
|
|
|
|
case MVT::i32: Opc = X86::MOV32ri; break;
|
|
|
|
}
|
|
|
|
BuildMI(BB, Opc, 1,Result).addImm(cast<ConstantSDNode>(N)->getValue());
|
|
|
|
return Result;
|
|
|
|
case ISD::GlobalAddress: {
|
|
|
|
GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
|
|
|
|
BuildMI(BB, X86::MOV32ri, 1, Result).addGlobalAddress(GV);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
case ISD::ExternalSymbol: {
|
|
|
|
const char *Sym = cast<ExternalSymbolSDNode>(N)->getSymbol();
|
|
|
|
BuildMI(BB, X86::MOV32ri, 1, Result).addExternalSymbol(Sym);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
case ISD::ZERO_EXTEND: {
|
|
|
|
int DestIs16 = N.getValueType() == MVT::i16;
|
|
|
|
int SrcIs16 = N.getOperand(0).getValueType() == MVT::i16;
|
2005-01-10 02:52:44 +08:00
|
|
|
|
|
|
|
// FIXME: This hack is here for zero extension casts from bool to i8. This
|
|
|
|
// would not be needed if bools were promoted by Legalize.
|
|
|
|
if (N.getValueType() == MVT::i8) {
|
2005-01-12 07:33:00 +08:00
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
2005-01-10 02:52:44 +08:00
|
|
|
BuildMI(BB, X86::MOV8rr, 1, Result).addReg(Tmp1);
|
|
|
|
return Result;
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
|
2005-01-17 14:26:58 +08:00
|
|
|
if (isFoldableLoad(N.getOperand(0), SDOperand())) {
|
2005-01-12 07:33:00 +08:00
|
|
|
static const unsigned Opc[3] = {
|
|
|
|
X86::MOVZX32rm8, X86::MOVZX32rm16, X86::MOVZX16rm8
|
|
|
|
};
|
|
|
|
|
|
|
|
X86AddressMode AM;
|
|
|
|
EmitFoldedLoad(N.getOperand(0), AM);
|
|
|
|
addFullAddress(BuildMI(BB, Opc[SrcIs16+DestIs16*2], 4, Result), AM);
|
|
|
|
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
static const unsigned Opc[3] = {
|
|
|
|
X86::MOVZX32rr8, X86::MOVZX32rr16, X86::MOVZX16rr8
|
|
|
|
};
|
2005-01-12 07:33:00 +08:00
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, Opc[SrcIs16+DestIs16*2], 1, Result).addReg(Tmp1);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
case ISD::SIGN_EXTEND: {
|
|
|
|
int DestIs16 = N.getValueType() == MVT::i16;
|
|
|
|
int SrcIs16 = N.getOperand(0).getValueType() == MVT::i16;
|
|
|
|
|
2005-01-10 02:52:44 +08:00
|
|
|
// FIXME: Legalize should promote bools to i8!
|
|
|
|
assert(N.getOperand(0).getValueType() != MVT::i1 &&
|
|
|
|
"Sign extend from bool not implemented!");
|
|
|
|
|
2005-01-17 14:26:58 +08:00
|
|
|
if (isFoldableLoad(N.getOperand(0), SDOperand())) {
|
2005-01-12 07:33:00 +08:00
|
|
|
static const unsigned Opc[3] = {
|
|
|
|
X86::MOVSX32rm8, X86::MOVSX32rm16, X86::MOVSX16rm8
|
|
|
|
};
|
|
|
|
|
|
|
|
X86AddressMode AM;
|
|
|
|
EmitFoldedLoad(N.getOperand(0), AM);
|
|
|
|
addFullAddress(BuildMI(BB, Opc[SrcIs16+DestIs16*2], 4, Result), AM);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
static const unsigned Opc[3] = {
|
|
|
|
X86::MOVSX32rr8, X86::MOVSX32rr16, X86::MOVSX16rr8
|
|
|
|
};
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
|
|
|
BuildMI(BB, Opc[SrcIs16+DestIs16*2], 1, Result).addReg(Tmp1);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
case ISD::TRUNCATE:
|
2005-01-12 10:19:06 +08:00
|
|
|
// Fold TRUNCATE (LOAD P) into a smaller load from P.
|
2005-01-19 04:05:56 +08:00
|
|
|
// FIXME: This should be performed by the DAGCombiner.
|
2005-01-17 14:26:58 +08:00
|
|
|
if (isFoldableLoad(N.getOperand(0), SDOperand())) {
|
2005-01-12 10:19:06 +08:00
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Unknown truncate!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = X86::MOV8rm; break;
|
|
|
|
case MVT::i16: Opc = X86::MOV16rm; break;
|
|
|
|
}
|
|
|
|
X86AddressMode AM;
|
|
|
|
EmitFoldedLoad(N.getOperand(0), AM);
|
|
|
|
addFullAddress(BuildMI(BB, Opc, 4, Result), AM);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
// Handle cast of LARGER int to SMALLER int using a move to EAX followed by
|
|
|
|
// a move out of AX or AL.
|
|
|
|
switch (N.getOperand(0).getValueType()) {
|
|
|
|
default: assert(0 && "Unknown truncate!");
|
|
|
|
case MVT::i8: Tmp2 = X86::AL; Opc = X86::MOV8rr; break;
|
|
|
|
case MVT::i16: Tmp2 = X86::AX; Opc = X86::MOV16rr; break;
|
|
|
|
case MVT::i32: Tmp2 = X86::EAX; Opc = X86::MOV32rr; break;
|
|
|
|
}
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
|
|
|
BuildMI(BB, Opc, 1, Tmp2).addReg(Tmp1);
|
|
|
|
|
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Unknown truncate!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Tmp2 = X86::AL; Opc = X86::MOV8rr; break;
|
|
|
|
case MVT::i16: Tmp2 = X86::AX; Opc = X86::MOV16rr; break;
|
|
|
|
}
|
|
|
|
BuildMI(BB, Opc, 1, Result).addReg(Tmp2);
|
|
|
|
return Result;
|
|
|
|
|
2005-01-10 02:52:44 +08:00
|
|
|
case ISD::SINT_TO_FP:
|
|
|
|
case ISD::UINT_TO_FP: {
|
|
|
|
// FIXME: Most of this grunt work should be done by legalize!
|
2005-01-11 11:50:45 +08:00
|
|
|
ContainsFPCode = true;
|
2005-01-10 02:52:44 +08:00
|
|
|
|
|
|
|
// Promote the integer to a type supported by FLD. We do this because there
|
|
|
|
// are no unsigned FLD instructions, so we must promote an unsigned value to
|
|
|
|
// a larger signed value, then use FLD on the larger value.
|
|
|
|
//
|
|
|
|
MVT::ValueType PromoteType = MVT::Other;
|
|
|
|
MVT::ValueType SrcTy = N.getOperand(0).getValueType();
|
|
|
|
unsigned PromoteOpcode = 0;
|
|
|
|
unsigned RealDestReg = Result;
|
|
|
|
switch (SrcTy) {
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8:
|
|
|
|
// We don't have the facilities for directly loading byte sized data from
|
|
|
|
// memory (even signed). Promote it to 16 bits.
|
|
|
|
PromoteType = MVT::i16;
|
|
|
|
PromoteOpcode = Node->getOpcode() == ISD::SINT_TO_FP ?
|
|
|
|
X86::MOVSX16rr8 : X86::MOVZX16rr8;
|
|
|
|
break;
|
|
|
|
case MVT::i16:
|
|
|
|
if (Node->getOpcode() == ISD::UINT_TO_FP) {
|
|
|
|
PromoteType = MVT::i32;
|
|
|
|
PromoteOpcode = X86::MOVZX32rr16;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
// Don't fild into the real destination.
|
|
|
|
if (Node->getOpcode() == ISD::UINT_TO_FP)
|
|
|
|
Result = MakeReg(Node->getValueType(0));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0)); // Get the operand register
|
|
|
|
|
|
|
|
if (PromoteType != MVT::Other) {
|
|
|
|
Tmp2 = MakeReg(PromoteType);
|
|
|
|
BuildMI(BB, PromoteOpcode, 1, Tmp2).addReg(Tmp1);
|
|
|
|
SrcTy = PromoteType;
|
|
|
|
Tmp1 = Tmp2;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Spill the integer to memory and reload it from there.
|
|
|
|
unsigned Size = MVT::getSizeInBits(SrcTy)/8;
|
|
|
|
MachineFunction *F = BB->getParent();
|
|
|
|
int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size);
|
|
|
|
|
|
|
|
switch (SrcTy) {
|
|
|
|
case MVT::i64:
|
2005-01-12 12:21:28 +08:00
|
|
|
assert(0 && "Cast ulong to FP not implemented yet!");
|
2005-01-10 02:52:44 +08:00
|
|
|
// FIXME: this won't work for cast [u]long to FP
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV32mr, 5),
|
|
|
|
FrameIdx).addReg(Tmp1);
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV32mr, 5),
|
|
|
|
FrameIdx, 4).addReg(Tmp1+1);
|
|
|
|
addFrameReference(BuildMI(BB, X86::FILD64m, 5, Result), FrameIdx);
|
|
|
|
break;
|
|
|
|
case MVT::i32:
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV32mr, 5),
|
|
|
|
FrameIdx).addReg(Tmp1);
|
|
|
|
addFrameReference(BuildMI(BB, X86::FILD32m, 5, Result), FrameIdx);
|
|
|
|
break;
|
|
|
|
case MVT::i16:
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV16mr, 5),
|
|
|
|
FrameIdx).addReg(Tmp1);
|
|
|
|
addFrameReference(BuildMI(BB, X86::FILD16m, 5, Result), FrameIdx);
|
|
|
|
break;
|
|
|
|
default: break; // No promotion required.
|
|
|
|
}
|
|
|
|
|
2005-01-12 12:00:00 +08:00
|
|
|
if (Node->getOpcode() == ISD::UINT_TO_FP && Result != RealDestReg) {
|
2005-01-10 02:52:44 +08:00
|
|
|
// If this is a cast from uint -> double, we need to be careful when if
|
|
|
|
// the "sign" bit is set. If so, we don't want to make a negative number,
|
|
|
|
// we want to make a positive number. Emit code to add an offset if the
|
|
|
|
// sign bit is set.
|
|
|
|
|
|
|
|
// Compute whether the sign bit is set by shifting the reg right 31 bits.
|
|
|
|
unsigned IsNeg = MakeReg(MVT::i32);
|
|
|
|
BuildMI(BB, X86::SHR32ri, 2, IsNeg).addReg(Tmp1).addImm(31);
|
|
|
|
|
|
|
|
// Create a CP value that has the offset in one word and 0 in the other.
|
|
|
|
static ConstantInt *TheOffset = ConstantUInt::get(Type::ULongTy,
|
|
|
|
0x4f80000000000000ULL);
|
|
|
|
unsigned CPI = F->getConstantPool()->getConstantPoolIndex(TheOffset);
|
|
|
|
BuildMI(BB, X86::FADD32m, 5, RealDestReg).addReg(Result)
|
|
|
|
.addConstantPoolIndex(CPI).addZImm(4).addReg(IsNeg).addSImm(0);
|
|
|
|
|
|
|
|
} else if (Node->getOpcode() == ISD::UINT_TO_FP && SrcTy == MVT::i64) {
|
|
|
|
// We need special handling for unsigned 64-bit integer sources. If the
|
|
|
|
// input number has the "sign bit" set, then we loaded it incorrectly as a
|
|
|
|
// negative 64-bit number. In this case, add an offset value.
|
|
|
|
|
|
|
|
// Emit a test instruction to see if the dynamic input value was signed.
|
|
|
|
BuildMI(BB, X86::TEST32rr, 2).addReg(Tmp1+1).addReg(Tmp1+1);
|
|
|
|
|
|
|
|
// If the sign bit is set, get a pointer to an offset, otherwise get a
|
|
|
|
// pointer to a zero.
|
|
|
|
MachineConstantPool *CP = F->getConstantPool();
|
|
|
|
unsigned Zero = MakeReg(MVT::i32);
|
|
|
|
Constant *Null = Constant::getNullValue(Type::UIntTy);
|
|
|
|
addConstantPoolReference(BuildMI(BB, X86::LEA32r, 5, Zero),
|
|
|
|
CP->getConstantPoolIndex(Null));
|
|
|
|
unsigned Offset = MakeReg(MVT::i32);
|
|
|
|
Constant *OffsetCst = ConstantUInt::get(Type::UIntTy, 0x5f800000);
|
|
|
|
|
|
|
|
addConstantPoolReference(BuildMI(BB, X86::LEA32r, 5, Offset),
|
|
|
|
CP->getConstantPoolIndex(OffsetCst));
|
|
|
|
unsigned Addr = MakeReg(MVT::i32);
|
|
|
|
BuildMI(BB, X86::CMOVS32rr, 2, Addr).addReg(Zero).addReg(Offset);
|
|
|
|
|
|
|
|
// Load the constant for an add. FIXME: this could make an 'fadd' that
|
|
|
|
// reads directly from memory, but we don't support these yet.
|
|
|
|
unsigned ConstReg = MakeReg(MVT::f64);
|
|
|
|
addDirectMem(BuildMI(BB, X86::FLD32m, 4, ConstReg), Addr);
|
|
|
|
|
|
|
|
BuildMI(BB, X86::FpADD, 2, RealDestReg).addReg(ConstReg).addReg(Result);
|
|
|
|
}
|
|
|
|
return RealDestReg;
|
|
|
|
}
|
|
|
|
case ISD::FP_TO_SINT:
|
|
|
|
case ISD::FP_TO_UINT: {
|
|
|
|
// FIXME: Most of this grunt work should be done by legalize!
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0)); // Get the operand register
|
|
|
|
|
|
|
|
// Change the floating point control register to use "round towards zero"
|
|
|
|
// mode when truncating to an integer value.
|
|
|
|
//
|
|
|
|
MachineFunction *F = BB->getParent();
|
|
|
|
int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
|
|
|
|
addFrameReference(BuildMI(BB, X86::FNSTCW16m, 4), CWFrameIdx);
|
|
|
|
|
|
|
|
// Load the old value of the high byte of the control word...
|
|
|
|
unsigned HighPartOfCW = MakeReg(MVT::i8);
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV8rm, 4, HighPartOfCW),
|
|
|
|
CWFrameIdx, 1);
|
|
|
|
|
|
|
|
// Set the high part to be round to zero...
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV8mi, 5),
|
|
|
|
CWFrameIdx, 1).addImm(12);
|
|
|
|
|
|
|
|
// Reload the modified control word now...
|
|
|
|
addFrameReference(BuildMI(BB, X86::FLDCW16m, 4), CWFrameIdx);
|
|
|
|
|
|
|
|
// Restore the memory image of control word to original value
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV8mr, 5),
|
|
|
|
CWFrameIdx, 1).addReg(HighPartOfCW);
|
|
|
|
|
|
|
|
// We don't have the facilities for directly storing byte sized data to
|
|
|
|
// memory. Promote it to 16 bits. We also must promote unsigned values to
|
|
|
|
// larger classes because we only have signed FP stores.
|
|
|
|
MVT::ValueType StoreClass = Node->getValueType(0);
|
|
|
|
if (StoreClass == MVT::i8 || Node->getOpcode() == ISD::FP_TO_UINT)
|
|
|
|
switch (StoreClass) {
|
|
|
|
case MVT::i8: StoreClass = MVT::i16; break;
|
|
|
|
case MVT::i16: StoreClass = MVT::i32; break;
|
|
|
|
case MVT::i32: StoreClass = MVT::i64; break;
|
|
|
|
// The following treatment of cLong may not be perfectly right,
|
|
|
|
// but it survives chains of casts of the form
|
|
|
|
// double->ulong->double.
|
|
|
|
case MVT::i64: StoreClass = MVT::i64; break;
|
|
|
|
default: assert(0 && "Unknown store class!");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Spill the integer to memory and reload it from there.
|
|
|
|
unsigned Size = MVT::getSizeInBits(StoreClass)/8;
|
|
|
|
int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size);
|
|
|
|
|
|
|
|
switch (StoreClass) {
|
|
|
|
default: assert(0 && "Unknown store class!");
|
|
|
|
case MVT::i16:
|
|
|
|
addFrameReference(BuildMI(BB, X86::FIST16m, 5), FrameIdx).addReg(Tmp1);
|
|
|
|
break;
|
|
|
|
case MVT::i32:
|
2005-01-10 03:49:59 +08:00
|
|
|
addFrameReference(BuildMI(BB, X86::FIST32m, 5), FrameIdx).addReg(Tmp1);
|
2005-01-10 02:52:44 +08:00
|
|
|
break;
|
|
|
|
case MVT::i64:
|
2005-01-10 03:49:59 +08:00
|
|
|
addFrameReference(BuildMI(BB, X86::FISTP64m, 5), FrameIdx).addReg(Tmp1);
|
2005-01-10 02:52:44 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (Node->getValueType(0)) {
|
|
|
|
default:
|
|
|
|
assert(0 && "Unknown integer type!");
|
|
|
|
case MVT::i64:
|
|
|
|
// FIXME: this isn't gunna work.
|
2005-01-12 12:21:28 +08:00
|
|
|
assert(0 && "Cast FP to long not implemented yet!");
|
2005-01-10 02:52:44 +08:00
|
|
|
addFrameReference(BuildMI(BB, X86::MOV32rm, 4, Result), FrameIdx);
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV32rm, 4, Result+1), FrameIdx, 4);
|
|
|
|
case MVT::i32:
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV32rm, 4, Result), FrameIdx);
|
|
|
|
break;
|
|
|
|
case MVT::i16:
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV16rm, 4, Result), FrameIdx);
|
|
|
|
break;
|
|
|
|
case MVT::i8:
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV8rm, 4, Result), FrameIdx);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reload the original control word now.
|
|
|
|
addFrameReference(BuildMI(BB, X86::FLDCW16m, 4), CWFrameIdx);
|
|
|
|
return Result;
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
case ISD::ADD:
|
2005-01-12 05:19:59 +08:00
|
|
|
Op0 = N.getOperand(0);
|
|
|
|
Op1 = N.getOperand(1);
|
|
|
|
|
2005-01-26 04:03:11 +08:00
|
|
|
if (isFoldableLoad(Op0, Op1, true)) {
|
2005-01-12 05:19:59 +08:00
|
|
|
std::swap(Op0, Op1);
|
2005-01-17 14:26:58 +08:00
|
|
|
goto FoldAdd;
|
|
|
|
}
|
2005-01-12 05:19:59 +08:00
|
|
|
|
2005-01-26 04:03:11 +08:00
|
|
|
if (isFoldableLoad(Op1, Op0, true)) {
|
2005-01-17 14:26:58 +08:00
|
|
|
FoldAdd:
|
2005-01-12 05:19:59 +08:00
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot add this type!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = X86::ADD8rm; break;
|
|
|
|
case MVT::i16: Opc = X86::ADD16rm; break;
|
|
|
|
case MVT::i32: Opc = X86::ADD32rm; break;
|
2005-01-26 04:03:11 +08:00
|
|
|
case MVT::f64:
|
|
|
|
// For F64, handle promoted load operations (from F32) as well!
|
|
|
|
Opc = Op1.getOpcode() == ISD::LOAD ? X86::FADD64m : X86::FADD32m;
|
|
|
|
break;
|
2005-01-12 05:19:59 +08:00
|
|
|
}
|
|
|
|
X86AddressMode AM;
|
2005-01-13 13:53:16 +08:00
|
|
|
EmitFoldedLoad(Op1, AM);
|
|
|
|
Tmp1 = SelectExpr(Op0);
|
2005-01-12 05:19:59 +08:00
|
|
|
addFullAddress(BuildMI(BB, Opc, 5, Result).addReg(Tmp1), AM);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
// See if we can codegen this as an LEA to fold operations together.
|
|
|
|
if (N.getValueType() == MVT::i32) {
|
2005-01-18 10:25:52 +08:00
|
|
|
ExprMap.erase(N);
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
X86ISelAddressMode AM;
|
2005-01-18 10:25:52 +08:00
|
|
|
MatchAddress(N, AM);
|
|
|
|
ExprMap[N] = Result;
|
|
|
|
|
|
|
|
// If this is not just an add, emit the LEA. For a simple add (like
|
|
|
|
// reg+reg or reg+imm), we just emit an add. It might be a good idea to
|
|
|
|
// leave this as LEA, then peephole it to 'ADD' after two address elim
|
|
|
|
// happens.
|
|
|
|
if (AM.Scale != 1 || AM.BaseType == X86ISelAddressMode::FrameIndexBase||
|
|
|
|
AM.GV || (AM.Base.Reg.Val && AM.IndexReg.Val && AM.Disp)) {
|
|
|
|
X86AddressMode XAM = SelectAddrExprs(AM);
|
|
|
|
addFullAddress(BuildMI(BB, X86::LEA32r, 4, Result), XAM);
|
|
|
|
return Result;
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
|
2005-01-12 05:19:59 +08:00
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
|
2005-01-07 15:49:41 +08:00
|
|
|
Opc = 0;
|
|
|
|
if (CN->getValue() == 1) { // add X, 1 -> inc X
|
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot integer add this type!");
|
|
|
|
case MVT::i8: Opc = X86::INC8r; break;
|
|
|
|
case MVT::i16: Opc = X86::INC16r; break;
|
|
|
|
case MVT::i32: Opc = X86::INC32r; break;
|
|
|
|
}
|
|
|
|
} else if (CN->isAllOnesValue()) { // add X, -1 -> dec X
|
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot integer add this type!");
|
|
|
|
case MVT::i8: Opc = X86::DEC8r; break;
|
|
|
|
case MVT::i16: Opc = X86::DEC16r; break;
|
|
|
|
case MVT::i32: Opc = X86::DEC32r; break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Opc) {
|
2005-01-12 05:19:59 +08:00
|
|
|
Tmp1 = SelectExpr(Op0);
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, Opc, 1, Result).addReg(Tmp1);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot add this type!");
|
|
|
|
case MVT::i8: Opc = X86::ADD8ri; break;
|
|
|
|
case MVT::i16: Opc = X86::ADD16ri; break;
|
|
|
|
case MVT::i32: Opc = X86::ADD32ri; break;
|
|
|
|
}
|
|
|
|
if (Opc) {
|
2005-01-12 05:19:59 +08:00
|
|
|
Tmp1 = SelectExpr(Op0);
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addImm(CN->getValue());
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot add this type!");
|
|
|
|
case MVT::i8: Opc = X86::ADD8rr; break;
|
|
|
|
case MVT::i16: Opc = X86::ADD16rr; break;
|
|
|
|
case MVT::i32: Opc = X86::ADD32rr; break;
|
2005-01-11 11:50:45 +08:00
|
|
|
case MVT::f64: Opc = X86::FpADD; break;
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
|
2005-01-12 05:19:59 +08:00
|
|
|
if (getRegPressure(Op0) > getRegPressure(Op1)) {
|
|
|
|
Tmp1 = SelectExpr(Op0);
|
|
|
|
Tmp2 = SelectExpr(Op1);
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
} else {
|
2005-01-12 05:19:59 +08:00
|
|
|
Tmp2 = SelectExpr(Op1);
|
|
|
|
Tmp1 = SelectExpr(Op0);
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2);
|
|
|
|
return Result;
|
|
|
|
case ISD::SUB:
|
2005-01-12 05:19:59 +08:00
|
|
|
case ISD::MUL:
|
|
|
|
case ISD::AND:
|
|
|
|
case ISD::OR:
|
2005-01-12 12:23:22 +08:00
|
|
|
case ISD::XOR: {
|
2005-01-12 05:19:59 +08:00
|
|
|
static const unsigned SUBTab[] = {
|
|
|
|
X86::SUB8ri, X86::SUB16ri, X86::SUB32ri, 0, 0,
|
|
|
|
X86::SUB8rm, X86::SUB16rm, X86::SUB32rm, X86::FSUB32m, X86::FSUB64m,
|
|
|
|
X86::SUB8rr, X86::SUB16rr, X86::SUB32rr, X86::FpSUB , X86::FpSUB,
|
|
|
|
};
|
|
|
|
static const unsigned MULTab[] = {
|
|
|
|
0, X86::IMUL16rri, X86::IMUL32rri, 0, 0,
|
|
|
|
0, X86::IMUL16rm , X86::IMUL32rm, X86::FMUL32m, X86::FMUL64m,
|
|
|
|
0, X86::IMUL16rr , X86::IMUL32rr, X86::FpMUL , X86::FpMUL,
|
|
|
|
};
|
|
|
|
static const unsigned ANDTab[] = {
|
|
|
|
X86::AND8ri, X86::AND16ri, X86::AND32ri, 0, 0,
|
|
|
|
X86::AND8rm, X86::AND16rm, X86::AND32rm, 0, 0,
|
|
|
|
X86::AND8rr, X86::AND16rr, X86::AND32rr, 0, 0,
|
|
|
|
};
|
|
|
|
static const unsigned ORTab[] = {
|
|
|
|
X86::OR8ri, X86::OR16ri, X86::OR32ri, 0, 0,
|
|
|
|
X86::OR8rm, X86::OR16rm, X86::OR32rm, 0, 0,
|
|
|
|
X86::OR8rr, X86::OR16rr, X86::OR32rr, 0, 0,
|
|
|
|
};
|
|
|
|
static const unsigned XORTab[] = {
|
|
|
|
X86::XOR8ri, X86::XOR16ri, X86::XOR32ri, 0, 0,
|
|
|
|
X86::XOR8rm, X86::XOR16rm, X86::XOR32rm, 0, 0,
|
|
|
|
X86::XOR8rr, X86::XOR16rr, X86::XOR32rr, 0, 0,
|
|
|
|
};
|
|
|
|
|
|
|
|
Op0 = Node->getOperand(0);
|
|
|
|
Op1 = Node->getOperand(1);
|
|
|
|
|
2005-01-19 15:37:26 +08:00
|
|
|
if (Node->getOpcode() == ISD::OR && Op0.hasOneUse() && Op1.hasOneUse())
|
|
|
|
if (EmitOrOpOp(Op0, Op1, Result)) // Match SHLD, SHRD, and rotates.
|
Codegen long >> 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
shrd %EAX, %EDX, 2
sar %EDX, 2
ret
instead of this:
test1:
mov %ECX, DWORD PTR [%ESP + 4]
shr %ECX, 2
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %EDX
shl %EAX, 30
or %EAX, %ECX
sar %EDX, 2
ret
and long << 2 to this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %EAX
shrd %EDX, %ECX, 30
shl %EAX, 2
ret
instead of this:
foo:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
shr %ECX, 30
mov %EDX, DWORD PTR [%ESP + 8]
shl %EDX, 2
or %EDX, %ECX
shl %EAX, 2
ret
The extra copy (marked ***) can be eliminated when I teach the code generator
that shrd32rri8 is really commutative.
llvm-svn: 19681
2005-01-19 14:18:43 +08:00
|
|
|
return Result;
|
|
|
|
|
|
|
|
if (Node->getOpcode() == ISD::SUB)
|
2005-01-07 15:49:41 +08:00
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(0)))
|
|
|
|
if (CN->isNullValue()) { // 0 - N -> neg N
|
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot sub this type!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = X86::NEG8r; break;
|
|
|
|
case MVT::i16: Opc = X86::NEG16r; break;
|
|
|
|
case MVT::i32: Opc = X86::NEG32r; break;
|
|
|
|
}
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
BuildMI(BB, Opc, 1, Result).addReg(Tmp1);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
2005-01-12 05:19:59 +08:00
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
|
|
|
|
if (CN->isAllOnesValue() && Node->getOpcode() == ISD::XOR) {
|
2005-01-17 08:23:16 +08:00
|
|
|
Opc = 0;
|
2005-01-11 12:31:30 +08:00
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot add this type!");
|
2005-01-17 08:23:16 +08:00
|
|
|
case MVT::i1: break; // Not supported, don't invert upper bits!
|
2005-01-11 12:31:30 +08:00
|
|
|
case MVT::i8: Opc = X86::NOT8r; break;
|
|
|
|
case MVT::i16: Opc = X86::NOT16r; break;
|
|
|
|
case MVT::i32: Opc = X86::NOT32r; break;
|
|
|
|
}
|
2005-01-17 08:23:16 +08:00
|
|
|
if (Opc) {
|
|
|
|
Tmp1 = SelectExpr(Op0);
|
|
|
|
BuildMI(BB, Opc, 1, Result).addReg(Tmp1);
|
|
|
|
return Result;
|
|
|
|
}
|
2005-01-11 12:31:30 +08:00
|
|
|
}
|
|
|
|
|
2005-01-17 14:48:02 +08:00
|
|
|
// Fold common multiplies into LEA instructions.
|
|
|
|
if (Node->getOpcode() == ISD::MUL && N.getValueType() == MVT::i32) {
|
|
|
|
switch ((int)CN->getValue()) {
|
|
|
|
default: break;
|
|
|
|
case 3:
|
|
|
|
case 5:
|
|
|
|
case 9:
|
|
|
|
// Remove N from exprmap so SelectAddress doesn't get confused.
|
|
|
|
ExprMap.erase(N);
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
X86AddressMode AM;
|
2005-01-17 14:48:02 +08:00
|
|
|
SelectAddress(N, AM);
|
|
|
|
// Restore it to the map.
|
|
|
|
ExprMap[N] = Result;
|
|
|
|
addFullAddress(BuildMI(BB, X86::LEA32r, 4, Result), AM);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (N.getValueType()) {
|
2005-01-11 12:31:30 +08:00
|
|
|
default: assert(0 && "Cannot xor this type!");
|
2005-01-07 15:49:41 +08:00
|
|
|
case MVT::i1:
|
2005-01-12 05:19:59 +08:00
|
|
|
case MVT::i8: Opc = 0; break;
|
|
|
|
case MVT::i16: Opc = 1; break;
|
|
|
|
case MVT::i32: Opc = 2; break;
|
|
|
|
}
|
|
|
|
switch (Node->getOpcode()) {
|
|
|
|
default: assert(0 && "Unreachable!");
|
|
|
|
case ISD::SUB: Opc = SUBTab[Opc]; break;
|
|
|
|
case ISD::MUL: Opc = MULTab[Opc]; break;
|
|
|
|
case ISD::AND: Opc = ANDTab[Opc]; break;
|
|
|
|
case ISD::OR: Opc = ORTab[Opc]; break;
|
|
|
|
case ISD::XOR: Opc = XORTab[Opc]; break;
|
|
|
|
}
|
|
|
|
if (Opc) { // Can't fold MUL:i8 R, imm
|
|
|
|
Tmp1 = SelectExpr(Op0);
|
|
|
|
BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addImm(CN->getValue());
|
|
|
|
return Result;
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
}
|
|
|
|
|
2005-01-26 04:03:11 +08:00
|
|
|
if (isFoldableLoad(Op0, Op1, true))
|
2005-01-12 05:19:59 +08:00
|
|
|
if (Node->getOpcode() != ISD::SUB) {
|
|
|
|
std::swap(Op0, Op1);
|
2005-01-17 14:26:58 +08:00
|
|
|
goto FoldOps;
|
2005-01-12 05:19:59 +08:00
|
|
|
} else {
|
2005-01-26 04:03:11 +08:00
|
|
|
// For FP, emit 'reverse' subract, with a memory operand.
|
|
|
|
if (N.getValueType() == MVT::f64) {
|
|
|
|
if (Op0.getOpcode() == ISD::EXTLOAD)
|
|
|
|
Opc = X86::FSUBR32m;
|
|
|
|
else
|
|
|
|
Opc = X86::FSUBR64m;
|
|
|
|
|
2005-01-12 05:19:59 +08:00
|
|
|
X86AddressMode AM;
|
2005-01-13 13:53:16 +08:00
|
|
|
EmitFoldedLoad(Op0, AM);
|
|
|
|
Tmp1 = SelectExpr(Op1);
|
2005-01-12 05:19:59 +08:00
|
|
|
addFullAddress(BuildMI(BB, Opc, 5, Result).addReg(Tmp1), AM);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
|
2005-01-26 04:03:11 +08:00
|
|
|
if (isFoldableLoad(Op1, Op0, true)) {
|
2005-01-17 14:26:58 +08:00
|
|
|
FoldOps:
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (N.getValueType()) {
|
2005-01-12 05:19:59 +08:00
|
|
|
default: assert(0 && "Cannot operate on this type!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = 5; break;
|
|
|
|
case MVT::i16: Opc = 6; break;
|
|
|
|
case MVT::i32: Opc = 7; break;
|
2005-01-26 04:03:11 +08:00
|
|
|
// For F64, handle promoted load operations (from F32) as well!
|
|
|
|
case MVT::f64: Opc = Op1.getOpcode() == ISD::LOAD ? 9 : 8; break;
|
2005-01-12 05:19:59 +08:00
|
|
|
}
|
|
|
|
switch (Node->getOpcode()) {
|
|
|
|
default: assert(0 && "Unreachable!");
|
|
|
|
case ISD::SUB: Opc = SUBTab[Opc]; break;
|
|
|
|
case ISD::MUL: Opc = MULTab[Opc]; break;
|
|
|
|
case ISD::AND: Opc = ANDTab[Opc]; break;
|
|
|
|
case ISD::OR: Opc = ORTab[Opc]; break;
|
|
|
|
case ISD::XOR: Opc = XORTab[Opc]; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
X86AddressMode AM;
|
2005-01-13 13:53:16 +08:00
|
|
|
EmitFoldedLoad(Op1, AM);
|
|
|
|
Tmp1 = SelectExpr(Op0);
|
2005-01-07 15:49:41 +08:00
|
|
|
if (Opc) {
|
2005-01-12 05:19:59 +08:00
|
|
|
addFullAddress(BuildMI(BB, Opc, 5, Result).addReg(Tmp1), AM);
|
|
|
|
} else {
|
|
|
|
assert(Node->getOpcode() == ISD::MUL &&
|
|
|
|
N.getValueType() == MVT::i8 && "Unexpected situation!");
|
|
|
|
// Must use the MUL instruction, which forces use of AL.
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, X86::AL).addReg(Tmp1);
|
|
|
|
addFullAddress(BuildMI(BB, X86::MUL8m, 1), AM);
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, Result).addReg(X86::AL);
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
2005-01-12 05:19:59 +08:00
|
|
|
return Result;
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
|
2005-01-12 05:19:59 +08:00
|
|
|
if (getRegPressure(Op0) > getRegPressure(Op1)) {
|
|
|
|
Tmp1 = SelectExpr(Op0);
|
|
|
|
Tmp2 = SelectExpr(Op1);
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
} else {
|
2005-01-12 05:19:59 +08:00
|
|
|
Tmp2 = SelectExpr(Op1);
|
|
|
|
Tmp1 = SelectExpr(Op0);
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
}
|
2005-01-12 05:19:59 +08:00
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot add this type!");
|
2005-01-12 05:19:59 +08:00
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = 10; break;
|
|
|
|
case MVT::i16: Opc = 11; break;
|
|
|
|
case MVT::i32: Opc = 12; break;
|
|
|
|
case MVT::f32: Opc = 13; break;
|
|
|
|
case MVT::f64: Opc = 14; break;
|
|
|
|
}
|
|
|
|
switch (Node->getOpcode()) {
|
|
|
|
default: assert(0 && "Unreachable!");
|
|
|
|
case ISD::SUB: Opc = SUBTab[Opc]; break;
|
|
|
|
case ISD::MUL: Opc = MULTab[Opc]; break;
|
|
|
|
case ISD::AND: Opc = ANDTab[Opc]; break;
|
|
|
|
case ISD::OR: Opc = ORTab[Opc]; break;
|
|
|
|
case ISD::XOR: Opc = XORTab[Opc]; break;
|
|
|
|
}
|
|
|
|
if (Opc) {
|
|
|
|
BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2);
|
|
|
|
} else {
|
|
|
|
assert(Node->getOpcode() == ISD::MUL &&
|
|
|
|
N.getValueType() == MVT::i8 && "Unexpected situation!");
|
2005-01-11 04:55:48 +08:00
|
|
|
// Must use the MUL instruction, which forces use of AL.
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, X86::AL).addReg(Tmp1);
|
|
|
|
BuildMI(BB, X86::MUL8r, 1).addReg(Tmp2);
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, Result).addReg(X86::AL);
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
|
|
|
return Result;
|
2005-01-12 12:23:22 +08:00
|
|
|
}
|
2005-01-21 02:53:00 +08:00
|
|
|
case ISD::ADD_PARTS:
|
|
|
|
case ISD::SUB_PARTS: {
|
|
|
|
assert(N.getNumOperands() == 4 && N.getValueType() == MVT::i32 &&
|
|
|
|
"Not an i64 add/sub!");
|
|
|
|
// Emit all of the operands.
|
|
|
|
std::vector<unsigned> InVals;
|
|
|
|
for (unsigned i = 0, e = N.getNumOperands(); i != e; ++i)
|
|
|
|
InVals.push_back(SelectExpr(N.getOperand(i)));
|
|
|
|
if (N.getOpcode() == ISD::ADD_PARTS) {
|
|
|
|
BuildMI(BB, X86::ADD32rr, 2, Result).addReg(InVals[0]).addReg(InVals[2]);
|
|
|
|
BuildMI(BB, X86::ADC32rr,2,Result+1).addReg(InVals[1]).addReg(InVals[3]);
|
|
|
|
} else {
|
|
|
|
BuildMI(BB, X86::SUB32rr, 2, Result).addReg(InVals[0]).addReg(InVals[2]);
|
|
|
|
BuildMI(BB, X86::SBB32rr, 2,Result+1).addReg(InVals[1]).addReg(InVals[3]);
|
|
|
|
}
|
|
|
|
return Result+N.ResNo;
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
case ISD::SELECT:
|
2005-01-16 15:34:08 +08:00
|
|
|
if (getRegPressure(N.getOperand(1)) > getRegPressure(N.getOperand(2))) {
|
|
|
|
Tmp2 = SelectExpr(N.getOperand(1));
|
|
|
|
Tmp3 = SelectExpr(N.getOperand(2));
|
2005-01-07 15:49:41 +08:00
|
|
|
} else {
|
2005-01-16 15:34:08 +08:00
|
|
|
Tmp3 = SelectExpr(N.getOperand(2));
|
|
|
|
Tmp2 = SelectExpr(N.getOperand(1));
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
2005-01-16 15:34:08 +08:00
|
|
|
EmitSelectCC(N.getOperand(0), N.getValueType(), Tmp2, Tmp3, Result);
|
|
|
|
return Result;
|
2005-01-07 15:49:41 +08:00
|
|
|
|
|
|
|
case ISD::SDIV:
|
|
|
|
case ISD::UDIV:
|
|
|
|
case ISD::SREM:
|
|
|
|
case ISD::UREM: {
|
2005-01-16 15:34:08 +08:00
|
|
|
assert((N.getOpcode() != ISD::SREM || MVT::isInteger(N.getValueType())) &&
|
|
|
|
"We don't support this operator!");
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
if (N.getOpcode() == ISD::SDIV)
|
2005-01-26 04:35:10 +08:00
|
|
|
|
|
|
|
// We can fold loads into FpDIVs, but not really into any others.
|
|
|
|
if (N.getValueType() == MVT::f64) {
|
|
|
|
// Check for reversed and unreversed DIV.
|
|
|
|
if (isFoldableLoad(N.getOperand(0), N.getOperand(1), true)) {
|
|
|
|
if (N.getOperand(0).getOpcode() == ISD::EXTLOAD)
|
|
|
|
Opc = X86::FDIVR32m;
|
|
|
|
else
|
|
|
|
Opc = X86::FDIVR64m;
|
|
|
|
X86AddressMode AM;
|
|
|
|
EmitFoldedLoad(N.getOperand(0), AM);
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
addFullAddress(BuildMI(BB, Opc, 5, Result).addReg(Tmp1), AM);
|
|
|
|
return Result;
|
|
|
|
} else if (isFoldableLoad(N.getOperand(1), N.getOperand(0), true) &&
|
|
|
|
N.getOperand(1).getOpcode() == ISD::LOAD) {
|
|
|
|
if (N.getOperand(1).getOpcode() == ISD::EXTLOAD)
|
|
|
|
Opc = X86::FDIV32m;
|
|
|
|
else
|
|
|
|
Opc = X86::FDIV64m;
|
|
|
|
X86AddressMode AM;
|
|
|
|
EmitFoldedLoad(N.getOperand(1), AM);
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
|
|
|
addFullAddress(BuildMI(BB, Opc, 5, Result).addReg(Tmp1), AM);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
|
|
|
|
// FIXME: These special cases should be handled by the lowering impl!
|
|
|
|
unsigned RHS = CN->getValue();
|
|
|
|
bool isNeg = false;
|
|
|
|
if ((int)RHS < 0) {
|
|
|
|
isNeg = true;
|
|
|
|
RHS = -RHS;
|
|
|
|
}
|
|
|
|
if (RHS && (RHS & (RHS-1)) == 0) { // Signed division by power of 2?
|
|
|
|
unsigned Log = log2(RHS);
|
|
|
|
unsigned TmpReg = MakeReg(N.getValueType());
|
|
|
|
unsigned SAROpc, SHROpc, ADDOpc, NEGOpc;
|
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert("Unknown type to signed divide!");
|
|
|
|
case MVT::i8:
|
|
|
|
SAROpc = X86::SAR8ri;
|
|
|
|
SHROpc = X86::SHR8ri;
|
|
|
|
ADDOpc = X86::ADD8rr;
|
|
|
|
NEGOpc = X86::NEG8r;
|
|
|
|
break;
|
|
|
|
case MVT::i16:
|
|
|
|
SAROpc = X86::SAR16ri;
|
|
|
|
SHROpc = X86::SHR16ri;
|
|
|
|
ADDOpc = X86::ADD16rr;
|
|
|
|
NEGOpc = X86::NEG16r;
|
|
|
|
break;
|
|
|
|
case MVT::i32:
|
|
|
|
SAROpc = X86::SAR32ri;
|
|
|
|
SHROpc = X86::SHR32ri;
|
|
|
|
ADDOpc = X86::ADD32rr;
|
|
|
|
NEGOpc = X86::NEG32r;
|
|
|
|
break;
|
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, SAROpc, 2, TmpReg).addReg(Tmp1).addImm(Log-1);
|
|
|
|
unsigned TmpReg2 = MakeReg(N.getValueType());
|
|
|
|
BuildMI(BB, SHROpc, 2, TmpReg2).addReg(TmpReg).addImm(32-Log);
|
|
|
|
unsigned TmpReg3 = MakeReg(N.getValueType());
|
|
|
|
BuildMI(BB, ADDOpc, 2, TmpReg3).addReg(Tmp1).addReg(TmpReg2);
|
|
|
|
|
|
|
|
unsigned TmpReg4 = isNeg ? MakeReg(N.getValueType()) : Result;
|
|
|
|
BuildMI(BB, SAROpc, 2, TmpReg4).addReg(TmpReg3).addImm(Log);
|
|
|
|
if (isNeg)
|
|
|
|
BuildMI(BB, NEGOpc, 1, Result).addReg(TmpReg4);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(1))) {
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
|
|
|
Tmp2 = SelectExpr(N.getOperand(1));
|
|
|
|
} else {
|
|
|
|
Tmp2 = SelectExpr(N.getOperand(1));
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
|
|
|
|
bool isSigned = N.getOpcode() == ISD::SDIV || N.getOpcode() == ISD::SREM;
|
|
|
|
bool isDiv = N.getOpcode() == ISD::SDIV || N.getOpcode() == ISD::UDIV;
|
|
|
|
unsigned LoReg, HiReg, DivOpcode, MovOpcode, ClrOpcode, SExtOpcode;
|
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot sdiv this type!");
|
|
|
|
case MVT::i8:
|
|
|
|
DivOpcode = isSigned ? X86::IDIV8r : X86::DIV8r;
|
|
|
|
LoReg = X86::AL;
|
|
|
|
HiReg = X86::AH;
|
|
|
|
MovOpcode = X86::MOV8rr;
|
|
|
|
ClrOpcode = X86::MOV8ri;
|
|
|
|
SExtOpcode = X86::CBW;
|
|
|
|
break;
|
|
|
|
case MVT::i16:
|
|
|
|
DivOpcode = isSigned ? X86::IDIV16r : X86::DIV16r;
|
|
|
|
LoReg = X86::AX;
|
|
|
|
HiReg = X86::DX;
|
|
|
|
MovOpcode = X86::MOV16rr;
|
|
|
|
ClrOpcode = X86::MOV16ri;
|
|
|
|
SExtOpcode = X86::CWD;
|
|
|
|
break;
|
|
|
|
case MVT::i32:
|
|
|
|
DivOpcode = isSigned ? X86::IDIV32r : X86::DIV32r;
|
2005-01-12 11:16:09 +08:00
|
|
|
LoReg = X86::EAX;
|
2005-01-07 15:49:41 +08:00
|
|
|
HiReg = X86::EDX;
|
|
|
|
MovOpcode = X86::MOV32rr;
|
|
|
|
ClrOpcode = X86::MOV32ri;
|
|
|
|
SExtOpcode = X86::CDQ;
|
|
|
|
break;
|
|
|
|
case MVT::f64:
|
2005-01-16 15:34:08 +08:00
|
|
|
BuildMI(BB, X86::FpDIV, 2, Result).addReg(Tmp1).addReg(Tmp2);
|
2005-01-07 15:49:41 +08:00
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set up the low part.
|
|
|
|
BuildMI(BB, MovOpcode, 1, LoReg).addReg(Tmp1);
|
|
|
|
|
|
|
|
if (isSigned) {
|
|
|
|
// Sign extend the low part into the high part.
|
|
|
|
BuildMI(BB, SExtOpcode, 0);
|
|
|
|
} else {
|
|
|
|
// Zero out the high part, effectively zero extending the input.
|
|
|
|
BuildMI(BB, ClrOpcode, 1, HiReg).addImm(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Emit the DIV/IDIV instruction.
|
|
|
|
BuildMI(BB, DivOpcode, 1).addReg(Tmp2);
|
|
|
|
|
|
|
|
// Get the result of the divide or rem.
|
|
|
|
BuildMI(BB, MovOpcode, 1, Result).addReg(isDiv ? LoReg : HiReg);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
|
|
|
case ISD::SHL:
|
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
|
2005-01-12 05:19:59 +08:00
|
|
|
if (CN->getValue() == 1) { // X = SHL Y, 1 -> X = ADD Y, Y
|
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot shift this type!");
|
|
|
|
case MVT::i8: Opc = X86::ADD8rr; break;
|
|
|
|
case MVT::i16: Opc = X86::ADD16rr; break;
|
|
|
|
case MVT::i32: Opc = X86::ADD32rr; break;
|
|
|
|
}
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
|
|
|
BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp1);
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot shift this type!");
|
|
|
|
case MVT::i8: Opc = X86::SHL8ri; break;
|
|
|
|
case MVT::i16: Opc = X86::SHL16ri; break;
|
|
|
|
case MVT::i32: Opc = X86::SHL32ri; break;
|
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addImm(CN->getValue());
|
|
|
|
return Result;
|
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
|
|
|
|
if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(1))) {
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
|
|
|
Tmp2 = SelectExpr(N.getOperand(1));
|
|
|
|
} else {
|
|
|
|
Tmp2 = SelectExpr(N.getOperand(1));
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot shift this type!");
|
|
|
|
case MVT::i8 : Opc = X86::SHL8rCL; break;
|
|
|
|
case MVT::i16: Opc = X86::SHL16rCL; break;
|
|
|
|
case MVT::i32: Opc = X86::SHL32rCL; break;
|
|
|
|
}
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, X86::CL).addReg(Tmp2);
|
|
|
|
BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2);
|
|
|
|
return Result;
|
|
|
|
case ISD::SRL:
|
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
|
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot shift this type!");
|
|
|
|
case MVT::i8: Opc = X86::SHR8ri; break;
|
|
|
|
case MVT::i16: Opc = X86::SHR16ri; break;
|
|
|
|
case MVT::i32: Opc = X86::SHR32ri; break;
|
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addImm(CN->getValue());
|
|
|
|
return Result;
|
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
|
|
|
|
if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(1))) {
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
|
|
|
Tmp2 = SelectExpr(N.getOperand(1));
|
|
|
|
} else {
|
|
|
|
Tmp2 = SelectExpr(N.getOperand(1));
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot shift this type!");
|
|
|
|
case MVT::i8 : Opc = X86::SHR8rCL; break;
|
|
|
|
case MVT::i16: Opc = X86::SHR16rCL; break;
|
|
|
|
case MVT::i32: Opc = X86::SHR32rCL; break;
|
|
|
|
}
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, X86::CL).addReg(Tmp2);
|
|
|
|
BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2);
|
|
|
|
return Result;
|
|
|
|
case ISD::SRA:
|
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
|
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot shift this type!");
|
|
|
|
case MVT::i8: Opc = X86::SAR8ri; break;
|
|
|
|
case MVT::i16: Opc = X86::SAR16ri; break;
|
|
|
|
case MVT::i32: Opc = X86::SAR32ri; break;
|
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addImm(CN->getValue());
|
|
|
|
return Result;
|
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
|
|
|
|
if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(1))) {
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
|
|
|
Tmp2 = SelectExpr(N.getOperand(1));
|
|
|
|
} else {
|
|
|
|
Tmp2 = SelectExpr(N.getOperand(1));
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(0));
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (N.getValueType()) {
|
|
|
|
default: assert(0 && "Cannot shift this type!");
|
|
|
|
case MVT::i8 : Opc = X86::SAR8rCL; break;
|
|
|
|
case MVT::i16: Opc = X86::SAR16rCL; break;
|
|
|
|
case MVT::i32: Opc = X86::SAR32rCL; break;
|
|
|
|
}
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, X86::CL).addReg(Tmp2);
|
|
|
|
BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2);
|
|
|
|
return Result;
|
|
|
|
|
|
|
|
case ISD::SETCC:
|
2005-01-17 09:34:14 +08:00
|
|
|
EmitCMP(N.getOperand(0), N.getOperand(1), Node->hasOneUse());
|
2005-01-07 15:49:41 +08:00
|
|
|
EmitSetCC(BB, Result, cast<SetCCSDNode>(N)->getCondition(),
|
|
|
|
MVT::isFloatingPoint(N.getOperand(1).getValueType()));
|
|
|
|
return Result;
|
2005-01-15 13:22:24 +08:00
|
|
|
case ISD::LOAD:
|
2005-01-07 15:49:41 +08:00
|
|
|
// Make sure we generate both values.
|
2005-01-18 11:51:59 +08:00
|
|
|
if (Result != 1) { // Generate the token
|
|
|
|
if (!ExprMap.insert(std::make_pair(N.getValue(1), 1)).second)
|
|
|
|
assert(0 && "Load already emitted!?");
|
|
|
|
} else
|
2005-01-07 15:49:41 +08:00
|
|
|
Result = ExprMap[N.getValue(0)] = MakeReg(N.getValue(0).getValueType());
|
|
|
|
|
2005-01-09 03:28:19 +08:00
|
|
|
switch (Node->getValueType(0)) {
|
2005-01-07 15:49:41 +08:00
|
|
|
default: assert(0 && "Cannot load this type!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = X86::MOV8rm; break;
|
|
|
|
case MVT::i16: Opc = X86::MOV16rm; break;
|
|
|
|
case MVT::i32: Opc = X86::MOV32rm; break;
|
|
|
|
case MVT::f64: Opc = X86::FLD64m; ContainsFPCode = true; break;
|
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N.getOperand(1))){
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
Select(N.getOperand(0));
|
2005-01-07 15:49:41 +08:00
|
|
|
addConstantPoolReference(BuildMI(BB, Opc, 4, Result), CP->getIndex());
|
|
|
|
} else {
|
|
|
|
X86AddressMode AM;
|
2005-01-13 13:53:16 +08:00
|
|
|
|
|
|
|
SDOperand Chain = N.getOperand(0);
|
|
|
|
SDOperand Address = N.getOperand(1);
|
|
|
|
if (getRegPressure(Chain) > getRegPressure(Address)) {
|
|
|
|
Select(Chain);
|
|
|
|
SelectAddress(Address, AM);
|
|
|
|
} else {
|
|
|
|
SelectAddress(Address, AM);
|
|
|
|
Select(Chain);
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
addFullAddress(BuildMI(BB, Opc, 4, Result), AM);
|
|
|
|
}
|
|
|
|
return Result;
|
2005-01-15 13:22:24 +08:00
|
|
|
|
|
|
|
case ISD::EXTLOAD: // Arbitrarily codegen extloads as MOVZX*
|
|
|
|
case ISD::ZEXTLOAD: {
|
|
|
|
// Make sure we generate both values.
|
|
|
|
if (Result != 1)
|
|
|
|
ExprMap[N.getValue(1)] = 1; // Generate the token
|
|
|
|
else
|
|
|
|
Result = ExprMap[N.getValue(0)] = MakeReg(N.getValue(0).getValueType());
|
|
|
|
|
2005-01-16 15:34:08 +08:00
|
|
|
if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N.getOperand(1)))
|
|
|
|
if (Node->getValueType(0) == MVT::f64) {
|
|
|
|
assert(cast<MVTSDNode>(Node)->getExtraValueType() == MVT::f32 &&
|
|
|
|
"Bad EXTLOAD!");
|
|
|
|
addConstantPoolReference(BuildMI(BB, X86::FLD32m, 4, Result),
|
|
|
|
CP->getIndex());
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
2005-01-15 13:22:24 +08:00
|
|
|
X86AddressMode AM;
|
|
|
|
if (getRegPressure(Node->getOperand(0)) >
|
|
|
|
getRegPressure(Node->getOperand(1))) {
|
|
|
|
Select(Node->getOperand(0)); // chain
|
|
|
|
SelectAddress(Node->getOperand(1), AM);
|
|
|
|
} else {
|
|
|
|
SelectAddress(Node->getOperand(1), AM);
|
|
|
|
Select(Node->getOperand(0)); // chain
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (Node->getValueType(0)) {
|
|
|
|
default: assert(0 && "Unknown type to sign extend to.");
|
|
|
|
case MVT::f64:
|
|
|
|
assert(cast<MVTSDNode>(Node)->getExtraValueType() == MVT::f32 &&
|
|
|
|
"Bad EXTLOAD!");
|
|
|
|
addFullAddress(BuildMI(BB, X86::FLD32m, 5, Result), AM);
|
|
|
|
break;
|
|
|
|
case MVT::i32:
|
|
|
|
switch (cast<MVTSDNode>(Node)->getExtraValueType()) {
|
|
|
|
default:
|
|
|
|
assert(0 && "Bad zero extend!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8:
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOVZX32rm8, 5, Result), AM);
|
|
|
|
break;
|
|
|
|
case MVT::i16:
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOVZX32rm16, 5, Result), AM);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case MVT::i16:
|
|
|
|
assert(cast<MVTSDNode>(Node)->getExtraValueType() <= MVT::i8 &&
|
|
|
|
"Bad zero extend!");
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOVSX16rm8, 5, Result), AM);
|
|
|
|
break;
|
|
|
|
case MVT::i8:
|
|
|
|
assert(cast<MVTSDNode>(Node)->getExtraValueType() == MVT::i1 &&
|
|
|
|
"Bad zero extend!");
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOV8rm, 5, Result), AM);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return Result;
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
2005-01-15 13:22:24 +08:00
|
|
|
case ISD::SEXTLOAD: {
|
|
|
|
// Make sure we generate both values.
|
|
|
|
if (Result != 1)
|
|
|
|
ExprMap[N.getValue(1)] = 1; // Generate the token
|
|
|
|
else
|
|
|
|
Result = ExprMap[N.getValue(0)] = MakeReg(N.getValue(0).getValueType());
|
|
|
|
|
|
|
|
X86AddressMode AM;
|
|
|
|
if (getRegPressure(Node->getOperand(0)) >
|
|
|
|
getRegPressure(Node->getOperand(1))) {
|
|
|
|
Select(Node->getOperand(0)); // chain
|
|
|
|
SelectAddress(Node->getOperand(1), AM);
|
|
|
|
} else {
|
|
|
|
SelectAddress(Node->getOperand(1), AM);
|
|
|
|
Select(Node->getOperand(0)); // chain
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (Node->getValueType(0)) {
|
|
|
|
case MVT::i8: assert(0 && "Cannot sign extend from bool!");
|
|
|
|
default: assert(0 && "Unknown type to sign extend to.");
|
|
|
|
case MVT::i32:
|
|
|
|
switch (cast<MVTSDNode>(Node)->getExtraValueType()) {
|
|
|
|
default:
|
|
|
|
case MVT::i1: assert(0 && "Cannot sign extend from bool!");
|
|
|
|
case MVT::i8:
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOVSX32rm8, 5, Result), AM);
|
|
|
|
break;
|
|
|
|
case MVT::i16:
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOVSX32rm16, 5, Result), AM);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case MVT::i16:
|
|
|
|
assert(cast<MVTSDNode>(Node)->getExtraValueType() == MVT::i8 &&
|
|
|
|
"Cannot sign extend from bool!");
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOVSX16rm8, 5, Result), AM);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
case ISD::DYNAMIC_STACKALLOC:
|
|
|
|
// Generate both result values.
|
|
|
|
if (Result != 1)
|
|
|
|
ExprMap[N.getValue(1)] = 1; // Generate the token
|
|
|
|
else
|
|
|
|
Result = ExprMap[N.getValue(0)] = MakeReg(N.getValue(0).getValueType());
|
|
|
|
|
|
|
|
// FIXME: We are currently ignoring the requested alignment for handling
|
|
|
|
// greater than the stack alignment. This will need to be revisited at some
|
|
|
|
// point. Align = N.getOperand(2);
|
|
|
|
|
|
|
|
if (!isa<ConstantSDNode>(N.getOperand(2)) ||
|
|
|
|
cast<ConstantSDNode>(N.getOperand(2))->getValue() != 0) {
|
|
|
|
std::cerr << "Cannot allocate stack object with greater alignment than"
|
|
|
|
<< " the stack alignment yet!";
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
Select(N.getOperand(0));
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, X86::SUB32ri, 2, X86::ESP).addReg(X86::ESP)
|
|
|
|
.addImm(CN->getValue());
|
|
|
|
} else {
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(1))) {
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
} else {
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
|
|
|
|
// Subtract size from stack pointer, thereby allocating some space.
|
|
|
|
BuildMI(BB, X86::SUB32rr, 2, X86::ESP).addReg(X86::ESP).addReg(Tmp1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Put a pointer to the space into the result register, by copying the stack
|
|
|
|
// pointer.
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, Result).addReg(X86::ESP);
|
|
|
|
return Result;
|
|
|
|
|
|
|
|
case ISD::CALL:
|
2005-01-09 03:28:19 +08:00
|
|
|
// The chain for this call is now lowered.
|
2005-01-18 11:51:59 +08:00
|
|
|
ExprMap.insert(std::make_pair(N.getValue(Node->getNumValues()-1), 1));
|
2005-01-09 03:28:19 +08:00
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
if (GlobalAddressSDNode *GASD =
|
|
|
|
dyn_cast<GlobalAddressSDNode>(N.getOperand(1))) {
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
Select(N.getOperand(0));
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, X86::CALLpcrel32, 1).addGlobalAddress(GASD->getGlobal(),true);
|
|
|
|
} else if (ExternalSymbolSDNode *ESSDN =
|
|
|
|
dyn_cast<ExternalSymbolSDNode>(N.getOperand(1))) {
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
Select(N.getOperand(0));
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, X86::CALLpcrel32,
|
|
|
|
1).addExternalSymbol(ESSDN->getSymbol(), true);
|
|
|
|
} else {
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(1))) {
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
} else {
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, X86::CALL32r, 1).addReg(Tmp1);
|
|
|
|
}
|
2005-01-09 03:28:19 +08:00
|
|
|
switch (Node->getValueType(0)) {
|
2005-01-07 15:49:41 +08:00
|
|
|
default: assert(0 && "Unknown value type for call result!");
|
|
|
|
case MVT::Other: return 1;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8:
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, Result).addReg(X86::AL);
|
|
|
|
break;
|
|
|
|
case MVT::i16:
|
|
|
|
BuildMI(BB, X86::MOV16rr, 1, Result).addReg(X86::AX);
|
|
|
|
break;
|
|
|
|
case MVT::i32:
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, Result).addReg(X86::EAX);
|
2005-01-09 03:28:19 +08:00
|
|
|
if (Node->getValueType(1) == MVT::i32)
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, X86::MOV32rr, 1, Result+1).addReg(X86::EDX);
|
|
|
|
break;
|
|
|
|
case MVT::f64: // Floating-point return values live in %ST(0)
|
|
|
|
ContainsFPCode = true;
|
|
|
|
BuildMI(BB, X86::FpGETRESULT, 1, Result);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return Result+N.ResNo;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-01-18 03:25:26 +08:00
|
|
|
/// TryToFoldLoadOpStore - Given a store node, try to fold together a
|
|
|
|
/// load/op/store instruction. If successful return true.
|
|
|
|
bool ISel::TryToFoldLoadOpStore(SDNode *Node) {
|
|
|
|
assert(Node->getOpcode() == ISD::STORE && "Can only do this for stores!");
|
|
|
|
SDOperand Chain = Node->getOperand(0);
|
|
|
|
SDOperand StVal = Node->getOperand(1);
|
2005-01-18 06:10:42 +08:00
|
|
|
SDOperand StPtr = Node->getOperand(2);
|
2005-01-18 03:25:26 +08:00
|
|
|
|
|
|
|
// The chain has to be a load, the stored value must be an integer binary
|
|
|
|
// operation with one use.
|
2005-01-18 06:10:42 +08:00
|
|
|
if (!StVal.Val->hasOneUse() || StVal.Val->getNumOperands() != 2 ||
|
2005-01-18 03:25:26 +08:00
|
|
|
MVT::isFloatingPoint(StVal.getValueType()))
|
|
|
|
return false;
|
|
|
|
|
2005-01-18 06:10:42 +08:00
|
|
|
// Token chain must either be a factor node or the load to fold.
|
|
|
|
if (Chain.getOpcode() != ISD::LOAD && Chain.getOpcode() != ISD::TokenFactor)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
SDOperand TheLoad;
|
|
|
|
|
|
|
|
// Check to see if there is a load from the same pointer that we're storing
|
|
|
|
// to in either operand of the binop.
|
|
|
|
if (StVal.getOperand(0).getOpcode() == ISD::LOAD &&
|
|
|
|
StVal.getOperand(0).getOperand(1) == StPtr)
|
|
|
|
TheLoad = StVal.getOperand(0);
|
|
|
|
else if (StVal.getOperand(1).getOpcode() == ISD::LOAD &&
|
|
|
|
StVal.getOperand(1).getOperand(1) == StPtr)
|
|
|
|
TheLoad = StVal.getOperand(1);
|
|
|
|
else
|
|
|
|
return false; // No matching load operand.
|
|
|
|
|
|
|
|
// We can only fold the load if there are no intervening side-effecting
|
|
|
|
// operations. This means that the store uses the load as its token chain, or
|
|
|
|
// there are only token factor nodes in between the store and load.
|
|
|
|
if (Chain != TheLoad.getValue(1)) {
|
|
|
|
// Okay, the other option is that we have a store referring to (possibly
|
|
|
|
// nested) token factor nodes. For now, just try peeking through one level
|
|
|
|
// of token factors to see if this is the case.
|
|
|
|
bool ChainOk = false;
|
|
|
|
if (Chain.getOpcode() == ISD::TokenFactor) {
|
|
|
|
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
|
|
|
|
if (Chain.getOperand(i) == TheLoad.getValue(1)) {
|
|
|
|
ChainOk = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2005-01-18 03:25:26 +08:00
|
|
|
|
2005-01-18 06:10:42 +08:00
|
|
|
if (!ChainOk) return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (TheLoad.getOperand(1) != StPtr)
|
2005-01-18 03:25:26 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
// Make sure that one of the operands of the binop is the load, and that the
|
|
|
|
// load folds into the binop.
|
|
|
|
if (((StVal.getOperand(0) != TheLoad ||
|
|
|
|
!isFoldableLoad(TheLoad, StVal.getOperand(1))) &&
|
|
|
|
(StVal.getOperand(1) != TheLoad ||
|
|
|
|
!isFoldableLoad(TheLoad, StVal.getOperand(0)))))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Finally, check to see if this is one of the ops we can handle!
|
|
|
|
static const unsigned ADDTAB[] = {
|
|
|
|
X86::ADD8mi, X86::ADD16mi, X86::ADD32mi,
|
|
|
|
X86::ADD8mr, X86::ADD16mr, X86::ADD32mr,
|
|
|
|
};
|
|
|
|
static const unsigned SUBTAB[] = {
|
|
|
|
X86::SUB8mi, X86::SUB16mi, X86::SUB32mi,
|
|
|
|
X86::SUB8mr, X86::SUB16mr, X86::SUB32mr,
|
|
|
|
};
|
|
|
|
static const unsigned ANDTAB[] = {
|
|
|
|
X86::AND8mi, X86::AND16mi, X86::AND32mi,
|
|
|
|
X86::AND8mr, X86::AND16mr, X86::AND32mr,
|
|
|
|
};
|
|
|
|
static const unsigned ORTAB[] = {
|
|
|
|
X86::OR8mi, X86::OR16mi, X86::OR32mi,
|
|
|
|
X86::OR8mr, X86::OR16mr, X86::OR32mr,
|
|
|
|
};
|
|
|
|
static const unsigned XORTAB[] = {
|
|
|
|
X86::XOR8mi, X86::XOR16mi, X86::XOR32mi,
|
|
|
|
X86::XOR8mr, X86::XOR16mr, X86::XOR32mr,
|
|
|
|
};
|
|
|
|
static const unsigned SHLTAB[] = {
|
|
|
|
X86::SHL8mi, X86::SHL16mi, X86::SHL32mi,
|
|
|
|
/*Have to put the reg in CL*/0, 0, 0,
|
|
|
|
};
|
|
|
|
static const unsigned SARTAB[] = {
|
|
|
|
X86::SAR8mi, X86::SAR16mi, X86::SAR32mi,
|
|
|
|
/*Have to put the reg in CL*/0, 0, 0,
|
|
|
|
};
|
|
|
|
static const unsigned SHRTAB[] = {
|
|
|
|
X86::SHR8mi, X86::SHR16mi, X86::SHR32mi,
|
|
|
|
/*Have to put the reg in CL*/0, 0, 0,
|
|
|
|
};
|
|
|
|
|
|
|
|
const unsigned *TabPtr = 0;
|
|
|
|
switch (StVal.getOpcode()) {
|
|
|
|
default:
|
|
|
|
std::cerr << "CANNOT [mem] op= val: ";
|
|
|
|
StVal.Val->dump(); std::cerr << "\n";
|
|
|
|
case ISD::MUL:
|
|
|
|
case ISD::SDIV:
|
|
|
|
case ISD::UDIV:
|
|
|
|
case ISD::SREM:
|
|
|
|
case ISD::UREM: return false;
|
|
|
|
|
|
|
|
case ISD::ADD: TabPtr = ADDTAB; break;
|
|
|
|
case ISD::SUB: TabPtr = SUBTAB; break;
|
|
|
|
case ISD::AND: TabPtr = ANDTAB; break;
|
|
|
|
case ISD:: OR: TabPtr = ORTAB; break;
|
|
|
|
case ISD::XOR: TabPtr = XORTAB; break;
|
|
|
|
case ISD::SHL: TabPtr = SHLTAB; break;
|
|
|
|
case ISD::SRA: TabPtr = SARTAB; break;
|
|
|
|
case ISD::SRL: TabPtr = SHRTAB; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle: [mem] op= CST
|
|
|
|
SDOperand Op0 = StVal.getOperand(0);
|
|
|
|
SDOperand Op1 = StVal.getOperand(1);
|
2005-01-24 07:20:06 +08:00
|
|
|
unsigned Opc = 0;
|
2005-01-18 03:25:26 +08:00
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
|
|
|
|
switch (Op0.getValueType()) { // Use Op0's type because of shifts.
|
|
|
|
default: break;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = TabPtr[0]; break;
|
|
|
|
case MVT::i16: Opc = TabPtr[1]; break;
|
|
|
|
case MVT::i32: Opc = TabPtr[2]; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Opc) {
|
2005-01-18 11:51:59 +08:00
|
|
|
if (!ExprMap.insert(std::make_pair(TheLoad.getValue(1), 1)).second)
|
|
|
|
assert(0 && "Already emitted?");
|
2005-01-18 06:10:42 +08:00
|
|
|
Select(Chain);
|
|
|
|
|
2005-01-18 03:25:26 +08:00
|
|
|
X86AddressMode AM;
|
|
|
|
if (getRegPressure(TheLoad.getOperand(0)) >
|
|
|
|
getRegPressure(TheLoad.getOperand(1))) {
|
|
|
|
Select(TheLoad.getOperand(0));
|
|
|
|
SelectAddress(TheLoad.getOperand(1), AM);
|
|
|
|
} else {
|
|
|
|
SelectAddress(TheLoad.getOperand(1), AM);
|
|
|
|
Select(TheLoad.getOperand(0));
|
|
|
|
}
|
2005-01-18 06:10:42 +08:00
|
|
|
|
|
|
|
if (StVal.getOpcode() == ISD::ADD) {
|
|
|
|
if (CN->getValue() == 1) {
|
|
|
|
switch (Op0.getValueType()) {
|
|
|
|
default: break;
|
|
|
|
case MVT::i8:
|
|
|
|
addFullAddress(BuildMI(BB, X86::INC8m, 4), AM);
|
|
|
|
return true;
|
|
|
|
case MVT::i16: Opc = TabPtr[1];
|
|
|
|
addFullAddress(BuildMI(BB, X86::INC16m, 4), AM);
|
|
|
|
return true;
|
|
|
|
case MVT::i32: Opc = TabPtr[2];
|
|
|
|
addFullAddress(BuildMI(BB, X86::INC32m, 4), AM);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} else if (CN->getValue()+1 == 0) { // [X] += -1 -> DEC [X]
|
|
|
|
switch (Op0.getValueType()) {
|
|
|
|
default: break;
|
|
|
|
case MVT::i8:
|
|
|
|
addFullAddress(BuildMI(BB, X86::DEC8m, 4), AM);
|
|
|
|
return true;
|
|
|
|
case MVT::i16: Opc = TabPtr[1];
|
|
|
|
addFullAddress(BuildMI(BB, X86::DEC16m, 4), AM);
|
|
|
|
return true;
|
|
|
|
case MVT::i32: Opc = TabPtr[2];
|
|
|
|
addFullAddress(BuildMI(BB, X86::DEC32m, 4), AM);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2005-01-18 03:25:26 +08:00
|
|
|
|
|
|
|
addFullAddress(BuildMI(BB, Opc, 4+1),AM).addImm(CN->getValue());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we have [mem] = V op [mem], try to turn it into:
|
|
|
|
// [mem] = [mem] op V.
|
|
|
|
if (Op1 == TheLoad && StVal.getOpcode() != ISD::SUB &&
|
|
|
|
StVal.getOpcode() != ISD::SHL && StVal.getOpcode() != ISD::SRA &&
|
|
|
|
StVal.getOpcode() != ISD::SRL)
|
|
|
|
std::swap(Op0, Op1);
|
|
|
|
|
|
|
|
if (Op0 != TheLoad) return false;
|
|
|
|
|
|
|
|
switch (Op0.getValueType()) {
|
|
|
|
default: return false;
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = TabPtr[3]; break;
|
|
|
|
case MVT::i16: Opc = TabPtr[4]; break;
|
|
|
|
case MVT::i32: Opc = TabPtr[5]; break;
|
|
|
|
}
|
2005-01-18 06:10:42 +08:00
|
|
|
|
2005-01-19 01:35:28 +08:00
|
|
|
// Table entry doesn't exist?
|
|
|
|
if (Opc == 0) return false;
|
|
|
|
|
2005-01-18 11:51:59 +08:00
|
|
|
if (!ExprMap.insert(std::make_pair(TheLoad.getValue(1), 1)).second)
|
|
|
|
assert(0 && "Already emitted?");
|
2005-01-18 06:10:42 +08:00
|
|
|
Select(Chain);
|
2005-01-18 03:25:26 +08:00
|
|
|
Select(TheLoad.getOperand(0));
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
|
2005-01-18 03:25:26 +08:00
|
|
|
X86AddressMode AM;
|
|
|
|
SelectAddress(TheLoad.getOperand(1), AM);
|
|
|
|
unsigned Reg = SelectExpr(Op1);
|
Fix a problem where probing for addressing modes caused expressions to be
emitted too early. In particular, this fixes
Regression/CodeGen/X86/regpressure.ll:regpressure3.
This also improves the 2nd basic block in 164.gzip:flush_block, which went from
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree + 20]
movzx %ECX, WORD PTR [dyn_ltree + 16]
mov DWORD PTR [%ESP + 32], %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
movzx %EDX, WORD PTR [dyn_ltree + 8]
movzx %EBX, WORD PTR [dyn_ltree + 4]
mov DWORD PTR [%ESP + 36], %EBX
movzx %EBX, WORD PTR [dyn_ltree]
add DWORD PTR [%ESP + 36], %EBX
add %EDX, DWORD PTR [%ESP + 36]
add %ECX, %EDX
add DWORD PTR [%ESP + 32], %ECX
add %EAX, DWORD PTR [%ESP + 32]
movzx %ECX, WORD PTR [dyn_ltree + 24]
add %EAX, %ECX
mov %ECX, 0
mov %EDX, %ECX
to
.LBBflush_block_1: # loopentry.1.i
movzx %EAX, WORD PTR [dyn_ltree]
movzx %ECX, WORD PTR [dyn_ltree + 4]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 8]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 12]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 16]
add %EAX, %ECX
movzx %ECX, WORD PTR [dyn_ltree + 20]
add %ECX, %EAX
movzx %EAX, WORD PTR [dyn_ltree + 24]
add %ECX, %EAX
mov %EAX, 0
mov %EDX, %EAX
... which results in less spilling in the function.
This change alone speeds up 164.gzip from 37.23s to 36.24s on apoc. The
default isel takes 37.31s.
llvm-svn: 19650
2005-01-18 09:06:26 +08:00
|
|
|
addFullAddress(BuildMI(BB, Opc, 4+1), AM).addReg(Reg);
|
2005-01-18 03:25:26 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
void ISel::Select(SDOperand N) {
|
|
|
|
unsigned Tmp1, Tmp2, Opc;
|
|
|
|
|
|
|
|
// FIXME: Disable for our current expansion model!
|
2005-01-18 11:51:59 +08:00
|
|
|
if (/*!N->hasOneUse() &&*/ !ExprMap.insert(std::make_pair(N, 1)).second)
|
2005-01-07 15:49:41 +08:00
|
|
|
return; // Already selected.
|
|
|
|
|
2005-01-11 14:14:36 +08:00
|
|
|
SDNode *Node = N.Val;
|
|
|
|
|
|
|
|
switch (Node->getOpcode()) {
|
2005-01-07 15:49:41 +08:00
|
|
|
default:
|
2005-01-11 14:14:36 +08:00
|
|
|
Node->dump(); std::cerr << "\n";
|
2005-01-07 15:49:41 +08:00
|
|
|
assert(0 && "Node not handled yet!");
|
|
|
|
case ISD::EntryToken: return; // Noop
|
2005-01-14 02:01:36 +08:00
|
|
|
case ISD::TokenFactor:
|
2005-01-14 03:56:00 +08:00
|
|
|
if (Node->getNumOperands() == 2) {
|
|
|
|
bool OneFirst =
|
|
|
|
getRegPressure(Node->getOperand(1))>getRegPressure(Node->getOperand(0));
|
|
|
|
Select(Node->getOperand(OneFirst));
|
|
|
|
Select(Node->getOperand(!OneFirst));
|
|
|
|
} else {
|
|
|
|
std::vector<std::pair<unsigned, unsigned> > OpsP;
|
|
|
|
for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
|
|
|
|
OpsP.push_back(std::make_pair(getRegPressure(Node->getOperand(i)), i));
|
|
|
|
std::sort(OpsP.begin(), OpsP.end());
|
|
|
|
std::reverse(OpsP.begin(), OpsP.end());
|
|
|
|
for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
|
|
|
|
Select(Node->getOperand(OpsP[i].second));
|
|
|
|
}
|
2005-01-14 02:01:36 +08:00
|
|
|
return;
|
2005-01-07 15:49:41 +08:00
|
|
|
case ISD::CopyToReg:
|
2005-01-12 10:02:48 +08:00
|
|
|
if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(1))) {
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
} else {
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
}
|
2005-01-14 04:50:02 +08:00
|
|
|
Tmp2 = cast<RegSDNode>(N)->getReg();
|
2005-01-07 15:49:41 +08:00
|
|
|
|
|
|
|
if (Tmp1 != Tmp2) {
|
|
|
|
switch (N.getOperand(1).getValueType()) {
|
|
|
|
default: assert(0 && "Invalid type for operation!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = X86::MOV8rr; break;
|
|
|
|
case MVT::i16: Opc = X86::MOV16rr; break;
|
|
|
|
case MVT::i32: Opc = X86::MOV32rr; break;
|
2005-01-11 11:50:45 +08:00
|
|
|
case MVT::f64: Opc = X86::FpMOV; ContainsFPCode = true; break;
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
|
|
|
BuildMI(BB, Opc, 1, Tmp2).addReg(Tmp1);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
case ISD::RET:
|
|
|
|
switch (N.getNumOperands()) {
|
|
|
|
default:
|
|
|
|
assert(0 && "Unknown return instruction!");
|
|
|
|
case 3:
|
|
|
|
assert(N.getOperand(1).getValueType() == MVT::i32 &&
|
|
|
|
N.getOperand(2).getValueType() == MVT::i32 &&
|
|
|
|
"Unknown two-register value!");
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
if (getRegPressure(N.getOperand(1)) > getRegPressure(N.getOperand(2))) {
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
Tmp2 = SelectExpr(N.getOperand(2));
|
|
|
|
} else {
|
|
|
|
Tmp2 = SelectExpr(N.getOperand(2));
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
}
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(Tmp1);
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::EDX).addReg(Tmp2);
|
|
|
|
// Declare that EAX & EDX are live on exit.
|
|
|
|
BuildMI(BB, X86::IMPLICIT_USE, 3).addReg(X86::EAX).addReg(X86::EDX)
|
|
|
|
.addReg(X86::ESP);
|
|
|
|
break;
|
|
|
|
case 2:
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(1))) {
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
} else {
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (N.getOperand(1).getValueType()) {
|
|
|
|
default: assert(0 && "All other types should have been promoted!!");
|
|
|
|
case MVT::f64:
|
|
|
|
BuildMI(BB, X86::FpSETRESULT, 1).addReg(Tmp1);
|
|
|
|
// Declare that top-of-stack is live on exit
|
|
|
|
BuildMI(BB, X86::IMPLICIT_USE, 2).addReg(X86::ST0).addReg(X86::ESP);
|
|
|
|
break;
|
|
|
|
case MVT::i32:
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(Tmp1);
|
|
|
|
BuildMI(BB, X86::IMPLICIT_USE, 2).addReg(X86::EAX).addReg(X86::ESP);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case 1:
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
Select(N.getOperand(0));
|
2005-01-07 15:49:41 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
BuildMI(BB, X86::RET, 0); // Just emit a 'ret' instruction
|
|
|
|
return;
|
|
|
|
case ISD::BR: {
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
MachineBasicBlock *Dest =
|
|
|
|
cast<BasicBlockSDNode>(N.getOperand(1))->getBasicBlock();
|
|
|
|
BuildMI(BB, X86::JMP, 1).addMBB(Dest);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
case ISD::BRCOND: {
|
|
|
|
MachineBasicBlock *Dest =
|
|
|
|
cast<BasicBlockSDNode>(N.getOperand(2))->getBasicBlock();
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
// Try to fold a setcc into the branch. If this fails, emit a test/jne
|
|
|
|
// pair.
|
2005-01-11 12:06:27 +08:00
|
|
|
if (EmitBranchCC(Dest, N.getOperand(0), N.getOperand(1))) {
|
|
|
|
if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(1))) {
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
} else {
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
BuildMI(BB, X86::TEST8rr, 2).addReg(Tmp1).addReg(Tmp1);
|
|
|
|
BuildMI(BB, X86::JNE, 1).addMBB(Dest);
|
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
return;
|
|
|
|
}
|
2005-01-15 13:22:24 +08:00
|
|
|
|
2005-01-17 08:00:33 +08:00
|
|
|
case ISD::LOAD:
|
|
|
|
// If this load could be folded into the only using instruction, and if it
|
|
|
|
// is safe to emit the instruction here, try to do so now.
|
|
|
|
if (Node->hasNUsesOfValue(1, 0)) {
|
|
|
|
SDOperand TheVal = N.getValue(0);
|
|
|
|
SDNode *User = 0;
|
|
|
|
for (SDNode::use_iterator UI = Node->use_begin(); ; ++UI) {
|
|
|
|
assert(UI != Node->use_end() && "Didn't find use!");
|
|
|
|
SDNode *UN = *UI;
|
|
|
|
for (unsigned i = 0, e = UN->getNumOperands(); i != e; ++i)
|
|
|
|
if (UN->getOperand(i) == TheVal) {
|
|
|
|
User = UN;
|
|
|
|
goto FoundIt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
FoundIt:
|
|
|
|
// Only handle unary operators right now.
|
|
|
|
if (User->getNumOperands() == 1) {
|
2005-01-18 11:51:59 +08:00
|
|
|
ExprMap.erase(N);
|
2005-01-17 08:00:33 +08:00
|
|
|
SelectExpr(SDOperand(User, 0));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2005-01-18 12:00:54 +08:00
|
|
|
ExprMap.erase(N);
|
2005-01-17 08:00:33 +08:00
|
|
|
SelectExpr(N);
|
|
|
|
return;
|
|
|
|
|
2005-01-15 13:22:24 +08:00
|
|
|
case ISD::EXTLOAD:
|
|
|
|
case ISD::SEXTLOAD:
|
|
|
|
case ISD::ZEXTLOAD:
|
2005-01-07 15:49:41 +08:00
|
|
|
case ISD::CALL:
|
|
|
|
case ISD::DYNAMIC_STACKALLOC:
|
2005-01-18 12:00:54 +08:00
|
|
|
ExprMap.erase(N);
|
2005-01-07 15:49:41 +08:00
|
|
|
SelectExpr(N);
|
|
|
|
return;
|
2005-01-15 13:22:24 +08:00
|
|
|
|
|
|
|
case ISD::TRUNCSTORE: { // truncstore chain, val, ptr :storety
|
|
|
|
// On X86, we can represent all types except for Bool and Float natively.
|
|
|
|
X86AddressMode AM;
|
|
|
|
MVT::ValueType StoredTy = cast<MVTSDNode>(Node)->getExtraValueType();
|
2005-01-16 15:34:08 +08:00
|
|
|
assert((StoredTy == MVT::i1 || StoredTy == MVT::f32 ||
|
|
|
|
StoredTy == MVT::i16 /*FIXME: THIS IS JUST FOR TESTING!*/)
|
|
|
|
&& "Unsupported TRUNCSTORE for this target!");
|
|
|
|
|
|
|
|
if (StoredTy == MVT::i16) {
|
|
|
|
// FIXME: This is here just to allow testing. X86 doesn't really have a
|
|
|
|
// TRUNCSTORE i16 operation, but this is required for targets that do not
|
|
|
|
// have 16-bit integer registers. We occasionally disable 16-bit integer
|
|
|
|
// registers to test the promotion code.
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
Tmp1 = SelectExpr(N.getOperand(1));
|
|
|
|
SelectAddress(N.getOperand(2), AM);
|
|
|
|
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(Tmp1);
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOV16mr, 5), AM).addReg(X86::AX);
|
|
|
|
return;
|
|
|
|
}
|
2005-01-15 13:22:24 +08:00
|
|
|
|
|
|
|
// Store of constant bool?
|
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
|
|
|
|
if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(2))) {
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
SelectAddress(N.getOperand(2), AM);
|
|
|
|
} else {
|
|
|
|
SelectAddress(N.getOperand(2), AM);
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
}
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOV8mi, 5), AM).addImm(CN->getValue());
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (StoredTy) {
|
|
|
|
default: assert(0 && "Cannot truncstore this type!");
|
|
|
|
case MVT::i1: Opc = X86::MOV8mr; break;
|
|
|
|
case MVT::f32: Opc = X86::FST32m; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::pair<unsigned, unsigned> > RP;
|
|
|
|
RP.push_back(std::make_pair(getRegPressure(N.getOperand(0)), 0));
|
|
|
|
RP.push_back(std::make_pair(getRegPressure(N.getOperand(1)), 1));
|
|
|
|
RP.push_back(std::make_pair(getRegPressure(N.getOperand(2)), 2));
|
|
|
|
std::sort(RP.begin(), RP.end());
|
|
|
|
|
|
|
|
for (unsigned i = 0; i != 3; ++i)
|
|
|
|
switch (RP[2-i].second) {
|
|
|
|
default: assert(0 && "Unknown operand number!");
|
|
|
|
case 0: Select(N.getOperand(0)); break;
|
|
|
|
case 1: Tmp1 = SelectExpr(N.getOperand(1)); break;
|
|
|
|
case 2: SelectAddress(N.getOperand(2), AM); break;
|
|
|
|
}
|
|
|
|
|
|
|
|
addFullAddress(BuildMI(BB, Opc, 4+1), AM).addReg(Tmp1);
|
|
|
|
return;
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
case ISD::STORE: {
|
|
|
|
X86AddressMode AM;
|
|
|
|
|
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
|
|
|
|
Opc = 0;
|
|
|
|
switch (CN->getValueType(0)) {
|
|
|
|
default: assert(0 && "Invalid type for operation!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = X86::MOV8mi; break;
|
|
|
|
case MVT::i16: Opc = X86::MOV16mi; break;
|
|
|
|
case MVT::i32: Opc = X86::MOV32mi; break;
|
|
|
|
case MVT::f64: break;
|
|
|
|
}
|
|
|
|
if (Opc) {
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(2))) {
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
SelectAddress(N.getOperand(2), AM);
|
|
|
|
} else {
|
|
|
|
SelectAddress(N.getOperand(2), AM);
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
}
|
2005-01-07 15:49:41 +08:00
|
|
|
addFullAddress(BuildMI(BB, Opc, 4+1), AM).addImm(CN->getValue());
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
Comment out debug code :)
Select [mem] += Val operations. For constants, we used to get:
mov %ECX, -32768
add %ECX, DWORD PTR [l4_match_start]
mov DWORD PTR [l4_match_start], %ECX
Now we get:
add DWORD PTR [l4_match_start], -32768
For other values we used to get:
mov %EBP, %EDI ;; because the add destroys the value
add %EBP, DWORD PTR [l4_input_len]
mov DWORD PTR [l4_input_len], %EBP
now we get:
add DWORD PTR [l4_input_len], %EDI
Both of these use less registers than the alternative, are faster and smaller.
llvm-svn: 19488
2005-01-12 07:21:30 +08:00
|
|
|
|
|
|
|
// Check to see if this is a load/op/store combination.
|
2005-01-18 03:25:26 +08:00
|
|
|
if (TryToFoldLoadOpStore(Node))
|
|
|
|
return;
|
Comment out debug code :)
Select [mem] += Val operations. For constants, we used to get:
mov %ECX, -32768
add %ECX, DWORD PTR [l4_match_start]
mov DWORD PTR [l4_match_start], %ECX
Now we get:
add DWORD PTR [l4_match_start], -32768
For other values we used to get:
mov %EBP, %EDI ;; because the add destroys the value
add %EBP, DWORD PTR [l4_input_len]
mov DWORD PTR [l4_input_len], %EBP
now we get:
add DWORD PTR [l4_input_len], %EDI
Both of these use less registers than the alternative, are faster and smaller.
llvm-svn: 19488
2005-01-12 07:21:30 +08:00
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
switch (N.getOperand(1).getValueType()) {
|
|
|
|
default: assert(0 && "Cannot store this type!");
|
|
|
|
case MVT::i1:
|
|
|
|
case MVT::i8: Opc = X86::MOV8mr; break;
|
|
|
|
case MVT::i16: Opc = X86::MOV16mr; break;
|
|
|
|
case MVT::i32: Opc = X86::MOV32mr; break;
|
2005-01-11 11:50:45 +08:00
|
|
|
case MVT::f64: Opc = X86::FST64m; break;
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
|
|
|
|
std::vector<std::pair<unsigned, unsigned> > RP;
|
|
|
|
RP.push_back(std::make_pair(getRegPressure(N.getOperand(0)), 0));
|
|
|
|
RP.push_back(std::make_pair(getRegPressure(N.getOperand(1)), 1));
|
|
|
|
RP.push_back(std::make_pair(getRegPressure(N.getOperand(2)), 2));
|
|
|
|
std::sort(RP.begin(), RP.end());
|
|
|
|
|
|
|
|
for (unsigned i = 0; i != 3; ++i)
|
|
|
|
switch (RP[2-i].second) {
|
|
|
|
default: assert(0 && "Unknown operand number!");
|
|
|
|
case 0: Select(N.getOperand(0)); break;
|
|
|
|
case 1: Tmp1 = SelectExpr(N.getOperand(1)); break;
|
2005-01-11 11:37:59 +08:00
|
|
|
case 2: SelectAddress(N.getOperand(2), AM); break;
|
Take register pressure into account when we have to decide whether to
evaluate the LHS or the RHS of an operation first. This causes good things
to happen. For example, instead of compiling a loop to this:
.LBBstrength_result7_1: # loopentry
movl 16(%esp), %edi
movl (%edi), %edi ;;; LOAD
movl (%ecx), %ebx
movl $2, (%eax,%ebx,4)
movl (%edx), %ebx
movl %esi, %ebp
addl $21, %ebp
addl $42, %esi
cmpl $0, %edi ;;; USE
cmovne %esi, %ebp
cmpl %ebp, %ebx
movl %ebp, %esi
jg .LBBstrength_result7_1
We now compile it to this:
.LBBstrength_result7_1: # loopentry
movl %edi, %ebx
addl $42, %ebx
addl $21, %edi
movl (%ecx), %ebp ;; LOAD
cmpl $0, %ebp ;; USE
cmovne %ebx, %edi
movl (%edx), %ebx
movl $2, (%eax,%ebx,4)
movl (%esi), %ebx
cmpl %edi, %ebx
jg .LBBstrength_result7_1
Which reduces register pressure enough (in this case) to avoid spilling in the
loop.
As another example, consider the CodeGen/X86/regpressure.ll testcase. We
used to generate this code for both cases:
regpressure1:
subl $32, %esp
movl %esi, 12(%esp)
movl %edi, 8(%esp)
movl %ebx, 4(%esp)
movl %ebp, (%esp)
movl 36(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 24(%esp)
movl 8(%ecx), %edx
movl %edx, 16(%esp)
movl 12(%ecx), %edx
movl 16(%ecx), %esi
movl 20(%ecx), %edi
movl 24(%ecx), %ebx
movl %ebx, 28(%esp)
movl 28(%ecx), %ebx
movl 32(%ecx), %ebp
movl %ebp, 20(%esp)
movl 36(%ecx), %ecx
imull 24(%esp), %eax
imull 16(%esp), %eax
imull %edx, %eax
imull %esi, %eax
imull %edi, %eax
imull 28(%esp), %eax
imull %ebx, %eax
imull 20(%esp), %eax
imull %ecx, %eax
movl (%esp), %ebp
movl 4(%esp), %ebx
movl 8(%esp), %edi
movl 12(%esp), %esi
addl $32, %esp
ret
This code is basically trying to do all of the loads first, then execute all
of the multiplies. Because we run out of registers, lots of spill code happens.
We now generate this code for both cases:
regpressure1:
movl 4(%esp), %ecx
movl (%ecx), %eax
movl 4(%ecx), %edx
imull %edx, %eax
movl 8(%ecx), %edx
imull %edx, %eax
movl 12(%ecx), %edx
imull %edx, %eax
movl 16(%ecx), %edx
imull %edx, %eax
movl 20(%ecx), %edx
imull %edx, %eax
movl 24(%ecx), %edx
imull %edx, %eax
movl 28(%ecx), %edx
imull %edx, %eax
movl 32(%ecx), %edx
imull %edx, %eax
movl 36(%ecx), %ecx
imull %ecx, %eax
ret
which is much nicer (when we fold loads into the muls it will be even better).
The old instruction selector used to produce the good code for regpressure1
but not for regpressure2, as it depended on the order of operations in the
LLVM code.
llvm-svn: 19449
2005-01-11 11:11:44 +08:00
|
|
|
}
|
|
|
|
|
2005-01-07 15:49:41 +08:00
|
|
|
addFullAddress(BuildMI(BB, Opc, 4+1), AM).addReg(Tmp1);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
case ISD::ADJCALLSTACKDOWN:
|
|
|
|
case ISD::ADJCALLSTACKUP:
|
|
|
|
Select(N.getOperand(0));
|
|
|
|
Tmp1 = cast<ConstantSDNode>(N.getOperand(1))->getValue();
|
|
|
|
|
|
|
|
Opc = N.getOpcode() == ISD::ADJCALLSTACKDOWN ? X86::ADJCALLSTACKDOWN :
|
|
|
|
X86::ADJCALLSTACKUP;
|
|
|
|
BuildMI(BB, Opc, 1).addImm(Tmp1);
|
|
|
|
return;
|
2005-01-11 14:14:36 +08:00
|
|
|
case ISD::MEMSET: {
|
|
|
|
Select(N.getOperand(0)); // Select the chain.
|
|
|
|
unsigned Align =
|
|
|
|
(unsigned)cast<ConstantSDNode>(Node->getOperand(4))->getValue();
|
|
|
|
if (Align == 0) Align = 1;
|
|
|
|
|
|
|
|
// Turn the byte code into # iterations
|
|
|
|
unsigned CountReg;
|
|
|
|
unsigned Opcode;
|
|
|
|
if (ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Node->getOperand(2))) {
|
|
|
|
unsigned Val = ValC->getValue() & 255;
|
|
|
|
|
|
|
|
// If the value is a constant, then we can potentially use larger sets.
|
|
|
|
switch (Align & 3) {
|
|
|
|
case 2: // WORD aligned
|
|
|
|
CountReg = MakeReg(MVT::i32);
|
|
|
|
if (ConstantSDNode *I = dyn_cast<ConstantSDNode>(Node->getOperand(3))) {
|
|
|
|
BuildMI(BB, X86::MOV32ri, 1, CountReg).addImm(I->getValue()/2);
|
|
|
|
} else {
|
|
|
|
unsigned ByteReg = SelectExpr(Node->getOperand(3));
|
|
|
|
BuildMI(BB, X86::SHR32ri, 2, CountReg).addReg(ByteReg).addImm(1);
|
|
|
|
}
|
|
|
|
BuildMI(BB, X86::MOV16ri, 1, X86::AX).addImm((Val << 8) | Val);
|
|
|
|
Opcode = X86::REP_STOSW;
|
|
|
|
break;
|
|
|
|
case 0: // DWORD aligned
|
|
|
|
CountReg = MakeReg(MVT::i32);
|
|
|
|
if (ConstantSDNode *I = dyn_cast<ConstantSDNode>(Node->getOperand(3))) {
|
|
|
|
BuildMI(BB, X86::MOV32ri, 1, CountReg).addImm(I->getValue()/4);
|
|
|
|
} else {
|
|
|
|
unsigned ByteReg = SelectExpr(Node->getOperand(3));
|
|
|
|
BuildMI(BB, X86::SHR32ri, 2, CountReg).addReg(ByteReg).addImm(2);
|
|
|
|
}
|
|
|
|
Val = (Val << 8) | Val;
|
|
|
|
BuildMI(BB, X86::MOV32ri, 1, X86::EAX).addImm((Val << 16) | Val);
|
|
|
|
Opcode = X86::REP_STOSD;
|
|
|
|
break;
|
|
|
|
default: // BYTE aligned
|
|
|
|
CountReg = SelectExpr(Node->getOperand(3));
|
|
|
|
BuildMI(BB, X86::MOV8ri, 1, X86::AL).addImm(Val);
|
|
|
|
Opcode = X86::REP_STOSB;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// If it's not a constant value we are storing, just fall back. We could
|
|
|
|
// try to be clever to form 16 bit and 32 bit values, but we don't yet.
|
|
|
|
unsigned ValReg = SelectExpr(Node->getOperand(2));
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, X86::AL).addReg(ValReg);
|
|
|
|
CountReg = SelectExpr(Node->getOperand(3));
|
|
|
|
Opcode = X86::REP_STOSB;
|
|
|
|
}
|
|
|
|
|
|
|
|
// No matter what the alignment is, we put the source in ESI, the
|
|
|
|
// destination in EDI, and the count in ECX.
|
|
|
|
unsigned TmpReg1 = SelectExpr(Node->getOperand(1));
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::ECX).addReg(CountReg);
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::EDI).addReg(TmpReg1);
|
|
|
|
BuildMI(BB, Opcode, 0);
|
|
|
|
return;
|
|
|
|
}
|
2005-01-11 14:19:26 +08:00
|
|
|
case ISD::MEMCPY:
|
|
|
|
Select(N.getOperand(0)); // Select the chain.
|
|
|
|
unsigned Align =
|
|
|
|
(unsigned)cast<ConstantSDNode>(Node->getOperand(4))->getValue();
|
|
|
|
if (Align == 0) Align = 1;
|
|
|
|
|
|
|
|
// Turn the byte code into # iterations
|
|
|
|
unsigned CountReg;
|
|
|
|
unsigned Opcode;
|
|
|
|
switch (Align & 3) {
|
|
|
|
case 2: // WORD aligned
|
|
|
|
CountReg = MakeReg(MVT::i32);
|
|
|
|
if (ConstantSDNode *I = dyn_cast<ConstantSDNode>(Node->getOperand(3))) {
|
|
|
|
BuildMI(BB, X86::MOV32ri, 1, CountReg).addImm(I->getValue()/2);
|
|
|
|
} else {
|
|
|
|
unsigned ByteReg = SelectExpr(Node->getOperand(3));
|
|
|
|
BuildMI(BB, X86::SHR32ri, 2, CountReg).addReg(ByteReg).addImm(1);
|
|
|
|
}
|
|
|
|
Opcode = X86::REP_MOVSW;
|
|
|
|
break;
|
|
|
|
case 0: // DWORD aligned
|
|
|
|
CountReg = MakeReg(MVT::i32);
|
|
|
|
if (ConstantSDNode *I = dyn_cast<ConstantSDNode>(Node->getOperand(3))) {
|
|
|
|
BuildMI(BB, X86::MOV32ri, 1, CountReg).addImm(I->getValue()/4);
|
|
|
|
} else {
|
|
|
|
unsigned ByteReg = SelectExpr(Node->getOperand(3));
|
|
|
|
BuildMI(BB, X86::SHR32ri, 2, CountReg).addReg(ByteReg).addImm(2);
|
|
|
|
}
|
|
|
|
Opcode = X86::REP_MOVSD;
|
|
|
|
break;
|
|
|
|
default: // BYTE aligned
|
|
|
|
CountReg = SelectExpr(Node->getOperand(3));
|
|
|
|
Opcode = X86::REP_MOVSB;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// No matter what the alignment is, we put the source in ESI, the
|
|
|
|
// destination in EDI, and the count in ECX.
|
|
|
|
unsigned TmpReg1 = SelectExpr(Node->getOperand(1));
|
|
|
|
unsigned TmpReg2 = SelectExpr(Node->getOperand(2));
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::ECX).addReg(CountReg);
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::EDI).addReg(TmpReg1);
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::ESI).addReg(TmpReg2);
|
|
|
|
BuildMI(BB, Opcode, 0);
|
|
|
|
return;
|
2005-01-07 15:49:41 +08:00
|
|
|
}
|
|
|
|
assert(0 && "Should not be reached!");
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// createX86PatternInstructionSelector - This pass converts an LLVM function
|
|
|
|
/// into a machine code representation using pattern matching and a machine
|
|
|
|
/// description file.
|
|
|
|
///
|
|
|
|
FunctionPass *llvm::createX86PatternInstructionSelector(TargetMachine &TM) {
|
|
|
|
return new ISel(TM);
|
|
|
|
}
|