2005-11-17 06:59:19 +08:00
|
|
|
//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
|
2005-11-16 09:54:32 +08:00
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-30 04:36:04 +08:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2005-11-16 09:54:32 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file defines a DAG pattern matching instruction selector for X86,
|
|
|
|
// converting from a legalized dag to a X86 dag.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "X86.h"
|
2006-01-11 14:09:51 +08:00
|
|
|
#include "X86InstrBuilder.h"
|
2008-01-05 08:41:47 +08:00
|
|
|
#include "X86MachineFunctionInfo.h"
|
2006-01-11 09:15:34 +08:00
|
|
|
#include "X86RegisterInfo.h"
|
2005-11-16 09:54:32 +08:00
|
|
|
#include "X86Subtarget.h"
|
2006-03-14 07:20:37 +08:00
|
|
|
#include "X86TargetMachine.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2006-01-11 04:26:56 +08:00
|
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
2006-01-11 09:15:34 +08:00
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
2007-12-31 12:13:23 +08:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2005-11-16 09:54:32 +08:00
|
|
|
#include "llvm/CodeGen/SelectionDAGISel.h"
|
2016-12-09 03:01:00 +08:00
|
|
|
#include "llvm/IR/ConstantRange.h"
|
2014-09-03 06:28:02 +08:00
|
|
|
#include "llvm/IR/Function.h"
|
2013-01-02 19:36:10 +08:00
|
|
|
#include "llvm/IR/Instructions.h"
|
|
|
|
#include "llvm/IR/Intrinsics.h"
|
|
|
|
#include "llvm/IR/Type.h"
|
2006-09-08 14:48:29 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2009-07-09 04:53:28 +08:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
2006-09-08 14:48:29 +08:00
|
|
|
#include "llvm/Support/MathExtras.h"
|
2009-07-09 04:53:28 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2012-12-04 00:50:05 +08:00
|
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
#include "llvm/Target/TargetOptions.h"
|
2016-04-06 04:45:04 +08:00
|
|
|
#include <stdint.h>
|
2005-11-16 09:54:32 +08:00
|
|
|
using namespace llvm;
|
|
|
|
|
2014-04-22 10:41:26 +08:00
|
|
|
#define DEBUG_TYPE "x86-isel"
|
|
|
|
|
2006-12-20 06:59:26 +08:00
|
|
|
STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
|
|
|
|
|
2005-11-16 09:54:32 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Pattern Matcher Implementation
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2005-11-19 10:11:08 +08:00
|
|
|
namespace {
|
2015-10-13 23:12:27 +08:00
|
|
|
/// This corresponds to X86AddressMode, but uses SDValue's instead of register
|
|
|
|
/// numbers for the leaves of the matched tree.
|
2005-11-19 10:11:08 +08:00
|
|
|
struct X86ISelAddressMode {
|
|
|
|
enum {
|
|
|
|
RegBase,
|
2006-05-25 01:04:05 +08:00
|
|
|
FrameIndexBase
|
2005-11-19 10:11:08 +08:00
|
|
|
} BaseType;
|
|
|
|
|
2010-04-30 07:30:41 +08:00
|
|
|
// This is really a union, discriminated by BaseType!
|
|
|
|
SDValue Base_Reg;
|
|
|
|
int Base_FrameIndex;
|
2005-11-19 10:11:08 +08:00
|
|
|
|
|
|
|
unsigned Scale;
|
2012-08-02 02:39:17 +08:00
|
|
|
SDValue IndexReg;
|
2008-11-11 23:52:29 +08:00
|
|
|
int32_t Disp;
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue Segment;
|
2010-04-15 09:51:59 +08:00
|
|
|
const GlobalValue *GV;
|
|
|
|
const Constant *CP;
|
|
|
|
const BlockAddress *BlockAddr;
|
2006-09-08 14:48:29 +08:00
|
|
|
const char *ES;
|
2015-06-23 01:46:53 +08:00
|
|
|
MCSymbol *MCSym;
|
2006-09-08 14:48:29 +08:00
|
|
|
int JT;
|
2006-02-25 18:09:08 +08:00
|
|
|
unsigned Align; // CP alignment.
|
2009-06-26 13:51:45 +08:00
|
|
|
unsigned char SymbolFlags; // X86II::MO_*
|
2005-11-19 10:11:08 +08:00
|
|
|
|
|
|
|
X86ISelAddressMode()
|
2015-06-23 01:46:53 +08:00
|
|
|
: BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
|
|
|
|
Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
|
|
|
|
MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
|
2009-02-07 08:43:41 +08:00
|
|
|
|
|
|
|
bool hasSymbolicDisplacement() const {
|
2014-04-25 13:30:21 +08:00
|
|
|
return GV != nullptr || CP != nullptr || ES != nullptr ||
|
2015-06-23 01:46:53 +08:00
|
|
|
MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
|
2009-02-07 08:43:41 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
bool hasBaseOrIndexReg() const {
|
2013-09-19 19:33:53 +08:00
|
|
|
return BaseType == FrameIndexBase ||
|
2014-04-25 13:30:21 +08:00
|
|
|
IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return true if this addressing mode is already RIP-relative.
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
bool isRIPRelative() const {
|
|
|
|
if (BaseType != RegBase) return false;
|
|
|
|
if (RegisterSDNode *RegNode =
|
2010-04-30 07:30:41 +08:00
|
|
|
dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
return RegNode->getReg() == X86::RIP;
|
|
|
|
return false;
|
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
void setBaseReg(SDValue Reg) {
|
|
|
|
BaseType = RegBase;
|
2010-04-30 07:30:41 +08:00
|
|
|
Base_Reg = Reg;
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
}
|
2009-02-07 08:43:41 +08:00
|
|
|
|
2012-09-12 06:23:19 +08:00
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
2008-08-12 07:46:25 +08:00
|
|
|
void dump() {
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << "X86ISelAddressMode " << this << '\n';
|
2010-04-30 07:30:41 +08:00
|
|
|
dbgs() << "Base_Reg ";
|
2014-04-28 12:05:08 +08:00
|
|
|
if (Base_Reg.getNode())
|
2012-08-02 02:39:17 +08:00
|
|
|
Base_Reg.getNode()->dump();
|
2009-08-08 05:33:25 +08:00
|
|
|
else
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << "nul";
|
2010-04-30 07:30:41 +08:00
|
|
|
dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
|
2009-08-23 19:52:17 +08:00
|
|
|
<< " Scale" << Scale << '\n'
|
|
|
|
<< "IndexReg ";
|
2014-04-28 12:05:08 +08:00
|
|
|
if (IndexReg.getNode())
|
2009-08-08 05:33:25 +08:00
|
|
|
IndexReg.getNode()->dump();
|
|
|
|
else
|
2012-08-02 02:39:17 +08:00
|
|
|
dbgs() << "nul";
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << " Disp " << Disp << '\n'
|
2009-08-23 19:52:17 +08:00
|
|
|
<< "GV ";
|
2009-08-08 05:33:25 +08:00
|
|
|
if (GV)
|
|
|
|
GV->dump();
|
|
|
|
else
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << "nul";
|
|
|
|
dbgs() << " CP ";
|
2009-08-08 05:33:25 +08:00
|
|
|
if (CP)
|
|
|
|
CP->dump();
|
|
|
|
else
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << "nul";
|
|
|
|
dbgs() << '\n'
|
2009-08-23 19:52:17 +08:00
|
|
|
<< "ES ";
|
2009-08-08 05:33:25 +08:00
|
|
|
if (ES)
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << ES;
|
2009-08-08 05:33:25 +08:00
|
|
|
else
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << "nul";
|
2015-06-23 01:46:53 +08:00
|
|
|
dbgs() << " MCSym ";
|
|
|
|
if (MCSym)
|
|
|
|
dbgs() << MCSym;
|
|
|
|
else
|
|
|
|
dbgs() << "nul";
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << " JT" << JT << " Align" << Align << '\n';
|
2008-08-12 07:46:25 +08:00
|
|
|
}
|
2012-09-07 03:06:06 +08:00
|
|
|
#endif
|
2005-11-19 10:11:08 +08:00
|
|
|
};
|
2016-04-06 04:45:04 +08:00
|
|
|
}
|
2005-11-19 10:11:08 +08:00
|
|
|
|
2005-11-16 09:54:32 +08:00
|
|
|
namespace {
|
|
|
|
//===--------------------------------------------------------------------===//
|
2015-10-13 23:12:27 +08:00
|
|
|
/// ISel - X86-specific code to select X86 machine instructions for
|
2005-11-16 09:54:32 +08:00
|
|
|
/// SelectionDAG operations.
|
|
|
|
///
|
2014-03-31 14:22:15 +08:00
|
|
|
class X86DAGToDAGISel final : public SelectionDAGISel {
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Keep a pointer to the X86Subtarget around so that we can
|
2005-11-16 09:54:32 +08:00
|
|
|
/// make the right decision when generating code for different targets.
|
|
|
|
const X86Subtarget *Subtarget;
|
2006-02-18 08:15:05 +08:00
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// If true, selector should try to optimize for code size instead of
|
|
|
|
/// performance.
|
2008-09-27 07:41:32 +08:00
|
|
|
bool OptForSize;
|
|
|
|
|
2016-03-25 09:10:56 +08:00
|
|
|
/// If true, selector should try to optimize for minimum code size.
|
|
|
|
bool OptForMinSize;
|
|
|
|
|
2005-11-16 09:54:32 +08:00
|
|
|
public:
|
2009-04-30 07:29:43 +08:00
|
|
|
explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
|
2016-03-25 09:10:56 +08:00
|
|
|
: SelectionDAGISel(tm, OptLevel), OptForSize(false),
|
|
|
|
OptForMinSize(false) {}
|
2005-11-16 09:54:32 +08:00
|
|
|
|
2016-10-01 10:56:57 +08:00
|
|
|
StringRef getPassName() const override {
|
2005-11-16 09:54:32 +08:00
|
|
|
return "X86 DAG->DAG Instruction Selection";
|
|
|
|
}
|
|
|
|
|
2014-05-22 09:53:26 +08:00
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
|
|
// Reset the subtarget each time through.
|
2015-02-03 01:38:43 +08:00
|
|
|
Subtarget = &MF.getSubtarget<X86Subtarget>();
|
2014-05-22 09:53:26 +08:00
|
|
|
SelectionDAGISel::runOnMachineFunction(MF);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-03-09 15:44:38 +08:00
|
|
|
void EmitFunctionEntryCode() override;
|
2007-09-26 05:52:30 +08:00
|
|
|
|
2014-03-09 15:44:38 +08:00
|
|
|
bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
|
2010-02-16 03:41:07 +08:00
|
|
|
|
2014-03-09 15:44:38 +08:00
|
|
|
void PreprocessISelDAG() override;
|
2010-03-02 14:34:30 +08:00
|
|
|
|
2005-11-16 09:54:32 +08:00
|
|
|
// Include the pieces autogenerated from the target description.
|
|
|
|
#include "X86GenDAGISel.inc"
|
|
|
|
|
|
|
|
private:
|
2016-05-11 07:55:37 +08:00
|
|
|
void Select(SDNode *N) override;
|
2016-05-12 01:46:03 +08:00
|
|
|
bool tryGather(SDNode *N, unsigned Opc);
|
2015-10-14 00:23:00 +08:00
|
|
|
|
|
|
|
bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
|
|
|
|
bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
|
|
|
|
bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
|
|
|
|
bool matchAddress(SDValue N, X86ISelAddressMode &AM);
|
2015-10-22 02:56:06 +08:00
|
|
|
bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
|
2009-07-23 07:26:55 +08:00
|
|
|
unsigned Depth);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
|
|
|
|
bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue &Scale, SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
|
2015-04-30 16:38:48 +08:00
|
|
|
SDValue &Scale, SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool selectMOV64Imm32(SDValue N, SDValue &Imm);
|
|
|
|
bool selectLEAAddr(SDValue N, SDValue &Base,
|
2010-07-09 07:46:44 +08:00
|
|
|
SDValue &Scale, SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool selectLEA64_32Addr(SDValue N, SDValue &Base,
|
2013-06-11 04:43:49 +08:00
|
|
|
SDValue &Scale, SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool selectTLSADDRAddr(SDValue N, SDValue &Base,
|
2010-07-09 07:46:44 +08:00
|
|
|
SDValue &Scale, SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
2015-10-14 00:23:00 +08:00
|
|
|
bool selectScalarSSELoad(SDNode *Root, SDValue N,
|
2010-02-17 06:35:06 +08:00
|
|
|
SDValue &Base, SDValue &Scale,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue &Index, SDValue &Disp,
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue &Segment,
|
2010-02-21 11:17:59 +08:00
|
|
|
SDValue &NodeWithChain);
|
2016-11-10 07:53:43 +08:00
|
|
|
bool selectRelocImm(SDValue N, SDValue &Op);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool tryFoldLoad(SDNode *P, SDValue N,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue &Base, SDValue &Scale,
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Implement addressing mode selection for inline asm expressions.
|
2014-03-09 15:44:38 +08:00
|
|
|
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
|
2015-03-13 20:45:09 +08:00
|
|
|
unsigned ConstraintID,
|
2014-03-09 15:44:38 +08:00
|
|
|
std::vector<SDValue> &OutOps) override;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
void emitSpecialCodeForMain();
|
2007-09-26 05:52:30 +08:00
|
|
|
|
2016-06-12 23:39:02 +08:00
|
|
|
inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue &Base, SDValue &Scale,
|
|
|
|
SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment) {
|
2014-10-08 15:32:17 +08:00
|
|
|
Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
|
2015-07-09 10:09:04 +08:00
|
|
|
? CurDAG->getTargetFrameIndex(
|
|
|
|
AM.Base_FrameIndex,
|
|
|
|
TLI->getPointerTy(CurDAG->getDataLayout()))
|
2014-10-08 15:32:17 +08:00
|
|
|
: AM.Base_Reg;
|
2015-04-28 22:05:47 +08:00
|
|
|
Scale = getI8Imm(AM.Scale, DL);
|
2005-12-13 05:49:40 +08:00
|
|
|
Index = AM.IndexReg;
|
2015-10-13 23:12:27 +08:00
|
|
|
// These are 32-bit even in 64-bit mode since RIP-relative offset
|
2006-09-08 14:48:29 +08:00
|
|
|
// is 32-bit.
|
|
|
|
if (AM.GV)
|
2013-05-25 10:42:55 +08:00
|
|
|
Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
|
2010-07-07 06:08:15 +08:00
|
|
|
MVT::i32, AM.Disp,
|
2009-06-26 13:51:45 +08:00
|
|
|
AM.SymbolFlags);
|
2006-09-08 14:48:29 +08:00
|
|
|
else if (AM.CP)
|
2009-08-12 04:47:22 +08:00
|
|
|
Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
|
2009-06-26 13:51:45 +08:00
|
|
|
AM.Align, AM.Disp, AM.SymbolFlags);
|
2012-09-13 05:43:09 +08:00
|
|
|
else if (AM.ES) {
|
|
|
|
assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
|
2009-08-12 04:47:22 +08:00
|
|
|
Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
|
2015-06-23 01:46:53 +08:00
|
|
|
} else if (AM.MCSym) {
|
|
|
|
assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
|
|
|
|
assert(AM.SymbolFlags == 0 && "oo");
|
|
|
|
Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
|
2012-09-13 05:43:09 +08:00
|
|
|
} else if (AM.JT != -1) {
|
|
|
|
assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
|
2009-08-12 04:47:22 +08:00
|
|
|
Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
|
2012-09-13 05:43:09 +08:00
|
|
|
} else if (AM.BlockAddr)
|
|
|
|
Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
|
|
|
|
AM.SymbolFlags);
|
2006-09-08 14:48:29 +08:00
|
|
|
else
|
2015-04-28 22:05:47 +08:00
|
|
|
Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
|
2009-04-09 05:14:34 +08:00
|
|
|
|
|
|
|
if (AM.Segment.getNode())
|
|
|
|
Segment = AM.Segment;
|
|
|
|
else
|
2009-08-12 04:47:22 +08:00
|
|
|
Segment = CurDAG->getRegister(0, MVT::i32);
|
2005-12-13 05:49:40 +08:00
|
|
|
}
|
|
|
|
|
2015-08-11 22:10:58 +08:00
|
|
|
// Utility function to determine whether we should avoid selecting
|
|
|
|
// immediate forms of instructions for better code size or not.
|
|
|
|
// At a high level, we'd like to avoid such instructions when
|
|
|
|
// we have similar constants used within the same basic block
|
|
|
|
// that can be kept in a register.
|
|
|
|
//
|
|
|
|
bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
|
|
|
|
uint32_t UseCount = 0;
|
|
|
|
|
|
|
|
// Do not want to hoist if we're not optimizing for size.
|
|
|
|
// TODO: We'd like to remove this restriction.
|
|
|
|
// See the comment in X86InstrInfo.td for more info.
|
|
|
|
if (!OptForSize)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Walk all the users of the immediate.
|
|
|
|
for (SDNode::use_iterator UI = N->use_begin(),
|
|
|
|
UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
|
2016-04-06 04:45:04 +08:00
|
|
|
|
2015-08-11 22:10:58 +08:00
|
|
|
SDNode *User = *UI;
|
|
|
|
|
|
|
|
// This user is already selected. Count it as a legitimate use and
|
|
|
|
// move on.
|
|
|
|
if (User->isMachineOpcode()) {
|
|
|
|
UseCount++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We want to count stores of immediates as real uses.
|
|
|
|
if (User->getOpcode() == ISD::STORE &&
|
|
|
|
User->getOperand(1).getNode() == N) {
|
|
|
|
UseCount++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We don't currently match users that have > 2 operands (except
|
|
|
|
// for stores, which are handled above)
|
|
|
|
// Those instruction won't match in ISEL, for now, and would
|
|
|
|
// be counted incorrectly.
|
|
|
|
// This may change in the future as we add additional instruction
|
|
|
|
// types.
|
|
|
|
if (User->getNumOperands() != 2)
|
|
|
|
continue;
|
2016-05-06 07:19:08 +08:00
|
|
|
|
2015-08-11 22:10:58 +08:00
|
|
|
// Immediates that are used for offsets as part of stack
|
|
|
|
// manipulation should be left alone. These are typically
|
|
|
|
// used to indicate SP offsets for argument passing and
|
|
|
|
// will get pulled into stores/pushes (implicitly).
|
|
|
|
if (User->getOpcode() == X86ISD::ADD ||
|
|
|
|
User->getOpcode() == ISD::ADD ||
|
|
|
|
User->getOpcode() == X86ISD::SUB ||
|
|
|
|
User->getOpcode() == ISD::SUB) {
|
|
|
|
|
|
|
|
// Find the other operand of the add/sub.
|
|
|
|
SDValue OtherOp = User->getOperand(0);
|
|
|
|
if (OtherOp.getNode() == N)
|
|
|
|
OtherOp = User->getOperand(1);
|
|
|
|
|
|
|
|
// Don't count if the other operand is SP.
|
|
|
|
RegisterSDNode *RegNode;
|
|
|
|
if (OtherOp->getOpcode() == ISD::CopyFromReg &&
|
|
|
|
(RegNode = dyn_cast_or_null<RegisterSDNode>(
|
|
|
|
OtherOp->getOperand(1).getNode())))
|
|
|
|
if ((RegNode->getReg() == X86::ESP) ||
|
|
|
|
(RegNode->getReg() == X86::RSP))
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// ... otherwise, count this and move on.
|
|
|
|
UseCount++;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we have more than 1 use, then recommend for hoisting.
|
|
|
|
return (UseCount > 1);
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return a target constant with the specified value of type i8.
|
2016-06-12 23:39:02 +08:00
|
|
|
inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
|
2015-04-28 22:05:47 +08:00
|
|
|
return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
|
2005-11-19 10:11:08 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return a target constant with the specified value, of type i32.
|
2016-06-12 23:39:02 +08:00
|
|
|
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
|
2015-04-28 22:05:47 +08:00
|
|
|
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
|
2005-11-16 09:54:32 +08:00
|
|
|
}
|
2006-02-11 06:24:32 +08:00
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return an SDNode that returns the value of the global base register.
|
|
|
|
/// Output instructions required to initialize the global base register,
|
|
|
|
/// if necessary.
|
2006-08-26 13:34:46 +08:00
|
|
|
SDNode *getGlobalBaseReg();
|
2006-02-18 08:15:05 +08:00
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return a reference to the TargetMachine, casted to the target-specific
|
|
|
|
/// type.
|
2013-02-20 05:54:59 +08:00
|
|
|
const X86TargetMachine &getTargetMachine() const {
|
2009-06-04 04:20:00 +08:00
|
|
|
return static_cast<const X86TargetMachine &>(TM);
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return a reference to the TargetInstrInfo, casted to the target-specific
|
|
|
|
/// type.
|
2013-02-20 05:54:59 +08:00
|
|
|
const X86InstrInfo *getInstrInfo() const {
|
2015-02-03 01:38:43 +08:00
|
|
|
return Subtarget->getInstrInfo();
|
2009-06-04 04:20:00 +08:00
|
|
|
}
|
2014-10-04 04:00:34 +08:00
|
|
|
|
|
|
|
/// \brief Address-mode matching performs shift-of-and to and-of-shift
|
|
|
|
/// reassociation in order to expose more scaled addressing
|
|
|
|
/// opportunities.
|
|
|
|
bool ComplexPatternFuncMutatesDAG() const override {
|
|
|
|
return true;
|
|
|
|
}
|
2005-11-16 09:54:32 +08:00
|
|
|
};
|
2016-04-06 04:45:04 +08:00
|
|
|
}
|
|
|
|
|
2006-08-08 08:31:00 +08:00
|
|
|
|
2010-02-16 03:41:07 +08:00
|
|
|
bool
|
|
|
|
X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
|
2009-04-30 07:29:43 +08:00
|
|
|
if (OptLevel == CodeGenOpt::None) return false;
|
2006-10-14 16:33:25 +08:00
|
|
|
|
2010-02-16 03:41:07 +08:00
|
|
|
if (!N.hasOneUse())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (N.getOpcode() != ISD::LOAD)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// If N is a load, do additional profitability checks.
|
|
|
|
if (U == Root) {
|
2008-11-27 08:49:46 +08:00
|
|
|
switch (U->getOpcode()) {
|
|
|
|
default: break;
|
2010-01-05 04:51:50 +08:00
|
|
|
case X86ISD::ADD:
|
|
|
|
case X86ISD::SUB:
|
|
|
|
case X86ISD::AND:
|
|
|
|
case X86ISD::XOR:
|
|
|
|
case X86ISD::OR:
|
2008-11-27 08:49:46 +08:00
|
|
|
case ISD::ADD:
|
|
|
|
case ISD::ADDC:
|
|
|
|
case ISD::ADDE:
|
|
|
|
case ISD::AND:
|
|
|
|
case ISD::OR:
|
|
|
|
case ISD::XOR: {
|
2009-04-10 18:09:34 +08:00
|
|
|
SDValue Op1 = U->getOperand(1);
|
|
|
|
|
2008-11-27 08:49:46 +08:00
|
|
|
// If the other operand is a 8-bit immediate we should fold the immediate
|
|
|
|
// instead. This reduces code size.
|
|
|
|
// e.g.
|
|
|
|
// movl 4(%esp), %eax
|
|
|
|
// addl $4, %eax
|
|
|
|
// vs.
|
|
|
|
// movl $4, %eax
|
|
|
|
// addl 4(%esp), %eax
|
|
|
|
// The former is 2 bytes shorter. In case where the increment is 1, then
|
|
|
|
// the saving can be 4 bytes (by using incl %eax).
|
2009-04-10 18:09:34 +08:00
|
|
|
if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
|
2009-03-14 10:07:16 +08:00
|
|
|
if (Imm->getAPIntValue().isSignedIntN(8))
|
|
|
|
return false;
|
2009-04-10 18:09:34 +08:00
|
|
|
|
|
|
|
// If the other operand is a TLS address, we should fold it instead.
|
|
|
|
// This produces
|
|
|
|
// movl %gs:0, %eax
|
|
|
|
// leal i@NTPOFF(%eax), %eax
|
|
|
|
// instead of
|
|
|
|
// movl $i@NTPOFF, %eax
|
|
|
|
// addl %gs:0, %eax
|
|
|
|
// if the block also has an access to a second TLS address this will save
|
|
|
|
// a load.
|
2013-12-05 13:44:44 +08:00
|
|
|
// FIXME: This is probably also true for non-TLS addresses.
|
2009-04-10 18:09:34 +08:00
|
|
|
if (Op1.getOpcode() == X86ISD::Wrapper) {
|
|
|
|
SDValue Val = Op1.getOperand(0);
|
|
|
|
if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
|
|
|
|
return false;
|
|
|
|
}
|
2008-11-27 08:49:46 +08:00
|
|
|
}
|
|
|
|
}
|
2010-02-16 03:41:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Replace the original chain operand of the call with
|
2010-03-14 11:48:46 +08:00
|
|
|
/// load's chain operand and move load below the call's chain operand.
|
2016-04-06 04:45:04 +08:00
|
|
|
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
|
|
|
|
SDValue Call, SDValue OrigChain) {
|
2008-08-26 05:27:18 +08:00
|
|
|
SmallVector<SDValue, 8> Ops;
|
2010-03-14 11:48:46 +08:00
|
|
|
SDValue Chain = OrigChain.getOperand(0);
|
2009-01-27 02:43:34 +08:00
|
|
|
if (Chain.getNode() == Load.getNode())
|
|
|
|
Ops.push_back(Load.getOperand(0));
|
|
|
|
else {
|
|
|
|
assert(Chain.getOpcode() == ISD::TokenFactor &&
|
2010-03-14 11:48:46 +08:00
|
|
|
"Unexpected chain operand");
|
2009-01-27 02:43:34 +08:00
|
|
|
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
|
|
|
|
if (Chain.getOperand(i).getNode() == Load.getNode())
|
|
|
|
Ops.push_back(Load.getOperand(0));
|
|
|
|
else
|
|
|
|
Ops.push_back(Chain.getOperand(i));
|
|
|
|
SDValue NewChain =
|
2014-04-27 02:35:24 +08:00
|
|
|
CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
|
2009-01-27 02:43:34 +08:00
|
|
|
Ops.clear();
|
|
|
|
Ops.push_back(NewChain);
|
|
|
|
}
|
2015-02-17 23:29:18 +08:00
|
|
|
Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
|
2014-04-28 13:57:50 +08:00
|
|
|
CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
|
2010-06-18 23:30:29 +08:00
|
|
|
CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
|
2008-08-26 05:27:18 +08:00
|
|
|
Load.getOperand(1), Load.getOperand(2));
|
2012-10-03 07:49:13 +08:00
|
|
|
|
2008-08-26 05:27:18 +08:00
|
|
|
Ops.clear();
|
2008-08-29 05:40:38 +08:00
|
|
|
Ops.push_back(SDValue(Load.getNode(), 1));
|
2015-02-17 23:29:18 +08:00
|
|
|
Ops.append(Call->op_begin() + 1, Call->op_end());
|
2014-04-28 13:57:50 +08:00
|
|
|
CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
|
2008-08-26 05:27:18 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return true if call address is a load and it can be
|
2008-08-26 05:27:18 +08:00
|
|
|
/// moved below CALLSEQ_START and the chains leading up to the call.
|
|
|
|
/// Return the CALLSEQ_START by reference as a second output.
|
2010-03-14 11:48:46 +08:00
|
|
|
/// In the case of a tail call, there isn't a callseq node between the call
|
|
|
|
/// chain and the load.
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
|
2012-10-05 09:48:22 +08:00
|
|
|
// The transformation is somewhat dangerous if the call's chain was glued to
|
|
|
|
// the call. After MoveBelowOrigChain the load is moved between the call and
|
|
|
|
// the chain, this can create a cycle if the load is not folded. So it is
|
|
|
|
// *really* important that we are sure the load will be folded.
|
2008-08-29 05:40:38 +08:00
|
|
|
if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
|
2008-08-26 05:27:18 +08:00
|
|
|
return false;
|
2008-08-29 05:40:38 +08:00
|
|
|
LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
|
2008-08-26 05:27:18 +08:00
|
|
|
if (!LD ||
|
|
|
|
LD->isVolatile() ||
|
|
|
|
LD->getAddressingMode() != ISD::UNINDEXED ||
|
|
|
|
LD->getExtensionType() != ISD::NON_EXTLOAD)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Now let's find the callseq_start.
|
2010-03-14 11:48:46 +08:00
|
|
|
while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
|
2008-08-26 05:27:18 +08:00
|
|
|
if (!Chain.hasOneUse())
|
|
|
|
return false;
|
|
|
|
Chain = Chain.getOperand(0);
|
|
|
|
}
|
2010-03-14 11:48:46 +08:00
|
|
|
|
|
|
|
if (!Chain.getNumOperands())
|
|
|
|
return false;
|
2013-01-07 03:00:15 +08:00
|
|
|
// Since we are not checking for AA here, conservatively abort if the chain
|
|
|
|
// writes to memory. It's not safe to move the callee (a load) across a store.
|
|
|
|
if (isa<MemSDNode>(Chain.getNode()) &&
|
|
|
|
cast<MemSDNode>(Chain.getNode())->writeMem())
|
|
|
|
return false;
|
2009-01-27 02:43:34 +08:00
|
|
|
if (Chain.getOperand(0).getNode() == Callee.getNode())
|
|
|
|
return true;
|
|
|
|
if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
|
2009-09-15 09:22:01 +08:00
|
|
|
Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
|
|
|
|
Callee.getValue(1).hasOneUse())
|
2009-01-27 02:43:34 +08:00
|
|
|
return true;
|
|
|
|
return false;
|
2008-08-26 05:27:18 +08:00
|
|
|
}
|
|
|
|
|
2010-03-03 07:12:51 +08:00
|
|
|
void X86DAGToDAGISel::PreprocessISelDAG() {
|
2016-03-25 09:10:56 +08:00
|
|
|
// OptFor[Min]Size are used in pattern predicates that isel is matching.
|
2015-08-11 00:47:47 +08:00
|
|
|
OptForSize = MF->getFunction()->optForSize();
|
2016-03-25 09:10:56 +08:00
|
|
|
OptForMinSize = MF->getFunction()->optForMinSize();
|
|
|
|
assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2008-08-23 10:25:05 +08:00
|
|
|
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
|
|
|
|
E = CurDAG->allnodes_end(); I != E; ) {
|
2015-10-20 05:48:29 +08:00
|
|
|
SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
|
2010-03-03 07:12:51 +08:00
|
|
|
|
2010-03-14 11:48:46 +08:00
|
|
|
if (OptLevel != CodeGenOpt::None &&
|
2013-03-29 07:13:21 +08:00
|
|
|
// Only does this when target favors doesn't favor register indirect
|
|
|
|
// call.
|
|
|
|
((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) ||
|
2012-10-05 09:48:22 +08:00
|
|
|
(N->getOpcode() == X86ISD::TC_RETURN &&
|
2013-01-14 03:03:55 +08:00
|
|
|
// Only does this if load can be folded into TC_RETURN.
|
2012-10-05 09:48:22 +08:00
|
|
|
(Subtarget->is64Bit() ||
|
2016-06-28 05:33:08 +08:00
|
|
|
!getTargetMachine().isPositionIndependent())))) {
|
2010-03-03 07:12:51 +08:00
|
|
|
/// Also try moving call address load from outside callseq_start to just
|
|
|
|
/// before the call to allow it to be folded.
|
|
|
|
///
|
|
|
|
/// [Load chain]
|
|
|
|
/// ^
|
|
|
|
/// |
|
|
|
|
/// [Load]
|
|
|
|
/// ^ ^
|
|
|
|
/// | |
|
|
|
|
/// / \--
|
|
|
|
/// / |
|
|
|
|
///[CALLSEQ_START] |
|
|
|
|
/// ^ |
|
|
|
|
/// | |
|
|
|
|
/// [LOAD/C2Reg] |
|
|
|
|
/// | |
|
|
|
|
/// \ /
|
|
|
|
/// \ /
|
|
|
|
/// [CALL]
|
2010-03-14 11:48:46 +08:00
|
|
|
bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
|
2010-03-03 07:12:51 +08:00
|
|
|
SDValue Chain = N->getOperand(0);
|
|
|
|
SDValue Load = N->getOperand(1);
|
2010-03-14 11:48:46 +08:00
|
|
|
if (!isCalleeLoad(Load, Chain, HasCallSeq))
|
2010-03-03 07:12:51 +08:00
|
|
|
continue;
|
2015-10-14 00:23:00 +08:00
|
|
|
moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
|
2010-03-03 07:12:51 +08:00
|
|
|
++NumLoadMoved;
|
|
|
|
continue;
|
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2010-03-03 07:12:51 +08:00
|
|
|
// Lower fpround and fpextend nodes that target the FP stack to be store and
|
|
|
|
// load to the stack. This is a gross hack. We would like to simply mark
|
|
|
|
// these as being illegal, but when we do that, legalize produces these when
|
|
|
|
// it expands calls, then expands these in the same legalize pass. We would
|
|
|
|
// like dag combine to be able to hack on these between the call expansion
|
|
|
|
// and the node legalization. As such this pass basically does "really
|
|
|
|
// late" legalization of these inline with the X86 isel pass.
|
|
|
|
// FIXME: This should only happen when not compiled with -O0.
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
|
|
|
|
continue;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT SrcVT = N->getOperand(0).getSimpleValueType();
|
|
|
|
MVT DstVT = N->getSimpleValueType(0);
|
2011-08-02 05:54:05 +08:00
|
|
|
|
|
|
|
// If any of the sources are vectors, no fp stack involved.
|
|
|
|
if (SrcVT.isVector() || DstVT.isVector())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// If the source and destination are SSE registers, then this is a legal
|
|
|
|
// conversion that should not be lowered.
|
2013-06-27 19:07:42 +08:00
|
|
|
const X86TargetLowering *X86Lowering =
|
2014-10-08 15:32:17 +08:00
|
|
|
static_cast<const X86TargetLowering *>(TLI);
|
2013-06-20 05:36:55 +08:00
|
|
|
bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
|
|
|
|
bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
if (SrcIsSSE && DstIsSSE)
|
|
|
|
continue;
|
|
|
|
|
2008-03-09 15:05:32 +08:00
|
|
|
if (!SrcIsSSE && !DstIsSSE) {
|
|
|
|
// If this is an FPStack extension, it is a noop.
|
|
|
|
if (N->getOpcode() == ISD::FP_EXTEND)
|
|
|
|
continue;
|
|
|
|
// If this is a value-preserving FPStack truncation, it is a noop.
|
|
|
|
if (N->getConstantOperandVal(1))
|
|
|
|
continue;
|
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
|
|
|
|
// FPStack has extload and truncstore. SSE can fold direct loads into other
|
|
|
|
// operations. Based on this, decide what we want to do.
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT MemVT;
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
if (N->getOpcode() == ISD::FP_ROUND)
|
|
|
|
MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
|
|
|
|
else
|
|
|
|
MemVT = SrcIsSSE ? SrcVT : DstVT;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2008-08-23 10:25:05 +08:00
|
|
|
SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(N);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
// FIXME: optimize the case where the src/dest is a load or store?
|
[SelectionDAG] Get rid of bool parameters in SelectionDAG::getLoad, getStore, and friends.
Summary:
Instead, we take a single flags arg (a bitset).
Also add a default 0 alignment, and change the order of arguments so the
alignment comes before the flags.
This greatly simplifies many callsites, and fixes a bug in
AMDGPUISelLowering, wherein the order of the args to getLoad was
inverted. It also greatly simplifies the process of adding another flag
to getLoad.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, jyknight, dsanders, nemanjai, llvm-commits
Differential Revision: http://reviews.llvm.org/D22249
llvm-svn: 275592
2016-07-16 02:27:10 +08:00
|
|
|
SDValue Store =
|
|
|
|
CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
|
|
|
|
MemTmp, MachinePointerInfo(), MemVT);
|
2011-02-17 00:23:55 +08:00
|
|
|
SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
|
[SelectionDAG] Get rid of bool parameters in SelectionDAG::getLoad, getStore, and friends.
Summary:
Instead, we take a single flags arg (a bitset).
Also add a default 0 alignment, and change the order of arguments so the
alignment comes before the flags.
This greatly simplifies many callsites, and fixes a bug in
AMDGPUISelLowering, wherein the order of the args to getLoad was
inverted. It also greatly simplifies the process of adding another flag
to getLoad.
Reviewers: chandlerc, tstellarAMD
Subscribers: jholewinski, arsenm, jyknight, dsanders, nemanjai, llvm-commits
Differential Revision: http://reviews.llvm.org/D22249
llvm-svn: 275592
2016-07-16 02:27:10 +08:00
|
|
|
MachinePointerInfo(), MemVT);
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
|
|
|
|
// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
|
|
|
|
// extload we created. This will cause general havok on the dag because
|
|
|
|
// anything below the conversion could be folded into other existing nodes.
|
|
|
|
// To avoid invalidating 'I', back it up to the convert node.
|
|
|
|
--I;
|
2008-08-23 10:25:05 +08:00
|
|
|
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
// Now that we did that, the node is dead. Increment the iterator to the
|
|
|
|
// next node to process, then delete N.
|
|
|
|
++I;
|
2008-08-23 10:25:05 +08:00
|
|
|
CurDAG->DeleteNode(N);
|
2012-08-02 02:39:17 +08:00
|
|
|
}
|
Significantly simplify and improve handling of FP function results on x86-32.
This case returns the value in ST(0) and then has to convert it to an SSE
register. This causes significant codegen ugliness in some cases. For
example in the trivial fp-stack-direct-ret.ll testcase we used to generate:
_bar:
subl $28, %esp
call L_foo$stub
fstpl 16(%esp)
movsd 16(%esp), %xmm0
movsd %xmm0, 8(%esp)
fldl 8(%esp)
addl $28, %esp
ret
because we move the result of foo() into an XMM register, then have to
move it back for the return of bar.
Instead of hacking ever-more special cases into the call result lowering code
we take a much simpler approach: on x86-32, fp return is modeled as always
returning into an f80 register which is then truncated to f32 or f64 as needed.
Similarly for a result, we model it as an extension to f80 + return.
This exposes the truncate and extensions to the dag combiner, allowing target
independent code to hack on them, eliminating them in this case. This gives
us this code for the example above:
_bar:
subl $12, %esp
call L_foo$stub
addl $12, %esp
ret
The nasty aspect of this is that these conversions are not legal, but we want
the second pass of dag combiner (post-legalize) to be able to hack on them.
To handle this, we lie to legalize and say they are legal, then custom expand
them on entry to the isel pass (PreprocessForFPConvert). This is gross, but
less gross than the code it is replacing :)
This also allows us to generate better code in several other cases. For
example on fp-stack-ret-conv.ll, we now generate:
_test:
subl $12, %esp
call L_foo$stub
fstps 8(%esp)
movl 16(%esp), %eax
cvtss2sd 8(%esp), %xmm0
movsd %xmm0, (%eax)
addl $12, %esp
ret
where before we produced (incidentally, the old bad code is identical to what
gcc produces):
_test:
subl $12, %esp
call L_foo$stub
fstpl (%esp)
cvtsd2ss (%esp), %xmm0
cvtss2sd %xmm0, %xmm0
movl 16(%esp), %eax
movsd %xmm0, (%eax)
addl $12, %esp
ret
Note that we generate slightly worse code on pr1505b.ll due to a scheduling
deficiency that is unrelated to this patch.
llvm-svn: 46307
2008-01-24 16:07:48 +08:00
|
|
|
}
|
|
|
|
|
2016-04-06 04:45:04 +08:00
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Emit any code that needs to be executed only in the main function.
|
2015-10-14 00:23:00 +08:00
|
|
|
void X86DAGToDAGISel::emitSpecialCodeForMain() {
|
2011-01-06 08:47:10 +08:00
|
|
|
if (Subtarget->isTargetCygMing()) {
|
2015-02-21 13:49:45 +08:00
|
|
|
TargetLowering::ArgListTy Args;
|
2015-07-09 10:09:04 +08:00
|
|
|
auto &DL = CurDAG->getDataLayout();
|
2015-02-21 13:49:45 +08:00
|
|
|
|
|
|
|
TargetLowering::CallLoweringInfo CLI(*CurDAG);
|
|
|
|
CLI.setChain(CurDAG->getRoot())
|
|
|
|
.setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
|
2015-07-09 10:09:04 +08:00
|
|
|
CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
|
2016-06-22 20:54:25 +08:00
|
|
|
std::move(Args));
|
2015-02-21 13:49:45 +08:00
|
|
|
const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
|
|
|
|
std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
|
|
|
|
CurDAG->setRoot(Result.second);
|
2011-01-06 08:47:10 +08:00
|
|
|
}
|
2007-09-26 05:52:30 +08:00
|
|
|
}
|
|
|
|
|
2010-04-15 04:17:22 +08:00
|
|
|
void X86DAGToDAGISel::EmitFunctionEntryCode() {
|
2007-09-26 05:52:30 +08:00
|
|
|
// If this is main, emit special code for main.
|
2010-04-15 04:17:22 +08:00
|
|
|
if (const Function *Fn = MF->getFunction())
|
|
|
|
if (Fn->hasExternalLinkage() && Fn->getName() == "main")
|
2015-10-14 00:23:00 +08:00
|
|
|
emitSpecialCodeForMain();
|
2007-09-26 05:52:30 +08:00
|
|
|
}
|
|
|
|
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool isDispSafeForFrameIndex(int64_t Val) {
|
2011-07-14 05:29:53 +08:00
|
|
|
// On 64-bit platforms, we can run into an issue where a frame index
|
|
|
|
// includes a displacement that, when added to the explicit displacement,
|
|
|
|
// will overflow the displacement field. Assuming that the frame index
|
|
|
|
// displacement fits into a 31-bit integer (which is only slightly more
|
|
|
|
// aggressive than the current fundamental assumption that it fits into
|
|
|
|
// a 32-bit integer), a 31-bit disp should always be safe.
|
|
|
|
return isInt<31>(Val);
|
|
|
|
}
|
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
|
2011-07-14 04:44:23 +08:00
|
|
|
X86ISelAddressMode &AM) {
|
2015-05-05 07:22:36 +08:00
|
|
|
// Cannot combine ExternalSymbol displacements with integer offsets.
|
2015-06-23 01:46:53 +08:00
|
|
|
if (Offset != 0 && (AM.ES || AM.MCSym))
|
2015-05-05 07:22:36 +08:00
|
|
|
return true;
|
2011-07-14 04:44:23 +08:00
|
|
|
int64_t Val = AM.Disp + Offset;
|
|
|
|
CodeModel::Model M = TM.getCodeModel();
|
2011-07-14 05:29:53 +08:00
|
|
|
if (Subtarget->is64Bit()) {
|
|
|
|
if (!X86::isOffsetSuitableForCodeModel(Val, M,
|
|
|
|
AM.hasSymbolicDisplacement()))
|
|
|
|
return true;
|
|
|
|
// In addition to the checks required for a register base, check that
|
|
|
|
// we do not try to use an unsafe Disp with a frame index.
|
|
|
|
if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
|
|
|
|
!isDispSafeForFrameIndex(Val))
|
|
|
|
return true;
|
2011-07-14 04:44:23 +08:00
|
|
|
}
|
2011-07-14 05:29:53 +08:00
|
|
|
AM.Disp = Val;
|
|
|
|
return false;
|
2016-04-06 04:45:04 +08:00
|
|
|
|
2011-07-14 04:44:23 +08:00
|
|
|
}
|
2009-04-09 05:14:34 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
|
2010-09-22 12:39:11 +08:00
|
|
|
SDValue Address = N->getOperand(1);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2010-09-22 12:39:11 +08:00
|
|
|
// load gs:0 -> GS segment register.
|
|
|
|
// load fs:0 -> FS segment register.
|
|
|
|
//
|
2009-04-09 05:14:34 +08:00
|
|
|
// This optimization is valid because the GNU TLS model defines that
|
|
|
|
// gs:0 (or fs:0 on X86-64) contains its own address.
|
|
|
|
// For more information see http://people.redhat.com/drepper/tls.pdf
|
2010-09-22 12:39:11 +08:00
|
|
|
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
|
2014-04-25 13:30:21 +08:00
|
|
|
if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
|
2016-05-05 19:35:51 +08:00
|
|
|
Subtarget->isTargetGlibc())
|
2010-09-22 12:39:11 +08:00
|
|
|
switch (N->getPointerInfo().getAddrSpace()) {
|
|
|
|
case 256:
|
|
|
|
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
|
|
|
|
return false;
|
|
|
|
case 257:
|
|
|
|
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
|
|
|
|
return false;
|
2016-05-04 04:16:08 +08:00
|
|
|
// Address space 258 is not handled here, because it is not used to
|
|
|
|
// address TLS areas.
|
2010-09-22 12:39:11 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2009-04-09 05:14:34 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
|
|
|
|
/// mode. These wrap things that will resolve down into a symbol reference.
|
|
|
|
/// If no match is possible, this returns true, otherwise it returns false.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
// If the addressing mode already has a symbol as the displacement, we can
|
|
|
|
// never match another symbol.
|
2009-04-13 05:55:03 +08:00
|
|
|
if (AM.hasSymbolicDisplacement())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
SDValue N0 = N.getOperand(0);
|
2009-08-06 07:01:26 +08:00
|
|
|
CodeModel::Model M = TM.getCodeModel();
|
|
|
|
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
// Handle X86-64 rip-relative addresses. We check this before checking direct
|
|
|
|
// folding because RIP is preferable to non-RIP accesses.
|
Cleanup and relax a restriction on the matching of global offsets into
x86 addressing modes. This allows PIE-based TLS offsets to fit directly
into an addressing mode immediate offset, which is the last remaining
code quality issue from PR12380. With this patch, that PR is completely
fixed.
To understand why this patch is correct to match these offsets into
addressing mode immediates, break it down by cases:
1) 32-bit is trivially correct, and unmodified here.
2) 64-bit non-small mode is unchanged and never matches.
3) 64-bit small PIC code which is RIP-relative is handled specially in
the match to try to fit RIP into the base register. If it fails, it
now early exits. This behavior is unchanged by the patch.
4) 64-bit small non-PIC code which is not RIP-relative continues to work
as it did before. The reason these immediates are safe is because the
ABI ensures they fit in small mode. This behavior is unchanged.
5) 64-bit small PIC code which is *not* using RIP-relative addressing.
This is the only case changed by the patch, and the primary place you
see it is in TLS, either the win64 section offset TLS or Linux
local-exec TLS model in a PIC compilation. Here the ABI again ensures
that the immediates fit because we are in small mode, and any other
operations required due to the PIC relocation model have been handled
externally to the Wrapper node (extra loads etc are made around the
wrapper node in ISelLowering).
I've tested this as much as I can comparing it with GCC's output, and
everything appears safe. I discussed this with Anton and it made sense
to him at least at face value. That said, if there are issues with PIC
code after this patch, yell and we can revert it.
llvm-svn: 154304
2012-04-09 10:13:06 +08:00
|
|
|
if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
// Under X86-64 non-small code model, GV (and friends) are 64-bits, so
|
|
|
|
// they cannot be folded into immediate fields.
|
|
|
|
// FIXME: This can be improved for kernel and other models?
|
Cleanup and relax a restriction on the matching of global offsets into
x86 addressing modes. This allows PIE-based TLS offsets to fit directly
into an addressing mode immediate offset, which is the last remaining
code quality issue from PR12380. With this patch, that PR is completely
fixed.
To understand why this patch is correct to match these offsets into
addressing mode immediates, break it down by cases:
1) 32-bit is trivially correct, and unmodified here.
2) 64-bit non-small mode is unchanged and never matches.
3) 64-bit small PIC code which is RIP-relative is handled specially in
the match to try to fit RIP into the base register. If it fails, it
now early exits. This behavior is unchanged by the patch.
4) 64-bit small non-PIC code which is not RIP-relative continues to work
as it did before. The reason these immediates are safe is because the
ABI ensures they fit in small mode. This behavior is unchanged.
5) 64-bit small PIC code which is *not* using RIP-relative addressing.
This is the only case changed by the patch, and the primary place you
see it is in TLS, either the win64 section offset TLS or Linux
local-exec TLS model in a PIC compilation. Here the ABI again ensures
that the immediates fit because we are in small mode, and any other
operations required due to the PIC relocation model have been handled
externally to the Wrapper node (extra loads etc are made around the
wrapper node in ISelLowering).
I've tested this as much as I can comparing it with GCC's output, and
everything appears safe. I discussed this with Anton and it made sense
to him at least at face value. That said, if there are issues with PIC
code after this patch, yell and we can revert it.
llvm-svn: 154304
2012-04-09 10:13:06 +08:00
|
|
|
(M == CodeModel::Small || M == CodeModel::Kernel)) {
|
|
|
|
// Base and index reg must be 0 in order to use %rip as base.
|
|
|
|
if (AM.hasBaseOrIndexReg())
|
|
|
|
return true;
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
|
2011-07-14 04:44:23 +08:00
|
|
|
X86ISelAddressMode Backup = AM;
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
AM.GV = G->getGlobal();
|
2009-06-26 13:51:45 +08:00
|
|
|
AM.SymbolFlags = G->getTargetFlags();
|
2015-10-14 00:23:00 +08:00
|
|
|
if (foldOffsetIntoAddress(G->getOffset(), AM)) {
|
2011-07-14 04:44:23 +08:00
|
|
|
AM = Backup;
|
|
|
|
return true;
|
|
|
|
}
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
} else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
|
2011-07-14 04:44:23 +08:00
|
|
|
X86ISelAddressMode Backup = AM;
|
2009-04-13 05:55:03 +08:00
|
|
|
AM.CP = CP->getConstVal();
|
|
|
|
AM.Align = CP->getAlignment();
|
2009-06-26 13:56:49 +08:00
|
|
|
AM.SymbolFlags = CP->getTargetFlags();
|
2015-10-14 00:23:00 +08:00
|
|
|
if (foldOffsetIntoAddress(CP->getOffset(), AM)) {
|
2011-07-14 04:44:23 +08:00
|
|
|
AM = Backup;
|
|
|
|
return true;
|
|
|
|
}
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
|
|
|
|
AM.ES = S->getSymbol();
|
|
|
|
AM.SymbolFlags = S->getTargetFlags();
|
2015-06-23 01:46:53 +08:00
|
|
|
} else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
|
|
|
|
AM.MCSym = S->getMCSymbol();
|
2009-11-01 11:25:03 +08:00
|
|
|
} else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
AM.JT = J->getIndex();
|
|
|
|
AM.SymbolFlags = J->getTargetFlags();
|
2012-09-13 05:43:09 +08:00
|
|
|
} else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
|
|
|
|
X86ISelAddressMode Backup = AM;
|
|
|
|
AM.BlockAddr = BA->getBlockAddress();
|
|
|
|
AM.SymbolFlags = BA->getTargetFlags();
|
2015-10-14 00:23:00 +08:00
|
|
|
if (foldOffsetIntoAddress(BA->getOffset(), AM)) {
|
2012-09-13 05:43:09 +08:00
|
|
|
AM = Backup;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
llvm_unreachable("Unhandled symbol reference node.");
|
2009-08-06 07:01:26 +08:00
|
|
|
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
if (N.getOpcode() == X86ISD::WrapperRIP)
|
2009-08-12 04:47:22 +08:00
|
|
|
AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
|
2009-04-13 05:55:03 +08:00
|
|
|
return false;
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Handle the case when globals fit in our immediate field: This is true for
|
Cleanup and relax a restriction on the matching of global offsets into
x86 addressing modes. This allows PIE-based TLS offsets to fit directly
into an addressing mode immediate offset, which is the last remaining
code quality issue from PR12380. With this patch, that PR is completely
fixed.
To understand why this patch is correct to match these offsets into
addressing mode immediates, break it down by cases:
1) 32-bit is trivially correct, and unmodified here.
2) 64-bit non-small mode is unchanged and never matches.
3) 64-bit small PIC code which is RIP-relative is handled specially in
the match to try to fit RIP into the base register. If it fails, it
now early exits. This behavior is unchanged by the patch.
4) 64-bit small non-PIC code which is not RIP-relative continues to work
as it did before. The reason these immediates are safe is because the
ABI ensures they fit in small mode. This behavior is unchanged.
5) 64-bit small PIC code which is *not* using RIP-relative addressing.
This is the only case changed by the patch, and the primary place you
see it is in TLS, either the win64 section offset TLS or Linux
local-exec TLS model in a PIC compilation. Here the ABI again ensures
that the immediates fit because we are in small mode, and any other
operations required due to the PIC relocation model have been handled
externally to the Wrapper node (extra loads etc are made around the
wrapper node in ISelLowering).
I've tested this as much as I can comparing it with GCC's output, and
everything appears safe. I discussed this with Anton and it made sense
to him at least at face value. That said, if there are issues with PIC
code after this patch, yell and we can revert it.
llvm-svn: 154304
2012-04-09 10:13:06 +08:00
|
|
|
// X86-32 always and X86-64 when in -mcmodel=small mode. In 64-bit
|
|
|
|
// mode, this only applies to a non-RIP-relative computation.
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
if (!Subtarget->is64Bit() ||
|
Cleanup and relax a restriction on the matching of global offsets into
x86 addressing modes. This allows PIE-based TLS offsets to fit directly
into an addressing mode immediate offset, which is the last remaining
code quality issue from PR12380. With this patch, that PR is completely
fixed.
To understand why this patch is correct to match these offsets into
addressing mode immediates, break it down by cases:
1) 32-bit is trivially correct, and unmodified here.
2) 64-bit non-small mode is unchanged and never matches.
3) 64-bit small PIC code which is RIP-relative is handled specially in
the match to try to fit RIP into the base register. If it fails, it
now early exits. This behavior is unchanged by the patch.
4) 64-bit small non-PIC code which is not RIP-relative continues to work
as it did before. The reason these immediates are safe is because the
ABI ensures they fit in small mode. This behavior is unchanged.
5) 64-bit small PIC code which is *not* using RIP-relative addressing.
This is the only case changed by the patch, and the primary place you
see it is in TLS, either the win64 section offset TLS or Linux
local-exec TLS model in a PIC compilation. Here the ABI again ensures
that the immediates fit because we are in small mode, and any other
operations required due to the PIC relocation model have been handled
externally to the Wrapper node (extra loads etc are made around the
wrapper node in ISelLowering).
I've tested this as much as I can comparing it with GCC's output, and
everything appears safe. I discussed this with Anton and it made sense
to him at least at face value. That said, if there are issues with PIC
code after this patch, yell and we can revert it.
llvm-svn: 154304
2012-04-09 10:13:06 +08:00
|
|
|
M == CodeModel::Small || M == CodeModel::Kernel) {
|
|
|
|
assert(N.getOpcode() != X86ISD::WrapperRIP &&
|
|
|
|
"RIP-relative addressing already handled");
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
|
|
|
|
AM.GV = G->getGlobal();
|
|
|
|
AM.Disp += G->getOffset();
|
|
|
|
AM.SymbolFlags = G->getTargetFlags();
|
|
|
|
} else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
|
|
|
|
AM.CP = CP->getConstVal();
|
|
|
|
AM.Align = CP->getAlignment();
|
|
|
|
AM.Disp += CP->getOffset();
|
|
|
|
AM.SymbolFlags = CP->getTargetFlags();
|
|
|
|
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
|
|
|
|
AM.ES = S->getSymbol();
|
|
|
|
AM.SymbolFlags = S->getTargetFlags();
|
2015-06-23 01:46:53 +08:00
|
|
|
} else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
|
|
|
|
AM.MCSym = S->getMCSymbol();
|
2009-11-01 11:25:03 +08:00
|
|
|
} else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
AM.JT = J->getIndex();
|
|
|
|
AM.SymbolFlags = J->getTargetFlags();
|
2012-09-13 05:43:09 +08:00
|
|
|
} else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
|
|
|
|
AM.BlockAddr = BA->getBlockAddress();
|
|
|
|
AM.Disp += BA->getOffset();
|
|
|
|
AM.SymbolFlags = BA->getTargetFlags();
|
|
|
|
} else
|
|
|
|
llvm_unreachable("Unhandled symbol reference node.");
|
2009-04-13 05:55:03 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Add the specified node to the specified addressing mode, returning true if
|
|
|
|
/// it cannot be done. This just pattern matches for the addressing mode.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
|
|
|
|
if (matchAddressRecursively(N, AM, 0))
|
2009-07-23 07:26:55 +08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
|
|
|
|
// a smaller encoding and avoids a scaled-index.
|
|
|
|
if (AM.Scale == 2 &&
|
|
|
|
AM.BaseType == X86ISelAddressMode::RegBase &&
|
2014-04-25 13:30:21 +08:00
|
|
|
AM.Base_Reg.getNode() == nullptr) {
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_Reg = AM.IndexReg;
|
2009-07-23 07:26:55 +08:00
|
|
|
AM.Scale = 1;
|
|
|
|
}
|
|
|
|
|
2009-08-21 02:23:44 +08:00
|
|
|
// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
|
|
|
|
// because it has a smaller encoding.
|
|
|
|
// TODO: Which other code models can use this?
|
|
|
|
if (TM.getCodeModel() == CodeModel::Small &&
|
|
|
|
Subtarget->is64Bit() &&
|
|
|
|
AM.Scale == 1 &&
|
|
|
|
AM.BaseType == X86ISelAddressMode::RegBase &&
|
2014-04-25 13:30:21 +08:00
|
|
|
AM.Base_Reg.getNode() == nullptr &&
|
|
|
|
AM.IndexReg.getNode() == nullptr &&
|
2009-08-26 01:47:44 +08:00
|
|
|
AM.SymbolFlags == X86II::MO_NO_FLAG &&
|
2009-08-21 02:23:44 +08:00
|
|
|
AM.hasSymbolicDisplacement())
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
|
2009-08-21 02:23:44 +08:00
|
|
|
|
2009-07-23 07:26:55 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-10-22 02:56:06 +08:00
|
|
|
bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
|
|
|
|
unsigned Depth) {
|
|
|
|
// Add an artificial use to this node so that we can keep track of
|
|
|
|
// it if it gets CSE'd with a different node.
|
|
|
|
HandleSDNode Handle(N);
|
|
|
|
|
|
|
|
X86ISelAddressMode Backup = AM;
|
|
|
|
if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
|
|
|
|
!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
|
|
|
|
return false;
|
|
|
|
AM = Backup;
|
|
|
|
|
|
|
|
// Try again after commuting the operands.
|
|
|
|
if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
|
|
|
|
!matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
|
|
|
|
return false;
|
|
|
|
AM = Backup;
|
|
|
|
|
|
|
|
// If we couldn't fold both operands into the address at the same time,
|
|
|
|
// see if we can just put each operand into a register and fold at least
|
|
|
|
// the add.
|
|
|
|
if (AM.BaseType == X86ISelAddressMode::RegBase &&
|
|
|
|
!AM.Base_Reg.getNode() &&
|
|
|
|
!AM.IndexReg.getNode()) {
|
|
|
|
N = Handle.getValue();
|
|
|
|
AM.Base_Reg = N.getOperand(0);
|
|
|
|
AM.IndexReg = N.getOperand(1);
|
|
|
|
AM.Scale = 1;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
N = Handle.getValue();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2012-01-11 19:04:36 +08:00
|
|
|
// Insert a node into the DAG at least before the Pos node's position. This
|
|
|
|
// will reposition the node as needed, and will assign it a node ID that is <=
|
|
|
|
// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
|
|
|
|
// IDs! The selection DAG must no longer depend on their uniqueness when this
|
|
|
|
// is used.
|
2016-04-06 04:45:04 +08:00
|
|
|
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
|
2012-01-11 19:04:36 +08:00
|
|
|
if (N.getNode()->getNodeId() == -1 ||
|
|
|
|
N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
|
2015-10-20 05:48:29 +08:00
|
|
|
DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode());
|
2012-01-11 19:04:36 +08:00
|
|
|
N.getNode()->setNodeId(Pos.getNode()->getNodeId());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-09-17 01:14:10 +08:00
|
|
|
// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
|
|
|
|
// safe. This allows us to convert the shift and and into an h-register
|
|
|
|
// extract and a scaled index. Returns false if the simplification is
|
|
|
|
// performed.
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
|
|
|
|
uint64_t Mask,
|
|
|
|
SDValue Shift, SDValue X,
|
|
|
|
X86ISelAddressMode &AM) {
|
2012-01-11 16:48:20 +08:00
|
|
|
if (Shift.getOpcode() != ISD::SRL ||
|
|
|
|
!isa<ConstantSDNode>(Shift.getOperand(1)) ||
|
|
|
|
!Shift.hasOneUse())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
int ScaleLog = 8 - Shift.getConstantOperandVal(1);
|
|
|
|
if (ScaleLog <= 0 || ScaleLog >= 4 ||
|
|
|
|
Mask != (0xffu << ScaleLog))
|
|
|
|
return true;
|
|
|
|
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT VT = N.getSimpleValueType();
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc DL(N);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
|
|
|
|
SDValue NewMask = DAG.getConstant(0xff, DL, VT);
|
2012-01-11 16:48:20 +08:00
|
|
|
SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
|
|
|
|
SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
|
2012-01-11 16:48:20 +08:00
|
|
|
SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
|
|
|
|
|
2012-01-12 09:34:44 +08:00
|
|
|
// Insert the new nodes into the topological ordering. We must do this in
|
|
|
|
// a valid topological ordering as nothing is going to go back and re-sort
|
|
|
|
// these nodes. We continually insert before 'N' in sequence as this is
|
|
|
|
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
|
|
|
|
// hierarchy left to express.
|
2015-10-14 00:23:00 +08:00
|
|
|
insertDAGNode(DAG, N, Eight);
|
|
|
|
insertDAGNode(DAG, N, Srl);
|
|
|
|
insertDAGNode(DAG, N, NewMask);
|
|
|
|
insertDAGNode(DAG, N, And);
|
|
|
|
insertDAGNode(DAG, N, ShlCount);
|
|
|
|
insertDAGNode(DAG, N, Shl);
|
2012-01-11 16:48:20 +08:00
|
|
|
DAG.ReplaceAllUsesWith(N, Shl);
|
|
|
|
AM.IndexReg = And;
|
|
|
|
AM.Scale = (1 << ScaleLog);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-01-11 17:35:00 +08:00
|
|
|
// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
|
|
|
|
// allows us to fold the shift into this addressing mode. Returns false if the
|
|
|
|
// transform succeeded.
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
|
|
|
|
uint64_t Mask,
|
|
|
|
SDValue Shift, SDValue X,
|
|
|
|
X86ISelAddressMode &AM) {
|
2012-01-11 17:35:00 +08:00
|
|
|
if (Shift.getOpcode() != ISD::SHL ||
|
|
|
|
!isa<ConstantSDNode>(Shift.getOperand(1)))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Not likely to be profitable if either the AND or SHIFT node has more
|
|
|
|
// than one use (unless all uses are for address computation). Besides,
|
|
|
|
// isel mechanism requires their node ids to be reused.
|
|
|
|
if (!N.hasOneUse() || !Shift.hasOneUse())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Verify that the shift amount is something we can fold.
|
|
|
|
unsigned ShiftAmt = Shift.getConstantOperandVal(1);
|
|
|
|
if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
|
|
|
|
return true;
|
|
|
|
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT VT = N.getSimpleValueType();
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc DL(N);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
|
2012-01-11 17:35:00 +08:00
|
|
|
SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
|
|
|
|
SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
|
|
|
|
|
2012-01-12 09:34:44 +08:00
|
|
|
// Insert the new nodes into the topological ordering. We must do this in
|
|
|
|
// a valid topological ordering as nothing is going to go back and re-sort
|
|
|
|
// these nodes. We continually insert before 'N' in sequence as this is
|
|
|
|
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
|
|
|
|
// hierarchy left to express.
|
2015-10-14 00:23:00 +08:00
|
|
|
insertDAGNode(DAG, N, NewMask);
|
|
|
|
insertDAGNode(DAG, N, NewAnd);
|
|
|
|
insertDAGNode(DAG, N, NewShift);
|
2012-01-11 17:35:00 +08:00
|
|
|
DAG.ReplaceAllUsesWith(N, NewShift);
|
|
|
|
|
|
|
|
AM.Scale = 1 << ShiftAmt;
|
|
|
|
AM.IndexReg = NewAnd;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-01-11 16:41:08 +08:00
|
|
|
// Implement some heroics to detect shifts of masked values where the mask can
|
|
|
|
// be replaced by extending the shift and undoing that in the addressing mode
|
|
|
|
// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
|
|
|
|
// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
|
|
|
|
// the addressing mode. This results in code such as:
|
|
|
|
//
|
|
|
|
// int f(short *y, int *lookup_table) {
|
|
|
|
// ...
|
|
|
|
// return *y + lookup_table[*y >> 11];
|
|
|
|
// }
|
|
|
|
//
|
|
|
|
// Turning into:
|
|
|
|
// movzwl (%rdi), %eax
|
|
|
|
// movl %eax, %ecx
|
|
|
|
// shrl $11, %ecx
|
|
|
|
// addl (%rsi,%rcx,4), %eax
|
|
|
|
//
|
|
|
|
// Instead of:
|
|
|
|
// movzwl (%rdi), %eax
|
|
|
|
// movl %eax, %ecx
|
|
|
|
// shrl $9, %ecx
|
|
|
|
// andl $124, %rcx
|
|
|
|
// addl (%rsi,%rcx), %eax
|
|
|
|
//
|
2012-01-11 17:35:02 +08:00
|
|
|
// Note that this function assumes the mask is provided as a mask *after* the
|
|
|
|
// value is shifted. The input chain may or may not match that, but computing
|
|
|
|
// such a mask is trivial.
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
|
|
|
|
uint64_t Mask,
|
|
|
|
SDValue Shift, SDValue X,
|
|
|
|
X86ISelAddressMode &AM) {
|
2012-01-11 17:35:02 +08:00
|
|
|
if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
|
|
|
|
!isa<ConstantSDNode>(Shift.getOperand(1)))
|
2012-01-11 16:41:08 +08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
unsigned ShiftAmt = Shift.getConstantOperandVal(1);
|
2013-05-25 06:23:49 +08:00
|
|
|
unsigned MaskLZ = countLeadingZeros(Mask);
|
|
|
|
unsigned MaskTZ = countTrailingZeros(Mask);
|
2012-01-11 16:41:08 +08:00
|
|
|
|
|
|
|
// The amount of shift we're trying to fit into the addressing mode is taken
|
2012-01-11 17:35:02 +08:00
|
|
|
// from the trailing zeros of the mask.
|
|
|
|
unsigned AMShiftAmt = MaskTZ;
|
2012-01-11 16:41:08 +08:00
|
|
|
|
|
|
|
// There is nothing we can do here unless the mask is removing some bits.
|
|
|
|
// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
|
|
|
|
if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
|
|
|
|
|
|
|
|
// We also need to ensure that mask is a continuous run of bits.
|
2015-02-12 23:35:40 +08:00
|
|
|
if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
|
2012-01-11 16:41:08 +08:00
|
|
|
|
|
|
|
// Scale the leading zero count down based on the actual size of the value.
|
2012-01-11 17:35:02 +08:00
|
|
|
// Also scale it down based on the size of the shift.
|
2013-08-15 13:57:07 +08:00
|
|
|
MaskLZ -= (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
|
2012-01-11 16:41:08 +08:00
|
|
|
|
|
|
|
// The final check is to ensure that any masked out high bits of X are
|
|
|
|
// already known to be zero. Otherwise, the mask has a semantic impact
|
|
|
|
// other than masking out a couple of low bits. Unfortunately, because of
|
|
|
|
// the mask, zero extensions will be removed from operands in some cases.
|
|
|
|
// This code works extra hard to look through extensions because we can
|
|
|
|
// replace them with zero extensions cheaply if necessary.
|
|
|
|
bool ReplacingAnyExtend = false;
|
|
|
|
if (X.getOpcode() == ISD::ANY_EXTEND) {
|
2013-08-15 13:57:07 +08:00
|
|
|
unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
|
|
|
|
X.getOperand(0).getSimpleValueType().getSizeInBits();
|
2012-01-11 16:41:08 +08:00
|
|
|
// Assume that we'll replace the any-extend with a zero-extend, and
|
|
|
|
// narrow the search to the extended value.
|
|
|
|
X = X.getOperand(0);
|
|
|
|
MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
|
|
|
|
ReplacingAnyExtend = true;
|
|
|
|
}
|
2013-08-15 13:57:07 +08:00
|
|
|
APInt MaskedHighBits =
|
|
|
|
APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
|
2012-01-11 16:41:08 +08:00
|
|
|
APInt KnownZero, KnownOne;
|
2014-05-15 05:14:37 +08:00
|
|
|
DAG.computeKnownBits(X, KnownZero, KnownOne);
|
2012-01-11 16:41:08 +08:00
|
|
|
if (MaskedHighBits != KnownZero) return true;
|
|
|
|
|
|
|
|
// We've identified a pattern that can be transformed into a single shift
|
|
|
|
// and an addressing mode. Make it so.
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT VT = N.getSimpleValueType();
|
2012-01-11 16:41:08 +08:00
|
|
|
if (ReplacingAnyExtend) {
|
|
|
|
assert(X.getValueType() != VT);
|
|
|
|
// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
|
2013-05-25 10:42:55 +08:00
|
|
|
SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
|
2015-10-14 00:23:00 +08:00
|
|
|
insertDAGNode(DAG, N, NewX);
|
2012-01-11 16:41:08 +08:00
|
|
|
X = NewX;
|
|
|
|
}
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc DL(N);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
|
2012-01-11 16:41:08 +08:00
|
|
|
SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
|
2012-01-11 16:41:08 +08:00
|
|
|
SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
|
2012-01-12 09:34:44 +08:00
|
|
|
|
|
|
|
// Insert the new nodes into the topological ordering. We must do this in
|
|
|
|
// a valid topological ordering as nothing is going to go back and re-sort
|
|
|
|
// these nodes. We continually insert before 'N' in sequence as this is
|
|
|
|
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
|
|
|
|
// hierarchy left to express.
|
2015-10-14 00:23:00 +08:00
|
|
|
insertDAGNode(DAG, N, NewSRLAmt);
|
|
|
|
insertDAGNode(DAG, N, NewSRL);
|
|
|
|
insertDAGNode(DAG, N, NewSHLAmt);
|
|
|
|
insertDAGNode(DAG, N, NewSHL);
|
2012-01-11 16:41:08 +08:00
|
|
|
DAG.ReplaceAllUsesWith(N, NewSHL);
|
|
|
|
|
|
|
|
AM.Scale = 1 << AMShiftAmt;
|
|
|
|
AM.IndexReg = NewSRL;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
|
2009-07-23 07:26:55 +08:00
|
|
|
unsigned Depth) {
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(N);
|
2009-08-08 05:33:25 +08:00
|
|
|
DEBUG({
|
2010-01-05 09:29:08 +08:00
|
|
|
dbgs() << "MatchAddress: ";
|
2009-08-08 05:33:25 +08:00
|
|
|
AM.dump();
|
|
|
|
});
|
2007-08-14 04:03:06 +08:00
|
|
|
// Limit recursion.
|
|
|
|
if (Depth > 5)
|
2015-10-14 00:23:00 +08:00
|
|
|
return matchAddressBase(N, AM);
|
2009-08-06 07:01:26 +08:00
|
|
|
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
// If this is already a %rip relative address, we can only merge immediates
|
|
|
|
// into it. Instead of handling this in every case, we handle it here.
|
2006-09-08 14:48:29 +08:00
|
|
|
// RIP relative addressing: %rip + 32-bit displacement!
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
if (AM.isRIPRelative()) {
|
|
|
|
// FIXME: JumpTable and ExternalSymbol address currently don't like
|
|
|
|
// displacements. It isn't very important, but this should be fixed for
|
|
|
|
// consistency.
|
2015-06-23 01:46:53 +08:00
|
|
|
if (!(AM.ES || AM.MCSym) && AM.JT != -1)
|
|
|
|
return true;
|
2009-08-06 07:01:26 +08:00
|
|
|
|
2011-07-14 04:44:23 +08:00
|
|
|
if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
|
2006-09-08 14:48:29 +08:00
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2005-11-19 10:11:08 +08:00
|
|
|
switch (N.getOpcode()) {
|
|
|
|
default: break;
|
Rename llvm.frameescape and llvm.framerecover to localescape and localrecover
Summary:
Initially, these intrinsics seemed like part of a family of "frame"
related intrinsics, but now I think that's more confusing than helpful.
Initially, the LangRef specified that this would create a new kind of
allocation that would be allocated at a fixed offset from the frame
pointer (EBP/RBP). We ended up dropping that design, and leaving the
stack frame layout alone.
These intrinsics are really about sharing local stack allocations, not
frame pointers. I intend to go further and add an `llvm.localaddress()`
intrinsic that returns whatever register (EBP, ESI, ESP, RBX) is being
used to address locals, which should not be confused with the frame
pointer.
Naming suggestions at this point are welcome, I'm happy to re-run sed.
Reviewers: majnemer, nicholas
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D11011
llvm-svn: 241633
2015-07-08 06:25:32 +08:00
|
|
|
case ISD::LOCAL_RECOVER: {
|
2015-05-05 07:22:36 +08:00
|
|
|
if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
|
2015-06-23 01:46:53 +08:00
|
|
|
if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
|
|
|
|
// Use the symbol and don't prefix it.
|
|
|
|
AM.MCSym = ESNode->getMCSymbol();
|
|
|
|
return false;
|
|
|
|
}
|
2015-03-06 02:50:12 +08:00
|
|
|
break;
|
|
|
|
}
|
2006-09-08 14:48:29 +08:00
|
|
|
case ISD::Constant: {
|
2008-11-11 23:52:29 +08:00
|
|
|
uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldOffsetIntoAddress(Val, AM))
|
2006-09-08 14:48:29 +08:00
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
}
|
2005-12-08 10:01:35 +08:00
|
|
|
|
2009-04-13 05:55:03 +08:00
|
|
|
case X86ISD::Wrapper:
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
case X86ISD::WrapperRIP:
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!matchWrapper(N, AM))
|
2009-04-13 05:55:03 +08:00
|
|
|
return false;
|
2005-12-08 10:01:35 +08:00
|
|
|
break;
|
|
|
|
|
2009-04-09 05:14:34 +08:00
|
|
|
case ISD::LOAD:
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
|
2009-04-09 05:14:34 +08:00
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
|
2006-02-25 18:09:08 +08:00
|
|
|
case ISD::FrameIndex:
|
2011-07-14 05:29:53 +08:00
|
|
|
if (AM.BaseType == X86ISelAddressMode::RegBase &&
|
2014-04-25 13:30:21 +08:00
|
|
|
AM.Base_Reg.getNode() == nullptr &&
|
2011-07-14 05:29:53 +08:00
|
|
|
(!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
|
2006-02-25 18:09:08 +08:00
|
|
|
AM.BaseType = X86ISelAddressMode::FrameIndexBase;
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
|
2005-12-17 17:13:43 +08:00
|
|
|
return false;
|
2005-11-19 10:11:08 +08:00
|
|
|
}
|
|
|
|
break;
|
2005-12-08 10:01:35 +08:00
|
|
|
|
2005-11-19 10:11:08 +08:00
|
|
|
case ISD::SHL:
|
2014-04-25 13:30:21 +08:00
|
|
|
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
|
2007-12-08 15:22:58 +08:00
|
|
|
break;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2008-08-31 23:37:04 +08:00
|
|
|
if (ConstantSDNode
|
|
|
|
*CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
|
2008-09-13 00:56:44 +08:00
|
|
|
unsigned Val = CN->getZExtValue();
|
2009-07-23 07:26:55 +08:00
|
|
|
// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
|
|
|
|
// that the base operand remains free for further matching. If
|
|
|
|
// the base doesn't end up getting used, a post-processing step
|
|
|
|
// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
|
2007-12-08 15:22:58 +08:00
|
|
|
if (Val == 1 || Val == 2 || Val == 3) {
|
|
|
|
AM.Scale = 1 << Val;
|
2008-08-29 05:40:38 +08:00
|
|
|
SDValue ShVal = N.getNode()->getOperand(0);
|
2007-12-08 15:22:58 +08:00
|
|
|
|
|
|
|
// Okay, we know that we have a scale by now. However, if the scaled
|
|
|
|
// value is an add of something and a constant, we can fold the
|
|
|
|
// constant into the disp field here.
|
2011-02-14 06:25:43 +08:00
|
|
|
if (CurDAG->isBaseWithConstantOffset(ShVal)) {
|
2008-08-29 05:40:38 +08:00
|
|
|
AM.IndexReg = ShVal.getNode()->getOperand(0);
|
2007-12-08 15:22:58 +08:00
|
|
|
ConstantSDNode *AddVal =
|
2008-08-29 05:40:38 +08:00
|
|
|
cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
|
2012-08-25 07:29:28 +08:00
|
|
|
uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldOffsetIntoAddress(Disp, AM))
|
2011-07-14 04:44:23 +08:00
|
|
|
return false;
|
2005-11-19 10:11:08 +08:00
|
|
|
}
|
2011-07-14 04:44:23 +08:00
|
|
|
|
|
|
|
AM.IndexReg = ShVal;
|
2007-12-08 15:22:58 +08:00
|
|
|
return false;
|
2005-11-19 10:11:08 +08:00
|
|
|
}
|
2007-12-08 15:22:58 +08:00
|
|
|
}
|
2013-01-05 07:01:26 +08:00
|
|
|
break;
|
2005-12-08 10:01:35 +08:00
|
|
|
|
2012-01-11 17:35:02 +08:00
|
|
|
case ISD::SRL: {
|
|
|
|
// Scale must not be used already.
|
2014-04-25 13:30:21 +08:00
|
|
|
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
|
2012-01-11 17:35:02 +08:00
|
|
|
|
|
|
|
SDValue And = N.getOperand(0);
|
|
|
|
if (And.getOpcode() != ISD::AND) break;
|
|
|
|
SDValue X = And.getOperand(0);
|
|
|
|
|
|
|
|
// We only handle up to 64-bit values here as those are what matter for
|
|
|
|
// addressing mode optimizations.
|
2013-08-15 13:57:07 +08:00
|
|
|
if (X.getSimpleValueType().getSizeInBits() > 64) break;
|
2012-01-11 17:35:02 +08:00
|
|
|
|
|
|
|
// The mask used for the transform is expected to be post-shift, but we
|
|
|
|
// found the shift first so just apply the shift to the mask before passing
|
|
|
|
// it down.
|
|
|
|
if (!isa<ConstantSDNode>(N.getOperand(1)) ||
|
|
|
|
!isa<ConstantSDNode>(And.getOperand(1)))
|
|
|
|
break;
|
|
|
|
uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
|
|
|
|
|
2012-01-11 16:41:08 +08:00
|
|
|
// Try to fold the mask and shift into the scale, and return false if we
|
|
|
|
// succeed.
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
|
2012-01-11 16:41:08 +08:00
|
|
|
return false;
|
|
|
|
break;
|
2012-01-11 17:35:02 +08:00
|
|
|
}
|
2012-01-11 16:41:08 +08:00
|
|
|
|
2007-10-23 04:22:24 +08:00
|
|
|
case ISD::SMUL_LOHI:
|
|
|
|
case ISD::UMUL_LOHI:
|
|
|
|
// A mul_lohi where we need the low part can be folded as a plain multiply.
|
2008-08-27 06:36:50 +08:00
|
|
|
if (N.getResNo() != 0) break;
|
2016-08-18 04:30:52 +08:00
|
|
|
LLVM_FALLTHROUGH;
|
2005-11-19 10:11:08 +08:00
|
|
|
case ISD::MUL:
|
2009-03-31 05:36:47 +08:00
|
|
|
case X86ISD::MUL_IMM:
|
2005-11-19 10:11:08 +08:00
|
|
|
// X*[3,5,9] -> X+X*[2,4,8]
|
Eliminate the ISel priority queue, which used the topological order for a
priority function. Instead, just iterate over the AllNodes list, which is
already in topological order. This eliminates a fair amount of bookkeeping,
and speeds up the isel phase by about 15% on many testcases.
The impact on most targets is that AddToISelQueue calls can be simply removed.
In the x86 target, there are two additional notable changes.
The rule-bending AND+SHIFT optimization in MatchAddress that creates new
pre-isel nodes during isel is now a little more verbose, but more robust.
Instead of either creating an invalid DAG or creating an invalid topological
sort, as it has historically done, it can now just insert the new nodes into
the node list at a position where they will be consistent with the topological
ordering.
Also, the address-matching code has logic that checked to see if a node was
"already selected". However, when a node is selected, it has all its uses
taken away via ReplaceAllUsesWith or equivalent, so it won't recieve any
further visits from MatchAddress. This code is now removed.
llvm-svn: 58748
2008-11-05 12:14:16 +08:00
|
|
|
if (AM.BaseType == X86ISelAddressMode::RegBase &&
|
2014-04-25 13:30:21 +08:00
|
|
|
AM.Base_Reg.getNode() == nullptr &&
|
|
|
|
AM.IndexReg.getNode() == nullptr) {
|
2008-08-31 23:37:04 +08:00
|
|
|
if (ConstantSDNode
|
|
|
|
*CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
|
2008-09-13 00:56:44 +08:00
|
|
|
if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
|
|
|
|
CN->getZExtValue() == 9) {
|
|
|
|
AM.Scale = unsigned(CN->getZExtValue())-1;
|
2005-11-19 10:11:08 +08:00
|
|
|
|
2008-08-29 05:40:38 +08:00
|
|
|
SDValue MulVal = N.getNode()->getOperand(0);
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue Reg;
|
2005-11-19 10:11:08 +08:00
|
|
|
|
|
|
|
// Okay, we know that we have a scale by now. However, if the scaled
|
|
|
|
// value is an add of something and a constant, we can fold the
|
|
|
|
// constant into the disp field here.
|
2008-08-29 05:40:38 +08:00
|
|
|
if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
|
|
|
|
isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
|
|
|
|
Reg = MulVal.getNode()->getOperand(0);
|
2005-11-19 10:11:08 +08:00
|
|
|
ConstantSDNode *AddVal =
|
2008-08-29 05:40:38 +08:00
|
|
|
cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
|
2011-07-14 04:44:23 +08:00
|
|
|
uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
|
2015-10-14 00:23:00 +08:00
|
|
|
if (foldOffsetIntoAddress(Disp, AM))
|
2008-08-29 05:40:38 +08:00
|
|
|
Reg = N.getNode()->getOperand(0);
|
2005-11-19 10:11:08 +08:00
|
|
|
} else {
|
2008-08-29 05:40:38 +08:00
|
|
|
Reg = N.getNode()->getOperand(0);
|
2005-11-19 10:11:08 +08:00
|
|
|
}
|
|
|
|
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.IndexReg = AM.Base_Reg = Reg;
|
2005-11-19 10:11:08 +08:00
|
|
|
return false;
|
|
|
|
}
|
2007-02-05 04:18:17 +08:00
|
|
|
}
|
2005-11-19 10:11:08 +08:00
|
|
|
break;
|
|
|
|
|
2009-05-12 02:02:53 +08:00
|
|
|
case ISD::SUB: {
|
|
|
|
// Given A-B, if A can be completely folded into the address and
|
|
|
|
// the index field with the index field unused, use -B as the index.
|
|
|
|
// This is a win if a has multiple parts that can be folded into
|
|
|
|
// the address. Also, this saves a mov if the base register has
|
|
|
|
// other uses, since it avoids a two-address sub instruction, however
|
|
|
|
// it costs an additional mov if the index register has other uses.
|
|
|
|
|
2010-06-18 09:24:29 +08:00
|
|
|
// Add an artificial use to this node so that we can keep track of
|
|
|
|
// it if it gets CSE'd with a different node.
|
|
|
|
HandleSDNode Handle(N);
|
|
|
|
|
2009-05-12 02:02:53 +08:00
|
|
|
// Test if the LHS of the sub can be folded.
|
|
|
|
X86ISelAddressMode Backup = AM;
|
2015-10-14 00:23:00 +08:00
|
|
|
if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
|
2009-05-12 02:02:53 +08:00
|
|
|
AM = Backup;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// Test if the index field is free for use.
|
Reimplement rip-relative addressing in the X86-64 backend. The new
implementation primarily differs from the former in that the asmprinter
doesn't make a zillion decisions about whether or not something will be
RIP relative or not. Instead, those decisions are made by isel lowering
and propagated through to the asm printer. To achieve this, we:
1. Represent RIP relative addresses by setting the base of the X86 addr
mode to X86::RIP.
2. When ISel Lowering decides that it is safe to use RIP, it lowers to
X86ISD::WrapperRIP. When it is unsafe to use RIP, it lowers to
X86ISD::Wrapper as before.
3. This removes isRIPRel from X86ISelAddressMode, representing it with
a basereg of RIP instead.
4. The addressing mode matching logic in isel is greatly simplified.
5. The asmprinter is greatly simplified, notably the "NotRIPRel" predicate
passed through various printoperand routines is gone now.
6. The various symbol printing routines in asmprinter now no longer infer
when to emit (%rip), they just print the symbol.
I think this is a big improvement over the previous situation. It does have
two small caveats though: 1. I implemented a horrible "no-rip" modifier for
the inline asm "P" constraint modifier. This is a short term hack, there is
a much better, but more involved, solution. 2. I had to xfail an
-aggressive-remat testcase because it isn't handling the use of RIP in the
constant-pool reading instruction. This specific test is easy to fix without
-aggressive-remat, which I intend to do next.
llvm-svn: 74372
2009-06-27 12:16:01 +08:00
|
|
|
if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
|
2009-05-12 02:02:53 +08:00
|
|
|
AM = Backup;
|
|
|
|
break;
|
|
|
|
}
|
2010-03-18 07:58:35 +08:00
|
|
|
|
2009-05-12 02:02:53 +08:00
|
|
|
int Cost = 0;
|
2010-06-18 09:24:29 +08:00
|
|
|
SDValue RHS = Handle.getValue().getNode()->getOperand(1);
|
2009-05-12 02:02:53 +08:00
|
|
|
// If the RHS involves a register with multiple uses, this
|
|
|
|
// transformation incurs an extra mov, due to the neg instruction
|
|
|
|
// clobbering its operand.
|
|
|
|
if (!RHS.getNode()->hasOneUse() ||
|
|
|
|
RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
|
|
|
|
RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
|
|
|
|
RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
|
|
|
|
(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
|
2009-08-12 04:47:22 +08:00
|
|
|
RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
|
2009-05-12 02:02:53 +08:00
|
|
|
++Cost;
|
|
|
|
// If the base is a register with multiple uses, this
|
|
|
|
// transformation may save a mov.
|
|
|
|
if ((AM.BaseType == X86ISelAddressMode::RegBase &&
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_Reg.getNode() &&
|
|
|
|
!AM.Base_Reg.getNode()->hasOneUse()) ||
|
2009-05-12 02:02:53 +08:00
|
|
|
AM.BaseType == X86ISelAddressMode::FrameIndexBase)
|
|
|
|
--Cost;
|
|
|
|
// If the folded LHS was interesting, this transformation saves
|
|
|
|
// address arithmetic.
|
|
|
|
if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
|
|
|
|
((AM.Disp != 0) && (Backup.Disp == 0)) +
|
|
|
|
(AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
|
|
|
|
--Cost;
|
|
|
|
// If it doesn't look like it may be an overall win, don't do it.
|
|
|
|
if (Cost >= 0) {
|
|
|
|
AM = Backup;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ok, the transformation is legal and appears profitable. Go for it.
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType());
|
2009-05-12 02:02:53 +08:00
|
|
|
SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
|
|
|
|
AM.IndexReg = Neg;
|
|
|
|
AM.Scale = 1;
|
|
|
|
|
|
|
|
// Insert the new nodes into the topological ordering.
|
2015-10-14 00:23:00 +08:00
|
|
|
insertDAGNode(*CurDAG, N, Zero);
|
|
|
|
insertDAGNode(*CurDAG, N, Neg);
|
2009-05-12 02:02:53 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-10-22 02:56:06 +08:00
|
|
|
case ISD::ADD:
|
|
|
|
if (!matchAdd(N, AM, Depth))
|
2010-06-18 09:24:29 +08:00
|
|
|
return false;
|
2005-11-19 10:11:08 +08:00
|
|
|
break;
|
2006-05-30 14:59:36 +08:00
|
|
|
|
2015-11-10 07:31:38 +08:00
|
|
|
case ISD::OR:
|
[x86] try harder to match bitwise 'or' into an LEA
The motivation for this patch starts with the epic fail example in PR18007:
https://llvm.org/bugs/show_bug.cgi?id=18007
...unfortunately, this patch makes no difference for that case, but it solves some
simpler cases. We'll get there some day. :)
The current 'or' matching code was using computeKnownBits() via
isBaseWithConstantOffset() -> MaskedValueIsZero(), but that's an unnecessarily limited use.
We can do more by copying the logic in ValueTracking's haveNoCommonBitsSet(), so we can
treat the 'or' as if it was an 'add'.
There's a TODO comment here because we should lift the bit-checking logic into a helper
function, so it's not duplicated in DAGCombiner.
An example of the better LEA matching:
leal (%rdi,%rdi), %eax
andl $1, %esi
orl %esi, %eax
Becomes:
andl $1, %esi
leal (%rsi,%rdi,2), %eax
Differential Revision: http://reviews.llvm.org/D13956
llvm-svn: 252515
2015-11-10 05:16:49 +08:00
|
|
|
// We want to look through a transform in InstCombine and DAGCombiner that
|
|
|
|
// turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
|
2015-11-10 07:31:38 +08:00
|
|
|
// Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
|
[x86] try harder to match bitwise 'or' into an LEA
The motivation for this patch starts with the epic fail example in PR18007:
https://llvm.org/bugs/show_bug.cgi?id=18007
...unfortunately, this patch makes no difference for that case, but it solves some
simpler cases. We'll get there some day. :)
The current 'or' matching code was using computeKnownBits() via
isBaseWithConstantOffset() -> MaskedValueIsZero(), but that's an unnecessarily limited use.
We can do more by copying the logic in ValueTracking's haveNoCommonBitsSet(), so we can
treat the 'or' as if it was an 'add'.
There's a TODO comment here because we should lift the bit-checking logic into a helper
function, so it's not duplicated in DAGCombiner.
An example of the better LEA matching:
leal (%rdi,%rdi), %eax
andl $1, %esi
orl %esi, %eax
Becomes:
andl $1, %esi
leal (%rsi,%rdi,2), %eax
Differential Revision: http://reviews.llvm.org/D13956
llvm-svn: 252515
2015-11-10 05:16:49 +08:00
|
|
|
// An 'lea' can then be used to match the shift (multiply) and add:
|
|
|
|
// and $1, %esi
|
|
|
|
// lea (%rsi, %rdi, 8), %rax
|
2015-11-10 07:31:38 +08:00
|
|
|
if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
|
|
|
|
!matchAdd(N, AM, Depth))
|
|
|
|
return false;
|
2006-05-30 14:59:36 +08:00
|
|
|
break;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2007-12-13 08:43:27 +08:00
|
|
|
case ISD::AND: {
|
Implement x86 h-register extract support.
- Add patterns for h-register extract, which avoids a shift and mask,
and in some cases a temporary register.
- Add address-mode matching for turning (X>>(8-n))&(255<<n), where
n is a valid address-mode scale value, into an h-register extract
and a scaled-offset address.
- Replace X86's MOV32to32_ and related instructions with the new
target-independent COPY_TO_SUBREG instruction.
On x86-64 there are complicated constraints on h registers, and
CodeGen doesn't currently provide a high-level way to express all of them,
so they are handled with a bunch of special code. This code currently only
supports extracts where the result is used by a zero-extend or a store,
though these are fairly common.
These transformations are not always beneficial; since there are only
4 h registers, they sometimes require extra move instructions, and
this sometimes increases register pressure because it can force out
values that would otherwise be in one of those registers. However,
this appears to be relatively uncommon.
llvm-svn: 68962
2009-04-14 00:09:41 +08:00
|
|
|
// Perform some heroic transforms on an and of a constant-count shift
|
|
|
|
// with a constant to enable use of the scaled offset field.
|
|
|
|
|
2007-12-13 08:43:27 +08:00
|
|
|
// Scale must not be used already.
|
2014-04-25 13:30:21 +08:00
|
|
|
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
|
Fix a x86-64 codegen deficiency. Allow gv + offset when using rip addressing mode.
Before:
_main:
subq $8, %rsp
leaq _X(%rip), %rax
movsd 8(%rax), %xmm1
movss _X(%rip), %xmm0
call _t
xorl %ecx, %ecx
movl %ecx, %eax
addq $8, %rsp
ret
Now:
_main:
subq $8, %rsp
movsd _X+8(%rip), %xmm1
movss _X(%rip), %xmm0
call _t
xorl %ecx, %ecx
movl %ecx, %eax
addq $8, %rsp
ret
Notice there is another idiotic codegen issue that needs to be fixed asap:
xorl %ecx, %ecx
movl %ecx, %eax
llvm-svn: 46850
2008-02-07 16:53:49 +08:00
|
|
|
|
2012-01-11 17:35:00 +08:00
|
|
|
SDValue Shift = N.getOperand(0);
|
|
|
|
if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
|
Implement x86 h-register extract support.
- Add patterns for h-register extract, which avoids a shift and mask,
and in some cases a temporary register.
- Add address-mode matching for turning (X>>(8-n))&(255<<n), where
n is a valid address-mode scale value, into an h-register extract
and a scaled-offset address.
- Replace X86's MOV32to32_ and related instructions with the new
target-independent COPY_TO_SUBREG instruction.
On x86-64 there are complicated constraints on h registers, and
CodeGen doesn't currently provide a high-level way to express all of them,
so they are handled with a bunch of special code. This code currently only
supports extracts where the result is used by a zero-extend or a store,
though these are fairly common.
These transformations are not always beneficial; since there are only
4 h registers, they sometimes require extra move instructions, and
this sometimes increases register pressure because it can force out
values that would otherwise be in one of those registers. However,
this appears to be relatively uncommon.
llvm-svn: 68962
2009-04-14 00:09:41 +08:00
|
|
|
SDValue X = Shift.getOperand(0);
|
2012-01-11 17:35:00 +08:00
|
|
|
|
|
|
|
// We only handle up to 64-bit values here as those are what matter for
|
|
|
|
// addressing mode optimizations.
|
2013-08-15 13:57:07 +08:00
|
|
|
if (X.getSimpleValueType().getSizeInBits() > 64) break;
|
2012-01-11 17:35:00 +08:00
|
|
|
|
2012-01-11 17:35:04 +08:00
|
|
|
if (!isa<ConstantSDNode>(N.getOperand(1)))
|
|
|
|
break;
|
|
|
|
uint64_t Mask = N.getConstantOperandVal(1);
|
2007-12-13 08:43:27 +08:00
|
|
|
|
2012-01-11 16:48:20 +08:00
|
|
|
// Try to fold the mask and shift into an extract and scale.
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
|
2012-01-11 16:48:20 +08:00
|
|
|
return false;
|
Implement x86 h-register extract support.
- Add patterns for h-register extract, which avoids a shift and mask,
and in some cases a temporary register.
- Add address-mode matching for turning (X>>(8-n))&(255<<n), where
n is a valid address-mode scale value, into an h-register extract
and a scaled-offset address.
- Replace X86's MOV32to32_ and related instructions with the new
target-independent COPY_TO_SUBREG instruction.
On x86-64 there are complicated constraints on h registers, and
CodeGen doesn't currently provide a high-level way to express all of them,
so they are handled with a bunch of special code. This code currently only
supports extracts where the result is used by a zero-extend or a store,
though these are fairly common.
These transformations are not always beneficial; since there are only
4 h registers, they sometimes require extra move instructions, and
this sometimes increases register pressure because it can force out
values that would otherwise be in one of those registers. However,
this appears to be relatively uncommon.
llvm-svn: 68962
2009-04-14 00:09:41 +08:00
|
|
|
|
2012-01-11 16:48:20 +08:00
|
|
|
// Try to fold the mask and shift directly into the scale.
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
|
2012-01-11 16:41:08 +08:00
|
|
|
return false;
|
|
|
|
|
2012-01-11 17:35:00 +08:00
|
|
|
// Try to swap the mask and shift to place shifts which can be done as
|
|
|
|
// a scale on the outside of the mask.
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
|
2012-01-11 17:35:00 +08:00
|
|
|
return false;
|
|
|
|
break;
|
2007-12-13 08:43:27 +08:00
|
|
|
}
|
2006-05-30 14:59:36 +08:00
|
|
|
}
|
2005-11-19 10:11:08 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
return matchAddressBase(N, AM);
|
2007-08-14 04:03:06 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Helper for MatchAddress. Add the specified node to the
|
2007-08-14 04:03:06 +08:00
|
|
|
/// specified addressing mode without any further recursion.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
|
2005-11-19 10:11:08 +08:00
|
|
|
// Is the base register already occupied?
|
2010-04-30 07:30:41 +08:00
|
|
|
if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
|
2005-11-19 10:11:08 +08:00
|
|
|
// If so, check to see if the scale index register is set.
|
2014-04-25 13:30:21 +08:00
|
|
|
if (!AM.IndexReg.getNode()) {
|
2005-11-19 10:11:08 +08:00
|
|
|
AM.IndexReg = N;
|
|
|
|
AM.Scale = 1;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Otherwise, we cannot select it.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Default, generate it as a register.
|
|
|
|
AM.BaseType = X86ISelAddressMode::RegBase;
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_Reg = N;
|
2005-11-19 10:11:08 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
|
2015-04-30 16:38:48 +08:00
|
|
|
SDValue &Scale, SDValue &Index,
|
|
|
|
SDValue &Disp, SDValue &Segment) {
|
2016-04-06 04:45:04 +08:00
|
|
|
|
2015-04-30 16:38:48 +08:00
|
|
|
MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent);
|
|
|
|
if (!Mgs)
|
|
|
|
return false;
|
|
|
|
X86ISelAddressMode AM;
|
|
|
|
unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace();
|
2016-05-04 04:16:08 +08:00
|
|
|
// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
|
2015-04-30 16:38:48 +08:00
|
|
|
if (AddrSpace == 256)
|
|
|
|
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
|
|
|
|
if (AddrSpace == 257)
|
|
|
|
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
|
2016-05-04 04:16:08 +08:00
|
|
|
if (AddrSpace == 258)
|
|
|
|
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
|
2015-04-30 16:38:48 +08:00
|
|
|
|
|
|
|
SDLoc DL(N);
|
|
|
|
Base = Mgs->getBasePtr();
|
|
|
|
Index = Mgs->getIndex();
|
2016-09-14 23:43:44 +08:00
|
|
|
unsigned ScalarSize = Mgs->getValue().getScalarValueSizeInBits();
|
2015-04-30 16:38:48 +08:00
|
|
|
Scale = getI8Imm(ScalarSize/8, DL);
|
|
|
|
|
|
|
|
// If Base is 0, the whole address is in index and the Scale is 1
|
2015-04-30 17:01:21 +08:00
|
|
|
if (isa<ConstantSDNode>(Base)) {
|
2015-10-21 14:11:01 +08:00
|
|
|
assert(cast<ConstantSDNode>(Base)->isNullValue() &&
|
2015-04-30 17:01:21 +08:00
|
|
|
"Unexpected base in gather/scatter");
|
2015-04-30 16:38:48 +08:00
|
|
|
Scale = getI8Imm(1, DL);
|
|
|
|
Base = CurDAG->getRegister(0, MVT::i32);
|
|
|
|
}
|
|
|
|
if (AM.Segment.getNode())
|
|
|
|
Segment = AM.Segment;
|
|
|
|
else
|
|
|
|
Segment = CurDAG->getRegister(0, MVT::i32);
|
|
|
|
Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Returns true if it is able to pattern match an addressing mode.
|
2005-12-08 10:01:35 +08:00
|
|
|
/// It returns the operands which make up the maximal addressing mode it can
|
|
|
|
/// match by reference.
|
2010-09-22 06:07:31 +08:00
|
|
|
///
|
|
|
|
/// Parent is the parent node of the addr operand that is being matched. It
|
|
|
|
/// is always a load, store, atomic node, or null. It is only null when
|
|
|
|
/// checking memory operands for inline asm nodes.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue &Scale, SDValue &Index,
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue &Disp, SDValue &Segment) {
|
2005-12-08 10:01:35 +08:00
|
|
|
X86ISelAddressMode AM;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2010-09-22 06:07:31 +08:00
|
|
|
if (Parent &&
|
|
|
|
// This list of opcodes are all the nodes that have an "addr:$ptr" operand
|
|
|
|
// that are not a MemSDNode, and thus don't have proper addrspace info.
|
|
|
|
Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
|
2010-09-23 04:42:08 +08:00
|
|
|
Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
|
2012-10-16 06:39:43 +08:00
|
|
|
Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
|
|
|
|
Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
|
|
|
|
Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
|
2010-09-22 06:07:31 +08:00
|
|
|
unsigned AddrSpace =
|
|
|
|
cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
|
2016-05-04 04:16:08 +08:00
|
|
|
// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
|
2010-09-22 06:07:31 +08:00
|
|
|
if (AddrSpace == 256)
|
2010-09-22 12:39:11 +08:00
|
|
|
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
|
2010-09-22 06:07:31 +08:00
|
|
|
if (AddrSpace == 257)
|
2010-09-22 12:39:11 +08:00
|
|
|
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
|
2016-05-04 04:16:08 +08:00
|
|
|
if (AddrSpace == 258)
|
|
|
|
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
|
2010-09-22 06:07:31 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
if (matchAddress(N, AM))
|
2010-09-22 12:39:11 +08:00
|
|
|
return false;
|
|
|
|
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT VT = N.getSimpleValueType();
|
2010-09-22 12:39:11 +08:00
|
|
|
if (AM.BaseType == X86ISelAddressMode::RegBase) {
|
|
|
|
if (!AM.Base_Reg.getNode())
|
|
|
|
AM.Base_Reg = CurDAG->getRegister(0, VT);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!AM.IndexReg.getNode())
|
|
|
|
AM.IndexReg = CurDAG->getRegister(0, VT);
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
|
2006-01-11 14:09:51 +08:00
|
|
|
return true;
|
2005-12-08 10:01:35 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Match a scalar SSE load. In particular, we want to match a load whose top
|
|
|
|
/// elements are either undef or zeros. The load flavor is derived from the
|
|
|
|
/// type of N, which is either v4f32 or v2f64.
|
2010-02-17 14:07:47 +08:00
|
|
|
///
|
|
|
|
/// We also return:
|
2010-02-21 11:17:59 +08:00
|
|
|
/// PatternChainNode: this is the matched node that has a chain input and
|
|
|
|
/// output.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue N, SDValue &Base,
|
|
|
|
SDValue &Scale, SDValue &Index,
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue &Disp, SDValue &Segment,
|
2010-02-21 11:17:59 +08:00
|
|
|
SDValue &PatternNodeWithChain) {
|
2016-12-12 15:57:24 +08:00
|
|
|
// We can allow a full vector load here since narrowing a load is ok.
|
|
|
|
if (ISD::isNON_EXTLoad(N.getNode())) {
|
|
|
|
PatternNodeWithChain = N;
|
|
|
|
if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
|
2016-12-19 16:35:56 +08:00
|
|
|
IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) {
|
2016-12-12 15:57:24 +08:00
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
|
|
|
|
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
|
|
|
|
Segment);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We can also match the special zero extended load opcode.
|
|
|
|
if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
|
|
|
|
PatternNodeWithChain = N;
|
|
|
|
if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
|
2016-12-19 16:35:56 +08:00
|
|
|
IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) {
|
2016-12-12 15:57:24 +08:00
|
|
|
auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
|
|
|
|
return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
|
|
|
|
Segment);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-11-27 01:29:25 +08:00
|
|
|
// Need to make sure that the SCALAR_TO_VECTOR and load are both only used
|
|
|
|
// once. Otherwise the load might get duplicated and the chain output of the
|
|
|
|
// duplicate load will not be observed by all dependencies.
|
|
|
|
if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
|
2010-02-21 11:17:59 +08:00
|
|
|
PatternNodeWithChain = N.getOperand(0);
|
|
|
|
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
|
2016-11-27 01:29:25 +08:00
|
|
|
IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
|
|
|
|
IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
|
2010-02-21 11:17:59 +08:00
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
|
2016-11-27 02:43:21 +08:00
|
|
|
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
|
|
|
|
Segment);
|
2006-10-08 05:55:32 +08:00
|
|
|
}
|
|
|
|
}
|
Fold "zero extending vector loads" now that evan added the chain manip stuff.
This compiles both tests in X86/vec_ss_load_fold.ll into:
_test1:
movss 4(%esp), %xmm0
subss LCPI1_0, %xmm0
mulss LCPI1_1, %xmm0
minss LCPI1_2, %xmm0
xorps %xmm1, %xmm1
maxss %xmm1, %xmm0
cvttss2si %xmm0, %eax
andl $65535, %eax
ret
instead of:
_test1:
movss LCPI1_0, %xmm0
movss 4(%esp), %xmm1
subss %xmm0, %xmm1
movss LCPI1_1, %xmm0
mulss %xmm0, %xmm1
movss LCPI1_2, %xmm0
minss %xmm0, %xmm1
xorps %xmm0, %xmm0
maxss %xmm0, %xmm1
cvttss2si %xmm1, %eax
andl $65535, %eax
ret
llvm-svn: 30894
2006-10-12 06:09:58 +08:00
|
|
|
|
|
|
|
// Also handle the case where we explicitly require zeros in the top
|
2006-10-08 05:55:32 +08:00
|
|
|
// elements. This is a vector shuffle from the zero vector.
|
2008-08-29 05:40:38 +08:00
|
|
|
if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
|
Fix a long standing deficiency in the X86 backend: we would
sometimes emit "zero" and "all one" vectors multiple times,
for example:
_test2:
pcmpeqd %mm0, %mm0
movq %mm0, _M1
pcmpeqd %mm0, %mm0
movq %mm0, _M2
ret
instead of:
_test2:
pcmpeqd %mm0, %mm0
movq %mm0, _M1
movq %mm0, _M2
ret
This patch fixes this by always arranging for zero/one vectors
to be defined as v4i32 or v2i32 (SSE/MMX) instead of letting them be
any random type. This ensures they get trivially CSE'd on the dag.
This fix is also important for LegalizeDAGTypes, as it gets unhappy
when the x86 backend wants BUILD_VECTOR(i64 0) to be legal even when
'i64' isn't legal.
This patch makes the following changes:
1) X86TargetLowering::LowerBUILD_VECTOR now lowers 0/1 vectors into
their canonical types.
2) The now-dead patterns are removed from the SSE/MMX .td files.
3) All the patterns in the .td file that referred to immAllOnesV or
immAllZerosV in the wrong form now use *_bc to match them with a
bitcast wrapped around them.
4) X86DAGToDAGISel::SelectScalarSSELoad is generalized to handle
bitcast'd zero vectors, which simplifies the code actually.
5) getShuffleVectorZeroOrUndef is updated to generate a shuffle that
is legal, instead of generating one that is illegal and expecting
a later legalize pass to clean it up.
6) isZeroShuffle is generalized to handle bitcast of zeros.
7) several other minor tweaks.
This patch is definite goodness, but has the potential to cause random
code quality regressions. Please be on the lookout for these and let
me know if they happen.
llvm-svn: 44310
2007-11-25 08:24:49 +08:00
|
|
|
// Check to see if the top elements are all zeros (or bitcast of zeros).
|
2012-08-02 02:39:17 +08:00
|
|
|
N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
|
2016-11-27 02:43:24 +08:00
|
|
|
N.getOperand(0).getNode()->hasOneUse()) {
|
|
|
|
PatternNodeWithChain = N.getOperand(0).getOperand(0);
|
|
|
|
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
|
|
|
|
IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
|
|
|
|
IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
|
|
|
|
// Okay, this is a zero extending load. Fold it.
|
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
|
|
|
|
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
|
|
|
|
Segment);
|
|
|
|
}
|
Fold "zero extending vector loads" now that evan added the chain manip stuff.
This compiles both tests in X86/vec_ss_load_fold.ll into:
_test1:
movss 4(%esp), %xmm0
subss LCPI1_0, %xmm0
mulss LCPI1_1, %xmm0
minss LCPI1_2, %xmm0
xorps %xmm1, %xmm1
maxss %xmm1, %xmm0
cvttss2si %xmm0, %eax
andl $65535, %eax
ret
instead of:
_test1:
movss LCPI1_0, %xmm0
movss 4(%esp), %xmm1
subss %xmm0, %xmm1
movss LCPI1_1, %xmm0
mulss %xmm0, %xmm1
movss LCPI1_2, %xmm0
minss %xmm0, %xmm1
xorps %xmm0, %xmm0
maxss %xmm0, %xmm1
cvttss2si %xmm1, %eax
andl $65535, %eax
ret
llvm-svn: 30894
2006-10-12 06:09:58 +08:00
|
|
|
}
|
2016-11-27 02:43:24 +08:00
|
|
|
|
2006-10-08 05:55:32 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-04-06 04:45:04 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
|
2013-06-01 17:55:14 +08:00
|
|
|
if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
|
|
|
|
uint64_t ImmVal = CN->getZExtValue();
|
|
|
|
if ((uint32_t)ImmVal != (uint64_t)ImmVal)
|
|
|
|
return false;
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
|
2013-06-01 17:55:14 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// In static codegen with small code model, we can get the address of a label
|
|
|
|
// into a register with 'movl'. TableGen has already made sure we're looking
|
|
|
|
// at a label of some kind.
|
2013-06-11 04:43:49 +08:00
|
|
|
assert(N->getOpcode() == X86ISD::Wrapper &&
|
|
|
|
"Unexpected node type for MOV32ri64");
|
2013-06-01 17:55:14 +08:00
|
|
|
N = N.getOperand(0);
|
|
|
|
|
2016-11-17 05:48:59 +08:00
|
|
|
// At least GNU as does not accept 'movl' for TPOFF relocations.
|
|
|
|
// FIXME: We could use 'movl' when we know we are targeting MC.
|
|
|
|
if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
|
2013-06-01 17:55:14 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
Imm = N;
|
2016-12-09 03:01:00 +08:00
|
|
|
if (N->getOpcode() != ISD::TargetGlobalAddress)
|
|
|
|
return TM.getCodeModel() == CodeModel::Small;
|
|
|
|
|
|
|
|
Optional<ConstantRange> CR =
|
|
|
|
cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
|
|
|
|
if (!CR)
|
|
|
|
return TM.getCodeModel() == CodeModel::Small;
|
|
|
|
|
|
|
|
return CR->getUnsignedMax().ult(1ull << 32);
|
2013-06-01 17:55:14 +08:00
|
|
|
}
|
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
|
2013-06-11 04:43:49 +08:00
|
|
|
SDValue &Scale, SDValue &Index,
|
|
|
|
SDValue &Disp, SDValue &Segment) {
|
2016-04-13 05:34:24 +08:00
|
|
|
// Save the debug loc before calling selectLEAAddr, in case it invalidates N.
|
|
|
|
SDLoc DL(N);
|
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
|
2013-06-11 04:43:49 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
|
|
|
|
if (RN && RN->getReg() == 0)
|
|
|
|
Base = CurDAG->getRegister(0, MVT::i64);
|
2014-08-20 19:59:22 +08:00
|
|
|
else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) {
|
2013-06-11 04:43:49 +08:00
|
|
|
// Base could already be %rip, particularly in the x32 ABI.
|
|
|
|
Base = SDValue(CurDAG->getMachineNode(
|
|
|
|
TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(0, DL, MVT::i64),
|
2013-06-11 04:43:49 +08:00
|
|
|
Base,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)),
|
2013-06-11 04:43:49 +08:00
|
|
|
0);
|
|
|
|
}
|
|
|
|
|
|
|
|
RN = dyn_cast<RegisterSDNode>(Index);
|
|
|
|
if (RN && RN->getReg() == 0)
|
|
|
|
Index = CurDAG->getRegister(0, MVT::i64);
|
|
|
|
else {
|
|
|
|
assert(Index.getValueType() == MVT::i32 &&
|
|
|
|
"Expect to be extending 32-bit registers for use in LEA");
|
|
|
|
Index = SDValue(CurDAG->getMachineNode(
|
|
|
|
TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(0, DL, MVT::i64),
|
2013-06-11 04:43:49 +08:00
|
|
|
Index,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(X86::sub_32bit, DL,
|
|
|
|
MVT::i32)),
|
2013-06-11 04:43:49 +08:00
|
|
|
0);
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Calls SelectAddr and determines if the maximal addressing
|
2006-02-25 18:09:08 +08:00
|
|
|
/// mode it matches can be cost effectively emitted as an LEA instruction.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue &Base, SDValue &Scale,
|
2010-07-09 07:46:44 +08:00
|
|
|
SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment) {
|
2006-02-25 18:09:08 +08:00
|
|
|
X86ISelAddressMode AM;
|
|
|
|
|
2016-04-13 05:34:24 +08:00
|
|
|
// Save the DL and VT before calling matchAddress, it can invalidate N.
|
|
|
|
SDLoc DL(N);
|
|
|
|
MVT VT = N.getSimpleValueType();
|
|
|
|
|
2009-04-10 18:09:34 +08:00
|
|
|
// Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
|
|
|
|
// segments.
|
|
|
|
SDValue Copy = AM.Segment;
|
2009-08-12 04:47:22 +08:00
|
|
|
SDValue T = CurDAG->getRegister(0, MVT::i32);
|
2009-04-10 18:09:34 +08:00
|
|
|
AM.Segment = T;
|
2015-10-14 00:23:00 +08:00
|
|
|
if (matchAddress(N, AM))
|
2009-04-09 05:14:34 +08:00
|
|
|
return false;
|
2009-04-10 18:09:34 +08:00
|
|
|
assert (T == AM.Segment);
|
|
|
|
AM.Segment = Copy;
|
2009-04-09 05:14:34 +08:00
|
|
|
|
2006-02-25 18:09:08 +08:00
|
|
|
unsigned Complexity = 0;
|
|
|
|
if (AM.BaseType == X86ISelAddressMode::RegBase)
|
2010-04-30 07:30:41 +08:00
|
|
|
if (AM.Base_Reg.getNode())
|
2006-02-25 18:09:08 +08:00
|
|
|
Complexity = 1;
|
|
|
|
else
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_Reg = CurDAG->getRegister(0, VT);
|
2006-02-25 18:09:08 +08:00
|
|
|
else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
|
|
|
|
Complexity = 4;
|
|
|
|
|
2008-08-29 05:40:38 +08:00
|
|
|
if (AM.IndexReg.getNode())
|
2006-02-25 18:09:08 +08:00
|
|
|
Complexity++;
|
|
|
|
else
|
2006-09-08 14:48:29 +08:00
|
|
|
AM.IndexReg = CurDAG->getRegister(0, VT);
|
2006-02-25 18:09:08 +08:00
|
|
|
|
Two changes:
1) codegen a shift of a register as a shift, not an LEA.
2) teach the RA to convert a shift to an LEA instruction if it wants something
in three-address form.
This gives us asm diffs like:
- leal (,%eax,4), %eax
+ shll $2, %eax
which is faster on some processors and smaller on all of them.
and, more interestingly:
- movl 24(%esi), %eax
- leal (,%eax,4), %edi
+ movl 24(%esi), %edi
+ shll $2, %edi
Without #2, #1 was a significant pessimization in some cases.
This implements CodeGen/X86/shift-codegen.ll
llvm-svn: 35204
2007-03-20 14:08:29 +08:00
|
|
|
// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
|
|
|
|
// a simple shift.
|
|
|
|
if (AM.Scale > 1)
|
2006-03-01 05:13:57 +08:00
|
|
|
Complexity++;
|
2006-02-25 18:09:08 +08:00
|
|
|
|
|
|
|
// FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
|
2015-10-13 00:09:59 +08:00
|
|
|
// to a LEA. This is determined with some experimentation but is by no means
|
2006-02-25 18:09:08 +08:00
|
|
|
// optimal (especially for code size consideration). LEA is nice because of
|
|
|
|
// its three-address nature. Tweak the cost function again when we can run
|
|
|
|
// convertToThreeAddress() at register allocation time.
|
2009-02-07 08:43:41 +08:00
|
|
|
if (AM.hasSymbolicDisplacement()) {
|
2015-10-13 00:09:59 +08:00
|
|
|
// For X86-64, always use LEA to materialize RIP-relative addresses.
|
2006-12-06 06:03:40 +08:00
|
|
|
if (Subtarget->is64Bit())
|
2006-09-08 14:48:29 +08:00
|
|
|
Complexity = 4;
|
|
|
|
else
|
|
|
|
Complexity += 2;
|
|
|
|
}
|
2006-02-25 18:09:08 +08:00
|
|
|
|
2010-04-30 07:30:41 +08:00
|
|
|
if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
|
2006-02-25 18:09:08 +08:00
|
|
|
Complexity++;
|
|
|
|
|
2009-07-12 06:50:33 +08:00
|
|
|
// If it isn't worth using an LEA, reject it.
|
2009-07-12 07:07:30 +08:00
|
|
|
if (Complexity <= 2)
|
2009-07-12 06:50:33 +08:00
|
|
|
return false;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2016-04-13 05:34:24 +08:00
|
|
|
getAddressOperands(AM, DL, Base, Scale, Index, Disp, Segment);
|
2009-07-12 06:50:33 +08:00
|
|
|
return true;
|
2006-02-25 18:09:08 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// This is only run on TargetGlobalTLSAddress nodes.
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
|
2009-06-21 04:38:48 +08:00
|
|
|
SDValue &Scale, SDValue &Index,
|
2010-07-09 07:46:44 +08:00
|
|
|
SDValue &Disp, SDValue &Segment) {
|
2009-06-21 04:38:48 +08:00
|
|
|
assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
|
|
|
|
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2009-06-21 04:38:48 +08:00
|
|
|
X86ISelAddressMode AM;
|
|
|
|
AM.GV = GA->getGlobal();
|
|
|
|
AM.Disp += GA->getOffset();
|
2010-04-30 07:30:41 +08:00
|
|
|
AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
|
2009-06-27 05:18:37 +08:00
|
|
|
AM.SymbolFlags = GA->getTargetFlags();
|
|
|
|
|
2009-08-12 04:47:22 +08:00
|
|
|
if (N.getValueType() == MVT::i32) {
|
2009-06-21 04:38:48 +08:00
|
|
|
AM.Scale = 1;
|
2009-08-12 04:47:22 +08:00
|
|
|
AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
|
2009-06-21 04:38:48 +08:00
|
|
|
} else {
|
2009-08-12 04:47:22 +08:00
|
|
|
AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
|
2009-06-21 04:38:48 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
|
2009-06-21 04:38:48 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-11-10 07:53:43 +08:00
|
|
|
bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
|
|
|
|
if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
|
|
|
|
Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
|
|
|
|
N.getValueType());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-12-09 03:01:00 +08:00
|
|
|
// Keep track of the original value type and whether this value was
|
|
|
|
// truncated. If we see a truncation from pointer type to VT that truncates
|
|
|
|
// bits that are known to be zero, we can use a narrow reference.
|
|
|
|
EVT VT = N.getValueType();
|
|
|
|
bool WasTruncated = false;
|
|
|
|
if (N.getOpcode() == ISD::TRUNCATE) {
|
|
|
|
WasTruncated = true;
|
|
|
|
N = N.getOperand(0);
|
|
|
|
}
|
|
|
|
|
2016-11-10 07:53:43 +08:00
|
|
|
if (N.getOpcode() != X86ISD::Wrapper)
|
|
|
|
return false;
|
|
|
|
|
2016-12-09 03:01:00 +08:00
|
|
|
// We can only use non-GlobalValues as immediates if they were not truncated,
|
|
|
|
// as we do not have any range information. If we have a GlobalValue and the
|
|
|
|
// address was not truncated, we can select it as an operand directly.
|
|
|
|
unsigned Opc = N.getOperand(0)->getOpcode();
|
|
|
|
if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
|
|
|
|
Op = N.getOperand(0);
|
|
|
|
// We can only select the operand directly if we didn't have to look past a
|
|
|
|
// truncate.
|
|
|
|
return !WasTruncated;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check that the global's range fits into VT.
|
|
|
|
auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
|
|
|
|
Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
|
|
|
|
if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Okay, we can use a narrow reference.
|
|
|
|
Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
|
|
|
|
GA->getOffset(), GA->getTargetFlags());
|
2016-11-17 05:48:59 +08:00
|
|
|
return true;
|
2016-11-10 07:53:43 +08:00
|
|
|
}
|
2016-04-06 04:45:04 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N,
|
2008-07-28 05:46:04 +08:00
|
|
|
SDValue &Base, SDValue &Scale,
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue &Index, SDValue &Disp,
|
|
|
|
SDValue &Segment) {
|
2010-03-03 06:20:06 +08:00
|
|
|
if (!ISD::isNON_EXTLoad(N.getNode()) ||
|
|
|
|
!IsProfitableToFold(N, P, P) ||
|
2010-04-17 23:26:15 +08:00
|
|
|
!IsLegalToFold(N, P, P, OptLevel))
|
2010-03-03 06:20:06 +08:00
|
|
|
return false;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2015-10-14 00:23:00 +08:00
|
|
|
return selectAddr(N.getNode(),
|
2010-09-22 06:07:31 +08:00
|
|
|
N.getOperand(1), Base, Scale, Index, Disp, Segment);
|
2006-01-07 04:36:21 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Return an SDNode that returns the value of the global base register.
|
|
|
|
/// Output instructions required to initialize the global base register,
|
|
|
|
/// if necessary.
|
2006-08-26 13:34:46 +08:00
|
|
|
SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
|
2009-06-04 04:20:00 +08:00
|
|
|
unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
|
2015-07-09 10:09:04 +08:00
|
|
|
auto &DL = MF->getDataLayout();
|
|
|
|
return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
|
2006-02-18 08:15:05 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Test whether the given X86ISD::CMP node has any uses which require the SF
|
|
|
|
/// or OF bits to be accurate.
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool hasNoSignedComparisonUses(SDNode *N) {
|
2009-10-10 04:35:19 +08:00
|
|
|
// Examine each user of the node.
|
|
|
|
for (SDNode::use_iterator UI = N->use_begin(),
|
|
|
|
UE = N->use_end(); UI != UE; ++UI) {
|
|
|
|
// Only examine CopyToReg uses.
|
|
|
|
if (UI->getOpcode() != ISD::CopyToReg)
|
|
|
|
return false;
|
|
|
|
// Only examine CopyToReg uses that copy to EFLAGS.
|
|
|
|
if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
|
|
|
|
X86::EFLAGS)
|
|
|
|
return false;
|
|
|
|
// Examine each user of the CopyToReg use.
|
|
|
|
for (SDNode::use_iterator FlagUI = UI->use_begin(),
|
|
|
|
FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
|
|
|
|
// Only examine the Flag result.
|
|
|
|
if (FlagUI.getUse().getResNo() != 1) continue;
|
|
|
|
// Anything unusual: assume conservatively.
|
|
|
|
if (!FlagUI->isMachineOpcode()) return false;
|
|
|
|
// Examine the opcode of the user.
|
|
|
|
switch (FlagUI->getMachineOpcode()) {
|
|
|
|
// These comparisons don't treat the most significant bit specially.
|
|
|
|
case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
|
|
|
|
case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
|
|
|
|
case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
|
|
|
|
case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
|
2015-01-06 12:23:53 +08:00
|
|
|
case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1:
|
|
|
|
case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1:
|
2009-10-10 04:35:19 +08:00
|
|
|
case X86::CMOVA16rr: case X86::CMOVA16rm:
|
|
|
|
case X86::CMOVA32rr: case X86::CMOVA32rm:
|
|
|
|
case X86::CMOVA64rr: case X86::CMOVA64rm:
|
|
|
|
case X86::CMOVAE16rr: case X86::CMOVAE16rm:
|
|
|
|
case X86::CMOVAE32rr: case X86::CMOVAE32rm:
|
|
|
|
case X86::CMOVAE64rr: case X86::CMOVAE64rm:
|
|
|
|
case X86::CMOVB16rr: case X86::CMOVB16rm:
|
|
|
|
case X86::CMOVB32rr: case X86::CMOVB32rm:
|
|
|
|
case X86::CMOVB64rr: case X86::CMOVB64rm:
|
2010-10-06 07:00:14 +08:00
|
|
|
case X86::CMOVBE16rr: case X86::CMOVBE16rm:
|
|
|
|
case X86::CMOVBE32rr: case X86::CMOVBE32rm:
|
|
|
|
case X86::CMOVBE64rr: case X86::CMOVBE64rm:
|
2009-10-10 04:35:19 +08:00
|
|
|
case X86::CMOVE16rr: case X86::CMOVE16rm:
|
|
|
|
case X86::CMOVE32rr: case X86::CMOVE32rm:
|
|
|
|
case X86::CMOVE64rr: case X86::CMOVE64rm:
|
|
|
|
case X86::CMOVNE16rr: case X86::CMOVNE16rm:
|
|
|
|
case X86::CMOVNE32rr: case X86::CMOVNE32rm:
|
|
|
|
case X86::CMOVNE64rr: case X86::CMOVNE64rm:
|
|
|
|
case X86::CMOVNP16rr: case X86::CMOVNP16rm:
|
|
|
|
case X86::CMOVNP32rr: case X86::CMOVNP32rm:
|
|
|
|
case X86::CMOVNP64rr: case X86::CMOVNP64rm:
|
|
|
|
case X86::CMOVP16rr: case X86::CMOVP16rm:
|
|
|
|
case X86::CMOVP32rr: case X86::CMOVP32rm:
|
|
|
|
case X86::CMOVP64rr: case X86::CMOVP64rm:
|
|
|
|
continue;
|
|
|
|
// Anything else: assume conservatively.
|
|
|
|
default: return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Check whether or not the chain ending in StoreNode is suitable for doing
|
|
|
|
/// the {load; increment or decrement; store} to modify transformation.
|
2016-04-06 04:45:04 +08:00
|
|
|
static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
|
|
|
|
SDValue StoredVal, SelectionDAG *CurDAG,
|
|
|
|
LoadSDNode* &LoadNode, SDValue &InputChain) {
|
|
|
|
|
2012-03-29 13:45:48 +08:00
|
|
|
// is the value stored the result of a DEC or INC?
|
|
|
|
if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
|
|
|
|
|
|
|
|
// is the stored value result 0 of the load?
|
|
|
|
if (StoredVal.getResNo() != 0) return false;
|
|
|
|
|
|
|
|
// are there other uses of the loaded value than the inc or dec?
|
|
|
|
if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
|
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
// is the store non-extending and non-indexed?
|
|
|
|
if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
|
2012-03-29 13:45:48 +08:00
|
|
|
return false;
|
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
SDValue Load = StoredVal->getOperand(0);
|
|
|
|
// Is the stored value a non-extending and non-indexed load?
|
|
|
|
if (!ISD::isNormalLoad(Load.getNode())) return false;
|
2012-03-29 13:45:48 +08:00
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
// Return LoadNode by reference.
|
|
|
|
LoadNode = cast<LoadSDNode>(Load);
|
|
|
|
// is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
|
2012-08-02 02:39:17 +08:00
|
|
|
EVT LdVT = LoadNode->getMemoryVT();
|
|
|
|
if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 &&
|
2012-04-13 03:14:21 +08:00
|
|
|
LdVT != MVT::i8)
|
2012-03-29 13:45:48 +08:00
|
|
|
return false;
|
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
// Is store the only read of the loaded value?
|
|
|
|
if (!Load.hasOneUse())
|
2012-03-29 13:45:48 +08:00
|
|
|
return false;
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
// Is the address of the store the same as the load?
|
|
|
|
if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
|
|
|
|
LoadNode->getOffset() != StoreNode->getOffset())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Check if the chain is produced by the load or is a TokenFactor with
|
|
|
|
// the load output chain as an operand. Return InputChain by reference.
|
|
|
|
SDValue Chain = StoreNode->getChain();
|
2012-03-29 13:45:48 +08:00
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
bool ChainCheck = false;
|
|
|
|
if (Chain == Load.getValue(1)) {
|
|
|
|
ChainCheck = true;
|
|
|
|
InputChain = LoadNode->getChain();
|
|
|
|
} else if (Chain.getOpcode() == ISD::TokenFactor) {
|
|
|
|
SmallVector<SDValue, 4> ChainOps;
|
|
|
|
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
|
|
|
|
SDValue Op = Chain.getOperand(i);
|
|
|
|
if (Op == Load.getValue(1)) {
|
|
|
|
ChainCheck = true;
|
|
|
|
continue;
|
|
|
|
}
|
2012-05-16 09:54:27 +08:00
|
|
|
|
|
|
|
// Make sure using Op as part of the chain would not cause a cycle here.
|
|
|
|
// In theory, we could check whether the chain node is a predecessor of
|
|
|
|
// the load. But that can be very expensive. Instead visit the uses and
|
|
|
|
// make sure they all have smaller node id than the load.
|
|
|
|
int LoadId = LoadNode->getNodeId();
|
|
|
|
for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
|
|
|
|
UE = UI->use_end(); UI != UE; ++UI) {
|
|
|
|
if (UI.getUse().getResNo() != 0)
|
|
|
|
continue;
|
|
|
|
if (UI->getNodeId() > LoadId)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-04-13 03:14:21 +08:00
|
|
|
ChainOps.push_back(Op);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ChainCheck)
|
|
|
|
// Make a new TokenFactor with all the other input chains except
|
|
|
|
// for the load.
|
2013-05-25 10:42:55 +08:00
|
|
|
InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
|
2014-04-27 02:35:24 +08:00
|
|
|
MVT::Other, ChainOps);
|
2012-04-13 03:14:21 +08:00
|
|
|
}
|
|
|
|
if (!ChainCheck)
|
2012-03-29 13:45:48 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Get the appropriate X86 opcode for an in-memory increment or decrement.
|
|
|
|
/// Opc should be X86ISD::DEC or X86ISD::INC.
|
2016-04-06 04:45:04 +08:00
|
|
|
static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
|
2012-03-29 13:45:48 +08:00
|
|
|
if (Opc == X86ISD::DEC) {
|
|
|
|
if (LdVT == MVT::i64) return X86::DEC64m;
|
|
|
|
if (LdVT == MVT::i32) return X86::DEC32m;
|
|
|
|
if (LdVT == MVT::i16) return X86::DEC16m;
|
|
|
|
if (LdVT == MVT::i8) return X86::DEC8m;
|
2012-03-29 20:37:26 +08:00
|
|
|
} else {
|
|
|
|
assert(Opc == X86ISD::INC && "unrecognized opcode");
|
2012-03-29 13:45:48 +08:00
|
|
|
if (LdVT == MVT::i64) return X86::INC64m;
|
|
|
|
if (LdVT == MVT::i32) return X86::INC32m;
|
|
|
|
if (LdVT == MVT::i16) return X86::INC16m;
|
|
|
|
if (LdVT == MVT::i8) return X86::INC8m;
|
|
|
|
}
|
2012-03-29 20:37:26 +08:00
|
|
|
llvm_unreachable("unrecognized size for LdVT");
|
2012-03-29 13:45:48 +08:00
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// Customized ISel for GATHER operations.
|
2016-05-12 01:46:03 +08:00
|
|
|
bool X86DAGToDAGISel::tryGather(SDNode *Node, unsigned Opc) {
|
2012-06-27 03:47:59 +08:00
|
|
|
// Operands of Gather: VSrc, Base, VIdx, VMask, Scale
|
|
|
|
SDValue Chain = Node->getOperand(0);
|
|
|
|
SDValue VSrc = Node->getOperand(2);
|
|
|
|
SDValue Base = Node->getOperand(3);
|
|
|
|
SDValue VIdx = Node->getOperand(4);
|
|
|
|
SDValue VMask = Node->getOperand(5);
|
|
|
|
ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
|
2012-07-01 10:17:08 +08:00
|
|
|
if (!Scale)
|
2016-05-12 01:46:03 +08:00
|
|
|
return false;
|
2012-06-27 03:47:59 +08:00
|
|
|
|
2012-07-12 14:52:41 +08:00
|
|
|
SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
|
|
|
|
MVT::Other);
|
|
|
|
|
2015-04-28 22:05:47 +08:00
|
|
|
SDLoc DL(Node);
|
|
|
|
|
2012-06-27 03:47:59 +08:00
|
|
|
// Memory Operands: Base, Scale, Index, Disp, Segment
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
|
2012-06-27 03:47:59 +08:00
|
|
|
SDValue Segment = CurDAG->getRegister(0, MVT::i32);
|
2015-04-28 22:05:47 +08:00
|
|
|
const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx,
|
2012-06-27 03:47:59 +08:00
|
|
|
Disp, Segment, VMask, Chain};
|
2015-04-28 22:05:47 +08:00
|
|
|
SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
|
2012-07-12 14:52:41 +08:00
|
|
|
// Node has 2 outputs: VDst and MVT::Other.
|
|
|
|
// ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
|
|
|
|
// We replace VDst of Node with VDst of ResNode, and Other of Node with Other
|
|
|
|
// of ResNode.
|
|
|
|
ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
|
|
|
|
ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
|
2016-05-12 01:46:03 +08:00
|
|
|
CurDAG->RemoveDeadNode(Node);
|
|
|
|
return true;
|
2012-06-27 03:47:59 +08:00
|
|
|
}
|
|
|
|
|
2016-05-11 07:55:37 +08:00
|
|
|
void X86DAGToDAGISel::Select(SDNode *Node) {
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT NVT = Node->getSimpleValueType(0);
|
2006-01-07 04:36:21 +08:00
|
|
|
unsigned Opc, MOpc;
|
|
|
|
unsigned Opcode = Node->getOpcode();
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc dl(Node);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2010-03-02 14:34:30 +08:00
|
|
|
DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
|
2006-02-11 06:24:32 +08:00
|
|
|
|
2008-07-18 03:10:17 +08:00
|
|
|
if (Node->isMachineOpcode()) {
|
2010-03-02 14:34:30 +08:00
|
|
|
DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
|
2013-09-22 16:21:56 +08:00
|
|
|
Node->setNodeId(-1);
|
2016-05-11 07:55:37 +08:00
|
|
|
return; // Already selected.
|
2006-02-09 08:37:58 +08:00
|
|
|
}
|
2006-01-12 06:15:18 +08:00
|
|
|
|
2006-01-07 04:36:21 +08:00
|
|
|
switch (Opcode) {
|
2015-08-19 19:35:10 +08:00
|
|
|
default: break;
|
2015-08-20 00:17:08 +08:00
|
|
|
case ISD::BRIND: {
|
|
|
|
if (Subtarget->isTargetNaCl())
|
|
|
|
// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
|
|
|
|
// leave the instruction alone.
|
|
|
|
break;
|
|
|
|
if (Subtarget->isTarget64BitILP32()) {
|
|
|
|
// Converts a 32-bit register to a 64-bit, zero-extended version of
|
|
|
|
// it. This is needed because x86-64 can do many things, but jmp %r32
|
|
|
|
// ain't one of them.
|
|
|
|
const SDValue &Target = Node->getOperand(1);
|
|
|
|
assert(Target.getSimpleValueType() == llvm::MVT::i32);
|
|
|
|
SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
|
|
|
|
SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
|
|
|
|
Node->getOperand(0), ZextTarget);
|
2016-05-14 07:26:28 +08:00
|
|
|
ReplaceNode(Node, Brind.getNode());
|
2015-08-20 00:17:08 +08:00
|
|
|
SelectCode(ZextTarget.getNode());
|
|
|
|
SelectCode(Brind.getNode());
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2015-08-20 00:17:08 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2012-06-27 03:47:59 +08:00
|
|
|
case ISD::INTRINSIC_W_CHAIN: {
|
|
|
|
unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
|
|
|
|
switch (IntNo) {
|
|
|
|
default: break;
|
|
|
|
case Intrinsic::x86_avx2_gather_d_pd:
|
|
|
|
case Intrinsic::x86_avx2_gather_d_pd_256:
|
|
|
|
case Intrinsic::x86_avx2_gather_q_pd:
|
|
|
|
case Intrinsic::x86_avx2_gather_q_pd_256:
|
|
|
|
case Intrinsic::x86_avx2_gather_d_ps:
|
|
|
|
case Intrinsic::x86_avx2_gather_d_ps_256:
|
|
|
|
case Intrinsic::x86_avx2_gather_q_ps:
|
|
|
|
case Intrinsic::x86_avx2_gather_q_ps_256:
|
2012-06-29 08:54:20 +08:00
|
|
|
case Intrinsic::x86_avx2_gather_d_q:
|
|
|
|
case Intrinsic::x86_avx2_gather_d_q_256:
|
|
|
|
case Intrinsic::x86_avx2_gather_q_q:
|
|
|
|
case Intrinsic::x86_avx2_gather_q_q_256:
|
|
|
|
case Intrinsic::x86_avx2_gather_d_d:
|
|
|
|
case Intrinsic::x86_avx2_gather_d_d_256:
|
|
|
|
case Intrinsic::x86_avx2_gather_q_d:
|
2012-07-01 10:05:52 +08:00
|
|
|
case Intrinsic::x86_avx2_gather_q_d_256: {
|
2013-06-06 02:12:26 +08:00
|
|
|
if (!Subtarget->hasAVX2())
|
|
|
|
break;
|
2012-07-01 10:05:52 +08:00
|
|
|
unsigned Opc;
|
|
|
|
switch (IntNo) {
|
2012-07-01 10:55:34 +08:00
|
|
|
default: llvm_unreachable("Impossible intrinsic");
|
2012-07-01 10:05:52 +08:00
|
|
|
case Intrinsic::x86_avx2_gather_d_pd: Opc = X86::VGATHERDPDrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_q_pd: Opc = X86::VGATHERQPDrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_d_ps: Opc = X86::VGATHERDPSrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_q_ps: Opc = X86::VGATHERQPSrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_d_q: Opc = X86::VPGATHERDQrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_d_q_256: Opc = X86::VPGATHERDQYrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_q_q: Opc = X86::VPGATHERQQrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_q_q_256: Opc = X86::VPGATHERQQYrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_d_d: Opc = X86::VPGATHERDDrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_d_d_256: Opc = X86::VPGATHERDDYrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break;
|
|
|
|
case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break;
|
|
|
|
}
|
2016-05-12 01:46:03 +08:00
|
|
|
if (tryGather(Node, Opc))
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2012-07-01 10:18:18 +08:00
|
|
|
break;
|
2012-07-01 10:05:52 +08:00
|
|
|
}
|
2012-06-27 03:47:59 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2009-08-03 00:10:52 +08:00
|
|
|
case X86ISD::GlobalBaseReg:
|
2016-05-12 05:13:17 +08:00
|
|
|
ReplaceNode(Node, getGlobalBaseReg());
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2009-08-03 00:10:52 +08:00
|
|
|
|
2014-11-06 10:25:03 +08:00
|
|
|
case X86ISD::SHRUNKBLEND: {
|
|
|
|
// SHRUNKBLEND selects like a regular VSELECT.
|
|
|
|
SDValue VSelect = CurDAG->getNode(
|
|
|
|
ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
|
|
|
|
Node->getOperand(1), Node->getOperand(2));
|
|
|
|
ReplaceUses(SDValue(Node, 0), VSelect);
|
|
|
|
SelectCode(VSelect.getNode());
|
|
|
|
// We already called ReplaceUses.
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2014-11-06 10:25:03 +08:00
|
|
|
}
|
2012-07-01 10:55:34 +08:00
|
|
|
|
2015-08-19 19:35:10 +08:00
|
|
|
case ISD::AND:
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
case ISD::OR:
|
|
|
|
case ISD::XOR: {
|
|
|
|
// For operations of the form (x << C1) op C2, check if we can use a smaller
|
|
|
|
// encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
|
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
|
|
|
|
|
|
|
if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
|
|
|
|
break;
|
|
|
|
|
|
|
|
// i8 is unshrinkable, i16 should be promoted to i32.
|
|
|
|
if (NVT != MVT::i32 && NVT != MVT::i64)
|
|
|
|
break;
|
|
|
|
|
|
|
|
ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
|
|
|
|
ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
|
|
|
|
if (!Cst || !ShlCst)
|
|
|
|
break;
|
|
|
|
|
|
|
|
int64_t Val = Cst->getSExtValue();
|
|
|
|
uint64_t ShlVal = ShlCst->getZExtValue();
|
|
|
|
|
|
|
|
// Make sure that we don't change the operation by removing bits.
|
|
|
|
// This only matters for OR and XOR, AND is unaffected.
|
2012-08-25 07:29:28 +08:00
|
|
|
uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
|
|
|
|
if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
break;
|
|
|
|
|
2015-04-02 03:01:09 +08:00
|
|
|
unsigned ShlOp, AddOp, Op;
|
2013-08-15 13:57:07 +08:00
|
|
|
MVT CstVT = NVT;
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
|
|
|
|
// Check the minimum bitwidth for the new constant.
|
|
|
|
// TODO: AND32ri is the same as AND64ri32 with zext imm.
|
|
|
|
// TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
|
|
|
|
// TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
|
|
|
|
if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
|
|
|
|
CstVT = MVT::i8;
|
|
|
|
else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
|
|
|
|
CstVT = MVT::i32;
|
|
|
|
|
|
|
|
// Bail if there is no smaller encoding.
|
|
|
|
if (NVT == CstVT)
|
|
|
|
break;
|
|
|
|
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
|
|
|
case MVT::i32:
|
|
|
|
assert(CstVT == MVT::i8);
|
|
|
|
ShlOp = X86::SHL32ri;
|
2015-04-02 03:01:09 +08:00
|
|
|
AddOp = X86::ADD32rr;
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
|
|
|
|
switch (Opcode) {
|
2012-08-12 01:44:14 +08:00
|
|
|
default: llvm_unreachable("Impossible opcode");
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
case ISD::AND: Op = X86::AND32ri8; break;
|
|
|
|
case ISD::OR: Op = X86::OR32ri8; break;
|
|
|
|
case ISD::XOR: Op = X86::XOR32ri8; break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case MVT::i64:
|
|
|
|
assert(CstVT == MVT::i8 || CstVT == MVT::i32);
|
|
|
|
ShlOp = X86::SHL64ri;
|
2015-04-02 03:01:09 +08:00
|
|
|
AddOp = X86::ADD64rr;
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
|
|
|
|
switch (Opcode) {
|
2012-08-12 01:44:14 +08:00
|
|
|
default: llvm_unreachable("Impossible opcode");
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
|
|
|
|
case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break;
|
|
|
|
case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Emit the smaller op and the shift.
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
|
2015-04-02 03:01:09 +08:00
|
|
|
if (ShlVal == 1)
|
2016-05-11 07:55:37 +08:00
|
|
|
CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
|
|
|
|
SDValue(New, 0));
|
|
|
|
else
|
|
|
|
CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
|
|
|
|
getI8Imm(ShlVal, dl));
|
|
|
|
return;
|
X86: Try to use a smaller encoding by transforming (X << C1) & C2 into (X & (C2 >> C1)) & C1. (Part of PR5039)
This tends to happen a lot with bitfield code generated by clang. A simple example for x86_64 is
uint64_t foo(uint64_t x) { return (x&1) << 42; }
which used to compile into bloated code:
shlq $42, %rdi ## encoding: [0x48,0xc1,0xe7,0x2a]
movabsq $4398046511104, %rax ## encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00]
andq %rdi, %rax ## encoding: [0x48,0x21,0xf8]
ret ## encoding: [0xc3]
with this patch we can fold the immediate into the and:
andq $1, %rdi ## encoding: [0x48,0x83,0xe7,0x01]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
shlq $42, %rax ## encoding: [0x48,0xc1,0xe0,0x2a]
ret ## encoding: [0xc3]
It's possible to save another byte by using 'andl' instead of 'andq' but I currently see no way of doing
that without making this code even more complicated. See the TODOs in the code.
llvm-svn: 129990
2011-04-22 23:30:40 +08:00
|
|
|
}
|
[X86] Improve mul w/ overflow codegen, to MUL8+SETO.
Currently, @llvm.smul.with.overflow.i8 expands to 9 instructions, where
3 are really needed.
This adds X86ISD::UMUL8/SMUL8 SD nodes, and custom lowers them to
MUL8/IMUL8 + SETO.
i8 is a special case because there is no two/three operand variants of
(I)MUL8, so the first operand and return value need to go in AL/AX.
Also, we can't write patterns for these instructions: TableGen refuses
patterns where output operands don't match SDNode results. In this case,
instructions where the output operand is an implicitly defined register.
A related special case (and FIXME) exists for MUL8 (X86InstrArith.td):
// FIXME: Used for 8-bit mul, ignore result upper 8 bits.
// This probably ought to be moved to a def : Pat<> if the
// syntax can be accepted.
[(set AL, (mul AL, GR8:$src)), (implicit EFLAGS)]
Ideally, these go away with UMUL8, but we still need to improve TableGen
support of implicit operands in patterns.
Before this change:
movsbl %sil, %eax
movsbl %dil, %ecx
imull %eax, %ecx
movb %cl, %al
sarb $7, %al
movzbl %al, %eax
movzbl %ch, %esi
cmpl %eax, %esi
setne %al
After:
movb %dil, %al
imulb %sil
seto %al
Also, remove a made-redundant testcase for PR19858, and enable more FastISel
ALU-overflow tests for SelectionDAG too.
Differential Revision: http://reviews.llvm.org/D5809
llvm-svn: 220516
2014-10-24 05:55:31 +08:00
|
|
|
case X86ISD::UMUL8:
|
|
|
|
case X86ISD::SMUL8: {
|
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
|
|
|
|
|
|
|
Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
|
|
|
|
|
|
|
|
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL,
|
|
|
|
N0, SDValue()).getValue(1);
|
|
|
|
|
|
|
|
SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32);
|
|
|
|
SDValue Ops[] = {N1, InFlag};
|
|
|
|
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
|
|
|
|
|
2016-05-12 05:13:17 +08:00
|
|
|
ReplaceNode(Node, CNode);
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
[X86] Improve mul w/ overflow codegen, to MUL8+SETO.
Currently, @llvm.smul.with.overflow.i8 expands to 9 instructions, where
3 are really needed.
This adds X86ISD::UMUL8/SMUL8 SD nodes, and custom lowers them to
MUL8/IMUL8 + SETO.
i8 is a special case because there is no two/three operand variants of
(I)MUL8, so the first operand and return value need to go in AL/AX.
Also, we can't write patterns for these instructions: TableGen refuses
patterns where output operands don't match SDNode results. In this case,
instructions where the output operand is an implicitly defined register.
A related special case (and FIXME) exists for MUL8 (X86InstrArith.td):
// FIXME: Used for 8-bit mul, ignore result upper 8 bits.
// This probably ought to be moved to a def : Pat<> if the
// syntax can be accepted.
[(set AL, (mul AL, GR8:$src)), (implicit EFLAGS)]
Ideally, these go away with UMUL8, but we still need to improve TableGen
support of implicit operands in patterns.
Before this change:
movsbl %sil, %eax
movsbl %dil, %ecx
imull %eax, %ecx
movb %cl, %al
sarb $7, %al
movzbl %al, %eax
movzbl %ch, %esi
cmpl %eax, %esi
setne %al
After:
movb %dil, %al
imulb %sil
seto %al
Also, remove a made-redundant testcase for PR19858, and enable more FastISel
ALU-overflow tests for SelectionDAG too.
Differential Revision: http://reviews.llvm.org/D5809
llvm-svn: 220516
2014-10-24 05:55:31 +08:00
|
|
|
}
|
|
|
|
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
case X86ISD::UMUL: {
|
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2011-01-15 06:34:13 +08:00
|
|
|
unsigned LoReg;
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
2011-01-15 06:34:13 +08:00
|
|
|
case MVT::i8: LoReg = X86::AL; Opc = X86::MUL8r; break;
|
|
|
|
case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break;
|
|
|
|
case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
|
|
|
|
case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
|
|
|
|
N0, SDValue()).getValue(1);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
|
|
|
|
SDValue Ops[] = {N1, InFlag};
|
2013-04-20 06:22:57 +08:00
|
|
|
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2016-05-12 06:21:50 +08:00
|
|
|
ReplaceNode(Node, CNode);
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2009-08-03 00:10:52 +08:00
|
|
|
case ISD::SMUL_LOHI:
|
|
|
|
case ISD::UMUL_LOHI: {
|
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
2006-01-07 04:36:21 +08:00
|
|
|
|
2009-08-03 00:10:52 +08:00
|
|
|
bool isSigned = Opcode == ISD::SMUL_LOHI;
|
2012-09-26 16:22:37 +08:00
|
|
|
bool hasBMI2 = Subtarget->hasBMI2();
|
2009-08-08 05:33:25 +08:00
|
|
|
if (!isSigned) {
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break;
|
|
|
|
case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
|
2012-09-26 16:22:37 +08:00
|
|
|
case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
|
|
|
|
MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
|
|
|
|
case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
|
|
|
|
MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
|
2006-01-07 04:36:21 +08:00
|
|
|
}
|
2009-08-08 05:33:25 +08:00
|
|
|
} else {
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
2009-08-03 00:10:52 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break;
|
|
|
|
case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
|
|
|
|
case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
|
|
|
|
case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
|
2006-01-07 07:19:29 +08:00
|
|
|
}
|
2009-08-08 05:33:25 +08:00
|
|
|
}
|
2006-01-07 07:19:29 +08:00
|
|
|
|
2012-09-26 16:22:37 +08:00
|
|
|
unsigned SrcReg, LoReg, HiReg;
|
|
|
|
switch (Opc) {
|
|
|
|
default: llvm_unreachable("Unknown MUL opcode!");
|
|
|
|
case X86::IMUL8r:
|
|
|
|
case X86::MUL8r:
|
|
|
|
SrcReg = LoReg = X86::AL; HiReg = X86::AH;
|
|
|
|
break;
|
|
|
|
case X86::IMUL16r:
|
|
|
|
case X86::MUL16r:
|
|
|
|
SrcReg = LoReg = X86::AX; HiReg = X86::DX;
|
|
|
|
break;
|
|
|
|
case X86::IMUL32r:
|
|
|
|
case X86::MUL32r:
|
|
|
|
SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
|
|
|
|
break;
|
|
|
|
case X86::IMUL64r:
|
|
|
|
case X86::MUL64r:
|
|
|
|
SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
|
|
|
|
break;
|
|
|
|
case X86::MULX32rr:
|
|
|
|
SrcReg = X86::EDX; LoReg = HiReg = 0;
|
|
|
|
break;
|
|
|
|
case X86::MULX64rr:
|
|
|
|
SrcReg = X86::RDX; LoReg = HiReg = 0;
|
|
|
|
break;
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
2006-01-07 04:36:21 +08:00
|
|
|
|
2009-08-03 00:10:52 +08:00
|
|
|
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
|
2015-10-14 00:23:00 +08:00
|
|
|
bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
|
2009-08-08 05:33:25 +08:00
|
|
|
// Multiply is commmutative.
|
2009-08-03 00:10:52 +08:00
|
|
|
if (!foldedLoad) {
|
2015-10-14 00:23:00 +08:00
|
|
|
foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
|
2009-08-03 00:10:52 +08:00
|
|
|
if (foldedLoad)
|
|
|
|
std::swap(N0, N1);
|
|
|
|
}
|
|
|
|
|
2012-09-26 16:22:37 +08:00
|
|
|
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
|
2012-05-23 13:44:51 +08:00
|
|
|
N0, SDValue()).getValue(1);
|
2012-09-26 16:22:37 +08:00
|
|
|
SDValue ResHi, ResLo;
|
2009-08-03 00:10:52 +08:00
|
|
|
|
|
|
|
if (foldedLoad) {
|
2012-09-26 16:22:37 +08:00
|
|
|
SDValue Chain;
|
2016-06-24 05:40:35 +08:00
|
|
|
MachineSDNode *CNode = nullptr;
|
2009-08-03 00:10:52 +08:00
|
|
|
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
|
|
|
|
InFlag };
|
2012-09-26 16:22:37 +08:00
|
|
|
if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
|
|
|
|
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
|
2016-06-24 05:40:35 +08:00
|
|
|
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
|
2012-09-26 16:22:37 +08:00
|
|
|
ResHi = SDValue(CNode, 0);
|
|
|
|
ResLo = SDValue(CNode, 1);
|
|
|
|
Chain = SDValue(CNode, 2);
|
|
|
|
InFlag = SDValue(CNode, 3);
|
|
|
|
} else {
|
|
|
|
SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
|
2016-06-24 05:40:35 +08:00
|
|
|
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
|
2012-09-26 16:22:37 +08:00
|
|
|
Chain = SDValue(CNode, 0);
|
|
|
|
InFlag = SDValue(CNode, 1);
|
|
|
|
}
|
it turns out that when ".with.overflow" intrinsics were added to the X86
backend that they were all implemented except umul. This one fell back
to the default implementation that did a hi/lo multiply and compared the
top. Fix this to check the overflow flag that the 'mul' instruction
sets, so we can avoid an explicit test. Now we compile:
void *func(long count) {
return new int[count];
}
into:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
seto %cl ## encoding: [0x0f,0x90,0xc1]
testb %cl, %cl ## encoding: [0x84,0xc9]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
instead of:
__Z4funcl: ## @_Z4funcl
movl $4, %ecx ## encoding: [0xb9,0x04,0x00,0x00,0x00]
movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
mulq %rcx ## encoding: [0x48,0xf7,0xe1]
testq %rdx, %rdx ## encoding: [0x48,0x85,0xd2]
movq $-1, %rdi ## encoding: [0x48,0xc7,0xc7,0xff,0xff,0xff,0xff]
cmoveq %rax, %rdi ## encoding: [0x48,0x0f,0x44,0xf8]
jmp __Znam ## TAILCALL
Other than the silly seto+test, this is using the o bit directly, so it's going in the right
direction.
llvm-svn: 120935
2010-12-05 15:30:36 +08:00
|
|
|
|
2009-08-03 00:10:52 +08:00
|
|
|
// Update the chain.
|
2012-09-26 16:22:37 +08:00
|
|
|
ReplaceUses(N1.getValue(1), Chain);
|
2016-06-24 05:40:35 +08:00
|
|
|
// Record the mem-refs
|
|
|
|
LoadSDNode *LoadNode = cast<LoadSDNode>(N1);
|
|
|
|
if (LoadNode) {
|
|
|
|
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
|
|
|
|
MemOp[0] = LoadNode->getMemOperand();
|
|
|
|
CNode->setMemRefs(MemOp, MemOp + 1);
|
|
|
|
}
|
2009-08-03 00:10:52 +08:00
|
|
|
} else {
|
2012-09-26 16:22:37 +08:00
|
|
|
SDValue Ops[] = { N1, InFlag };
|
|
|
|
if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
|
|
|
|
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
|
2013-04-20 06:22:57 +08:00
|
|
|
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
|
2012-09-26 16:22:37 +08:00
|
|
|
ResHi = SDValue(CNode, 0);
|
|
|
|
ResLo = SDValue(CNode, 1);
|
|
|
|
InFlag = SDValue(CNode, 2);
|
|
|
|
} else {
|
|
|
|
SDVTList VTs = CurDAG->getVTList(MVT::Glue);
|
2013-04-20 06:22:57 +08:00
|
|
|
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
|
2012-09-26 16:22:37 +08:00
|
|
|
InFlag = SDValue(CNode, 0);
|
|
|
|
}
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
|
|
|
|
2010-06-26 08:39:23 +08:00
|
|
|
// Prevent use of AH in a REX instruction by referencing AX instead.
|
|
|
|
if (HiReg == X86::AH && Subtarget->is64Bit() &&
|
|
|
|
!SDValue(Node, 1).use_empty()) {
|
|
|
|
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
|
|
|
|
X86::AX, MVT::i16, InFlag);
|
|
|
|
InFlag = Result.getValue(2);
|
|
|
|
// Get the low part if needed. Don't use getCopyFromReg for aliasing
|
|
|
|
// registers.
|
|
|
|
if (!SDValue(Node, 0).use_empty())
|
|
|
|
ReplaceUses(SDValue(Node, 1),
|
|
|
|
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
|
|
|
|
|
|
|
|
// Shift AX down 8 bits.
|
|
|
|
Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
|
|
|
|
Result,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(8, dl, MVT::i8)),
|
|
|
|
0);
|
2010-06-26 08:39:23 +08:00
|
|
|
// Then truncate it down to i8.
|
|
|
|
ReplaceUses(SDValue(Node, 1),
|
|
|
|
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
|
|
|
|
}
|
2009-08-03 00:10:52 +08:00
|
|
|
// Copy the low half of the result, if it is needed.
|
2010-01-05 09:24:18 +08:00
|
|
|
if (!SDValue(Node, 0).use_empty()) {
|
2014-04-25 13:30:21 +08:00
|
|
|
if (!ResLo.getNode()) {
|
2012-09-26 16:22:37 +08:00
|
|
|
assert(LoReg && "Register for low half is not defined!");
|
|
|
|
ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
|
|
|
|
InFlag);
|
|
|
|
InFlag = ResLo.getValue(2);
|
|
|
|
}
|
|
|
|
ReplaceUses(SDValue(Node, 0), ResLo);
|
|
|
|
DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
|
|
|
// Copy the high half of the result, if it is needed.
|
2010-01-05 09:24:18 +08:00
|
|
|
if (!SDValue(Node, 1).use_empty()) {
|
2014-04-25 13:30:21 +08:00
|
|
|
if (!ResHi.getNode()) {
|
2012-09-26 16:22:37 +08:00
|
|
|
assert(HiReg && "Register for high half is not defined!");
|
|
|
|
ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
|
|
|
|
InFlag);
|
|
|
|
InFlag = ResHi.getValue(2);
|
|
|
|
}
|
|
|
|
ReplaceUses(SDValue(Node, 1), ResHi);
|
|
|
|
DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
case ISD::SDIVREM:
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
case ISD::UDIVREM:
|
|
|
|
case X86ISD::SDIVREM8_SEXT_HREG:
|
|
|
|
case X86ISD::UDIVREM8_ZEXT_HREG: {
|
2009-08-03 00:10:52 +08:00
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
2006-01-07 07:19:29 +08:00
|
|
|
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
bool isSigned = (Opcode == ISD::SDIVREM ||
|
|
|
|
Opcode == X86ISD::SDIVREM8_SEXT_HREG);
|
2009-08-08 05:33:25 +08:00
|
|
|
if (!isSigned) {
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
2009-08-03 00:10:52 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
|
|
|
|
case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
|
|
|
|
case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
|
|
|
|
case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
2009-08-08 05:33:25 +08:00
|
|
|
} else {
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
2009-07-15 00:55:14 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
|
|
|
|
case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
|
|
|
|
case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
|
|
|
|
case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
|
2006-01-07 07:19:29 +08:00
|
|
|
}
|
2009-08-08 05:33:25 +08:00
|
|
|
}
|
2006-01-07 07:19:29 +08:00
|
|
|
|
2009-12-23 09:45:04 +08:00
|
|
|
unsigned LoReg, HiReg, ClrReg;
|
2013-05-30 21:19:42 +08:00
|
|
|
unsigned SExtOpcode;
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
2009-08-03 00:10:52 +08:00
|
|
|
default: llvm_unreachable("Unsupported VT!");
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i8:
|
2009-12-23 09:45:04 +08:00
|
|
|
LoReg = X86::AL; ClrReg = HiReg = X86::AH;
|
2009-08-03 00:10:52 +08:00
|
|
|
SExtOpcode = X86::CBW;
|
|
|
|
break;
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i16:
|
2009-08-03 00:10:52 +08:00
|
|
|
LoReg = X86::AX; HiReg = X86::DX;
|
2013-05-30 21:19:42 +08:00
|
|
|
ClrReg = X86::DX;
|
2009-08-03 00:10:52 +08:00
|
|
|
SExtOpcode = X86::CWD;
|
|
|
|
break;
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i32:
|
2009-12-23 09:45:04 +08:00
|
|
|
LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
|
2009-08-03 00:10:52 +08:00
|
|
|
SExtOpcode = X86::CDQ;
|
|
|
|
break;
|
2009-08-12 04:47:22 +08:00
|
|
|
case MVT::i64:
|
2009-12-23 09:45:04 +08:00
|
|
|
LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
|
2009-08-03 00:10:52 +08:00
|
|
|
SExtOpcode = X86::CQO;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
|
2015-10-14 00:23:00 +08:00
|
|
|
bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
|
2009-08-03 00:10:52 +08:00
|
|
|
bool signBitIsZero = CurDAG->SignBitIsZero(N0);
|
|
|
|
|
|
|
|
SDValue InFlag;
|
2009-08-12 04:47:22 +08:00
|
|
|
if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
|
2009-08-03 00:10:52 +08:00
|
|
|
// Special case for div8, just use a move with zero extension to AX to
|
|
|
|
// clear the upper 8 bits (AH).
|
|
|
|
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
|
2015-10-14 00:23:00 +08:00
|
|
|
if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
|
2009-08-03 00:10:52 +08:00
|
|
|
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
|
|
|
|
Move =
|
2011-05-21 03:04:40 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
|
2013-04-20 06:22:57 +08:00
|
|
|
MVT::Other, Ops), 0);
|
2009-08-03 00:10:52 +08:00
|
|
|
Chain = Move.getValue(1);
|
|
|
|
ReplaceUses(N0.getValue(1), Chain);
|
2006-11-18 06:10:14 +08:00
|
|
|
} else {
|
2009-08-03 00:10:52 +08:00
|
|
|
Move =
|
2011-05-21 03:04:40 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
|
2009-08-03 00:10:52 +08:00
|
|
|
Chain = CurDAG->getEntryNode();
|
|
|
|
}
|
2011-05-21 03:04:40 +08:00
|
|
|
Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
|
2009-08-03 00:10:52 +08:00
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
} else {
|
|
|
|
InFlag =
|
|
|
|
CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
|
|
|
|
LoReg, N0, SDValue()).getValue(1);
|
|
|
|
if (isSigned && !signBitIsZero) {
|
|
|
|
// Sign extend the low part into the high part.
|
2006-11-18 06:10:14 +08:00
|
|
|
InFlag =
|
2010-12-21 10:38:05 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
|
2009-08-03 00:10:52 +08:00
|
|
|
} else {
|
|
|
|
// Zero out the high part, effectively zero extending the input.
|
2014-12-04 13:20:33 +08:00
|
|
|
SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
|
2013-08-15 13:57:07 +08:00
|
|
|
switch (NVT.SimpleTy) {
|
2013-05-30 21:19:42 +08:00
|
|
|
case MVT::i16:
|
|
|
|
ClrNode =
|
|
|
|
SDValue(CurDAG->getMachineNode(
|
|
|
|
TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(X86::sub_16bit, dl,
|
|
|
|
MVT::i32)),
|
2013-05-30 21:19:42 +08:00
|
|
|
0);
|
|
|
|
break;
|
|
|
|
case MVT::i32:
|
|
|
|
break;
|
|
|
|
case MVT::i64:
|
|
|
|
ClrNode =
|
|
|
|
SDValue(CurDAG->getMachineNode(
|
|
|
|
TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
|
|
|
|
CurDAG->getTargetConstant(X86::sub_32bit, dl,
|
|
|
|
MVT::i32)),
|
2013-05-30 21:19:42 +08:00
|
|
|
0);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Unexpected division source");
|
|
|
|
}
|
|
|
|
|
2009-12-23 09:45:04 +08:00
|
|
|
InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
|
2009-08-03 00:10:52 +08:00
|
|
|
ClrNode, InFlag).getValue(1);
|
2006-01-07 07:19:29 +08:00
|
|
|
}
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
2006-01-07 07:19:29 +08:00
|
|
|
|
2009-08-03 00:10:52 +08:00
|
|
|
if (foldedLoad) {
|
|
|
|
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
|
|
|
|
InFlag };
|
|
|
|
SDNode *CNode =
|
2013-04-20 06:22:57 +08:00
|
|
|
CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
|
2009-08-03 00:10:52 +08:00
|
|
|
InFlag = SDValue(CNode, 1);
|
|
|
|
// Update the chain.
|
|
|
|
ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
|
|
|
|
} else {
|
|
|
|
InFlag =
|
2010-12-21 10:38:05 +08:00
|
|
|
SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
|
|
|
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
// Prevent use of AH in a REX instruction by explicitly copying it to
|
|
|
|
// an ABCD_L register.
|
2013-07-09 10:07:28 +08:00
|
|
|
//
|
|
|
|
// The current assumption of the register allocator is that isel
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
// won't generate explicit references to the GR8_ABCD_H registers. If
|
2013-07-09 10:07:28 +08:00
|
|
|
// the allocator and/or the backend get enhanced to be more robust in
|
|
|
|
// that regard, this can be, and should be, removed.
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
|
|
|
|
SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
|
|
|
|
unsigned AHExtOpcode =
|
|
|
|
isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8;
|
|
|
|
|
|
|
|
SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
|
|
|
|
MVT::Glue, AHCopy, InFlag);
|
|
|
|
SDValue Result(RNode, 0);
|
|
|
|
InFlag = SDValue(RNode, 1);
|
|
|
|
|
|
|
|
if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG ||
|
|
|
|
Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
|
|
|
|
if (Node->getValueType(1) == MVT::i64) {
|
|
|
|
// It's not possible to directly movsx AH to a 64bit register, because
|
|
|
|
// the latter needs the REX prefix, but the former can't have it.
|
|
|
|
assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG &&
|
|
|
|
"Unexpected i64 sext of h-register");
|
|
|
|
Result =
|
|
|
|
SDValue(CurDAG->getMachineNode(
|
|
|
|
TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
|
2015-04-28 22:05:47 +08:00
|
|
|
CurDAG->getTargetConstant(0, dl, MVT::i64), Result,
|
|
|
|
CurDAG->getTargetConstant(X86::sub_32bit, dl,
|
|
|
|
MVT::i32)),
|
[X86] 8bit divrem: Improve codegen for AH register extraction.
For 8-bit divrems where the remainder is used, we used to generate:
divb %sil
shrw $8, %ax
movzbl %al, %eax
That was to avoid an H-reg access, which is problematic mainly because
it isn't possible in REX-prefixed instructions.
This patch optimizes that to:
divb %sil
movzbl %ah, %eax
To do that, we explicitly extend AH, and extract the L-subreg in the
resulting register. The extension is done using the NOREX variants of
MOVZX. To support signed operations, MOVSX_NOREX is also added.
Further, this introduces a new SDNode type, [us]divrem_ext_hreg, which is
then lowered to a sequence containing a single zext (rather than 2).
Differential Revision: http://reviews.llvm.org/D6064
llvm-svn: 221176
2014-11-04 04:26:35 +08:00
|
|
|
0);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
Result =
|
|
|
|
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
|
|
|
|
}
|
|
|
|
ReplaceUses(SDValue(Node, 1), Result);
|
|
|
|
DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
|
2010-06-26 08:39:23 +08:00
|
|
|
}
|
2009-08-03 00:10:52 +08:00
|
|
|
// Copy the division (low) result, if it is needed.
|
2010-01-05 09:24:18 +08:00
|
|
|
if (!SDValue(Node, 0).use_empty()) {
|
2009-08-03 00:10:52 +08:00
|
|
|
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
|
|
|
|
LoReg, NVT, InFlag);
|
|
|
|
InFlag = Result.getValue(2);
|
2010-01-05 09:24:18 +08:00
|
|
|
ReplaceUses(SDValue(Node, 0), Result);
|
2010-03-02 14:34:30 +08:00
|
|
|
DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
|
|
|
// Copy the remainder (high) result, if it is needed.
|
2010-01-05 09:24:18 +08:00
|
|
|
if (!SDValue(Node, 1).use_empty()) {
|
2010-06-26 08:39:23 +08:00
|
|
|
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
|
|
|
|
HiReg, NVT, InFlag);
|
|
|
|
InFlag = Result.getValue(2);
|
2010-01-05 09:24:18 +08:00
|
|
|
ReplaceUses(SDValue(Node, 1), Result);
|
2010-03-02 14:34:30 +08:00
|
|
|
DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2009-08-03 00:10:52 +08:00
|
|
|
}
|
|
|
|
|
2012-08-08 08:51:41 +08:00
|
|
|
case X86ISD::CMP:
|
|
|
|
case X86ISD::SUB: {
|
|
|
|
// Sometimes a SUB is used to perform comparison.
|
|
|
|
if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
|
|
|
|
// This node is not a CMP.
|
|
|
|
break;
|
2009-08-20 02:16:17 +08:00
|
|
|
SDValue N0 = Node->getOperand(0);
|
|
|
|
SDValue N1 = Node->getOperand(1);
|
|
|
|
|
2010-08-05 06:40:58 +08:00
|
|
|
if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
|
2015-10-14 00:23:00 +08:00
|
|
|
hasNoSignedComparisonUses(Node))
|
2010-04-28 16:30:49 +08:00
|
|
|
N0 = N0.getOperand(0);
|
2015-02-12 16:40:34 +08:00
|
|
|
|
2014-08-18 19:59:06 +08:00
|
|
|
// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
|
|
|
|
// use a smaller encoding.
|
|
|
|
// Look past the truncate if CMP is the only use of it.
|
2011-11-04 05:49:52 +08:00
|
|
|
if ((N0.getNode()->getOpcode() == ISD::AND ||
|
|
|
|
(N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) &&
|
|
|
|
N0.getNode()->hasOneUse() &&
|
2009-08-20 02:16:17 +08:00
|
|
|
N0.getValueType() != MVT::i8 &&
|
|
|
|
X86::isZeroNode(N1)) {
|
|
|
|
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
|
|
|
|
if (!C) break;
|
|
|
|
|
|
|
|
// For example, convert "testl %eax, $8" to "testb %al, $8"
|
2009-10-10 04:35:19 +08:00
|
|
|
if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
|
|
|
|
(!(C->getZExtValue() & 0x80) ||
|
2015-10-14 00:23:00 +08:00
|
|
|
hasNoSignedComparisonUses(Node))) {
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
|
2009-08-20 02:16:17 +08:00
|
|
|
SDValue Reg = N0.getNode()->getOperand(0);
|
|
|
|
|
|
|
|
// On x86-32, only the ABCD registers have 8-bit subregisters.
|
|
|
|
if (!Subtarget->is64Bit()) {
|
2012-02-22 15:28:11 +08:00
|
|
|
const TargetRegisterClass *TRC;
|
2013-08-15 10:33:50 +08:00
|
|
|
switch (N0.getSimpleValueType().SimpleTy) {
|
2009-08-20 02:16:17 +08:00
|
|
|
case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
|
|
|
|
case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
|
|
|
|
default: llvm_unreachable("Unsupported TEST operand type!");
|
|
|
|
}
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
|
2009-09-26 02:54:59 +08:00
|
|
|
Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
|
|
|
|
Reg.getValueType(), Reg, RC), 0);
|
2009-08-20 02:16:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Extract the l-register.
|
2010-05-24 22:48:17 +08:00
|
|
|
SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
|
2009-08-20 02:16:17 +08:00
|
|
|
MVT::i8, Reg);
|
|
|
|
|
|
|
|
// Emit a testb.
|
2012-09-29 02:53:24 +08:00
|
|
|
SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
|
|
|
|
Subreg, Imm);
|
|
|
|
// Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
|
|
|
|
// one, do not call ReplaceAllUsesWith.
|
|
|
|
ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
|
|
|
|
SDValue(NewNode, 0));
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2009-08-20 02:16:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// For example, "testl %eax, $2048" to "testb %ah, $8".
|
2009-10-10 04:35:19 +08:00
|
|
|
if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
|
|
|
|
(!(C->getZExtValue() & 0x8000) ||
|
2015-10-14 00:23:00 +08:00
|
|
|
hasNoSignedComparisonUses(Node))) {
|
2009-08-20 02:16:17 +08:00
|
|
|
// Shift the immediate right by 8 bits.
|
|
|
|
SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
|
2015-04-28 22:05:47 +08:00
|
|
|
dl, MVT::i8);
|
2009-08-20 02:16:17 +08:00
|
|
|
SDValue Reg = N0.getNode()->getOperand(0);
|
|
|
|
|
|
|
|
// Put the value in an ABCD register.
|
2012-02-22 15:28:11 +08:00
|
|
|
const TargetRegisterClass *TRC;
|
2013-08-15 10:33:50 +08:00
|
|
|
switch (N0.getSimpleValueType().SimpleTy) {
|
2009-08-20 02:16:17 +08:00
|
|
|
case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
|
|
|
|
case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
|
|
|
|
case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
|
|
|
|
default: llvm_unreachable("Unsupported TEST operand type!");
|
|
|
|
}
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
|
2009-09-26 02:54:59 +08:00
|
|
|
Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
|
|
|
|
Reg.getValueType(), Reg, RC), 0);
|
2009-08-20 02:16:17 +08:00
|
|
|
|
|
|
|
// Extract the h-register.
|
2010-05-24 22:48:17 +08:00
|
|
|
SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
|
2009-08-20 02:16:17 +08:00
|
|
|
MVT::i8, Reg);
|
|
|
|
|
2011-10-09 02:28:28 +08:00
|
|
|
// Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only
|
|
|
|
// target GR8_NOREX registers, so make sure the register class is
|
|
|
|
// forced.
|
2012-09-29 02:53:24 +08:00
|
|
|
SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
|
|
|
|
MVT::i32, Subreg, ShiftedImm);
|
|
|
|
// Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
|
|
|
|
// one, do not call ReplaceAllUsesWith.
|
|
|
|
ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
|
|
|
|
SDValue(NewNode, 0));
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2009-08-20 02:16:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// For example, "testl %eax, $32776" to "testw %ax, $32776".
|
|
|
|
if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
|
2009-10-10 04:35:19 +08:00
|
|
|
N0.getValueType() != MVT::i16 &&
|
|
|
|
(!(C->getZExtValue() & 0x8000) ||
|
2015-10-14 00:23:00 +08:00
|
|
|
hasNoSignedComparisonUses(Node))) {
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
|
|
|
|
MVT::i16);
|
2009-08-20 02:16:17 +08:00
|
|
|
SDValue Reg = N0.getNode()->getOperand(0);
|
|
|
|
|
|
|
|
// Extract the 16-bit subregister.
|
2010-05-24 22:48:17 +08:00
|
|
|
SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
|
2009-08-20 02:16:17 +08:00
|
|
|
MVT::i16, Reg);
|
|
|
|
|
|
|
|
// Emit a testw.
|
2012-09-29 02:53:24 +08:00
|
|
|
SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
|
|
|
|
Subreg, Imm);
|
|
|
|
// Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
|
|
|
|
// one, do not call ReplaceAllUsesWith.
|
|
|
|
ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
|
|
|
|
SDValue(NewNode, 0));
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2009-08-20 02:16:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
|
|
|
|
if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
|
2009-10-10 04:35:19 +08:00
|
|
|
N0.getValueType() == MVT::i64 &&
|
|
|
|
(!(C->getZExtValue() & 0x80000000) ||
|
2015-10-14 00:23:00 +08:00
|
|
|
hasNoSignedComparisonUses(Node))) {
|
2015-04-28 22:05:47 +08:00
|
|
|
SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
|
|
|
|
MVT::i32);
|
2009-08-20 02:16:17 +08:00
|
|
|
SDValue Reg = N0.getNode()->getOperand(0);
|
|
|
|
|
|
|
|
// Extract the 32-bit subregister.
|
2010-05-24 22:48:17 +08:00
|
|
|
SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
|
2009-08-20 02:16:17 +08:00
|
|
|
MVT::i32, Reg);
|
|
|
|
|
|
|
|
// Emit a testl.
|
2012-09-29 02:53:24 +08:00
|
|
|
SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
|
|
|
|
Subreg, Imm);
|
|
|
|
// Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
|
|
|
|
// one, do not call ReplaceAllUsesWith.
|
|
|
|
ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
|
|
|
|
SDValue(NewNode, 0));
|
2016-05-11 07:55:37 +08:00
|
|
|
return;
|
2009-08-20 02:16:17 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2011-11-16 05:57:53 +08:00
|
|
|
case ISD::STORE: {
|
2012-03-29 13:45:48 +08:00
|
|
|
// Change a chain of {load; incr or dec; store} of the same value into
|
|
|
|
// a simple increment or decrement through memory of that value, if the
|
|
|
|
// uses of the modified value and its address are suitable.
|
2011-11-17 03:03:23 +08:00
|
|
|
// The DEC64m tablegen pattern is currently not able to match the case where
|
2012-08-02 02:39:17 +08:00
|
|
|
// the EFLAGS on the original DEC are used. (This also applies to
|
2012-03-29 13:45:48 +08:00
|
|
|
// {INC,DEC}X{64,32,16,8}.)
|
|
|
|
// We'll need to improve tablegen to allow flags to be transferred from a
|
2011-11-17 03:03:23 +08:00
|
|
|
// node in the pattern to the result node. probably with a new keyword
|
|
|
|
// for example, we have this
|
|
|
|
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
|
|
|
|
// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
|
|
|
|
// (implicit EFLAGS)]>;
|
|
|
|
// but maybe need something like this
|
|
|
|
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
|
|
|
|
// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
|
|
|
|
// (transferrable EFLAGS)]>;
|
2012-03-29 13:45:48 +08:00
|
|
|
|
2011-11-16 05:57:53 +08:00
|
|
|
StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
|
|
|
|
SDValue StoredVal = StoreNode->getOperand(1);
|
2012-03-29 13:45:48 +08:00
|
|
|
unsigned Opc = StoredVal->getOpcode();
|
2011-11-16 05:57:53 +08:00
|
|
|
|
2014-04-25 13:30:21 +08:00
|
|
|
LoadSDNode *LoadNode = nullptr;
|
2012-04-13 03:14:21 +08:00
|
|
|
SDValue InputChain;
|
|
|
|
if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
|
|
|
|
LoadNode, InputChain))
|
|
|
|
break;
|
2011-11-16 05:57:53 +08:00
|
|
|
|
|
|
|
SDValue Base, Scale, Index, Disp, Segment;
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!selectAddr(LoadNode, LoadNode->getBasePtr(),
|
2011-11-16 05:57:53 +08:00
|
|
|
Base, Scale, Index, Disp, Segment))
|
|
|
|
break;
|
|
|
|
|
|
|
|
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
|
|
|
|
MemOp[0] = StoreNode->getMemOperand();
|
|
|
|
MemOp[1] = LoadNode->getMemOperand();
|
|
|
|
const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
|
2012-08-02 02:39:17 +08:00
|
|
|
EVT LdVT = LoadNode->getMemoryVT();
|
2012-03-29 13:45:48 +08:00
|
|
|
unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
|
|
|
|
MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
|
2013-05-25 10:42:55 +08:00
|
|
|
SDLoc(Node),
|
2013-04-20 06:22:57 +08:00
|
|
|
MVT::i32, MVT::Other, Ops);
|
2011-11-16 05:57:53 +08:00
|
|
|
Result->setMemRefs(MemOp, MemOp + 2);
|
|
|
|
|
|
|
|
ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
|
|
|
|
ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
|
2016-05-11 07:55:37 +08:00
|
|
|
CurDAG->RemoveDeadNode(Node);
|
|
|
|
return;
|
2011-11-16 05:57:53 +08:00
|
|
|
}
|
2005-11-16 09:54:32 +08:00
|
|
|
}
|
|
|
|
|
2016-05-11 07:55:37 +08:00
|
|
|
SelectCode(Node);
|
2005-11-16 09:54:32 +08:00
|
|
|
}
|
|
|
|
|
2006-06-09 02:03:49 +08:00
|
|
|
bool X86DAGToDAGISel::
|
2015-03-13 20:45:09 +08:00
|
|
|
SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
|
2008-08-23 10:25:05 +08:00
|
|
|
std::vector<SDValue> &OutOps) {
|
2009-04-09 05:14:34 +08:00
|
|
|
SDValue Op0, Op1, Op2, Op3, Op4;
|
2015-03-13 20:45:09 +08:00
|
|
|
switch (ConstraintID) {
|
2015-05-16 20:09:54 +08:00
|
|
|
default:
|
|
|
|
llvm_unreachable("Unexpected asm memory constraint");
|
|
|
|
case InlineAsm::Constraint_i:
|
|
|
|
// FIXME: It seems strange that 'i' is needed here since it's supposed to
|
|
|
|
// be an immediate and not a memory constraint.
|
2016-08-17 13:10:15 +08:00
|
|
|
LLVM_FALLTHROUGH;
|
2015-03-13 20:45:09 +08:00
|
|
|
case InlineAsm::Constraint_o: // offsetable ??
|
|
|
|
case InlineAsm::Constraint_v: // not offsetable ??
|
|
|
|
case InlineAsm::Constraint_m: // memory
|
2015-05-16 20:09:54 +08:00
|
|
|
case InlineAsm::Constraint_X:
|
2015-10-14 00:23:00 +08:00
|
|
|
if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
|
2006-06-09 02:03:49 +08:00
|
|
|
return true;
|
|
|
|
break;
|
|
|
|
}
|
2012-08-02 02:39:17 +08:00
|
|
|
|
2006-08-26 09:05:16 +08:00
|
|
|
OutOps.push_back(Op0);
|
|
|
|
OutOps.push_back(Op1);
|
|
|
|
OutOps.push_back(Op2);
|
|
|
|
OutOps.push_back(Op3);
|
2009-04-09 05:14:34 +08:00
|
|
|
OutOps.push_back(Op4);
|
2006-06-09 02:03:49 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-10-13 23:12:27 +08:00
|
|
|
/// This pass converts a legalized DAG into a X86-specific DAG,
|
|
|
|
/// ready for instruction scheduling.
|
2009-04-30 07:29:43 +08:00
|
|
|
FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
|
2012-03-27 15:21:54 +08:00
|
|
|
CodeGenOpt::Level OptLevel) {
|
2009-04-29 08:15:41 +08:00
|
|
|
return new X86DAGToDAGISel(TM, OptLevel);
|
2005-11-16 09:54:32 +08:00
|
|
|
}
|